summaryrefslogtreecommitdiff
path: root/src/third_party
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party')
-rw-r--r--src/third_party/wiredtiger/.gitignore0
-rw-r--r--src/third_party/wiredtiger/.hgignore24
-rw-r--r--src/third_party/wiredtiger/.hgtags24
-rw-r--r--src/third_party/wiredtiger/LICENSE17
-rw-r--r--src/third_party/wiredtiger/NEWS1785
-rw-r--r--src/third_party/wiredtiger/README21
-rw-r--r--src/third_party/wiredtiger/RELEASE_INFO10
-rw-r--r--src/third_party/wiredtiger/SConscript105
-rw-r--r--src/third_party/wiredtiger/SConstruct282
-rw-r--r--src/third_party/wiredtiger/api/leveldb/Makefile.am81
-rw-r--r--src/third_party/wiredtiger/api/leveldb/basho/perf_count.cc657
-rw-r--r--src/third_party/wiredtiger/api/leveldb/basho/perf_count.h298
-rw-r--r--src/third_party/wiredtiger/api/leveldb/config.hin22
-rw-r--r--src/third_party/wiredtiger/api/leveldb/dummy.cc28
-rw-r--r--src/third_party/wiredtiger/api/leveldb/hyper_wt.cc415
-rw-r--r--src/third_party/wiredtiger/api/leveldb/hyperleveldb/AUTHORS15
-rw-r--r--src/third_party/wiredtiger/api/leveldb/hyperleveldb/LICENSE28
-rw-r--r--src/third_party/wiredtiger/api/leveldb/hyperleveldb/replay_iterator.h67
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/AUTHORS8
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/LICENSE27
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/db/dbformat.h233
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/db/skiplist.h379
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/db/write_batch.cc110
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/db/write_batch_internal.h53
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/cache.h110
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/comparator.h74
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/db.h350
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/env.h349
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/filter_policy.h78
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/iterator.h105
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/options.h258
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/slice.h127
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/status.h111
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/write_batch.h142
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/port/port.h38
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/util/arena.h68
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/util/coding.cc163
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/util/coding.h311
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/util/comparator.cc80
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/util/env.cc96
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/util/env_posix.cc625
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/util/logging.cc80
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/util/logging.h47
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/util/options.cc26
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/util/posix_logger.h98
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/util/random.h72
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb/util/status.cc74
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb_test.cc141
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb_wt.cc810
-rw-r--r--src/third_party/wiredtiger/api/leveldb/leveldb_wt.h460
-rw-r--r--src/third_party/wiredtiger/api/leveldb/rocks_wt.cc315
-rw-r--r--src/third_party/wiredtiger/api/leveldb/rocksdb/LICENSE35
-rw-r--r--src/third_party/wiredtiger/api/leveldb/rocksdb/PATENTS23
-rw-r--r--src/third_party/wiredtiger/api/leveldb/rocksdb/write_batch.cc275
-rwxr-xr-xsrc/third_party/wiredtiger/autogen.sh4
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/Makefile.am16
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/README3
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/config.c736
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/config_opt.h38
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/doxy.c110
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/misc.c113
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/evict-btree.wtperf8
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm.wtperf9
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/fruit-lsm.wtperf22
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/fruit-short.wtperf20
-rwxr-xr-xsrc/third_party/wiredtiger/bench/wtperf/runners/get_ckpt.py42
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/insert-rmw.wtperf9
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/large-lsm.wtperf11
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/long-txn-btree.wtperf9
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/long-txn-lsm.wtperf9
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/medium-btree.wtperf8
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/medium-lsm-async.wtperf9
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/medium-lsm-compact.wtperf9
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/medium-lsm.wtperf8
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm-noprefix.wtperf10
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm.wtperf10
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-long.wtperf16
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/multi-btree.wtperf17
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/parallel-pop-btree.wtperf9
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/parallel-pop-lsm.wtperf9
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/shared-cache-stress.wtperf12
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/small-btree.wtperf8
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/small-lsm.wtperf8
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test1-1b-lsm.wtperf18
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test1-2b-lsm.wtperf18
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test1-500m-lsm.wtperf18
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test1-50m-lsm.wtperf17
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test2-1b-lsm.wtperf18
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test2-2b-lsm.wtperf18
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test2-500m-lsm.wtperf18
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test2-50m-lsm.wtperf17
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test3-1b-lsm.wtperf20
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test3-2b-lsm.wtperf20
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test3-500m-lsm.wtperf20
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test3-50m-lsm.wtperf19
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test4-1b-lsm.wtperf18
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test4-2b-lsm.wtperf18
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test4-500m-lsm.wtperf18
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/test4-50m-lsm.wtperf17
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/update-btree.wtperf8
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/update-checkpoint-btree.wtperf12
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/update-checkpoint-lsm.wtperf12
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/update-large-lsm.wtperf9
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/update-lsm.wtperf8
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/voxer-10k-short.wtperf19
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/voxer-10k.wtperf20
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/voxer-130k-short.wtperf19
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/voxer-130k.wtperf20
-rwxr-xr-xsrc/third_party/wiredtiger/bench/wtperf/runners/wtperf_ckpt.sh136
-rwxr-xr-xsrc/third_party/wiredtiger/bench/wtperf/runners/wtperf_run.sh168
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/wtperf_track.sh204
-rwxr-xr-xsrc/third_party/wiredtiger/bench/wtperf/smoke.sh4
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/track.c324
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/wtperf.c2298
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/wtperf.h247
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i172
-rw-r--r--src/third_party/wiredtiger/build_darwin/wiredtiger_config.h151
-rw-r--r--src/third_party/wiredtiger/build_freebsd/wiredtiger_config.h151
-rw-r--r--src/third_party/wiredtiger/build_linux/wiredtiger_config.h151
-rw-r--r--src/third_party/wiredtiger/build_posix/Make.base81
-rw-r--r--src/third_party/wiredtiger/build_posix/Make.subdirs28
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/ax_check_class.m4144
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/ax_check_junit.m472
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/ax_java_options.m448
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/ax_jni_include_dir.m4128
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/ax_pkg_swig.m4135
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/ax_prog_jar.m452
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/ax_prog_java.m4115
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/ax_prog_java_works.m4134
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/ax_prog_javac.m479
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/ax_prog_javac_works.m472
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/ax_try_compile_java.m455
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/cond-if.m414
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/options.m4228
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/types.m447
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/version-set.m414
-rw-r--r--src/third_party/wiredtiger/build_posix/aclocal/version.m42
-rw-r--r--src/third_party/wiredtiger/build_posix/configure.ac.in179
-rwxr-xr-xsrc/third_party/wiredtiger/build_posix/makemake35
-rwxr-xr-xsrc/third_party/wiredtiger/build_posix/reconf79
-rw-r--r--src/third_party/wiredtiger/build_posix/wiredtiger.pc.in11
-rw-r--r--src/third_party/wiredtiger/build_solaris/wiredtiger_config.h146
-rw-r--r--src/third_party/wiredtiger/build_win/wiredtiger_config.h151
-rw-r--r--src/third_party/wiredtiger/dist/api_config.py332
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py863
-rw-r--r--src/third_party/wiredtiger/dist/api_err.py111
-rw-r--r--src/third_party/wiredtiger/dist/db.py24
-rw-r--r--src/third_party/wiredtiger/dist/dist.py35
-rw-r--r--src/third_party/wiredtiger/dist/extlist9
-rw-r--r--src/third_party/wiredtiger/dist/filelist166
-rw-r--r--src/third_party/wiredtiger/dist/filelist.win167
-rw-r--r--src/third_party/wiredtiger/dist/flags.py183
-rw-r--r--src/third_party/wiredtiger/dist/java_doc.py44
-rw-r--r--src/third_party/wiredtiger/dist/log.py263
-rw-r--r--src/third_party/wiredtiger/dist/log_data.py63
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/README.Debian8
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/README.source9
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/changelog5
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/compat1
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/control36
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/copyright26
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/docs2
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/files3
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/libwiredtiger-dev.dirs2
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/libwiredtiger-dev.install2
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/libwiredtiger-dev.substvars1
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/libwiredtiger.dirs1
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/libwiredtiger.install2
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/libwiredtiger.postinst.debhelper5
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/libwiredtiger.postrm.debhelper5
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/libwiredtiger.substvars2
-rwxr-xr-xsrc/third_party/wiredtiger/dist/package/debian/rules13
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/shlibs.local1
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/source/format1
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/watch8
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/wiredtiger-util.dirs1
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/wiredtiger-util.install1
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/wiredtiger-util.substvars2
-rw-r--r--src/third_party/wiredtiger/dist/package/debian/wiredtiger.doc-base12
-rw-r--r--src/third_party/wiredtiger/dist/package/wiredtiger.spec58
-rw-r--r--src/third_party/wiredtiger/dist/s_all84
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_copyright101
-rw-r--r--src/third_party/wiredtiger/dist/s_copyright.list41
-rw-r--r--src/third_party/wiredtiger/dist/s_define34
-rw-r--r--src/third_party/wiredtiger/dist/s_define.list131
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_docs216
-rw-r--r--src/third_party/wiredtiger/dist/s_funcs29
-rw-r--r--src/third_party/wiredtiger/dist/s_funcs.list44
-rw-r--r--src/third_party/wiredtiger/dist/s_getopt16
-rw-r--r--src/third_party/wiredtiger/dist/s_longlines17
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_prototypes41
-rw-r--r--src/third_party/wiredtiger/dist/s_readme54
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_release55
-rw-r--r--src/third_party/wiredtiger/dist/s_release.list9
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_release_docs79
-rw-r--r--src/third_party/wiredtiger/dist/s_stat33
-rw-r--r--src/third_party/wiredtiger/dist/s_string37
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok1147
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_style183
-rw-r--r--src/third_party/wiredtiger/dist/s_symbols56
-rw-r--r--src/third_party/wiredtiger/dist/s_symbols.list19
-rw-r--r--src/third_party/wiredtiger/dist/s_tags44
-rw-r--r--src/third_party/wiredtiger/dist/s_typedef80
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_version60
-rw-r--r--src/third_party/wiredtiger/dist/s_whitespace30
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_win21
-rw-r--r--src/third_party/wiredtiger/dist/serial.py189
-rw-r--r--src/third_party/wiredtiger/dist/stat.py183
-rw-r--r--src/third_party/wiredtiger/dist/stat_data.py410
-rwxr-xr-xsrc/third_party/wiredtiger/dist/style.py44
-rw-r--r--src/third_party/wiredtiger/examples/c/Makefile.am32
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_access.c98
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_all.c1125
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_async.c223
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_call_center.c248
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_config.c90
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_config_parse.c165
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_cursor.c227
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_data_source.c661
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_extending.c132
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_file.c72
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_hello.c75
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_log.c344
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_pack.c85
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_process.c78
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_schema.c309
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_scope.c174
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_stat.c223
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_thread.c126
-rw-r--r--src/third_party/wiredtiger/examples/java/Makefile.am21
-rw-r--r--src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_access.java93
-rw-r--r--src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_all.java1009
-rw-r--r--src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_async.java222
-rw-r--r--src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_call_center.java299
-rw-r--r--src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_cursor.java239
-rw-r--r--src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_log.java376
-rw-r--r--src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_schema.java333
-rw-r--r--src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_stat.java252
-rw-r--r--src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_thread.java142
-rwxr-xr-xsrc/third_party/wiredtiger/examples/python/ex_access.py47
-rw-r--r--src/third_party/wiredtiger/ext/collators/reverse/Makefile.am10
-rw-r--r--src/third_party/wiredtiger/ext/collators/reverse/reverse_collator.c74
-rw-r--r--src/third_party/wiredtiger/ext/compressors/bzip2/Makefile.am6
-rw-r--r--src/third_party/wiredtiger/ext/compressors/bzip2/bzip2_compress.c407
-rw-r--r--src/third_party/wiredtiger/ext/compressors/nop/Makefile.am9
-rw-r--r--src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c187
-rw-r--r--src/third_party/wiredtiger/ext/compressors/snappy/Makefile.am10
-rw-r--r--src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c244
-rw-r--r--src/third_party/wiredtiger/ext/compressors/zlib/Makefile.am10
-rw-r--r--src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c426
-rw-r--r--src/third_party/wiredtiger/ext/datasources/helium/Makefile.am11
-rw-r--r--src/third_party/wiredtiger/ext/datasources/helium/README125
-rw-r--r--src/third_party/wiredtiger/ext/datasources/helium/helium.c3449
-rw-r--r--src/third_party/wiredtiger/lang/java/Makefile.am91
-rw-r--r--src/third_party/wiredtiger/lang/java/java_doc.i59
-rw-r--r--src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/AsyncCallback.java42
-rw-r--r--src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java184
-rw-r--r--src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackInputStream.java340
-rw-r--r--src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackOutputStream.java264
-rw-r--r--src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackUtil.java69
-rw-r--r--src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerException.java39
-rw-r--r--src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerPackingException.java41
-rw-r--r--src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerPanicException.java42
-rw-r--r--src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerRollbackException.java41
-rw-r--r--src/third_party/wiredtiger/lang/java/wiredtiger.i1849
-rw-r--r--src/third_party/wiredtiger/lang/python/Makefile.am28
-rwxr-xr-xsrc/third_party/wiredtiger/lang/python/run-ex_access5
-rw-r--r--src/third_party/wiredtiger/lang/python/setup.py57
-rw-r--r--src/third_party/wiredtiger/lang/python/wiredtiger.i1155
-rw-r--r--src/third_party/wiredtiger/lang/python/wiredtiger/fpacking.py118
-rw-r--r--src/third_party/wiredtiger/lang/python/wiredtiger/intpack-test.py35
-rw-r--r--src/third_party/wiredtiger/lang/python/wiredtiger/intpacking.py136
-rw-r--r--src/third_party/wiredtiger/lang/python/wiredtiger/packing-test.py38
-rw-r--r--src/third_party/wiredtiger/lang/python/wiredtiger/packing.py149
-rw-r--r--src/third_party/wiredtiger/src/async/async_api.c604
-rw-r--r--src/third_party/wiredtiger/src/async/async_op.c359
-rw-r--r--src/third_party/wiredtiger/src/async/async_worker.c359
-rw-r--r--src/third_party/wiredtiger/src/block/block_addr.c202
-rw-r--r--src/third_party/wiredtiger/src/block/block_ckpt.c842
-rw-r--r--src/third_party/wiredtiger/src/block/block_compact.c221
-rw-r--r--src/third_party/wiredtiger/src/block/block_ext.c1437
-rw-r--r--src/third_party/wiredtiger/src/block/block_map.c65
-rw-r--r--src/third_party/wiredtiger/src/block/block_mgr.c433
-rw-r--r--src/third_party/wiredtiger/src/block/block_open.c330
-rw-r--r--src/third_party/wiredtiger/src/block/block_read.c212
-rw-r--r--src/third_party/wiredtiger/src/block/block_session.c305
-rw-r--r--src/third_party/wiredtiger/src/block/block_slvg.c190
-rw-r--r--src/third_party/wiredtiger/src/block/block_vrfy.c514
-rw-r--r--src/third_party/wiredtiger/src/block/block_write.c269
-rw-r--r--src/third_party/wiredtiger/src/bloom/bloom.c351
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_compact.c215
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c468
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c560
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c1025
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c1104
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c339
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c422
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_evict.c1297
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c770
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_huffman.c340
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_io.c304
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_misc.c128
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ovfl.c270
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c734
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c88
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ret.c116
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_slvg.c2520
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_stat.c190
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c373
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_upgrade.c22
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy.c666
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c739
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c285
-rw-r--r--src/third_party/wiredtiger/src/btree/col_modify.c223
-rw-r--r--src/third_party/wiredtiger/src/btree/col_srch.c199
-rw-r--r--src/third_party/wiredtiger/src/btree/rec_evict.c468
-rw-r--r--src/third_party/wiredtiger/src/btree/rec_split.c1121
-rw-r--r--src/third_party/wiredtiger/src/btree/rec_track.c904
-rw-r--r--src/third_party/wiredtiger/src/btree/rec_write.c5521
-rw-r--r--src/third_party/wiredtiger/src/btree/row_key.c500
-rw-r--r--src/third_party/wiredtiger/src/btree/row_modify.c346
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c553
-rw-r--r--src/third_party/wiredtiger/src/config/config.c745
-rw-r--r--src/third_party/wiredtiger/src/config/config_api.c105
-rw-r--r--src/third_party/wiredtiger/src/config/config_check.c370
-rw-r--r--src/third_party/wiredtiger/src/config/config_collapse.c380
-rw-r--r--src/third_party/wiredtiger/src/config/config_concat.c71
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c744
-rw-r--r--src/third_party/wiredtiger/src/config/config_ext.c44
-rw-r--r--src/third_party/wiredtiger/src/config/config_upgrade.c32
-rw-r--r--src/third_party/wiredtiger/src/conn/api_strerror.c43
-rw-r--r--src/third_party/wiredtiger/src/conn/api_version.c24
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c1573
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_cache.c174
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_cache_pool.c639
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_ckpt.c228
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c694
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_handle.c142
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c284
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_open.c244
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_stat.c540
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_sweep.c187
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_backup.c540
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_bulk.c287
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_config.c65
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_ds.c524
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_dump.c400
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_file.c471
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_index.c447
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_json.c931
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_log.c380
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_metadata.c444
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_stat.c574
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_std.c625
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_table.c808
-rw-r--r--src/third_party/wiredtiger/src/include/api.h128
-rw-r--r--src/third_party/wiredtiger/src/include/async.h128
-rw-r--r--src/third_party/wiredtiger/src/include/bitstring.i316
-rw-r--r--src/third_party/wiredtiger/src/include/block.h337
-rw-r--r--src/third_party/wiredtiger/src/include/bloom.h28
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h1015
-rw-r--r--src/third_party/wiredtiger/src/include/btree.h155
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i1216
-rw-r--r--src/third_party/wiredtiger/src/include/buf.i133
-rw-r--r--src/third_party/wiredtiger/src/include/cache.h139
-rw-r--r--src/third_party/wiredtiger/src/include/cache.i174
-rw-r--r--src/third_party/wiredtiger/src/include/cell.i816
-rw-r--r--src/third_party/wiredtiger/src/include/column.i201
-rw-r--r--src/third_party/wiredtiger/src/include/compact.h12
-rw-r--r--src/third_party/wiredtiger/src/include/config.h85
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h270
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.h380
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.i277
-rw-r--r--src/third_party/wiredtiger/src/include/dhandle.h73
-rw-r--r--src/third_party/wiredtiger/src/include/dlh.h15
-rw-r--r--src/third_party/wiredtiger/src/include/error.h141
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h650
-rw-r--r--src/third_party/wiredtiger/src/include/flags.h88
-rw-r--r--src/third_party/wiredtiger/src/include/gcc.h152
-rw-r--r--src/third_party/wiredtiger/src/include/hardware.h60
-rw-r--r--src/third_party/wiredtiger/src/include/intpack.i371
-rw-r--r--src/third_party/wiredtiger/src/include/lint.h56
-rw-r--r--src/third_party/wiredtiger/src/include/log.h177
-rw-r--r--src/third_party/wiredtiger/src/include/lsm.h232
-rw-r--r--src/third_party/wiredtiger/src/include/meta.h58
-rw-r--r--src/third_party/wiredtiger/src/include/misc.h221
-rw-r--r--src/third_party/wiredtiger/src/include/misc.i32
-rw-r--r--src/third_party/wiredtiger/src/include/msvc.h70
-rw-r--r--src/third_party/wiredtiger/src/include/mutex.h73
-rw-r--r--src/third_party/wiredtiger/src/include/mutex.i368
-rw-r--r--src/third_party/wiredtiger/src/include/os.h72
-rw-r--r--src/third_party/wiredtiger/src/include/os_windows.h60
-rw-r--r--src/third_party/wiredtiger/src/include/packing.i685
-rw-r--r--src/third_party/wiredtiger/src/include/posix.h47
-rw-r--r--src/third_party/wiredtiger/src/include/queue.h559
-rw-r--r--src/third_party/wiredtiger/src/include/schema.h101
-rw-r--r--src/third_party/wiredtiger/src/include/serial.i329
-rw-r--r--src/third_party/wiredtiger/src/include/session.h156
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h332
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h139
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i382
-rw-r--r--src/third_party/wiredtiger/src/include/verify_build.h75
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in3463
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger_ext.h398
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h337
-rw-r--r--src/third_party/wiredtiger/src/log/log.c1243
-rw-r--r--src/third_party/wiredtiger/src/log/log_auto.c437
-rw-r--r--src/third_party/wiredtiger/src/log/log_slot.c354
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor.c1519
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_manager.c667
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_merge.c489
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_meta.c238
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_stat.c162
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_tree.c1266
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_work_unit.c625
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_worker.c167
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_apply.c62
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ckpt.c528
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ext.c103
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_table.c206
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_track.c365
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_turtle.c318
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_abort.c26
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_alloc.c238
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_dir.c94
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_dlopen.c83
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_errno.c22
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_exist.c37
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fallocate.c97
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_filesize.c55
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_flock.c37
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fsync.c54
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_ftruncate.c26
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_getline.c48
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_getopt.c150
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_map.c136
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c157
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c227
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_once.c20
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_open.c253
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_path.c28
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_priv.c19
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_remove.c66
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_rename.c38
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_rw.c86
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_sleep.c23
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_strtouq.c24
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_thread.c59
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_time.c53
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_yield.c18
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_dir.c111
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_dlopen.c86
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_errno.c27
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_exist.c32
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_fallocate.c53
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_filesize.c56
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_flock.c46
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_fsync.c40
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_ftruncate.c40
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_map.c106
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_mtx_cond.c155
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_mtx_rw.c126
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_once.c39
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_open.c219
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_path.c34
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_priv.c19
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_remove.c68
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_rename.c51
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_rw.c98
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_sleep.c18
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_thread.c51
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_time.c62
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_vsnprintf.c31
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_yield.c18
-rw-r--r--src/third_party/wiredtiger/src/packing/pack_api.c137
-rw-r--r--src/third_party/wiredtiger/src/packing/pack_impl.c96
-rw-r--r--src/third_party/wiredtiger/src/packing/pack_stream.c296
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_create.c595
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_drop.c204
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_list.c204
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_open.c510
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_plan.c394
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_project.c474
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_rename.c276
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_stat.c114
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_truncate.c183
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_util.c84
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_worker.c134
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c1054
-rw-r--r--src/third_party/wiredtiger/src/session/session_compact.c236
-rw-r--r--src/third_party/wiredtiger/src/session/session_dhandle.c478
-rw-r--r--src/third_party/wiredtiger/src/session/session_salvage.c58
-rw-r--r--src/third_party/wiredtiger/src/support/cksum.c1306
-rw-r--r--src/third_party/wiredtiger/src/support/err.c527
-rw-r--r--src/third_party/wiredtiger/src/support/filename.c49
-rw-r--r--src/third_party/wiredtiger/src/support/global.c118
-rw-r--r--src/third_party/wiredtiger/src/support/hash_city.c323
-rw-r--r--src/third_party/wiredtiger/src/support/hash_fnv.c161
-rw-r--r--src/third_party/wiredtiger/src/support/hazard.c244
-rw-r--r--src/third_party/wiredtiger/src/support/hex.c215
-rw-r--r--src/third_party/wiredtiger/src/support/huffman.c899
-rw-r--r--src/third_party/wiredtiger/src/support/mutex.c257
-rw-r--r--src/third_party/wiredtiger/src/support/pow.c130
-rw-r--r--src/third_party/wiredtiger/src/support/rand.c69
-rw-r--r--src/third_party/wiredtiger/src/support/scratch.c319
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c567
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c554
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c944
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ext.c104
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_log.c500
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c491
-rw-r--r--src/third_party/wiredtiger/src/utilities/util.h50
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_backup.c205
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_compact.c59
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_cpyright.c35
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_create.c53
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_drop.c50
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_dump.c701
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_list.c193
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_load.c595
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_load.h27
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_load_json.c573
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_loadtext.c157
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_main.c262
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_misc.c146
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_printlog.c65
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_read.c101
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_rename.c60
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_salvage.c68
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_stat.c103
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_upgrade.c63
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_verbose.c62
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_verify.c119
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_write.c107
-rw-r--r--src/third_party/wiredtiger/tools/stat_data.py66
-rw-r--r--src/third_party/wiredtiger/tools/statlog.py124
-rw-r--r--src/third_party/wiredtiger/tools/wt_nvd3_util.py46
-rw-r--r--src/third_party/wiredtiger/tools/wtperf_graph.py234
-rw-r--r--src/third_party/wiredtiger/tools/wtperf_stats.py174
-rw-r--r--src/third_party/wiredtiger/tools/wtstats.py236
540 files changed, 130811 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/.gitignore b/src/third_party/wiredtiger/.gitignore
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/src/third_party/wiredtiger/.gitignore
diff --git a/src/third_party/wiredtiger/.hgignore b/src/third_party/wiredtiger/.hgignore
new file mode 100644
index 00000000000..724585de3ac
--- /dev/null
+++ b/src/third_party/wiredtiger/.hgignore
@@ -0,0 +1,24 @@
+~$
+\.l?o$
+\.swp$
+\.pyc$
+\.class$
+\.dSYM$
+/Makefile.in$
+/tags$
+^configure.*$
+^build.*/(\.deps|COPYING|ChangeLog|INSTALL|Makefile.*|NEWS|README)
+^build.*/(aclocal\.m4|config\..*|configure|stamp-h1|prototype.chk|w.*\.h)
+^build.*/(ex.*|lib.*|test.*|wt_.*|_wiredtiger.so)
+^build_posix/gnu-support/(compile|depcomp|install-sh|ltmain.sh|missing)
+^build_posix/autom4te.cache
+^docs/(doxygen.log|installdox|search)
+^docs/[^/]*\.(css|html|js|png)$
+^docs/java
+^docs/latex
+^docs/python
+^docs/swig
+^lang/python/(wiredtiger/__init__.py|wiredtiger_wrap.c)
+^releases
+^src/server
+^test/bt/(CONFIG|__rand|__wt.bdb|__wt.run|__wt.wt|db|t|vgout\..*)
diff --git a/src/third_party/wiredtiger/.hgtags b/src/third_party/wiredtiger/.hgtags
new file mode 100644
index 00000000000..19fa553550b
--- /dev/null
+++ b/src/third_party/wiredtiger/.hgtags
@@ -0,0 +1,24 @@
+9e6541afe6e8c6567ac9b2e108e409371623f636 release-1.0
+9e6541afe6e8c6567ac9b2e108e409371623f636 1.0.0
+9e6541afe6e8c6567ac9b2e108e409371623f636 release-1.0
+0000000000000000000000000000000000000000 release-1.0
+98e12f19a801998ff8afe1762116a141f1487b1f 1.1.0
+349c43bd3bf439fb64bcb481bb8651a655edc471 1.1.1
+a792d468bedd7b37be9cfff545582ae8ff54ff6f 1.1.3
+8054de4cb42988cd54b395cc834a6f8ab25298f7 1.1.4
+ef844093bec2ac38945fd04487dc3a051f4b9136 1.1.5
+12cf1d5546df25ac323f0400d4764e67ad5802e2 1.2.0
+9046bcab74eba90a2cb05af28026ec4a74e4fb9c 1.2.1
+50cb97d00c6238ebef64e290616e8cec9995687f 1.2.2
+ef3ccde04cb28060319be900a2d31c88071933f6 1.3.0
+945a898eb714bb8d46c088928d81b2135eefc18e 1.3.1
+961b8482202543635417399aca5b1093e5ba5cbd 1.3.2
+01380d42b30c74937b7d801062d744823a47fd3d 1.3.3
+df87effe7cd3239e3666a76312bae77b92090d98 1.3.4
+8b91f84675fd67259b1f513e3f84786501cbc16c 1.3.6
+27cec73582030254a2752cc3213bb89825dc5183 1.3.7
+edc4643f811d706cbbb6400d048bf56602aed963 1.4.2
+aff8aabe571be6db68e8bf44bf7670df5d55d1ff 1.5.0
+32e357c7de239cd9184f6c9b592353e5165b65a9 1.5.1
+b5c9f28d72fe1f835d24fe427e211a539f8709fe 1.5.2
+03ab950d31edfe1b77aa6e7259e64dbdd3e17fe2 1.6.5
diff --git a/src/third_party/wiredtiger/LICENSE b/src/third_party/wiredtiger/LICENSE
new file mode 100644
index 00000000000..1b2196a4300
--- /dev/null
+++ b/src/third_party/wiredtiger/LICENSE
@@ -0,0 +1,17 @@
+Copyright (c) 2008-2014 WiredTiger, Inc.
+ All rights reserved.
+
+This program is free software: you can redistribute it and/or modify it under
+the terms of either version 2 or version 3 of the GNU General Public License
+as published by the Free Software Foundation.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+details.
+
+For a license to use the WiredTiger software under conditions other than those
+described by the GNU General Public License, or for technical support for this
+software, contact WiredTiger, Inc. at info@wiredtiger.com.
+
+For further information, see the licensing section in the documentation.
diff --git a/src/third_party/wiredtiger/NEWS b/src/third_party/wiredtiger/NEWS
new file mode 100644
index 00000000000..3d793f3fcf4
--- /dev/null
+++ b/src/third_party/wiredtiger/NEWS
@@ -0,0 +1,1785 @@
+WiredTiger release 2.4.1, TBD
+------------------------------------
+
+The WiredTiger 2.4.1 release is currently in active development:
+
+New features and API changes:
+* Add a new WT_SESSION::transaction_pinned_range method that helps users
+ identify when a session is keeping a transaction ID pinned for a long time.
+ [#1314]
+
+
+WiredTiger release 2.4.0, 2014-10-15
+------------------------------------
+
+The WiredTiger 2.4.0 release contains significant new features, API changes
+and many bug fixes.
+
+New features and API changes:
+
+* Cursors keep their position across transaction boundaries. That is
+ WT_SESSION::begin_transaction and WT_SESSION::commit_transaction no longer
+ reset cursors. [#1181]
+
+* Change cursor behavior so that when an operation returns WT_NOTFOUND, the
+ cursor is now left pointing to the original key/value pair. [#1209]
+
+* Initial support for building WiredTiger on Windows.
+
+* Add ability to customize a collator for specific data sources or with
+ application managed metadata. See upgrading documentation for more
+ information. [#1165]
+
+* Enhance extension mechanism in WiredTiger to support loading extensions from
+ the application binary - not just a separate library. [#1174]
+
+* Replace WT_SESSION::create "lsm=(merge_threads)" configuration option with
+ ::wiredtiger_open "lsm_manager=(worker_thread_max)". See upgrading documentation
+ for more information.
+
+* Enhancements to the WiredTiger Python API build process. [#1188]
+
+* Add ability to dump and load WiredTiger databases in JSON format. [#1154]
+
+* Add ability to automatically checkpoint based on the volume of log records
+ generated since the last checkpoint. This is enabled using the
+ ::wiredtiger_open configuration option "checkpoint=(log_size=size)" [#1170]
+
+* Enhance functionality allowing users to write content into the WiredTiger
+ transaction log. [#1171][#1175]
+
+* Enhance the WiredTiger HyperLevelDB implementation to support log replay.
+ [#1106][#1155]
+
+Other significant changes:
+
+* Fix several bugs in the shared cache implementation. [#1180][#1176]
+
+* Fix a bug where the public URI field in a cursor did not match the string
+ passed to WT_SESSION::open_cursor. [#1235]
+
+* Fix several bugs in salvage. [#1222][#1169]
+
+* Several bug fixes and enhancements for WT_CONNECTION::reconfigure.
+ [#1214][#1172]
+
+* Fix several bugs in raw compression implementation, particularly for data
+ that compresses extremely well. [#1191]
+
+* Several bug fixes and enhancements to WiredTiger LevelDB interface.
+
+* Switch default build from using adaptive pthread mutexes to default pthread
+ mutexes.
+
+
+WiredTiger release 2.3.1, 2014-08-14
+------------------------------------
+
+The WiredTiger 2.3.1 release contains mainly performance enhancements and bug
+fixes.
+
+Changes to the WiredTiger API:
+
+* Fix a bug in WT_CURSOR::set_value that could lead to undefined behavior with
+ some value formats.
+
+* Make the asynchronous API generally available [#1139]
+
+* Add log cursors for replay and verification. Make generated log record and
+ operation types public. [#1106]
+
+* Allow eviction worker threads to be started and stopped dynamically.
+ Applications that use the `eviction_workers` configuration should see the
+ upgrading documentation on how to use this feature.
+ [#1116, #1143, #1158]
+
+Other significant changes:
+
+* Improve performance and reduce latency during checkpoints and LSM merges.
+ Remove uses of the checkpoint lock other than serializing checkpoints:
+ compact holds the schema lock, so it doesn't need to hold the checkpoint
+ lock, the new WT_BTREE handle close lock prevents checkpoints from colliding
+ with handle close, so LSM doesn't need the checkpoint lock either.
+
+* Some minor cleanups, setting the internal session's name in a few places.
+ [#1073]
+
+* Grab the live lock when loading a checkpoint in diagnostic mode: that could
+ race with a read. [#1102]
+
+* Instead of keeping a list of file URIs for checkpoint to flush, open a handle
+ and stash it. [#1114]
+
+* Add a new OS-layer function __wt_fsync_async to flush a file without waiting
+ for the results, call it from the Btree flush-leaves code so pages start
+ flushing while we're working the rest of the checkpoint. [#1136, #1152]
+
+* Wait for the handle flush lock when writing the leaf pages instead of
+ returning EBUSY. [#1136]
+
+* Add a wtperf page to the documentation, describe how to simulate workloads
+ and view statistics. [#1147]
+
+* Flag new structures not listed in PREDEFINE. [#1148]
+
+* Return EBUSY if no async handles available and fix ex_async to look for it.
+ [#1153]
+
+* Fix some problems with navigation in the reference guide.
+
+* Bump the number of slots for internal sessions: we have a lot more than 2
+ now. Add a test for `session_max` settings, make sure we add enough to
+ account for at least the default internal sessions.
+
+* Remove tcbench: we're no longer maintaining it.
+
+
+WiredTiger release 2.3.0, 2014-07-29
+------------------------------------
+
+The WiredTiger 2.3.0 release contains significant new features, performance
+enhancements and bug fixes. Significant changes are described below.
+
+Changes to the WiredTiger API (see upgrading documentation for details):
+
+* Add a LevelDB API implementation for WiredTiger. This includes support for
+ stock LevelDB as well as Basho, HyperLevelDB and RocksDB versions of the API.
+ To build the LevelDB API include --enable-leveldb in the configure command,
+ to specify compatability with an alternative LevelDB API use
+ --enable-leveldb=[basho,hyper,rocksdb]. [#1028]
+
+* Add ability to build some common extensions into the WiredTiger library.
+ This means that the libraries for those extensions don't need to be
+ dynamically loaded at runtime. Currently supported extensions are Snappy
+ compression and zlib compression. The option can be enabled by passing
+ --with-builtins=[snappy,zlib] to the configure command line.
+
+* Add a new configuration to wiredtiger_open: statistics_log=(on_close=true),
+ that causes a set of statistics to be logged on WT_CONNECTION::close. [#1086]
+
+* Add a new configuration to wiredtiger_open: exclusive, that causes the open
+ to fail if the database already exists.
+
+Other significant changes:
+
+* Performance improvement for high throughput workloads using multiple
+ eviction threads. Performance of some workloads improves by over 15% [#1087]
+
+* Significant performance optimizations for queries, giving up to 20%
+ throughput improvement for in-memory query workloads.
+ https://github.com/wiredtiger/wiredtiger/wiki/Query-throughput
+
+* Fix an off-by-one bug that could lead to ENOMEM during commit with logging.
+ [#1104][#1121]
+
+* Allow bulk loads to multiple files to complete in parallel. [#1114][#1126]
+
+
+WiredTiger release 2.2.1, 2014-06-24
+------------------------------------
+
+The WiredTiger 2.2.1 release contains mainly performance enhancements and bug
+fixes. Significant changes are described below.
+
+Changes to the WiredTiger API (see upgrading documentation for details):
+
+* Change the order in which configuration setting mechanisms are applied by
+ wiredtiger_open. [#1010][#1034]
+
+* Split the global transaction_sync configuration into two parts: a sync method
+ (dsync, fsync or none), and an enabled flag (false by default). [#1074]
+
+* Add ability to sync with per transaction granularity. [#1074]
+
+* Update WiredTiger Java API to throw WiredTigerException consistently. [#1011]
+
+* Add ability to dump and load databases using JSON format. [#740][#1049]
+
+Other significant changes:
+
+* Various performance improvements to the main cursor search routine including
+ reductions in how often we need to copy data and profiling based optimizations
+ for tight search loops. [#1050][#1070]
+
+* Fix a bug in recovery with missing files (e.g., after a hotbackup that raced
+ with file creation). [#1042][#1045]
+
+* Several bug fixes and performance enhancements related to LSM trees and
+ snapshot isolation transactions. [#1057][#1060][#1075]
+
+* Several performance tuning enhancements to LSM trees around locking,
+ throttling and switching chunks. [#1051]
+
+* Algorithmic improvements to LSM tree compact operation. It is now faster
+ and more reliable. [#1063]
+
+* Create a separate thread to manage open file handles - which means that:
+ - Application threads are less likely to be responsible for closing handles
+ - Multi threaded workloads don't open/close handles more often than necessary
+ [#1018]
+
+
+WiredTiger release 2.2.0, 2014-05-21
+------------------------------------
+
+The WiredTiger 2.2.0 release contains new features, performance enhancements
+and bug fixes. Significant changes include:
+
+Changes relevant for upgrading applications:
+
+Update the table create API to disable prefix compression by default.
+Applications generally see better performance without prefix compression,
+choosing space saving over performance is up to the application. [#981]
+
+Change the default leaf_page_max setting from 1MB to 32KB. Choosing a large
+default leaf_page_max led to poor performance in out of cache workloads.
+
+Remove the `--enable-debug` option to configure. It is more standard to set
+`CFLAGS="-g"` variable instead.
+
+Save the wiredtiger_open configuration when a database is created, so that
+settings like cache size, extensions and logging are set consistently by all
+subsequent users of the database.
+
+Add an `--enable-verbose` option to configure. In order to access the verbose
+message functionality available as part of the wiredtiger_open and
+WT_CONNECTION::reconfigure APIs, it is necessary to pass the `--enable-verbose`
+option to configure.
+
+Enhance the metadata cursor implementation (i.e: cursors created with a
+"metadata:" prefix) so that they can be used to inspect metadata for internal
+tables and now support altering the metadata. Add a new "read_only" flag to
+cursor configuration that defaults to false for metadata cursors.
+
+Fix several bugs in raw compression, including one that could cause data
+corruption and some that triggered poor performance.
+[#984][#991][#1007][#1008][#1013]
+
+Improve the performance of recovery - we no longer need to scan all log files
+looking for the last checkpoint.
+
+Improve performance of read-only transactions, by deferring the allocation
+of transaction IDs. [#978]
+
+Several bug fixes in hot backup related to log
+files, including:
+ * Always choose the right metadata version in the backup [#972]
+ * Don't require that hot backup copies log files in order [#976]
+ * Always copy log files before data files [#976]
+ * Fix a bug where recovery returned an error if the last log record was
+ incomplete [#994]
+
+Speed up checkpoints by doing a better job of skipping pages that can't
+contain changes that need to be included. [#954][#963][#1001]
+
+Add ability to store zero length data items into LSM trees. [#540]
+
+Add an asynchronous data access/manipulation API to WiredTiger. [#933]
+
+Add the ability to configure multiple eviction server threads, to help with
+keeping space available in the cache. [#918]
+
+Add the ability to reconfigure the checkpoint and statistics log servers.
+[#997][#1004]
+
+Improve the performance of retrieving data for in cache workloads. [#970]
+
+Improve the structure of the in-memory tree we are generating, by allowing
+internal pages to be split. This significantly improves query performance
+in some workloads. [#876]
+
+Work around a bug in posix_fallocate on Linux, where it could corrupt already
+written data.
+
+Add the ability to leak memory on connection close via new leak_memory option
+to WT_CONNECTION::close API. This allows for faster shutdown if a process is
+going to exit when the WiredTiger connection is closing.
+
+Allow salvage to run on any table type.
+
+WiredTiger release 2.1.2, 2014-03-27
+------------------------------------
+
+The WiredTiger 2.1.2 release contains performance enhancements and bug fixes.
+Significant changes include:
+
+Update the configuration settings for shared_cache to make the distinction
+between cache_size and shared_cache less confusing. See upgrading
+documentation for more information.
+
+Various performance enhancements to improve the performance of checkpoints.
+
+Fix a bug that could cause a hang with small caches under heavy load. [#894]
+
+WiredTiger release 2.1.1, 2014-03-04
+------------------------------------
+
+The WiredTiger 2.1.1 release contains new features, performance enhancements
+and bug fixes. Significant changes include:
+
+Fix a bug where a page could be marked clean when it contained uncommitted
+changes. This bug could cause undefined behavior in transaction rollback
+under load.
+
+Fix a bug with shared caches when rebalancing between connections.
+
+Add a new public API to WiredTiger that provides the ability to parse
+WiredTiger compatible configuration strings. See the upgrading documentation
+for further information. [#873]
+
+A number of performance enhancements to the LSM implementation, particularly
+for long running workloads.
+
+A number of performance enhancements and bug fixes to cache eviction code.
+
+Add an option to use direct I/O when reading from checkpoints. To enabled
+the functionality add "direct_io=[checkpoint]" to your wiredtiger_open
+configuration string. [#847]
+
+
+WiredTiger release 2.1.0, 2014-02-04
+------------------------------------
+
+The WiredTiger 2.1.0 release contains new features, performance enhancements
+and bug fixes. Significant changes include:
+
+The WT_ITEM structure was changed so that the size field is a size_t rather
+than a uint32_t. See upgrading documentation for details.
+
+A change to the compress_raw interface around repeating the call with more
+records. See upgrading documentation for details.
+
+In LSM trees, the memory_page_max setting is ignored. The effective setting
+is double the chunk size. [#861][#859]
+
+Add support for zlib compression. [#855] [#865]
+
+Various enhancements to how WiredTiger generates tree structures in memory to
+help maintain consistent performance as table size grows. [#851]
+
+Add support for Levyx Inc Helium as an external data source in WiredTiger
+[#849][#850]
+
+Improve insert performance when a table contains many identical overflow
+items.
+
+Various performance enhancements to btree searches. [#838][#839][#840]
+
+Add support for newer versions of autoconf up to 1.14. [#599][#841]
+
+Improve multi-threaded throughput of durable log writes, including changing
+the default wiredtiger_open transaction_sync configuration from dsync to
+fsync, see the upgrading documentation for further information. [#831][#832]
+
+In the Python and Java APIs, automatically close handles to prevent invalid
+accesses by applications. [#649][#800][#830]
+
+Various enhancements to the LSM merge algorithm, including improvements to how
+files are selected for merging, and throttling based on whether merges are
+keeping up (to limit write amplification). Made the minimum number of chunks
+chosen to merge configurable. [#817][#819][#822]
+
+
+WiredTiger release 2.0.1, 2013-12-12
+------------------------------------
+
+The WiredTiger 2.0.1 release contains major new features, numerous performance
+enhancements and bug fixes.
+
+Significant changes include:
+
+* WiredTiger now supports fine-grained durability via Write Ahead Logging (WAL).
+ Logging is enabled with the "log=(enabled)" configuration string to
+ wiredtiger_open. If the connection is not shut down cleanly and logging is
+ enabled, WiredTiger will automatically run recovery the next time it is
+ opened, rolling forward changes in the log until the last commit.
+ [#605]
+
+* Many enhancements to the LSM implementation to improve the throughput and
+ reduce maximum operation latency including:
+ - Algorithmic improvements when multiple merge threads are configured.
+ - Improvements to bloom filter lookup speed.
+ - Enhancements to internal cursor management, to reduce search overhead.
+ - Prioritize switching to a new level 0 chunk in utility threads, to avoid
+ application thread pauses.
+ - More advanced logic in choosing when to create bloom filters.
+
+* LSM specific WT_SESSION::create configuration option enhancements. Including:
+ - Move existing options into their own group, and strip leading lsm_ prefix.
+ - Add a new merge_max configuration option.
+ - Update the default chunk_size to be 10MB.
+ - Increase the default bloom filter bit and hash counts.
+ - Clean up files left after interrupted merges.
+ See the upgrading documentation for details.
+ [#784, #785, #786, #802]
+
+* WT_SESSION::compact can now be used to merge LSM trees into a small number
+ of chunks on disk.
+ [#792]
+
+* Enhanced the Java API, so that when WiredTiger automatically closes a
+ handle, the handle is automatically invalidated for the Java application.
+ [#485]
+
+* Add a script that can create an interactive web page to view statistics
+ from a WiredTiger statistics log. Based on D3: http://d3js.org/
+
+* Enhancements to the wtperf performance testing tool to add new features
+
+
+WiredTiger release 1.6.6, 2013-11-19
+------------------------------------
+
+The WiredTiger 1.6.6 release is a bugfix and performance tuning release.
+
+This release of WiredTiger contains a database format change. Database files
+from previous releases will need to be upgraded.
+
+A special note: the WiredTiger code base is now being regularly reviewed
+using the Coverity Static Analysis Verification Engine. We'd like to
+thank Coverity for their on-going support of Open Source projects like
+WiredTiger!
+
+Significant changes include:
+
+* Performance changes include: limiting operations done inside update
+ serialization primitives, removing unnecessary memory barriers, replacing
+ spinlocks with atomic instructions, padding structures to avoid false
+ cache sharing, switching from per-file mutexes to per-page mutexes,
+ pre-allocating structures to avoid memory allocation while holding
+ mutexes, and using adaptive mutexes where available.
+ [#707, #718, #719]
+
+* A number of LSM stability and performance improvements: changes include
+ better merge algorithms, reduced locking, and higher concurrency.
+
+* A number of table compaction performance improvements, including changes
+ allowing compaction to no longer read unnecessary file blocks into the
+ cache, requires fewer passes over the file and support concurrent
+ checkpoints and eviction. This change required an underlying file
+ format change, see the upgrading documentation for details.
+ [#756, #761]
+
+* WiredTiger statistics have been significantly improved:
+
+ Statistics logging has been changed to aggregate information from all
+ open handles. [#709, #717]
+
+ For performance reasons, statistics are now disabled by default, see
+ the upgrading documentation for details. [#715]
+
+ Statistics configuration has been changed so the connection and cursor
+ configuration are consistent, with matching changes to the "wt stat"
+ command-line utility; see the upgrading documentation for details.
+
+* Update WT_EVENT_HANDLER interface to contain a new "handle close"
+ interface and to pass a WT_SESSION handle into all callbacks, see the
+ upgrading documentation for details. [#649]
+
+ Add timestamp, process ID and thread ID to messages generated via
+ WT_EVENT_HANDLER interface. [#753]
+
+* WiredTiger eviction improvements, supporting larger data-to-cache size
+ ratios. [#754]
+
+* Various fixes for handling overflow records. [#726, #743]
+
+* Overflow records are no longer tracked during bulk-loads, significantly
+ increasing bulk-load performance for some data sets.
+
+
+WiredTiger release 1.6.5, 2013-10-09
+------------------------------------
+
+This is primarily a bugfix and performance tuning release. The main changes are:
+
+* Change the default statistics_fast configuration from false to true.
+
+* Change WT_CURSOR::insert to not hold a position. [#673]
+
+* Disallow WT_SESSION::compact operations on LSM trees.
+
+* The 'sync' setting to wiredtiger_open has been renamed 'checkpoint_sync'.
+
+* Add a "metadata:" cursor type. [#660]
+
+* Fix race in the cache's dirty byte tracking. [#635, #699]
+
+* Fix a bug scanning through a memory-mapped file with overflow items. [#701]
+
+* Use hardware checksum instructions when available. [#582, #702]
+
+* Several bug fixes related to tracking active transaction IDs and detection of
+ obsolete updates with high concurrency workloads. [#639, #643, #657, #683]
+
+* Fix several bugs in LSM including races on shutdown and Bloom filter
+ creation. [#686, #687, #688].
+
+* Fix a bug in LSM where we were not including Bloom filter files in backups.
+ [#684]
+
+* Optimize the LSM throttle and merge algorithms. [#676]
+
+* Make hot backups work concurrently with files being bulk-loaded. [#570, #653]
+
+* Add full support for snapshot isolation to LSM: only switch LSM chunks if all
+ changes are globally visible and detect conflicts between transactions across
+ file switches. [#629]
+
+
+WiredTiger release 1.6.4, 2013-08-20
+------------------------------------
+
+This is primarily a bugfix and performance tuning release. The main changes are:
+
+* Make prefix compression of keys conditional on the amount of space saved.
+ A database format change was required for this enhancement. See upgrading
+ documentation for details. [#624]
+
+* The default behavior of the wt utility's load command has been changed to
+ overwrite existing data.
+
+* Add a WT_SESSION.create prefix_compression_min configuration option with a
+ default value of 4. [#624] and [#624]
+
+* Fix "make install" of Python API. [#598]
+
+* Require platform support for atomic read/write of 64 bit values. [#553]
+
+* Support transaction semantics for custom data source implementations. Enhance
+ Memrata data source to support transactions.
+
+* Changes to the wtperf testing tool related to how configuration options are
+ specified.
+
+* Enhance cursor key/value memory management to be more efficient, consistent,
+ and have stricter checking of inputs and outputs.
+
+* Increase the likelihood of being able to evict hot pages. [#604]
+
+* Reference on-page keys instead of copying them to allocated memory. This
+ saves space in the cache and overhead when reading pages into cache.
+ [#592] and [#600]
+
+* Add a btree search optimization that skips matching prefixes. [#595]
+
+* Turn off Huffman encoding for keys on row-store internal pages. [#592]
+
+* Add concurrent logging infrastructure that will be used to support write
+ ahead logging in a future release.
+
+
+WiredTiger release 1.6.3, 2013-07-12
+------------------------------------
+
+This is a bugfix and performance tuning release. The main changes are:
+
+* Change the default cursor overwrite configuration so that it is consistent
+ across all data sources. This change may alter the behavior of existing
+ applications without triggering any compilation or runtime warnings. See
+ the upgrade documentation for details. [#512]
+
+* Require platform support for 64 bit atomic operations. [#553]
+
+
+WiredTiger release 1.6.2, 2013-06-18
+------------------------------------
+
+This is a bugfix and performance tuning release. The main changes are:
+
+* Fix a race in the WiredTiger pseudo random number generator that was leading
+ to poor distribution of numbers.
+
+* Change the default compression configuration to "uncompressed".
+
+* Fix a race between checkpoints and LSM that could result in a crash. [#543]
+
+* Add an option to output version information at runtime. Configure by
+ including "verbose=[version]" in the wiredtiger_open connection
+ configuration string. [#564]
+
+* Add a configurable prefix to error messages. [#527]
+
+* Add two new extension APIs, one to return a transaction ID, one to return
+ if a transaction ID is visible to the current transaction.
+
+* Add standard metadata functions to the extension API and make extension data
+ sources responsible for their own metadata entries.
+
+* Add a new extension function __wt_ext_config_strget that returns the
+ configuration value from a single string.
+
+
+WiredTiger release 1.6.1, 2013-05-31
+------------------------------------
+
+This is a bugfix and performance tuning release. The main changes are:
+
+* Fix the compress_raw API so that it uses platform independent types. See the
+ upgrade guide for further information. [#561]
+
+* Add an explicit enable setting to shared_cache configuration. See the
+ upgrade guide for further information.
+
+* Fix several bugs in hot backup, including race conditions between backup and
+ table drop (and other schema level operations). [#556] [#557]
+
+* Allow any data source type for indices as well as column groups. [#545]
+
+* Preload btree internal pages into file system cache when opening a table.
+
+* Change the default allocation size to 4KB so that DIRECT_IO with 4KB blocks
+ works. [#547]
+
+* Fix some bugs related to tracking the oldest active transaction. [#552]
+
+* Fix a bug in the extension API when using multiple databases.
+
+* Disallow named checkpoints on LSM trees - they aren't supported. [#546]
+
+* Fix support for custom collators with LSM trees. [#544]
+
+* Build fixes for gcc 4.1.2.
+
+See the upgrade documentation for details of API changes that may require
+altering existing applications.
+
+
+WiredTiger release 1.6.0, 2013-05-16
+------------------------------------
+
+This release contains new features, bug fixes and performance improvements.
+The significant changes are highlighted below:
+
+* Fix a bug where configuring direct I/O could cause checksum errors at
+ runtime. NOTE: database file format change. [#526]
+
+* Fix a race that allowed checkpoints to be deleted while hot backups are
+ running. [#515]
+
+* Scale to events per second in graphs generated from statistics log output.
+ [#518]
+
+* Changes to reduce the latency of LSM operations.
+
+* Add a new terminate callback to extension interfaces that is called when the
+ WiredTiger connection is closed. [#530]
+
+* Various optimizations and bug fixes to cache management and eviction code.
+
+* Update various statistics.
+
+* Fix a bug where using a combination of read-committed and snapshot
+ transactions could result in inconsistent values being returned. [#539]
+
+* Fix a bug where using LSM trees with compression enabled could result in an
+ invalid system call. [#535]
+
+* Enhance statistics logging so that it can dump "lsm:" statistics.
+
+See the upgrade documentation for information about database format changes
+in this release.
+
+
+WiredTiger release 1.5.3, 2013-04-26
+------------------------------------
+
+This release contains some major new features along with numerous bug fixes
+and performance improvements. The significant changes are highlighted
+below:
+
+* Enhance the extension data source API to facilitate implementation of new
+ data stores in WiredTiger.
+
+* Add support for the STEC / Memrata KVS data source.
+
+* Add a Berkeley DB data source via the WiredTiger extension API.
+
+* Various enhancements to cache eviction management. Mostly to avoid stalls in
+ application threads.
+
+* Fixes to shared cache pool implementation, so resources are more
+ aggressively reallocated.
+
+* Add new statistics.
+
+* Implement automatic insert throttling in LSM - enabled by default.
+
+* Configuration strings are now case sensitive.
+
+* Enhance LSM merge algorithms to be more efficient as trees grow very large.
+
+See the upgrade documentation for details of API changes that may require
+altering existing applications.
+
+
+WiredTiger release 1.5.2, 2013-03-28
+------------------------------------
+
+This is a bugfix release. The main changes are:
+
+[#493] Fix get_key/value in the Java API for complex cursors.
+
+* Fix a leak in eviction detected by valgrind.
+
+* Stop trying to cache the oldest reader: we only use it for eviction and only update it when required.
+
+* Track cursor creation in the statistics (creating a cursor per operation isn't a good idea).
+
+
+WiredTiger release 1.5.1, 2013-03-25
+------------------------------------
+
+This is a bugfix and performance tuning release. The main changes are:
+
+* Fix several bugs in LSM:
+ - the logic for setting the "no eviction" flag on LSM chunks was reversed,
+ causing unnecessary eviction once the cache became full;
+ - calling session.checkpoint while writing to an LSM tree could confuse
+ the logic around switching to new chunks; and
+ - fix a possible NULL pointer indirection when switching chunks.
+
+* Make WT_ASSERT a no-op when not in DIAGNOSTIC mode.
+
+* Panic if we find a block on the wrong list, that's not something we can
+ recover from.
+
+* If a page is reconciled (causing it's on-disk blocks to be freed and
+ potentially recycled), and then a subsequent collapse of a stack of
+ split-merge pages replaces that page with a page that has not yet been
+ reconciled, we can potentially free the same blocks twice. The fix is to
+ clear the page's WT_REF.addr field at the time we free the blocks, so
+ future reconciliations will ignore the original disk blocks.
+
+* Fix a bug in the dump utility that allowed index URIs.
+
+* Tweak merge to build better trees with random insert workloads.
+
+* Don't use a stale value for the oldest reader transaction ID.
+
+* Track the size of the WT_REF array in internal pages (including
+ WT_ADDRs). Also add an estimate of per-allocation overhead.
+
+* Fix a bug where URIs containing absolute paths were not being parsed
+ correctly.
+
+* Add a RMW insert mode to wtbench.
+
+[#427] Improve cleanup after a failed wiredtiger_open call.
+
+[#484] Don't allow true/false values in config strings where integers are
+ expected.
+
+[#486] Move the cache full check for autocommit transactions out of the
+ rollback path (since we don't reset cursors there), to after we
+ close a cursor.
+
+[#488] Fix an assertion failure if we try to do eviction without ever having done an update.
+
+
+WiredTiger release 1.5.0, 2013-03-14
+------------------------------------
+
+This release contains some major new features along with numerous bug fixes
+and performance improvements. The significant changes are highlighted
+below:
+
+* Add a Java API.
+
+* Create a thread to do automatic checkpoints, configured by passing
+ "checkpoint=(wait=X)" to wiredtiger_open.
+
+* Add support for periodically logging statistics to a file and a tool to
+ generate graphs based on those logs. Configured by passing
+ "statistics_log=(wait=X)" to wiredtiger_open.
+
+* Several changes to minimize the impact of checkpoints on other threads.
+
+* When reading from checkpoints, use mmap by default.
+
+* Enhance eviction so that internal pages take up less space.
+
+* Add maximum filesystem buffer cache settings to wiredtiger_open called
+ "os_cache_max" and "os_cache_dirty_max". After doing the specified
+ amount of reads or writes, WiredTiger will call fadvise and/or
+ sync_file_range to drop pages from the filesystem cache. This is an
+ alternative to direct I/O with less impact on performance.
+
+* Make run-time statistics optional, defaulted to "off".
+
+* Change how we detect if shared cache is used. It used to rely on a name,
+ now it will be used if the shared_cache configuration option is included.
+
+* Add the ability to specify a per-connection reserved size for cache
+ pools. Ensure cache pool reconfiguration is honoured quickly.
+
+* Rework hazard pointer coupling during cursor walks to be more efficient.
+
+* Add a cache_eviction_walk statistic to track the pages we walk and a
+ cache_eviction_force statistic to track the count of pages queued for
+ forced eviction.
+
+* Fixes to reduce the number of operations on shared data that were causing
+ bottlenecks in read only workloads.
+
+* Add streaming pack / unpack to the API.
+
+* Add some basic reconciliation stats to the connection stats.
+
+* In LSM, keep trying to switch if there is an error: it may be transient.
+
+* Minor clean up and enhancement for the reconciliation statistics, add a
+ set of compression statistics, both to the data-source statistics.
+
+* Compaction cannot run at the same time as a checkpoint: the problem is
+ that checkpoints review page reconciliation information and checkpoints
+ update page reconciliation information. Lock out checkpoints while
+ compaction is running.
+
+
+WiredTiger release 1.4.2, 2013-01-14
+------------------------------------
+
+[#387] Fast-path "S" and "u" formats in cursor.get_key and cursor.get_value.
+
+[#407] Allow non-conflicting updates to complete concurrently.
+
+[#418] Add code in to prioritize eviction of pages that are larger than a
+certain threshold. This avoids taking a performance hit when a huge page
+needs to be reconciled. Add a new memory_max_page configuration option.
+
+[#419] If a page splits, it potentially creates a merge-split internal page
+and we potentially walk that page during fast-delete. The WT_REF.addr field
+doesn't point to a cell in that case and we'll drop core.
+
+[#424] Add clarification wording for boolean configuration strings.
+
+[#425] Perform checkpoints in the calling thread, don't block eviction: when
+evicting in a file that is being checkpointed, only evict clean pages. Also
+Do compaction in the calling thread instead of interrupting the eviction
+thread to do the work.
+
+[#426] Fixes for automake 1.3.x. Allow examples to run in parallel: give
+each a unique home directory.
+
+Make the tree build without HAVE_VERBOSE.
+
+Fix some issues with LSM rename and add a Python test.
+
+Track when cursors refer to memory returned by WiredTiger, copy it if
+required before dropping hazard pointers that might be protecting it.
+
+Verify shouldn't ever modify the file -- don't bother checking for dirty
+pages, just discard everything.
+
+When rolling forward to resolve key prefix compression, don't copy the key,
+we only need a reference to it, should speed up tables with lots of key
+prefix compression.
+
+Requested changes for the WT_COMPRESSOR::compress_raw method API: pass in the
+configured object's page size as a convenience, and if
+WT_COMPRESSOR::pre_size is set, use it to determine the size of the
+destination buffer, rather than using the object's page size as the maximum
+needed.
+
+
+WiredTiger release 1.4.1, 2012-12-12
+------------------------------------
+
+This is a bugfix, cleanup and performance tuning release. The significant
+changes are highlighted below:
+
+[215] Add a __wt_panic function that shuts down all of the WiredTiger APIs.
+ Also add a new error return WT_PANIC which means there has been an error
+ in the WiredTiger engine, and it should be restarted.
+
+[409] Fix a bug populating column groups with complex schema. Also allow empty
+ column lists in projection cursors.
+
+[150] Add description of how to do index-only searches to the documentation.
+
+[392] Move examples/c/ex_test_perf.c to bench/wtperf.
+
+[322] Add support for statistics on schema-level objects i.e tables,
+ column groups, indices.
+
+* Enhance statistics, including changing the name of some statistics.
+
+* Fix a bug in the eviction server that could cause it to abort, leaving the
+ system unusable.
+
+
+WiredTiger release 1.4.0, 2012-12-03
+------------------------------------
+
+This release adds several major new features, a number of performance
+improvements and bug fixes. The significant changes are outlined below:
+
+New features and API changes:
+
+[242] Track the percentage of cache that is dirty, trigger eviction to bound
+ it. This can be used to bound how much data checkpoints write.
+
+[324] Add support for WT_COMPRESS::compress_raw, which lets the compression
+ routine select how many rows are included in each disk block.
+
+[381] Add statistics to track read and write amplification (application data
+ size versus I/O size)
+
+* Add a trigger configuration option to WT_SESSION::compact API.
+
+* Make WT_SESSION::create's checksum configuration 3-state: on, off, or
+ uncompressed blocks only.
+
+Bug fixes:
+
+* Fix build issues on Solaris.
+
+* Fix a bug calculating the generation of an LSM merge.
+
+* Fix WiredTiger dump and load for tables.
+
+* Fix a memory leak in checkpoints.
+
+* Improve accuracy of cache memory tracking with overflow items.
+
+
+WiredTiger release 1.3.8, 2012-11-22
+------------------------------------
+
+This release improves the performance of LSM trees, changes how statistics are
+reported and adds a shared cache implementation:
+
+New features and API changes:
+
+[232] Add a "size of checkpoint" statistic.
+
+* Add a shared cache pool implemention. Manages a single cache among
+ multiple databases within a process.
+
+* Merge statistics from file and LSM sources into a "data source" statistic
+ structure. Rename and regroup some shared stastistics. Add a helper to
+ the Python API to lookup in a cursor in a simple expression.
+
+* Add support for sub groups of options in configuration strings.
+
+Performance tuning for LSM trees:
+
+* Don't try to merge with a chunk that is much larger than a small chunk.
+
+* After an LSM merge, fault in some pages before the new tree goes live to
+ avoid stalling application threads.
+
+* Don't automatically fail inserts if the write generation check fails:
+ compare keys instead.
+
+* Switch the LSM tree lock to a read/write lock, so cursors can read the
+ state of the tree in parallel.
+
+Bug fixes:
+
+* Fix a bug where we could write past the end of a buffer after it was grown.
+
+
+WiredTiger release 1.3.7, 2012-11-09
+------------------------------------
+
+This release fixes a bug and improves performance with Bloom filters:
+
+* Drop any old Bloom filter before creating a new one -- we may have been
+ interrupted in between creating it and updating the metadata. Write the
+ metadata after creating missing Bloom filters.
+
+* Use a separate thread for creation of Bloom filters for the newest,
+ unmerged LSM chunks.
+
+* Changes to the ex_test_perf example: change the default configuration to
+ 4KB pages and disable prefix compression. Change the "-i" command line
+ option to be a simple count of records to insert. Clean up error
+ handling and add option to populate using multiple threads.
+
+* Clarify the docs for the default buffer_alignment setting.
+
+
+WiredTiger release 1.3.6, 2012-11-06
+------------------------------------
+
+This is a bugfix and performance tuning release. The changes are as follows:
+
+* Rename the WiredTiger installed modules to libwiredtiger_XXX. Don't install
+ the nop and reverse collator modules.
+
+* Replace test/format's bzip configuration string with compression, which can
+ take one of four arguments (none, bzip, ext, snappy), change format to run
+ snappy compression if the library is available.
+
+* Rename the builtin block compressor names from "bzip2_compress" to "bzip2",
+ and from "snappy_compress" to "snappy".
+
+* Support multiple LSM merge threads with the "lsm_merge_threads" config key.
+ Use IDs rather than array index to mark the start chunk in a merge, in case
+ we race with another thread.
+
+* Cache the hash values used for Bloom filter lookups, rather than hashing for
+ each Bloom filter in an LSM tree.
+
+* Only switch trees in an LSM cursor if the primary chunk is on disk.
+
+* Add a per-btree cache priority, currently only used to make it more likely
+ for Bloom filter pages to stay in cache.
+
+* Only evict pages with read generations in the bottom quarter of the range we
+ see. Fix a 32-bit wrapping bug in assigning read generations.
+
+* For update-only LSM cursors, only open a cursor in the primary chunk.
+
+* LSM: Report errors from the checkpoint thread.
+
+* LSM: only save a Bloom URI in the metadata after it is successfully created.
+
+* LSM: Create missing Bloom filters when reading from an LSM tree if
+ "lsm_bloom_newest"is set.
+
+* LSM: Include all of the chosen chunks in a merge. Only pin the current chunk
+ in an LSM cursor if it is writeable.
+
+
+WiredTiger release 1.3.5, 2012-10-26
+------------------------------------
+
+This is a bugfix and performance tuning release. The changes are as follows:
+
+[#370] Document that applications are responsible for figuring out their
+ upgrade path if they might swap out compression engines.
+
+[#371] When a single session was used to reconcile multiple btrees, one of
+ which had dictionaries configured and one of which didn't, we failed to
+ clear the dictionary when starting page reconciliation. Be consistent,
+ never use anything other than the btree handle's configuration to decide
+ if we're using a dictionary in a reconcilation run.
+
+[#372] Fix several potential integer overflow bugs.
+
+[#373] Fix a bug where calls that performed an operation on multiple objects
+ (such as creating a table that implicitly creates a column group)
+ could leave the metadata incomplete if a process exited without
+ calling `WT_CONNECTION::close`.
+ Hold the schema lock while opening tables. Fixes the error "cannot be
+ opened until all column groups are created" message when create calls
+ race with open_cursor.
+
+[#374] Fix a race that caused crashes when using the Python API with
+ multi-threaded code.
+
+[#375] Fix a bug in __wt_cond_wait - so that it returns after timeout expires.
+
+* Protect the list of LSM trees with the schema lock to avoid races during
+ create.
+
+* Update ex_test_perf to output statistics during populate and improve timing
+ accuracy.
+
+* Skew eviction in favor of leaf pages - which improves read-only performance
+ for large LSM trees.
+
+* Hold the LSM tree lock while gathering statistics.
+
+* Fix a bug in bulk load of bitmap files.
+
+* Fix a related bug in the bloom code that uses bitmap stores.
+
+* Don't attempt to drop the first chunk of an LSM tree before creating it.
+
+* Instead of entering a fake key cell after the last cell on the page just
+ in case the page ends with a key cell which has no value, use the end of
+ the page to detect that case.
+
+* Cache cursor key/value formats in Python, to save a native call from every
+ get_key/value.
+
+* Don't sync the directory after open if the global "sync" flag is false.
+
+* Fix a race for LSM trees that could happen if two threads race to open a
+ cursor and drop the LSM tree.
+
+WiredTiger release 1.3.4, 2012-10-19
+------------------------------------
+
+This release includes several important new features, including:
+
+* support for online compaction of files;
+* support for tables, column groups and indices that use LSM trees for
+ storage; and
+* improved statistics and configuration for LSM trees and Bloom filters.
+
+In addition, there are some significant performance improvements and bug
+fixes. The full list of changes is:
+
+[#248] Add support for online compaction.
+
+[#310] Fixed a bug where overflow blocks could be accessed by a
+ long-running reader after they had been freed in a checkpoint.
+
+[#358] Allocate checkpoint blocks from the live system's list of available
+ blocks rather than always extending the file.
+
+[#361] Sync the directory after creating a file: this is apparently
+ required for durability on Linux, according to the Linux fsync man
+ page.
+
+[#362] Don't check if a page is on the avail or discard lists if we're
+ salvaging the file, that is okay.
+
+[#363] Remove obsolete code dealing with forced eviction.
+
+[#366] Fake checkpoints may have the delete flag set, ignore them when
+ rolling checkpoints forward.
+
+[#367] All metadata reads should ignore the application's transactional
+ context.
+
+[#369] Support LSM as a data source for tables, column groups and indices.
+
+* Add tuning options for LSM bloom filters, including controlling whether
+ the oldest level in the tree has a Bloom filter, whether newly-created
+ (level 0) files have Bloom filters, and passing arbitrary file
+ configuration for Bloom filters.
+
+* Add a merge generation to LSM chunks. Add a statistic that reports the
+ highest merge generation in a tree.
+
+* Add a new LSM statistic tracking searches that could benefit from bloom
+ filters.
+
+* Enable LSM statistics in the "wt stat" utility.
+
+* Interrupt LSM merge operations, rather than waiting on close.
+
+* Wait for a while before looking for LSM major merges, in case merges
+ catch up with inserts.
+
+* Fix LSM index searches. The main issue was LSM search_near was not
+ always returning the closest key to the search key, which calling code
+ expects. It now tries hard to find the smallest cursor larger than the
+ search key, and only if no larger record exists does it return the
+ largest record smaller than the search key.
+
+* Reset any old cursor position before an LSM search. This limits hazard
+ references in an LSM search to a single chunk.
+
+* Fix a memory leak in an error path in Bloom filters.
+
+* Tweak the search loops in hazard_{set,clear} in favor of
+ last-in-first-out ordering.
+
+* If there are many files open, some hotter than others, walk more files
+ looking for pages to evict.
+
+* Don't stop evicting until we reach the target, have eviction wake up
+ periodically regardless of whether the application signals it. This
+ latter requires a "timed condition wait" operation.
+
+* Tweaks to file handle flags for out-of-cache read performance on Linux
+ (disable readahead and access time updates).
+
+* Replace the WT_SESSION::dumpfile method with configuration strings to
+ WT_SESSION::verify.
+
+* Fix a bug where we weren't skipping unnecessary default checkpoints
+ because we weren't handling the generational number included in the
+ internal checkpoint name.
+
+* Add a "force" configuration flag to WT_SESSION::checkpoint, object
+ compaction needs it because the work it wants done is done by the block
+ manager.
+
+* Make compact and checkpoint operate on a table's indices.
+
+* When doing a page truncate, lock down the page before we unpack the
+ on-page cell -- it's possible the page could be instantiated, modified
+ and reconciled while we're sleeping, in which case the WT_REF.addr field
+ would no longer point on-page.
+
+
+WiredTiger release 1.3.3, 2012-10-11
+------------------------------------
+
+This is a bugfix and performance tuning release, primarily related to LSM
+trees. The changes are as follows:
+
+[#350] Checkpoint the metadata after successful schema-level operations.
+ Otherwise, if process exits without closing the connection or
+ running a checkpoint, created objects exist but there is no record
+ in the metadata.
+
+[#351] Don't put checkpoint extent blocks on the available list, blocks on
+ it are considered for truncation; they have to go on the "checkpoint
+ available" list.
+
+* Choose LSM merges based on a measure of efficiency (levels collapsed per
+ record), rather than simply choosing a minor or a major merge. Tweak the
+ merge heuristic so we don't end up with runs of smaller chunks in the
+ middle of the tree.
+
+* Add a connection-wide flag to disable LSM merges.
+
+* Don't create Bloom filters for the oldest chunk in the system. Add the
+ ability to disable Bloom filters entirely.
+
+* Fix fast-path for bit values in WT_CURSOR::set_value.
+
+* Clean up allocation of LSM chunk IDs.
+
+* Update bloom_get so that it doesn't hold a cursor position.
+
+* Respect the page size for fixed-length column stores, remembering there
+ are 8 bits per byte.
+
+* Support bulk loading a bitmap into a fixed-length column store, update
+ Bloom filter code to use this.
+
+* Add an example program, ex_test_perf, to demonstrate basic LSM usage.
+
+* Add a new statistics cursor type "statistics:lsm". Update ex_stat.c to
+ demonstrate usage.
+
+* Add a statistics_fast flag to file statistics cursors. Update LSM
+ statistics so that they aggregate some cache statistics. Add ability to
+ open a statistics cursor on a checkpoint.
+
+* Walk a constant number of pages for LRU eviction.
+
+* Move the cache full check to after an update operation completes, when it
+ is no longer holding hazard references. This improves behavior with
+ small caches.
+
+
+WiredTiger release 1.3.2, 2012-10-03
+------------------------------------
+
+This is a bugfix and performance tuning release, primarily related to LSM
+trees. The changes are as follows:
+
+* Implement minor merges for LSM trees, prefer them to major merges.
+
+* Update hazard references, so the active array grows as needed. Change
+ the default hazard_max to 1000.
+
+* Abort transactions if the cache is so full that they cannot make
+ progress.
+
+* Fix a bug where verify could crash if an empty checkpoint exists.
+
+* Make the maximum number of chunks for merges configurable, rather than
+ deriving a value from the number of hazard references available.
+
+* Switch to an atomic add to allocate transaction IDs. This fixes a subtle
+ race before where two threads could temporarily have the same ID in the
+ global state table. If one of the threads timed out and the other thread
+ committed its transaction with that ID, the commit would not become
+ visible immediately. This could lead to deadlock errors in workloads
+ that are logically conflict-free.
+
+* Have auto-commit transactions retry deadlocks. This requires that we
+ keep the user's key and value in the cursor.
+
+* Simplify the code handling updated records in variable-length
+ column-store reconciliation.
+
+* Never wait for eviction when holding the schema lock. This avoids
+ deadlocks between opening a column store file and taking a checkpoint.
+
+* Take care with the loop termination when walking files for eviction. We
+ were making one extra call into __wt_tree_walk, which would leave a leaf
+ page in the WT_REF_EVICT_WALK state, unable to be evicted. In some
+ workloads, including LSM loads, we could end up with many files all
+ consisting of a single leaf page, none of which could be evicted.
+
+* Pause updates when the cache is full.
+
+* In files marked as "out of cache", don't wait for eviction when reading a
+ page.
+
+* Fix the record count calculation for minor merges. This was leading to
+ no Bloom filter being created for minor merges after running for some
+ time, leading to merges taking increasingly long to complete.
+
+* Only sleep in the LSM checkpoint thread if no work is done.
+
+* Add sanity check of cache size to LSM open.
+
+[#338] Create fake checkpoints until an object is modified, so that a
+ checkpoint between the cursor create and the bulk load doesn't make
+ it impossible to do a bulk-load on the cursor.
+
+
+WiredTiger release 1.3.1, 2012-09-25
+------------------------------------
+
+This is a bugfix release, primarily related to LSM trees. The changes are
+as follows:
+
+[#309] Implement auto-commit of transactions at the API. As well as
+ ensuring the atomicity of complex operations, this change simplified
+ code that simulated auto-commit internally and fixed a number of
+ bugs.
+
+[#321] Bulk-cursors no longer block checkpoints. We can't write files that
+ are being bulk-loaded, so change checkpoint to create checkpoints in
+ the metadata that, if accessed, look like empty files.
+
+ Tighten down the requirements for bulk-load, the only thing that can
+ be bulk-loaded now is a newly created tree, not any empty file.
+
+[#329] Add dictionary support to variable-length column store objects.
+ Support large row-store reconciliation dictionaries: add a skiplist
+ as the indexing mechanism.
+
+[#333] Fix a leak of the in-memory transaction log structure and the LSM
+ data source handle.
+
+[#334] Fix a memory leak where a page's replacement address wasn't being
+ freed.
+
+* Check that LSM trees are not configured as column stores.
+
+* Fix a race when starting the LSM worker thread. It was possible for the
+ thread to exit immediately if it started fast enough.
+
+* Two fixes for LSM, one to ensure that cursors read from a checkpoint if
+ one is available. The other to reduce the number of empty chunks that can
+ be created initially.
+
+* Fix a bug that disabled bloom filters.
+
+* The configure script checks for Python support in SWIG.
+
+* If a drop operation fails to acquire all of the handle locks it needs,
+ make sure it releases the primary handle lock.
+
+* Fix a number of other minor bugs and memory leaks.
+
+
+WiredTiger release 1.3.0, 2012-09-17
+------------------------------------
+
+This release contains a number of major new features, including:
+
+* support for LSM trees with Bloom filters;
+* support for hot backups; and
+* support for fast truncation of files.
+
+In addition, there are some critical bug fixes. We recommend that all users
+upgrade. Here is the full list of changes:
+
+[#143] Implement random record lookups.
+
+[#168] Add support for LSM trees.
+
+[#168] Add support for Bloom filters in LSM trees.
+
+[#198] Handle page-generation wraparound.
+
+[#236] Implement hot backups.
+
+[#244] Index cursors for column-store objects may not be created using the
+ record number as the index key.
+
+[#247] Add a fast-path for WT_SESSION::truncate that avoids reading most
+ data to be deleted.
+
+[#259] Performance hack for cursor open: don't parse the configuration
+ strings for a default value if the application didn't specify a
+ configuration string.
+
+[#262] Disable dump on child cursors: only the top-level cursor is wrapped
+ in a dump cursor.
+
+[#266] Deal with new / dropped indices in __wt_schema_open_index.
+
+[#269] Checkpoint handles must not be open when they are overwritten.
+
+[#271] Add support for a reserved checkpoint name "WiredTigerCheckpoint"
+ that opens the object's last checkpoint.
+
+[#271] Add the ability to access unnamed checkpoints.
+
+[#274] Change cursor.equals to return a standard error value and store the
+ cursor equality result in a separate argument.
+
+[#275] If exclusive handle is required for an operation and it is not
+ available, fail immediately: don't block.
+
+[#276] Fix methods that return integer parameters from Python. This
+ includes cursor.equals and cursor.search_near.
+
+[#277] Acquire the schema lock when creating the metadata file. We're
+ single-threaded, so it isn't protecting against anything, but the
+ handle management code expects to have the schema lock.
+
+[#279] Some optimizations for __wt_config_gets_defno. Specifically, if
+ we're dealing with a simple stack of config strings, just parse the
+ application string rather than the full list of defaults.
+
+[#279] Split the description string into a set of structures, to reduce the
+ number of string comparisons and manipulation that's required.
+
+[#282] Remove the cursor.reconfigure method, and replace it with
+ documentation showing how to "reconfigure" cursors using the
+ session.open_cursor method to duplicate them with different
+ configuration strings.
+
+[#284] Fix for a hazard reference race, where page eviction races with the
+ creation of the hazard reference, we have to check the pointer
+ itself as well as the state of the pointer.
+
+[#285] We can clear the tree's modified flag on checkpoint, as long as the
+ checkpoint writes all modifications. Clear the tree's modified
+ flag before we start the checkpoint, but reset it as necessary if
+ reconciliation is unable to write all of the changes in a page.
+
+[#287] Fix __wt_config_check to handle overlapping config values correctly.
+
+[#289] Add support for read-committed isolation, make it the default. Add
+ a session-level "isolation" setting.
+
+[#294] If txn_commit fails, document the transaction was rolled-back.
+
+[#295] Expand the documentation on using cursors without explicit
+ transactions.
+
+[#300] Include all changes whenever closing a file, don't check for
+ visibility. If updates are skipped while evicting a page, give up.
+
+[#305] Have "wt dump" fail more gracefully if the object doesn't exist.
+
+[#310] When freeing a tracked address in reconcilation, clear it to avoid
+ freeing the same address again on error.
+
+[#314] Replace cursor.equals with cursor.compare
+
+[#319] Clear the bulk_load_ok flag when closing handles.
+
+
+* Add an "ancient transaction" statistic so we can find out if they're
+ actually occurring in the field.
+
+* Add an "was object ever modified" flag to the btree handle, and use it to
+ avoid writing read-only objects during internal checkpoints, issue
+
+* Add per-connection statistics counters for transaction checkpoint, begin,
+ commit and rollback. Add per-btree statistics counters for update
+ conflicts.
+
+* Another fixed-length column-store implicit record fix: if the earliest
+ row in the object is row 10, and it's on an append list, we still must
+ return rows 1-9, they've been implicitly created.
+
+* Bulk cursors: disallow cursor.{equals,next,prev,reset,search,
+ search_near,update,remove}; only close and insert are supported.
+
+* Change session.truncate to support any cursor position for range
+ truncation, not just keys that are known to exist.
+
+* Checkpoint has to flush the metadata file, but only after it's flushed
+ all of the other files.
+
+* Discard obsolete WT_UPDATE structures during updates.
+
+* Document that duplicated cursors are positioned at the same point as the
+ cursor that was duplicated.
+
+* Fix a (very unlikely) deadlock at startup, if an application issues a
+ checkpoint before the eviction server has managed to open its sesssion.
+
+* Fix a core dump if we verify a file that's corrupted such that we are
+ unable to load any checkpoints at all, and the per-checkpoint bit map is
+ never set.
+
+* If a page selected for eviction cannot be freed because it has some
+ recent updates, try instead to free memory by trimming old updates.
+
+* If a thread fails to evict a page, try to bump its snapshot. This avoids
+ the common case of read-committed threads getting stuck because one
+ thread falls behind (e.g., because we can't evict during a checkpoint).
+
+* If an exclusive table create fails, return EEXIST.
+
+* If we try to remove a file that doesn't exist, don't complain, return
+ success.
+
+* If we're repeatedly taking a checkpoint with the same name, skip the work
+ for read-only objects.
+
+* Instead of flagging the empty tree's leaf page empty as part of creating
+ an empty tree in memory, set the page as modified (to force
+ reconciliation); if the leaf page is still empty at that time, then we'll
+ figure it out during that reconciliation. This fixes a memory leak where
+ the leaf page of a empty tree wasn't being freed.
+
+* It's not unreasonable to open a cursor on a non-existent table, don't
+ complain, just return not-found.
+
+* Move dist/RELEASE to the top level of the tree.
+
+* Optimization: don't repeatedly look up btree handles for schema
+ operations.
+
+* Return keys from all operations: don't keep pointing to the application's
+ key.
+
+* Update btree usage of 64 bitstring implementation, so it's cleaner.
+
+* Update the bitstring implementation to use 64 bit length strings.
+
+* Updates performed without an active transaction should become visible
+ with the current transaction ID.
+
+* Upgrade to doxygen 1.8.x
+
+* Use a real snapshot transaction for checkpoints. Otherwise, the snapshot
+ can be updated in between checkpointing multiple files (when updating the
+ metadata).
+
+
+WiredTiger release 1.2.2, 2012-06-20
+------------------------------------
+
+This is a bugfix release. The changes are as follows:
+
+* Defer making free pages available until the end of a checkpoint, in case
+ there is a failure after processing some files.
+
+* When checking the value of the "isolation" key, don't assume it is NUL
+ terminated. This bug could cause transactions to run with incorrect
+ isolation.
+
+* Fix two bugs with snapshot isolation:
+
+ 1. reset the isolation level when the transaction completes;
+ 2. when checking visibility, check item's ID against the maximum snapshot ID
+ (not the transaction's ID).
+
+
+WiredTiger release 1.2.1, 2012-06-15
+------------------------------------
+
+This is a bugfix release. The changes are as follows:
+
+* Avoid a deadlock between eviction and checkpoint on the connection spinlock.
+
+* Allocate "desc" buffers in heap memory so that they are correctly aligned
+ (fixes direct_io support on Linux).
+
+* Initialize the snapshot-avail list after cleaning it out, else we'll try and
+ print a NULL pointer in VERBOSE mode.
+
+
+WiredTiger release 1.2.0, 2012-06-04
+------------------------------------
+
+This release contains many bugfixes and improvements. The major changes are:
+
+[#138] Add support for transactions with coarse-grained durability.
+ Transactions provide atomicity guarantees and rollback, and uncommitted
+ changes are never written to disk. There is no on-disk log, so
+ committed changes only become durable when the next checkpoint
+ completes. Checkpoints are implemented by creating
+ transactionally-consistent snapshots within data files.
+
+[#156] Fully support operations that make schema changes with multiple
+ sessions open concurrently.
+
+[#159] Disable internal page key suffix compression if a custom collator is
+ configured. This avoids issues with collators that require complete
+ keys.
+
+[#167] Add support for durable snapshots within files. While a snapshot is
+ active, the pages used by the snapshot will not be overwritten. If a
+ file is accessed after a crash or application exit without calling
+ WT_CONNECTION::close, any changes made after the last snapshot will be
+ silently ignored.
+
+[#214, #216]
+ Fixes for forcing eviction with small caches.
+
+WiredTiger release 1.1.5, 2012-04-26
+------------------------------------
+
+Don't update a WT_REF after it has been unlocked.
+
+Add an operation to set a flag atomically, use it to avoid racing on page flags.
+
+Fix a race between sync and reading that could cause a segfault.
+
+
+WiredTiger release 1.1.4, 2012-04-16
+------------------------------------
+
+Check the versions of autoconf, automake and libtool to avoid failures when
+trying to build from the github tree with versions that are too old.
+
+[#191] Create the schema table as part of creating the environment so that
+ application threads don't race trying to create it later.
+
+[#193] Split-merge pages have to be reconciled to mark their parents dirty
+
+[#194] The dump utility should only output configuration that can be passed to
+ WT_SESSION::create.
+
+Eviction fixes for out-of-cache update workloads:
+
+* Fix an unlikely bug where the EVICT_LRU flag was cleared when a page in
+ the LRU queue was overwritten with itself during a walk. This led to an
+ assertion failure when the page was later evicted.
+
+* Clear all unused eviction queue entries while holding the lru_lock.
+
+* Split WT_PAGE->flags so that there is no possibility of racing:
+ (1) Move WT_PAGE_REC_* flags into WT_PAGE_MODIFY;
+ (2) Use atomic operations to set and clear the remaining (2) page flags.
+
+Move the test/format threads setting into the CONFIG file.
+
+
+WiredTiger release 1.1.3, 2012-04-04
+------------------------------------
+
+Fix the "exclusive" config for WT_SESSION::create. [#181]
+1. Make it work for files within a single session.
+2. Make it work for files across sessions.
+3. Make other data sources consistent with files.
+
+Fix an eviction bug introduced into 1.1.2: when evicting a page with children,
+remove the children from the LRU eviction queue. Reduce the impact of clearing
+a page from the LRU queue by marking pages on the queue with a flag
+(WT_PAGE_EVICT_LRU).
+
+During an eviction walk, pin pages up to the root so there is no need to spin
+when attempting to lock a parent page. Use the EVICT_LRU page flag to avoid
+putting a page on the LRU queue multiple times.
+
+Layer dump cursors on top of any cursor type.
+
+Add a section on replacing the default system memory allocator to the tuning
+page.
+
+Typo in usage method for "wt write".
+
+Don't report range errors for config values that aren't well-formed integers.
+
+
+WiredTiger release 1.1.2, 2012-03-20
+------------------------------------
+
+Add public-domain copyright notices to the extension code.
+
+test/format can now run multi-threaded, fixed two bugs it found:
+(1) When iterating backwards through a skiplist, we could race with an insert.
+(2) If eviction fails for a page, we have to assume that eviction has unlocked
+ the reference.
+
+Scan row-store leaf pages twice when reading to reduce the overhead of the
+index array.
+
+Eviction race fixes:
+(1) Call __rec_review with WT_REFs: don't look at the page until we've checked
+ the state.
+(2) Clear the eviction point if we hit it when discarding a child page, not
+ just the parent.
+
+Eviction tuning changes, particularly for read-only, out-of-cache workloads.
+
+Only notify the eviction server if an application thread doesn't find any pages
+to evict, and then only once.
+
+Only spin on the LRU lock if there might be pages in the LRU queue to evict.
+
+Keep the current eviction point in memory and make the eviction walk run
+concurrent with LRU eviction.
+
+Every test now has err/out captured, and it is checked to assure it is empty at
+the end of every test.
+
+
+WiredTiger release 1.1.1, 2012-03-12
+------------------------------------
+
+Default to a verbose build: that can be switched off by running `configure
+--enable-silent-rules`).
+
+Account for all memory allocated when reading a page into cache. Total memory
+usage is now much closer to the cache size when using many small keys and
+values.
+
+Have application threads trigger a retry forced page eviction rather than
+blocking eviction. This allows rec_evict.c to simply set the WT_REF state to
+WT_REF_MEM after all failures, and fixes a bug where pages on the forced
+eviction queue would end up with state WT_REF_MEM, meaning they could be chosen
+for eviction multiple times.
+
+Grow existing scratch buffers in preference to allocating new ones.
+
+Fix a race between threads reading in and then modifying a page.
+
+Get rid of the pinned flag: it is no longer used.
+
+Fix a race where btree files weren't completely closed before they could be
+re-opened. This behavior can be triggered by using a new session on every
+operation (see the new -S flag to the test/thread program). [#178]
+
+When connections are closed, create a session and discard the btree handles.
+This fixes a long-standing bug in closing a connection: if for any reason there
+are btree handles still open, we need a real session handle to close them.
+
+Really close btree handles: otherwise we can't safely remove or rename them.
+Fixes test failures in test_base02 (among others).
+
+Wait for application threads in LRU eviction to drain before walking a file.
+
+Fix a buffer size calculation when updating the root address of a file.
+
+Documentation fix: 10% of 1MB is 100KB.
+
+
+WiredTiger release 1.1.0, 2012-02-28
+------------------------------------
+
+Add checks to the session.truncate method to ensure the start/stop
+cursors reference the same object and have been initialized.
+
+Implement cursor duplication via WT_SESSION::open_cursor. [#161]
+
+Switch to quiet builds by default.
+
+Fix with automake version < 1.11, use foreign mode so that fewer
+top-level files are required.
+
+If a session or connection method is about to return WT_NOTFOUND (some
+underlying object was not found), map it to ENOENT, only cursor methods
+return WT_NOTFOUND. [#163]
+
+Save and restore session->btree in schema ops to simplify calling code.
+[#164]
+
+Note the wiredtiger_open config string "multiprocess" is not yet
+supported.
+
+Move "root:F" and "version:F" entries for files into the value for
+"file:F", so there is only a single record per file.
+[NOTE: SCHEMA CHANGE]
+
+When parsing config strings, continue to the end of the string in case
+of repeated keys. [#124]
+
+Don't require shared libraries unless Python is configured.
+
+Add support for direct I/O, with the config "direct_io=(data,log)".
+Build with _GNU_SOURCE on Linux to enable O_DIRECT.
+
+Don't keep the last page of column stores pinned: it prevented eviction
+of large trees created from scratch.
+
+Allow application threads to evict pages from any tree: maintain a count
+of threads doing LRU in each tree and wait for activity to drain when
+closing.
diff --git a/src/third_party/wiredtiger/README b/src/third_party/wiredtiger/README
new file mode 100644
index 00000000000..a064230f8ba
--- /dev/null
+++ b/src/third_party/wiredtiger/README
@@ -0,0 +1,21 @@
+WiredTiger 2.4.1: (October 16, 2014)
+
+This is version 2.4.1 of WiredTiger.
+
+WiredTiger release packages and documentation can be found at:
+
+ http://source.wiredtiger.com/
+
+Information on configuring, building and installing WiredTiger can be
+found at:
+
+ http://source.wiredtiger.com/2.4.1/install.html
+
+WiredTiger licensing information can be found at:
+
+ http://source.wiredtiger.com/license.html
+
+For general questions and discussion, please use the WiredTiger mailing
+list:
+
+ http://groups.google.com/group/wiredtiger-users
diff --git a/src/third_party/wiredtiger/RELEASE_INFO b/src/third_party/wiredtiger/RELEASE_INFO
new file mode 100644
index 00000000000..d0536282f61
--- /dev/null
+++ b/src/third_party/wiredtiger/RELEASE_INFO
@@ -0,0 +1,10 @@
+WIREDTIGER_VERSION_MAJOR=2
+WIREDTIGER_VERSION_MINOR=4
+WIREDTIGER_VERSION_PATCH=1
+WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH"
+
+WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"`
+
+WIREDTIGER_VERSION_STRING="WiredTiger $WIREDTIGER_VERSION: ($WIREDTIGER_RELEASE_DATE)"
+
+WIREDTIGER_VERSION_UNIQUE_NAME=`printf "_%d%03d" $WIREDTIGER_VERSION_MAJOR $WIREDTIGER_VERSION_MINOR`
diff --git a/src/third_party/wiredtiger/SConscript b/src/third_party/wiredtiger/SConscript
new file mode 100644
index 00000000000..5d19eaf5046
--- /dev/null
+++ b/src/third_party/wiredtiger/SConscript
@@ -0,0 +1,105 @@
+# -*- mode: python; -*-
+import re
+import textwrap
+
+Import("env windows darwin solaris linux freebsd")
+
+env = env.Clone()
+env.InjectThirdPartyIncludePaths(libraries=['snappy'])
+
+env.Append(CPPPATH=[
+ "src/include",
+ ])
+
+if windows:
+ env.Append(CPPPATH=["build_win"])
+elif darwin:
+ env.Append(CPPPATH=["build_darwin"])
+elif solaris:
+ env.Append(CPPPATH=["build_solaris"])
+elif freebsd:
+ env.Append(CPPPATH=["build_freebsd"])
+elif linux:
+ env.Append(CPPPATH=["build_linux"])
+ env.Append(CPPDEFINES=["_GNU_SOURCE"])
+else:
+ print("Wiredtiger is not supported on this platform. " +
+ "Please generate an approriate wiredtiger_config.h")
+ Exit(1)
+
+useZlib = False
+useSnappy = True
+
+version_file = 'build_posix/aclocal/version-set.m4'
+
+VERSION_MAJOR = None
+VERSION_MINOR = None
+VERSION_PATCH = None
+VERSION_STRING = None
+
+# Read the version information from the version-set.m4 file
+for l in open(File(version_file).srcnode().abspath):
+ if re.match(r'^VERSION_[A-Z]+', l):
+ exec(l)
+
+if (VERSION_MAJOR == None or
+ VERSION_MINOR == None or
+ VERSION_PATCH == None or
+ VERSION_STRING == None):
+ print "Failed to find version variables in " + version_file
+ Exit(1)
+
+wiredtiger_includes = """
+ #include <sys/types.h>
+ #ifndef _WIN32
+ #include <inttypes.h>
+ #endif
+ #include <stdarg.h>
+ #include <stdint.h>
+ #include <stdio.h>
+ """
+wiredtiger_includes = textwrap.dedent(wiredtiger_includes)
+replacements = {
+ '@VERSION_MAJOR@' : VERSION_MAJOR,
+ '@VERSION_MINOR@' : VERSION_MINOR,
+ '@VERSION_PATCH@' : VERSION_PATCH,
+ '@VERSION_STRING@' : VERSION_STRING,
+ '@uintmax_t_decl@': "",
+ '@uintptr_t_decl@': "",
+ '@off_t_decl@' : 'typedef int64_t wt_off_t;' if windows else "typedef off_t wt_off_t;",
+ '@wiredtiger_includes_decl@': wiredtiger_includes
+}
+
+env.Substfile(
+ target='wiredtiger.h',
+ source=[
+ 'src/include/wiredtiger.in',
+ ],
+ SUBST_DICT=replacements)
+
+#
+# WiredTiger library
+#
+filelistfile = "dist/filelist.win" if windows else 'dist/filelist'
+
+wtsources = []
+
+with open(File(filelistfile).srcnode().abspath) as filelist:
+ wtsources = [line.strip()
+ for line in filelist
+ if not line.startswith("#") and len(line.strip()) >= 1]
+
+if useZlib:
+ env.Append(CPPDEFINES=['HAVE_BUILTIN_EXTENSION_ZLIB'])
+ wtsources.append("ext/compressors/zlib/zlib_compress.c")
+
+if useSnappy:
+ env.Append(CPPDEFINES=['HAVE_BUILTIN_EXTENSION_SNAPPY'])
+ wtsources.append("ext/compressors/snappy/snappy_compress.c")
+
+wtlib = env.Library(
+ target="wiredtiger",
+ source=wtsources)
+
+env.Depends(wtlib, [filelistfile, version_file])
+
diff --git a/src/third_party/wiredtiger/SConstruct b/src/third_party/wiredtiger/SConstruct
new file mode 100644
index 00000000000..c8b94d31469
--- /dev/null
+++ b/src/third_party/wiredtiger/SConstruct
@@ -0,0 +1,282 @@
+# -*- mode: python; -*-
+import re
+import os
+import distutils.sysconfig
+
+EnsureSConsVersion( 2, 0, 0 )
+
+if not os.sys.platform == "win32":
+ print ("SConstruct is only supported for Windows, use build_posix for other platforms")
+ Exit(1)
+
+AddOption("--with-berkeley-db", dest="bdb", type="string", nargs=1, action="store",
+ help="Berkeley DB install path, ie, /usr/local")
+
+AddOption("--enable-zlib", dest="zlib", type="string", nargs=1, action="store",
+ help="Use zlib compression")
+
+AddOption("--enable-snappy", dest="snappy", type="string", nargs=1, action="store",
+ help="Use snappy compression")
+
+AddOption("--enable-swig", dest="swig", type="string", nargs=1, action="store",
+ help="Build python extension, specify location of swig.exe binary")
+
+AddOption("--dynamic-crt", dest="dynamic-crt", action="store_true", default=False,
+ help="Link with the MSVCRT DLL version")
+
+env = Environment(
+ CPPPATH = ["#/src/include/",
+ "#/build_win",
+ "#/test/windows",
+ "#/.",
+ distutils.sysconfig.get_python_inc()
+ ],
+ #CPPDEFINES = ["HAVE_DIAGNOSTIC", "HAVE_VERBOSE"],
+ CFLAGS = [
+ "/Z7", # Generate debugging symbols
+ "/wd4090", # Ignore warning about mismatched const qualifiers
+ "/wd4996",
+ "/W3", # Warning level 3
+ "/we4013", # Error on undefined functions
+ "/TC", # Compile as C code
+ #"/Od", # Disable optimization
+ "/Ob1", # inline expansion
+ "/O2", # optimize for speed
+ "/GF", # enable string pooling
+ "/EHsc", # extern "C" does not throw
+ #"/RTC1", # enable stack checks
+ "/GS", # enable secrutiy checks
+ "/Gy", # separate functions for linker
+ "/Zc:wchar_t",
+ "/Gd",
+ "/MD" if GetOption("dynamic-crt") else "/MT",
+ ],
+ LINKFLAGS = [
+ "/DEBUG", # Generate debug symbols
+ "/INCREMENTAL:NO", # Disable incremental linking
+ "/OPT:REF", # Remove dead code
+ "/DYNAMICBASE",
+ "/NXCOMPAT",
+ ],
+ LIBPATH=[ distutils.sysconfig.PREFIX + r"\libs"],
+ tools=["default", "swig"],
+ SWIGFLAGS=['-python',
+ "-threads",
+ "-O",
+ "-nodefaultctor",
+ "-nodefaultdtor"
+ ],
+ SWIG=GetOption("swig")
+)
+
+useZlib = GetOption("zlib")
+useSnappy = GetOption("snappy")
+useBdb = GetOption("bdb")
+wtlibs = []
+
+conf = Configure(env)
+if not conf.CheckCHeader('stdlib.h'):
+ print 'stdlib.h must be installed!'
+ Exit(1)
+
+if useZlib:
+ conf.env.Append(CPPPATH=[useZlib + "/include"])
+ conf.env.Append(LIBPATH=[useZlib + "/lib"])
+ if conf.CheckCHeader('zlib.h'):
+ conf.env.Append(CPPDEFINES=["HAVE_BUILTIN_EXTENSION_ZLIB"])
+ wtlibs.append("zlib")
+ else:
+ print 'zlib.h must be installed!'
+ Exit(1)
+
+if useSnappy:
+ conf.env.Append(CPPPATH=[useSnappy + "/include"])
+ conf.env.Append(LIBPATH=[useSnappy + "/lib"])
+ if conf.CheckCHeader('snappy-c.h'):
+ conf.env.Append(CPPDEFINES=['HAVE_BUILTIN_EXTENSION_SNAPPY'])
+ wtlibs.append("snappy")
+ else:
+ print 'snappy-c.h must be installed!'
+ Exit(1)
+
+if useBdb:
+ conf.env.Append(CPPPATH=[useBdb+ "/include"])
+ conf.env.Append(LIBPATH=[useBdb+ "/lib"])
+ if not conf.CheckCHeader('db.h'):
+ print 'db.h must be installed!'
+ Exit(1)
+
+env = conf.Finish()
+
+def GenerateWiredTigerH(target, source, env):
+ # Read the version information from the RELEASE_INFO file
+ for l in open('build_posix/aclocal/version-set.m4'):
+ if re.match(r'^VERSION_', l):
+ exec(l)
+
+ print VERSION_STRING
+
+ replacements = {
+ '@VERSION_MAJOR@' : VERSION_MAJOR,
+ '@VERSION_MINOR@' : VERSION_MINOR,
+ '@VERSION_PATCH@' : VERSION_PATCH,
+ '@VERSION_STRING@' : VERSION_STRING,
+ '@uintmax_t_decl@': "",
+ '@uintptr_t_decl@': "",
+ '@off_t_decl@' : 'typedef int64_t wt_off_t;',
+ '@wiredtiger_includes_decl@':
+ """#include <sys/types.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>"""
+ }
+
+ wt = open("src/include/wiredtiger.in")
+ out = open("wiredtiger.h", "w")
+ for l in wt:
+ lr = l
+ for r in replacements.items():
+ lr = lr.replace(r[0], str(r[1]))
+ out.write(lr)
+
+ wt.close()
+ out.close()
+
+
+#
+# WiredTiger library
+#
+filelist = open(r"dist\filelist.win")
+wtsources = [line.strip()
+ for line in filelist
+ if not line.startswith("#") and len(line) > 1]
+filelist.close()
+
+if useZlib:
+ wtsources.append("ext/compressors/zlib/zlib_compress.c")
+
+if useSnappy:
+ wtsources.append("ext/compressors/snappy/snappy_compress.c")
+
+env.Command('wiredtiger.h', 'src/include/wiredtiger.in', GenerateWiredTigerH)
+
+wtlib = env.Library("wiredtiger", wtsources)
+
+env.Program("wt", [
+ "src/utilities/util_backup.c",
+ "src/utilities/util_cpyright.c",
+ "src/utilities/util_compact.c",
+ "src/utilities/util_create.c",
+ "src/utilities/util_drop.c",
+ "src/utilities/util_dump.c",
+ "src/utilities/util_list.c",
+ "src/utilities/util_load.c",
+ "src/utilities/util_load_json.c",
+ "src/utilities/util_loadtext.c",
+ "src/utilities/util_main.c",
+ "src/utilities/util_misc.c",
+ "src/utilities/util_printlog.c",
+ "src/utilities/util_read.c",
+ "src/utilities/util_rename.c",
+ "src/utilities/util_salvage.c",
+ "src/utilities/util_stat.c",
+ "src/utilities/util_upgrade.c",
+ "src/utilities/util_verbose.c",
+ "src/utilities/util_verify.c",
+ "src/utilities/util_write.c"],
+ LIBS=[wtlib] + wtlibs)
+
+if GetOption("swig"):
+ env.SharedLibrary('_wiredtiger',
+ [ 'lang\python\wiredtiger.i'],
+ SHLIBSUFFIX=".pyd",
+ LIBS=[wtlib])
+
+# Shim library of functions to emulate POSIX on Windows
+shim = env.Library("window_shim",
+ ["test/windows/windows_shim.c"])
+
+env.Program("t_bloom",
+ "test/bloom/test_bloom.c",
+ LIBS=[wtlib])
+
+#env.Program("t_checkpoint",
+ #["test/checkpoint/checkpointer.c",
+ #"test/checkpoint/test_checkpoint.c",
+ #"test/checkpoint/workers.c"],
+ #LIBS=[wtlib])
+
+env.Program("t_huge",
+ "test/huge/huge.c",
+ LIBS=[wtlib])
+
+#env.Program("t_fops",
+ #["test/fops/file.c",
+ #"test/fops/fops.c",
+ #"test/fops/t.c"],
+ #LIBS=[wtlib])
+
+if useBdb:
+ benv = env.Clone()
+
+ benv.Append(CPPDEFINES=['BERKELEY_DB_PATH=\\"' + useBdb.replace("\\", "\\\\") + '\\"'])
+
+ benv.Program("t_format",
+ ["test/format/backup.c",
+ "test/format/bdb.c",
+ "test/format/bulk.c",
+ "test/format/compact.c",
+ "test/format/config.c",
+ "test/format/ops.c",
+ "test/format/salvage.c",
+ "test/format/t.c",
+ "test/format/util.c",
+ "test/format/wts.c"],
+ LIBS=[wtlib, shim, "libdb61"])
+
+#env.Program("t_thread",
+ #["test/thread/file.c",
+ #"test/thread/rw.c",
+ #"test/thread/stats.c",
+ #"test/thread/t.c"],
+ #LIBS=[wtlib])
+
+#env.Program("t_salvage",
+ #["test/salvage/salvage.c"],
+ #LIBS=[wtlib])
+
+env.Program("wtperf", [
+ "bench/wtperf/config.c",
+ "bench/wtperf/misc.c",
+ "bench/wtperf/track.c",
+ "bench/wtperf/wtperf.c",
+ ],
+ LIBS=[wtlib, shim] )
+
+examples = [
+ "ex_access",
+ "ex_all",
+ "ex_async",
+ "ex_call_center",
+ "ex_config",
+ "ex_config_parse",
+ "ex_cursor",
+ "ex_data_source",
+ "ex_extending",
+ "ex_file",
+ "ex_hello",
+ "ex_log",
+ "ex_pack",
+ "ex_process",
+ "ex_schema",
+ "ex_scope",
+ "ex_stat",
+ "ex_thread",
+ ]
+
+for ex in examples:
+ if(ex in ['ex_async', 'ex_thread']):
+ env.Program(ex, "examples/c/" + ex + ".c", LIBS=[wtlib, shim])
+ else:
+ env.Program(ex, "examples/c/" + ex + ".c", LIBS=[wtlib])
+
diff --git a/src/third_party/wiredtiger/api/leveldb/Makefile.am b/src/third_party/wiredtiger/api/leveldb/Makefile.am
new file mode 100644
index 00000000000..2cfd9d945a5
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/Makefile.am
@@ -0,0 +1,81 @@
+AM_CPPFLAGS = -I$(top_builddir) -I$(srcdir)/leveldb -I$(srcdir)/leveldb/include
+
+lib_LTLIBRARIES = libwiredtiger_leveldb.la
+
+noinst_PROGRAMS = leveldb_test
+
+# Setup the LevelDB headers to be installed in a wiredtiger/leveldb
+# subdirectory, so we don't interfere with other LevelDB installs.
+if HAVE_HYPERLEVELDB
+leveldbincludedir = $(includedir)/wiredtiger/hyperleveldb
+else
+if HAVE_ROCKSDB
+leveldbincludedir = $(includedir)/wiredtiger/rocksdb
+else
+leveldbincludedir = $(includedir)/wiredtiger/leveldb
+endif
+endif
+leveldbinclude_HEADERS = \
+ leveldb_wt_config.h \
+ leveldb/include/leveldb/cache.h \
+ leveldb/include/leveldb/comparator.h\
+ leveldb/include/leveldb/db.h \
+ leveldb/include/leveldb/env.h \
+ leveldb/include/leveldb/filter_policy.h \
+ leveldb/include/leveldb/iterator.h \
+ leveldb/include/leveldb/options.h \
+ leveldb/include/leveldb/slice.h \
+ leveldb/include/leveldb/status.h \
+ leveldb/include/leveldb/write_batch.h
+
+if HAVE_BASHOLEVELDB
+AM_CPPFLAGS += -I$(srcdir)/leveldb/include/leveldb -I$(srcdir)/basho
+leveldbinclude_HEADERS += \
+ basho/perf_count.h
+endif
+if HAVE_HYPERLEVELDB
+AM_CPPFLAGS += -I$(srcdir)/leveldb/include/leveldb -I$(srcdir)/hyperleveldb
+leveldbinclude_HEADERS += \
+ hyperleveldb/replay_iterator.h
+endif
+
+libwiredtiger_leveldb_la_LDFLAGS = -release @VERSION@
+libwiredtiger_leveldb_la_LIBADD = $(top_builddir)/libwiredtiger_static.la
+libwiredtiger_leveldb_la_SOURCES = \
+ leveldb_wt.cc \
+ leveldb/util/coding.cc leveldb/util/comparator.cc leveldb/util/env.cc leveldb/util/env_posix.cc \
+ leveldb/util/logging.cc leveldb/util/options.cc leveldb/util/status.cc
+
+if HAVE_BASHOLEVELDB
+libwiredtiger_leveldb_la_SOURCES += basho/perf_count.cc
+endif
+if HAVE_HYPERLEVELDB
+libwiredtiger_leveldb_la_SOURCES += hyper_wt.cc
+endif
+if HAVE_ROCKSDB
+libwiredtiger_leveldb_la_SOURCES += rocks_wt.cc rocksdb/write_batch.cc
+else
+libwiredtiger_leveldb_la_SOURCES += leveldb/db/write_batch.cc
+endif
+
+if HAVE_ROCKSDB
+pkglib_LTLIBRARIES = librocksdb.la
+else
+pkglib_LTLIBRARIES = libleveldb.la
+endif
+
+libleveldb_la_LDFLAGS = -release @VERSION@
+libleveldb_la_LIBADD = $(top_builddir)/libwiredtiger_static.la
+libleveldb_la_SOURCES = $(libwiredtiger_leveldb_la_SOURCES)
+
+librocksdb_la_LDFLAGS = -release @VERSION@
+librocksdb_la_LIBADD = $(top_builddir)/libwiredtiger_static.la
+librocksdb_la_SOURCES = $(libwiredtiger_leveldb_la_SOURCES)
+
+leveldb_test_SOURCES = leveldb_test.cc
+leveldb_test_LDADD = libwiredtiger_leveldb.la
+
+TESTS = $(noinst_PROGRAMS)
+
+clean-local:
+ rm -rf WTLDB_HOME
diff --git a/src/third_party/wiredtiger/api/leveldb/basho/perf_count.cc b/src/third_party/wiredtiger/api/leveldb/basho/perf_count.cc
new file mode 100644
index 00000000000..0e666ac1dc0
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/basho/perf_count.cc
@@ -0,0 +1,657 @@
+// -------------------------------------------------------------------
+//
+// perf_count.cc: performance counters LevelDB
+//
+// Copyright (c) 2012-2013 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License. You may obtain
+// a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#include <limits.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <syslog.h>
+#include <memory.h>
+#include <errno.h>
+
+#ifndef STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
+#include "perf_count.h"
+#endif
+
+#include "util/coding.h"
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
+#ifdef OS_SOLARIS
+# include <atomic.h>
+#endif
+
+
+namespace leveldb
+{
+
+// always have something active in gPerfCounters, eliminates
+// need to test for "is shared object attached yet"
+static PerformanceCounters LocalStartupCounters;
+PerformanceCounters * gPerfCounters(&LocalStartupCounters);
+
+
+ SstCounters::SstCounters()
+ : m_IsReadOnly(false),
+ m_Version(eSstCountVersion),
+ m_CounterSize(eSstCountEnumSize)
+ {
+ memset(m_Counter, 0, sizeof(m_Counter));
+
+ m_Counter[eSstCountKeySmallest]=ULLONG_MAX;
+ m_Counter[eSstCountValueSmallest]=ULLONG_MAX;
+
+ return;
+
+ }; // SstCounters::SstCounters
+
+
+ void
+ SstCounters::EncodeTo(
+ std::string & Dst) const
+ {
+ unsigned loop;
+
+ PutVarint32(&Dst, m_Version);
+ PutVarint32(&Dst, m_CounterSize);
+
+ for(loop=0; loop<eSstCountEnumSize; ++loop)
+ PutVarint64(&Dst, m_Counter[loop]);
+ } // SstCounters::EncodeTo
+
+
+ Status
+ SstCounters::DecodeFrom(
+ const Slice& src)
+ {
+ Status ret_status;
+ Slice cursor;
+ bool good;
+ int loop;
+
+ cursor=src;
+ m_IsReadOnly=true;
+ good=GetVarint32(&cursor, &m_Version);
+ good=good && (m_Version<=eSstCountVersion);
+
+ // all lesser number of stats to be read
+ good=good && GetVarint32(&cursor, &m_CounterSize);
+ if (good && eSstCountEnumSize < m_CounterSize)
+ m_CounterSize=eSstCountEnumSize;
+
+ for (loop=0; good && loop<eSstCountEnumSize; ++loop)
+ {
+ good=GetVarint64(&cursor, &m_Counter[loop]);
+ } // for
+
+ // if (!good) change ret_status to bad
+
+ return(ret_status);
+
+ } // SstCounters::DecodeFrom
+
+
+ uint64_t
+ SstCounters::Inc(
+ unsigned Index)
+ {
+ uint64_t ret_val;
+
+ ret_val=0;
+ if (!m_IsReadOnly && Index<m_CounterSize)
+ {
+ ++m_Counter[Index];
+ ret_val=m_Counter[Index];
+ } // if
+
+ return(ret_val);
+ } // SstCounters::Inc
+
+
+ uint64_t
+ SstCounters::Add(
+ unsigned Index,
+ uint64_t Amount)
+ {
+ uint64_t ret_val;
+
+ ret_val=0;
+ if (!m_IsReadOnly && Index<m_CounterSize)
+ {
+ m_Counter[Index]+=Amount;
+ ret_val=m_Counter[Index];
+ } // if
+
+ return(ret_val);
+ } // SstCounters::Add
+
+
+ uint64_t
+ SstCounters::Value(
+ unsigned Index) const
+ {
+ uint64_t ret_val;
+
+ ret_val=0;
+ if (Index<m_CounterSize)
+ {
+ ret_val=m_Counter[Index];
+ } // if
+
+ return(ret_val);
+ } // SstCounters::Value
+
+
+ void
+ SstCounters::Set(
+ unsigned Index,
+ uint64_t Value)
+ {
+ if (Index<m_CounterSize)
+ {
+ m_Counter[Index]=Value;
+ } // if
+
+ return;
+ } // SstCounters::Set
+
+
+ void
+ SstCounters::Dump() const
+ {
+ unsigned loop;
+
+ printf("SstCounters:\n");
+ printf(" m_IsReadOnly: %u\n", m_IsReadOnly);
+ printf(" m_Version: %u\n", m_Version);
+ printf(" m_CounterSize: %u\n", m_CounterSize);
+ for (loop=0; loop<m_CounterSize; ++loop)
+ printf(" Counter[%2u]: %" PRIu64 "\n", loop, m_Counter[loop]);
+
+ return;
+
+ } // SstCounters::Dump
+
+
+ // only used for local static objects, not shared memory objects
+ PerformanceCounters::PerformanceCounters()
+ {
+ m_Version=ePerfVersion;
+ m_CounterSize=ePerfCountEnumSize;
+ // cast away "volatile"
+ memset((void*)m_Counter, 0, sizeof(m_Counter));
+
+ return;
+
+ } // PerformanceCounters::PerformanceCounters
+
+
+ PerformanceCounters *
+ PerformanceCounters::Init(
+ bool IsReadOnly)
+ {
+ PerformanceCounters * ret_ptr;
+ bool should_create, good;
+ int ret_val, id;
+ struct shmid_ds shm_info;
+ size_t open_size;
+
+ ret_ptr=NULL;
+ memset(&shm_info, 0, sizeof(shm_info));
+ good=true;
+ open_size=sizeof(PerformanceCounters);
+
+ // first id attempt, minimal request
+ id=shmget(ePerfKey, 0, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ if (-1!=id)
+ ret_val=shmctl(id, IPC_STAT, &shm_info);
+ else
+ ret_val=-1;
+
+ // does the shared memory already exists (and of proper size if writing)
+ should_create=(0!=ret_val || (shm_info.shm_segsz < sizeof(PerformanceCounters))) && !IsReadOnly;
+
+ // should old shared memory be deleted?
+ if (should_create && 0==ret_val)
+ {
+ ret_val=shmctl(id, IPC_RMID, &shm_info);
+ good=(0==ret_val);
+ if (0!=ret_val)
+ syslog(LOG_ERR, "shmctl IPC_RMID failed [%d, %m]", errno);
+ } // if
+
+ // else open the size that exists
+ else if (0==ret_val)
+ {
+ open_size=shm_info.shm_segsz;
+ } // else if
+
+ // attempt to attach/create to shared memory instance
+ if (good)
+ {
+ int flags;
+
+ if (IsReadOnly)
+ flags = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+ else
+ flags = IPC_CREAT | S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+
+ m_PerfSharedId=shmget(ePerfKey, open_size, flags);
+ good=(-1!=m_PerfSharedId);
+ } // if
+
+ // map shared memory instance
+ if (good)
+ {
+ ret_ptr=(PerformanceCounters *)shmat(m_PerfSharedId, NULL, (IsReadOnly ? SHM_RDONLY : 0));
+ if ((void*)-1 != ret_ptr)
+ {
+ // initialize?
+ if (should_create || ePerfVersion!=ret_ptr->m_Version)
+ {
+ if (!IsReadOnly)
+ {
+ memset(ret_ptr, 0, sizeof(PerformanceCounters));
+ ret_ptr->m_Version=ePerfVersion;
+ ret_ptr->m_CounterSize=ePerfCountEnumSize;
+ } // if
+
+ // bad version match to existing segment
+ else
+ {
+ good=false;
+ errno=EINVAL;
+ } // else
+ } // if
+ } // if
+ else
+ {
+ good=false;
+ syslog(LOG_ERR, "shmat failed [%d, %m]", errno);
+ } // else
+
+ if (good)
+ {
+ // make this available process wide
+ gPerfCounters=ret_ptr;
+ } // if
+ else
+ {
+ ret_ptr=NULL;
+ m_LastError=errno;
+ } // else
+ } // if
+ else
+ {
+ m_LastError=errno;
+ ret_ptr=NULL;
+ } // else
+
+ return(ret_ptr);
+
+ }; // PerformanceCounters::Init
+
+
+ int
+ PerformanceCounters::Close(
+ PerformanceCounters * Counts)
+ {
+ int ret_val;
+
+ if (NULL!=Counts && &LocalStartupCounters != Counts)
+ {
+ // keep gPerf valid
+ if (gPerfCounters==Counts)
+ gPerfCounters=&LocalStartupCounters;
+
+ ret_val=shmdt(Counts);
+ if (0!=ret_val)
+ ret_val=errno;
+ } // if
+ else
+ {
+ ret_val=EINVAL;
+ } // else
+
+ return(ret_val);
+ } // PerformanceCounters::Close
+
+
+ uint64_t
+ PerformanceCounters::Inc(
+ unsigned Index)
+ {
+ uint64_t ret_val;
+
+ ret_val=0;
+ if (Index<m_CounterSize)
+ {
+ volatile uint64_t * val_ptr;
+
+ val_ptr=&m_Counter[Index];
+
+# if ULONG_MAX != 4294967295UL
+#ifdef OS_SOLARIS
+ atomic_inc_64(val_ptr);
+#else
+ __sync_add_and_fetch(val_ptr, 1);
+#endif
+#else
+ // hack fest for 64 bit semi-atomic on 32bit machine
+ uint32_t ret_32, * ptr_32;
+
+ ptr_32=(uint32_t *)&val_ptr;
+ ret_32=__sync_add_and_fetch(ptr_32, 1);
+ if (0==ret_32)
+ {
+ ++ptr_32;
+ __sync_add_and_fetch(ptr_32, 1);
+ } // if
+#endif
+ ret_val=*val_ptr;
+ } // if
+
+ return(ret_val);
+ } // PerformanceCounters::Inc
+
+
+ uint64_t
+ PerformanceCounters::Dec(
+ unsigned Index)
+ {
+ uint64_t ret_val;
+
+ ret_val=0;
+ if (Index<m_CounterSize)
+ {
+ volatile uint64_t * val_ptr;
+
+ val_ptr=&m_Counter[Index];
+
+# if ULONG_MAX != 4294967295UL
+#ifdef OS_SOLARIS
+ atomic_dec_64(val_ptr);
+#else
+ __sync_sub_and_fetch(val_ptr, 1);
+#endif
+#else
+ // hack fest for 64 bit semi-atomic on 32bit machine
+ uint32_t ret_32, * ptr_32;
+
+ ptr_32=(uint32_t *)&val_ptr;
+ ret_32=__sync_sub_and_fetch(ptr_32, 1);
+ if (0xFFFFFFFF==ret_32)
+ {
+ ++ptr_32;
+ __sync_sub_and_fetch(ptr_32, 1);
+ } // if
+#endif
+ ret_val=*val_ptr;
+ } // if
+
+ return(ret_val);
+ } // PerformanceCounters::Dec
+
+
+ uint64_t
+ PerformanceCounters::Add(
+ unsigned Index,
+ uint64_t Amount)
+ {
+ uint64_t ret_val;
+
+ ret_val=0;
+ if (Index<m_CounterSize)
+ {
+ volatile uint64_t * val_ptr;
+
+ val_ptr=&m_Counter[Index];
+
+# if ULONG_MAX != 4294967295UL
+#ifdef OS_SOLARIS
+ ret_val=atomic_add_64_nv(val_ptr, Amount);
+#else
+ ret_val=__sync_add_and_fetch(val_ptr, Amount);
+#endif
+#else
+ // hack fest for 64 bit semi-atomic on 32bit machine
+ uint32_t old_32, ret_32, * ptr_32;
+
+ ptr_32=(uint32_t *)&val_ptr;
+ old_32=*ptr_32;
+ ret_32=__sync_add_and_fetch(ptr_32, Amount);
+ if (ret_32<old_32)
+ {
+ ++ptr_32;
+ __sync_add_and_fetch(ptr_32, 1);
+ } // if
+
+ ret_val=*val_ptr;
+#endif
+ } // if
+
+ return(ret_val);
+ } // PerformanceCounters::Add
+
+
+ uint64_t
+ PerformanceCounters::Value(
+ unsigned Index) const
+ {
+ uint64_t ret_val;
+
+ ret_val=0;
+ if (Index<m_CounterSize)
+ {
+ ret_val=m_Counter[Index];
+ } // if
+
+ return(ret_val);
+ } // SstCounters::Value
+
+
+ void
+ PerformanceCounters::Set(
+ unsigned Index,
+ uint64_t Amount)
+ {
+ if (Index<m_CounterSize)
+ {
+ volatile uint64_t * val_ptr;
+
+ val_ptr=&m_Counter[Index];
+
+ *val_ptr=Amount;
+ } // if
+
+ return;
+ } // PerformanceCounters::Set
+
+
+ volatile const uint64_t *
+ PerformanceCounters::GetPtr(
+ unsigned Index) const
+ {
+ const volatile uint64_t * ret_ptr;
+
+ if (Index<m_CounterSize)
+ ret_ptr=&m_Counter[Index];
+ else
+ ret_ptr=&m_BogusCounter;
+
+ return(ret_ptr);
+
+ } // PerformanceCounters::GetPtr
+
+
+ const char *
+ PerformanceCounters::GetNamePtr(
+ unsigned Index)
+ {
+ const char * ret_ptr;
+
+ if (Index<ePerfCountEnumSize)
+ ret_ptr=m_PerfCounterNames[Index];
+ else
+ ret_ptr="???";
+
+ return(ret_ptr);
+
+ } // PerformanceCounters::GetPtr
+
+
+ int PerformanceCounters::m_PerfSharedId=-1;
+ int PerformanceCounters::m_LastError=0;
+ volatile uint64_t PerformanceCounters::m_BogusCounter=0;
+ const char * PerformanceCounters::m_PerfCounterNames[]=
+ {
+ "ROFileOpen",
+ "ROFileClose",
+ "ROFileUnmap",
+ "RWFileOpen",
+ "RWFileClose",
+ "RWFileUnmap",
+ "ApiOpen",
+ "ApiGet",
+ "ApiWrite",
+ "WriteSleep",
+ "WriteWaitImm",
+ "WriteWaitLevel0",
+ "WriteNewMem",
+ "WriteError",
+ "WriteNoWait",
+ "GetMem",
+ "GetImm",
+ "GetVersion",
+ "SearchLevel[0]",
+ "SearchLevel[1]",
+ "SearchLevel[2]",
+ "SearchLevel[3]",
+ "SearchLevel[4]",
+ "SearchLevel[5]",
+ "SearchLevel[6]",
+ "TableCached",
+ "TableOpened",
+ "TableGet",
+ "BGCloseUnmap",
+ "BGCompactImm",
+ "BGNormal",
+ "BGCompactLevel0",
+ "BlockFiltered",
+ "BlockFilterFalse",
+ "BlockCached",
+ "BlockRead",
+ "BlockFilterRead",
+ "BlockValidGet",
+ "Debug[0]",
+ "Debug[1]",
+ "Debug[2]",
+ "Debug[3]",
+ "Debug[4]",
+ "ReadBlockError",
+ "DBIterNew",
+ "DBIterNext",
+ "DBIterPrev",
+ "DBIterSeek",
+ "DBIterSeekFirst",
+ "DBIterSeekLast",
+ "DBIterDelete",
+ "eleveldbDirect",
+ "eleveldbQueued",
+ "eleveldbDequeued",
+ "elevelRefCreate",
+ "elevelRefDelete",
+ "ThrottleGauge",
+ "ThrottleCounter",
+ "ThrottleMicros0",
+ "ThrottleKeys0",
+ "ThrottleBacklog0",
+ "ThrottleCompacts0",
+ "ThrottleMicros1",
+ "ThrottleKeys1",
+ "ThrottleBacklog1",
+ "ThrottleCompacts1",
+ "BGWriteError",
+ "ThrottleWait",
+ "ThreadError",
+ "BGImmDirect",
+ "BGImmQueued",
+ "BGImmDequeued",
+ "BGImmWeighted",
+ "BGUnmapDirect",
+ "BGUnmapQueued",
+ "BGUnmapDequeued",
+ "BGUnmapWeighted",
+ "BGLevel0Direct",
+ "BGLevel0Queued",
+ "BGLevel0Dequeued",
+ "BGLevel0Weighted",
+ "BGCompactDirect",
+ "BGCompactQueued",
+ "BGCompactDequeued",
+ "BGCompactWeighted",
+ "FileCacheInsert",
+ "FileCacheRemove",
+ "BlockCacheInsert",
+ "BlockCacheRemove",
+ "ApiDelete"
+ };
+
+
+ int
+ PerformanceCounters::LookupCounter(
+ const char * Name)
+ {
+ int index,loop;
+
+ index=-1;
+
+ if (NULL!=Name && '\0'!=*Name)
+ {
+ for (loop=0; loop<ePerfCountEnumSize && -1==index; ++loop)
+ {
+ if (0==strcmp(m_PerfCounterNames[loop], Name))
+ index=loop;
+ } // loop
+ } // if
+
+ return(index);
+ };
+
+ void
+ PerformanceCounters::Dump()
+ {
+ int loop;
+
+ printf(" m_Version: %u\n", m_Version);
+ printf(" m_CounterSize: %u\n", m_CounterSize);
+
+ for (loop=0; loop<ePerfCountEnumSize; ++loop)
+ {
+ printf(" %s: %" PRIu64 "\n", m_PerfCounterNames[loop], m_Counter[loop]);
+ } // loop
+ }; // Dump
+
+} // namespace leveldb
diff --git a/src/third_party/wiredtiger/api/leveldb/basho/perf_count.h b/src/third_party/wiredtiger/api/leveldb/basho/perf_count.h
new file mode 100644
index 00000000000..b0f4abf9b66
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/basho/perf_count.h
@@ -0,0 +1,298 @@
+// -------------------------------------------------------------------
+//
+// perf_count.h: performance counters LevelDB
+//
+// Copyright (c) 2012-2013 Basho Technologies, Inc. All Rights Reserved.
+//
+// This file is provided to you under the Apache License,
+// Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License. You may obtain
+// a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// -------------------------------------------------------------------
+
+#ifndef STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
+#define STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
+
+#include "leveldb_wt_config.h"
+
+#include <stdint.h>
+#include <string>
+#include "status.h"
+
+namespace leveldb {
+
+enum SstCountEnum
+{
+ //
+ // array index values/names
+ //
+ eSstCountKeys=0, //!< how many keys in this sst
+ eSstCountBlocks=1, //!< how many blocks in this sst
+ eSstCountCompressAborted=2,//!< how many blocks attempted compression and aborted use
+ eSstCountKeySize=3, //!< byte count of all keys
+ eSstCountValueSize=4, //!< byte count of all values
+ eSstCountBlockSize=5, //!< byte count of all blocks (pre-compression)
+ eSstCountBlockWriteSize=6, //!< post-compression size, or BlockSize if no compression
+ eSstCountIndexKeys=7, //!< how many keys in the index block
+ eSstCountKeyLargest=8, //!< largest key in sst
+ eSstCountKeySmallest=9, //!< smallest key in sst
+ eSstCountValueLargest=10, //!< largest value in sst
+ eSstCountValueSmallest=11, //!< smallest value in sst
+ eSstCountDeleteKey=12, //!< tombstone count
+ eSstCountBlockSizeUsed=13, //!< Options::block_size used with this file
+ eSstCountUserDataSize=14, //!< post-compression size of non-metadata (user keys/values/block overhead)
+
+ // must follow last index name to represent size of array
+ eSstCountEnumSize, //!< size of the array described by the enum values
+
+ eSstCountVersion=1
+
+}; // enum SstCountEnum
+
+
+class SstCounters
+{
+protected:
+ bool m_IsReadOnly; //!< set when data decoded from a file
+ uint32_t m_Version; //!< object revision identification
+ uint32_t m_CounterSize; //!< number of objects in m_Counter
+
+ uint64_t m_Counter[eSstCountEnumSize];
+
+public:
+ // constructors / destructor
+ SstCounters();
+
+ // Put data into disk form
+ void EncodeTo(std::string & Dst) const;
+
+ // Populate member data from prior EncodeTo block
+ Status DecodeFrom(const Slice& src);
+
+ // increment the counter
+ uint64_t Inc(unsigned Index);
+
+ // add value to the counter
+ uint64_t Add(unsigned Index, uint64_t Amount);
+
+ // return value of a counter
+ uint64_t Value(unsigned Index) const;
+
+ // set a value
+ void Set(unsigned Index, uint64_t);
+
+ // return number of counters
+ uint32_t Size() const {return(m_CounterSize);};
+
+ // printf all values
+ void Dump() const;
+
+}; // class SstCounters
+
+
+extern struct PerformanceCounters * gPerfCounters;
+
+
+enum PerformanceCountersEnum
+{
+ //
+ // array index values/names
+ // (enum explicitly numbered to allow future edits / moves / inserts)
+ //
+ ePerfROFileOpen=0, //!< PosixMmapReadableFile open
+ ePerfROFileClose=1, //!< closed
+ ePerfROFileUnmap=2, //!< unmap without close
+
+ ePerfRWFileOpen=3, //!< PosixMmapFile open
+ ePerfRWFileClose=4, //!< closed
+ ePerfRWFileUnmap=5, //!< unmap without close
+
+ ePerfApiOpen=6, //!< Count of DB::Open completions
+ ePerfApiGet=7, //!< Count of DBImpl::Get completions
+ ePerfApiWrite=8, //!< Count of DBImpl::Get completions
+
+ ePerfWriteSleep=9, //!< DBImpl::MakeRoomForWrite called sleep
+ ePerfWriteWaitImm=10, //!< DBImpl::MakeRoomForWrite called Wait on Imm compact
+ ePerfWriteWaitLevel0=11,//!< DBImpl::MakeRoomForWrite called Wait on Level0 compact
+ ePerfWriteNewMem=12, //!< DBImpl::MakeRoomForWrite created new memory log
+ ePerfWriteError=13, //!< DBImpl::MakeRoomForWrite saw bg_error_
+ ePerfWriteNoWait=14, //!< DBImpl::MakeRoomForWrite took no action
+
+ ePerfGetMem=15, //!< DBImpl::Get read from memory log
+ ePerfGetImm=16, //!< DBImpl::Get read from previous memory log
+ ePerfGetVersion=17, //!< DBImpl::Get read from Version object
+
+ // code ASSUMES the levels are in numerical order,
+ // i.e. based off of ePerfSearchLevel0
+ ePerfSearchLevel0=18, //!< Version::Get read searched one or more files here
+ ePerfSearchLevel1=19, //!< Version::Get read searched one or more files here
+ ePerfSearchLevel2=20, //!< Version::Get read searched one or more files here
+ ePerfSearchLevel3=21, //!< Version::Get read searched one or more files here
+ ePerfSearchLevel4=22, //!< Version::Get read searched one or more files here
+ ePerfSearchLevel5=23, //!< Version::Get read searched one or more files here
+ ePerfSearchLevel6=24, //!< Version::Get read searched one or more files here
+
+ ePerfTableCached=25, //!< TableCache::FindTable found table in cache
+ ePerfTableOpened=26, //!< TableCache::FindTable had to open table file
+ ePerfTableGet=27, //!< TableCache::Get used to retrieve a key
+
+ ePerfBGCloseUnmap=28, //!< PosixEnv::BGThreaed started Unmap/Close job
+ ePerfBGCompactImm=29, //!< PosixEnv::BGThreaed started compaction of Imm
+ ePerfBGNormal=30, //!< PosixEnv::BGThreaed started normal compaction job
+ ePerfBGCompactLevel0=31,//!< PosixEnv::BGThreaed started compaction of Level0
+
+ ePerfBlockFiltered=32, //!< Table::BlockReader search stopped due to filter
+ ePerfBlockFilterFalse=33,//!< Table::BlockReader gave a false positive for match
+ ePerfBlockCached=34, //!< Table::BlockReader found block in cache
+ ePerfBlockRead=35, //!< Table::BlockReader read block from disk
+ ePerfBlockFilterRead=36,//!< Table::ReadMeta filter loaded from file
+ ePerfBlockValidGet=37, //!< Table::InternalGet has valid iterator
+
+ ePerfDebug0=38, //!< Developer debug counters, moveable
+ ePerfDebug1=39, //!< Developer debug counters, moveable
+ ePerfDebug2=40, //!< Developer debug counters, moveable
+ ePerfDebug3=41, //!< Developer debug counters, moveable
+ ePerfDebug4=42, //!< Developer debug counters, moveable
+
+ ePerfReadBlockError=43, //!< crc or compression error in ReadBlock (format.cc)
+
+ ePerfIterNew=44, //!< Count of DBImpl::NewDBIterator calls
+ ePerfIterNext=45, //!< Count of DBIter::Next calls
+ ePerfIterPrev=46, //!< Count of DBIter::Prev calls
+ ePerfIterSeek=47, //!< Count of DBIter::Seek calls
+ ePerfIterSeekFirst=48, //!< Count of DBIter::SeekFirst calls
+ ePerfIterSeekLast=49, //!< Count of DBIter::SeekLast calls
+ ePerfIterDelete=50, //!< Count of DBIter::~DBIter
+
+ ePerfElevelDirect=51, //!< eleveldb's FindWaitingThread went direct to thread
+ ePerfElevelQueued=52, //!< eleveldb's FindWaitingThread queued work item
+ ePerfElevelDequeued=53, //!< eleveldb's worker took item from backlog queue
+
+ ePerfElevelRefCreate=54,//!< eleveldb RefObject constructed
+ ePerfElevelRefDelete=55,//!< eleveldb RefObject destructed
+
+ ePerfThrottleGauge=56, //!< current throttle value
+ ePerfThrottleCounter=57,//!< running throttle by seconds
+
+ ePerfThrottleMicros0=58,//!< level 0 micros spent compacting
+ ePerfThrottleKeys0=59, //!< level 0 keys processed
+ ePerfThrottleBacklog0=60,//!< backlog at time of posting (level0)
+ ePerfThrottleCompacts0=61,//!< number of level 0 compactions
+
+ ePerfThrottleMicros1=62,//!< level 1+ micros spent compacting
+ ePerfThrottleKeys1=63, //!< level 1+ keys processed
+ ePerfThrottleBacklog1=64,//!< backlog at time of posting (level1+)
+ ePerfThrottleCompacts1=65,//!< number of level 1+ compactions
+
+ ePerfBGWriteError=66, //!< error in write/close, see syslog
+
+ ePerfThrottleWait=67, //!< milliseconds of throttle wait
+ ePerfThreadError=68, //!< system error on thread related call, no LOG access
+
+ ePerfBGImmDirect=69, //!< count Imm compactions happened directly
+ ePerfBGImmQueued=70, //!< count Imm compactions placed on queue
+ ePerfBGImmDequeued=71, //!< count Imm compactions removed from queue
+ ePerfBGImmWeighted=72, //!< total microseconds item spent on queue
+
+ ePerfBGUnmapDirect=73, //!< count Unmap operations happened directly
+ ePerfBGUnmapQueued=74, //!< count Unmap operations placed on queue
+ ePerfBGUnmapDequeued=75,//!< count Unmap operations removed from queue
+ ePerfBGUnmapWeighted=76,//!< total microseconds item spent on queue
+
+ ePerfBGLevel0Direct=77, //!< count Level0 compactions happened directly
+ ePerfBGLevel0Queued=78, //!< count Level0 compactions placed on queue
+ ePerfBGLevel0Dequeued=79,//!< count Level0 compactions removed from queue
+ ePerfBGLevel0Weighted=80,//!< total microseconds item spent on queue
+
+ ePerfBGCompactDirect=81, //!< count generic compactions happened directly
+ ePerfBGCompactQueued=82, //!< count generic compactions placed on queue
+ ePerfBGCompactDequeued=83,//!< count generic compactions removed from queue
+ ePerfBGCompactWeighted=84,//!< total microseconds item spent on queue
+
+ ePerfFileCacheInsert=85, //!< total bytes inserted into file cache
+ ePerfFileCacheRemove=86, //!< total bytes removed from file cache
+
+ ePerfBlockCacheInsert=87, //!< total bytes inserted into block cache
+ ePerfBlockCacheRemove=88, //!< total bytes removed from block cache
+
+ ePerfApiDelete=89, //!< Count of DB::Delete
+
+ // must follow last index name to represent size of array
+ // (ASSUMES previous enum is highest value)
+ ePerfCountEnumSize, //!< size of the array described by the enum values
+
+ ePerfVersion=1, //!< structure versioning
+ ePerfKey=41207 //!< random number as shared memory identifier
+};
+
+//
+// Do NOT use virtual functions. This structure will be aligned at different
+// locations in multiple processes. Things can get messy with virtuals.
+
+struct PerformanceCounters
+{
+public:
+ static int m_LastError;
+
+protected:
+ uint32_t m_Version; //!< object revision identification
+ uint32_t m_CounterSize; //!< number of objects in m_Counter
+
+ volatile uint64_t m_Counter[ePerfCountEnumSize];
+
+ static const char * m_PerfCounterNames[];
+ static int m_PerfSharedId;
+ static volatile uint64_t m_BogusCounter; //!< for out of range GetPtr calls
+
+public:
+ // only called for local object, not for shared memory
+ PerformanceCounters();
+
+ //!< does executable's idea of version match shared object?
+ bool VersionTest()
+ {return(ePerfCountEnumSize<=m_CounterSize && ePerfVersion==m_Version);};
+
+ //!< mostly for perf_count_test.cc
+ void SetVersion(uint32_t Version, uint32_t CounterSize)
+ {m_Version=Version; m_CounterSize=CounterSize;};
+
+ static PerformanceCounters * Init(bool IsReadOnly);
+ static int Close(PerformanceCounters * Counts);
+
+ uint64_t Inc(unsigned Index);
+ uint64_t Dec(unsigned Index);
+
+ // add value to the counter
+ uint64_t Add(unsigned Index, uint64_t Amount);
+
+ // return value of a counter
+ uint64_t Value(unsigned Index) const;
+
+ // set a value
+ void Set(unsigned Index, uint64_t);
+
+ volatile const uint64_t * GetPtr(unsigned Index) const;
+
+ static const char * GetNamePtr(unsigned Index);
+
+ int LookupCounter(const char * Name);
+
+ void Dump();
+
+}; // struct PerformanceCounters
+
+extern PerformanceCounters * gPerfCounters;
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/config.hin b/src/third_party/wiredtiger/api/leveldb/config.hin
new file mode 100644
index 00000000000..131b68969d3
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/config.hin
@@ -0,0 +1,22 @@
+/* api/leveldb/config.hin. Generated by autoheader, then hand-edited. */
+
+/* Build the LevelDB API with Basho LevelDB support. */
+#undef HAVE_BASHOLEVELDB
+
+/* Snappy support automatically loaded. */
+#undef HAVE_BUILTIN_EXTENSION_SNAPPY
+
+/* Zlib support automatically loaded. */
+#undef HAVE_BUILTIN_EXTENSION_ZLIB
+
+/* Define to 1 for diagnostic tests. */
+#undef HAVE_DIAGNOSTIC
+
+/* Build the LevelDB API with HyperLevelDB support. */
+#undef HAVE_HYPERLEVELDB
+
+/* Define to 1 if you have the `snappy' library (-lsnappy). */
+#undef HAVE_LIBSNAPPY
+
+/* Build the LevelDB API with RocksDB support. */
+#undef HAVE_ROCKSDB
diff --git a/src/third_party/wiredtiger/api/leveldb/dummy.cc b/src/third_party/wiredtiger/api/leveldb/dummy.cc
new file mode 100644
index 00000000000..d56f03b544b
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/dummy.cc
@@ -0,0 +1,28 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* Nothing to see, just keep build tools happy. */
diff --git a/src/third_party/wiredtiger/api/leveldb/hyper_wt.cc b/src/third_party/wiredtiger/api/leveldb/hyper_wt.cc
new file mode 100644
index 00000000000..95c82289e18
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/hyper_wt.cc
@@ -0,0 +1,415 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "leveldb_wt.h"
+#include <errno.h>
+#include <sstream>
+#include <sys/param.h>
+#include <sys/stat.h>
+
+using leveldb::ReplayIterator;
+using leveldb::Status;
+
+// Fill in missing methods from the interface
+ReplayIterator::ReplayIterator() {}
+ReplayIterator::~ReplayIterator() {}
+
+class ReplayIteratorImpl : public ReplayIterator {
+ public:
+ ReplayIteratorImpl(OperationContext *context) : context_(context), cursor_(NULL) {
+ WT_SESSION *session = context_->GetSession();
+ int ret = session->open_cursor(
+ session, "log:", NULL, NULL, &cursor_);
+ status_ = WiredTigerErrorToStatus(ret);
+ valid_ = false;
+ // Position on first record. valid_ will be set appropriately.
+ Next();
+ }
+
+ ReplayIteratorImpl(OperationContext *context, const std::string& timestamp) :
+ context_(context), cursor_(NULL) {
+
+ WT_SESSION *session = context_->GetSession();
+ int ret = session->open_cursor(
+ session, "log:", NULL, NULL, &cursor_);
+ status_ = WiredTigerErrorToStatus(ret);
+ valid_ = false;
+ // Position on requested record. valid_ will be set appropriately.
+ SeekTo(timestamp);
+ }
+
+ // An iterator is either positioned at a deleted key, present key/value pair,
+ // or not valid. This method returns true iff the iterator is valid.
+ virtual bool Valid();
+
+ // Moves to the next entry in the source. After this call, Valid() is
+ // true iff the iterator was not positioned at the last entry in the source.
+ // REQUIRES: Valid()
+ virtual void Next();
+
+ // Position at the first key in the source that at or past target for this
+ // pass. Note that this is unlike the Seek call, as the ReplayIterator is
+ // unsorted.
+ // The iterator is Valid() after this call iff the source contains
+ // an entry that comes at or past target.
+ // Per Robert at Hyperdex, the SkipTo functions are hacky optimizations
+ // for LevelDB and its key layout. It is okay for them to be no-ops.
+ virtual void SkipTo(const Slice& target) { }
+ virtual void SkipToLast() { }
+ virtual void SeekTo(const std::string& timestamp);
+ virtual void SeekToLast();
+
+ // Return true if the current entry points to a key-value pair. If this
+ // returns false, it means the current entry is a deleted entry.
+ virtual bool HasValue() {
+ assert(Valid());
+ if (optype == WT_LOGOP_ROW_PUT ||
+ optype == WT_LOGOP_COL_PUT)
+ return true;
+ else
+ return false;
+ }
+
+ int Compare(ReplayIteratorImpl* other) {
+ int cmp;
+ assert(Valid());
+ // assert(other->Valid());
+ int ret = cursor_->compare(cursor_, other->cursor_, &cmp);
+ status_ = WiredTigerErrorToStatus(ret);
+ return (cmp);
+ }
+
+ // Return the key for the current entry. The underlying storage for
+ // the returned slice is valid only until the next modification of
+ // the iterator.
+ // REQUIRES: Valid()
+ virtual Slice key() const { return Slice((const char *)key_.data, key_.size); }
+
+ // Return the value for the current entry. The underlying storage for
+ // the returned slice is valid only until the next modification of
+ // the iterator.
+ // REQUIRES: !AtEnd() && !AtStart()
+ virtual Slice value() const { return Slice((const char *)value_.data, value_.size); }
+
+ // If an error has occurred, return it. Else return an ok status.
+ virtual Status status() const { return status_; }
+
+ // must be released by giving it back to the DB
+ virtual ~ReplayIteratorImpl() {
+ int ret = Close();
+ assert(ret == 0);
+ }
+
+ std::string GetTimestamp() {
+ char lsn[256];
+ assert(Valid());
+ snprintf(lsn, sizeof(lsn), WT_TIMESTAMP_FORMAT,
+ lsn_.file, lsn_.offset);
+ return (std::string(lsn));
+ }
+
+ int Close() {
+ int ret = 0;
+ if (cursor_ != NULL)
+ ret = cursor_->close(cursor_);
+ status_ = WiredTigerErrorToStatus(ret);
+ valid_ = false;
+ cursor_ = NULL;
+ return (ret);
+ }
+
+ private:
+ void SeekTo(WT_LSN *lsn);
+ // No copying allowed
+ ReplayIteratorImpl(const ReplayIterator&) { }
+ void operator=(const ReplayIterator&) { }
+ OperationContext *context_;
+ Status status_;
+ WT_CURSOR *cursor_;
+ WT_ITEM key_, value_;
+ WT_LSN lsn_;
+ bool valid_;
+ uint64_t txnid;
+ uint32_t fileid, opcount, optype, rectype;
+};
+
+bool
+ReplayIteratorImpl::Valid() {
+ // If we're invalid and at the end, try again.
+ if (valid_ == false && cursor_ != NULL && status_.IsNotFound())
+ Next();
+ return valid_;
+}
+
+void
+ReplayIteratorImpl::Next() {
+ int ret = 0;
+
+ if (cursor_ != NULL) {
+ while ((ret = cursor_->next(cursor_)) == 0) {
+ ret = cursor_->get_key(cursor_,
+ &lsn_.file, &lsn_.offset, &opcount);
+ if (ret != 0)
+ break;
+ ret = cursor_->get_value(cursor_,
+ &txnid, &rectype, &optype, &fileid, &key_, &value_);
+ if (ret != 0)
+ break;
+ // Next() is only interested in modification operations.
+ // Continue for any other type of record.
+ if (WT_VALID_OPERATION(fileid, optype)) {
+ valid_ = true;
+ break;
+ }
+ }
+ status_ = WiredTigerErrorToStatus(ret);
+ if (ret != 0) {
+ valid_ = false;
+ if (ret != WT_NOTFOUND)
+ ret = Close();
+ else
+ ret = 0;
+ assert(ret == 0);
+ }
+ }
+}
+
+void
+ReplayIteratorImpl::SeekToLast() {
+ int ret = 0;
+ WT_LSN last_lsn;
+
+ last_lsn.file = 0;
+ if (cursor_ != NULL) {
+ // Walk the log to the end, then set the cursor on the
+ // last valid LSN we saw.
+ while ((ret = cursor_->next(cursor_)) == 0) {
+ ret = cursor_->get_key(cursor_,
+ &lsn_.file, &lsn_.offset, &opcount);
+ if (ret != 0)
+ break;
+ ret = cursor_->get_value(cursor_,
+ &txnid, &rectype, &optype, &fileid, &key_, &value_);
+ if (ret != 0)
+ break;
+ // We're only interested in modification operations.
+ // Continue for any other type of record.
+ if (WT_VALID_OPERATION(fileid, optype)) {
+ valid_ = true;
+ last_lsn = lsn_;
+ }
+ }
+ // We reached the end of log
+ if (ret != WT_NOTFOUND || last_lsn.file == 0) {
+ valid_ = false;
+ ret = Close();
+ assert(ret == 0);
+ } else
+ SeekTo(&last_lsn);
+ }
+}
+
+void
+ReplayIteratorImpl::SeekTo(const std::string& timestamp) {
+ WT_LSN target_lsn;
+ int ret = 0;
+
+ if (timestamp == "all") {
+ if (cursor_ != NULL) {
+ ret = cursor_->reset(cursor_);
+ status_ = WiredTigerErrorToStatus(ret);
+ if (ret != 0)
+ return;
+ Next();
+ return;
+ }
+ }
+ if (timestamp == "now") {
+ SeekToLast();
+ return;
+ }
+ sscanf(timestamp.c_str(), WT_TIMESTAMP_FORMAT,
+ &target_lsn.file, &target_lsn.offset);
+ SeekTo(&target_lsn);
+}
+
+// Set the cursor on the first modification record at or after the
+// given LSN.
+void
+ReplayIteratorImpl::SeekTo(WT_LSN *target_lsn) {
+ int ret = 0;
+
+ valid_ = false;
+ if (cursor_ != NULL) {
+ cursor_->set_key(cursor_,
+ target_lsn->file, target_lsn->offset, 0, 0);
+ ret = cursor_->search(cursor_);
+ status_ = WiredTigerErrorToStatus(ret);
+ if (ret != 0)
+ return;
+ // If we were successful, set up the info.
+ ret = cursor_->get_key(cursor_,
+ &lsn_.file, &lsn_.offset, &opcount);
+ status_ = WiredTigerErrorToStatus(ret);
+ if (ret != 0)
+ return;
+ ret = cursor_->get_value(cursor_,
+ &txnid, &rectype, &optype, &fileid, &key_, &value_);
+ status_ = WiredTigerErrorToStatus(ret);
+ if (ret != 0)
+ return;
+ valid_ = true;
+ // We're only interested in modification operations.
+ // Continue for any other type of record.
+ if (!WT_VALID_OPERATION(fileid, optype))
+ Next();
+ }
+}
+
+// Create a live backup of a live LevelDB instance.
+// The backup is stored in a directory named "backup-<name>" under the top
+// level of the open LevelDB database. The implementation is permitted, and
+// even encouraged, to improve the performance of this call through
+// hard-links.
+Status
+DbImpl::LiveBackup(const Slice& name)
+{
+ OperationContext *context = GetContext();
+ WT_SESSION *session = context->GetSession();
+ WT_CURSOR *cursor;
+ int ret = session->open_cursor(
+ session, "backup:", NULL, NULL, &cursor);
+ int t_ret;
+ const char *filename;
+ const char *home = conn_->get_home(conn_);
+ char backup[MAXPATHLEN], buf[MAXPATHLEN * 2];
+
+ // If we couldn't open the backup cursor, we're done.
+ if (ret != 0)
+ return (WiredTigerErrorToStatus(ret));
+
+ // Remove any old directory and create the backup directory.
+ // WT single-threads hot backups. If we get here we already have
+ // the backup cursor open and we do not have to worry about other
+ // threads trying to remove and recreate the same directory out
+ // from under us.
+ snprintf(buf, sizeof(buf), "rm -rf %s/backup-%s", home,
+ (char *)name.data());
+ if ((ret = system(buf)) != 0)
+ return WiredTigerErrorToStatus(ret);
+ snprintf(backup, sizeof(backup), "%s/backup-%s", home,
+ (char *)name.data());
+ if ((ret = mkdir(backup, 0777)) != 0)
+ return WiredTigerErrorToStatus(ret);
+ // Copy all files returned by backup cursor.
+ while ((ret = cursor->next(cursor)) == 0 &&
+ (ret = cursor->get_key(cursor, &filename)) == 0) {
+ snprintf(buf, sizeof(buf), "cp %s/%s %s/%s",
+ home, filename, backup, filename);
+ if ((ret = system(buf)) != 0)
+ break;
+ }
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ if ((t_ret = cursor->close(cursor)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (WiredTigerErrorToStatus(ret));
+}
+
+// Return an opaque timestamp that identifies the current point in time of the
+// database. This timestamp may be subsequently presented to the
+// NewReplayIterator method to create a ReplayIterator.
+void
+DbImpl::GetReplayTimestamp(std::string* timestamp)
+{
+ OperationContext *context = GetContext();
+ ReplayIteratorImpl *iter = new ReplayIteratorImpl(context);
+
+ iter->SeekToLast();
+ *timestamp = iter->GetTimestamp();
+ ReleaseReplayIterator(iter);
+}
+
+// Set the lower bound for manual garbage collection. This method only takes
+// effect when Options.manual_garbage_collection is true.
+void
+DbImpl::AllowGarbageCollectBeforeTimestamp(const std::string& timestamp)
+{
+}
+
+// Validate the timestamp
+bool
+DbImpl::ValidateTimestamp(const std::string& timestamp)
+{
+ bool valid;
+ OperationContext *context = GetContext();
+ ReplayIteratorImpl *iter = new ReplayIteratorImpl(context);
+
+ // The SeekTo function will handle "all" or "now".
+ iter->SeekTo(timestamp);
+ valid = iter->Valid();
+ ReleaseReplayIterator(iter);
+ return valid;
+}
+
+// Compare two timestamps and return -1, 0, 1 for lt, eq, gt
+int
+DbImpl::CompareTimestamps(const std::string& lhs, const std::string& rhs)
+{
+ OperationContext *context = GetContext();
+ ReplayIteratorImpl *lhiter = new ReplayIteratorImpl(context);
+ ReplayIteratorImpl *rhiter = new ReplayIteratorImpl(context);
+ int cmp = 0;
+
+ // The SeekTo function will handle "all" or "now".
+ lhiter->SeekTo(lhs);
+ rhiter->SeekTo(rhs);
+ if (lhiter->Valid() && rhiter->Valid())
+ cmp = lhiter->Compare(rhiter);
+ ReleaseReplayIterator(lhiter);
+ ReleaseReplayIterator(rhiter);
+ return cmp;
+}
+
+// Return a ReplayIterator that returns every write operation performed after
+// the timestamp.
+Status
+DbImpl::GetReplayIterator(const std::string& timestamp,
+ ReplayIterator** iter)
+{
+ OperationContext *context = GetContext();
+ *iter = new ReplayIteratorImpl(context, timestamp);
+ return ((*iter)->status());
+}
+
+// Release a previously allocated replay iterator.
+void
+DbImpl::ReleaseReplayIterator(ReplayIterator* iter)
+{
+ delete static_cast<ReplayIteratorImpl *>(iter);
+}
diff --git a/src/third_party/wiredtiger/api/leveldb/hyperleveldb/AUTHORS b/src/third_party/wiredtiger/api/leveldb/hyperleveldb/AUTHORS
new file mode 100644
index 00000000000..bf024aba6a8
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/hyperleveldb/AUTHORS
@@ -0,0 +1,15 @@
+# Names should be added to this file like so:
+# Name or Organization <email address>
+
+Google Inc.
+
+# Initial version authors:
+Jeffrey Dean <jeff@google.com>
+Sanjay Ghemawat <sanjay@google.com>
+
+# Partial list of contributors:
+Kevin Regan <kevin.d.regan@gmail.com>
+Johan Bilien <jobi@litl.com>
+
+# HyperLevelDB authors:
+Robert Escriva <robert@hyperdex.org>
diff --git a/src/third_party/wiredtiger/api/leveldb/hyperleveldb/LICENSE b/src/third_party/wiredtiger/api/leveldb/hyperleveldb/LICENSE
new file mode 100644
index 00000000000..262b0af095d
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/hyperleveldb/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+Copyright (c) 2013-2014 The HyperLevelDB Authors. All rights reserved. (HyperLevelDB changes)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/third_party/wiredtiger/api/leveldb/hyperleveldb/replay_iterator.h b/src/third_party/wiredtiger/api/leveldb/hyperleveldb/replay_iterator.h
new file mode 100644
index 00000000000..397acdfd889
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/hyperleveldb/replay_iterator.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2013 The HyperLevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_REPLAY_ITERATOR_H_
+#define STORAGE_LEVELDB_INCLUDE_REPLAY_ITERATOR_H_
+
+#include "leveldb_wt_config.h"
+
+#include "slice.h"
+#include "status.h"
+
+namespace leveldb {
+
+class ReplayIterator {
+ public:
+ ReplayIterator();
+
+ // An iterator is either positioned at a deleted key, present key/value pair,
+ // or not valid. This method returns true iff the iterator is valid.
+ virtual bool Valid() = 0;
+
+ // Moves to the next entry in the source. After this call, Valid() is
+ // true iff the iterator was not positioned at the last entry in the source.
+ // REQUIRES: Valid()
+ virtual void Next() = 0;
+
+ // Position at the first key in the source that at or past target for this
+ // pass. Note that this is unlike the Seek call, as the ReplayIterator is
+ // unsorted.
+ // The iterator is Valid() after this call iff the source contains
+ // an entry that comes at or past target.
+ virtual void SkipTo(const Slice& target) = 0;
+ virtual void SkipToLast() = 0;
+
+ // Return true if the current entry points to a key-value pair. If this
+ // returns false, it means the current entry is a deleted entry.
+ virtual bool HasValue() = 0;
+
+ // Return the key for the current entry. The underlying storage for
+ // the returned slice is valid only until the next modification of
+ // the iterator.
+ // REQUIRES: Valid()
+ virtual Slice key() const = 0;
+
+ // Return the value for the current entry. The underlying storage for
+ // the returned slice is valid only until the next modification of
+ // the iterator.
+ // REQUIRES: !AtEnd() && !AtStart()
+ virtual Slice value() const = 0;
+
+ // If an error has occurred, return it. Else return an ok status.
+ virtual Status status() const = 0;
+
+ protected:
+ // must be released by giving it back to the DB
+ virtual ~ReplayIterator();
+
+ private:
+ // No copying allowed
+ ReplayIterator(const ReplayIterator&);
+ void operator=(const ReplayIterator&);
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_INCLUDE_REPLAY_ITERATOR_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/AUTHORS b/src/third_party/wiredtiger/api/leveldb/leveldb/AUTHORS
new file mode 100644
index 00000000000..27a9407e52f
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/AUTHORS
@@ -0,0 +1,8 @@
+# Names should be added to this file like so:
+# Name or Organization <email address>
+
+Google Inc.
+
+# Initial version authors:
+Jeffrey Dean <jeff@google.com>
+Sanjay Ghemawat <sanjay@google.com>
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/LICENSE b/src/third_party/wiredtiger/api/leveldb/leveldb/LICENSE
new file mode 100644
index 00000000000..8e80208cd72
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/db/dbformat.h b/src/third_party/wiredtiger/api/leveldb/leveldb/db/dbformat.h
new file mode 100644
index 00000000000..2c8a9d5f5a7
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/db/dbformat.h
@@ -0,0 +1,233 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_FORMAT_H_
+#define STORAGE_LEVELDB_DB_FORMAT_H_
+
+#include <stdio.h>
+#include "leveldb_wt.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+// Grouping of constants. We may want to make some of these
+// parameters set via options.
+namespace config {
+static const int kNumLevels = 7;
+
+// Level-0 compaction is started when we hit this many files.
+static const int kL0_CompactionTrigger = 4;
+
+// Soft limit on number of level-0 files. We slow down writes at this point.
+static const int kL0_SlowdownWritesTrigger = 8;
+
+// Maximum number of level-0 files. We stop writes at this point.
+static const int kL0_StopWritesTrigger = 12;
+
+// Maximum level to which a new compacted memtable is pushed if it
+// does not create overlap. We try to push to level 2 to avoid the
+// relatively expensive level 0=>1 compactions and to avoid some
+// expensive manifest file operations. We do not push all the way to
+// the largest level since that can generate a lot of wasted disk
+// space if the same key space is being repeatedly overwritten.
+static const int kMaxMemCompactLevel = 2;
+
+} // namespace config
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+enum ValueType {
+ kTypeDeletion = 0x0,
+ kTypeValue = 0x1
+#ifdef HAVE_ROCKSDB
+ ,kTypeMerge = 0x2,
+ // Following types are used only in write ahead logs. They are not used in
+ // memtables or sst files:
+ kTypeLogData = 0x3,
+ kTypeColumnFamilyDeletion = 0x4,
+ kTypeColumnFamilyValue = 0x5,
+ kTypeColumnFamilyMerge = 0x6,
+ kMaxValue = 0x7F
+#endif
+};
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+static const ValueType kValueTypeForSeek = kTypeValue;
+
+typedef uint64_t SequenceNumber;
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber =
+ ((0x1ull << 56) - 1);
+
+struct ParsedInternalKey {
+ Slice user_key;
+ SequenceNumber sequence;
+ ValueType type;
+
+ ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
+ ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+ : user_key(u), sequence(seq), type(t) { }
+ std::string DebugString() const;
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+ return key.user_key.size() + 8;
+}
+
+// Append the serialization of "key" to *result.
+extern void AppendInternalKey(std::string* result,
+ const ParsedInternalKey& key);
+
+// Attempt to parse an internal key from "internal_key". On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+extern bool ParseInternalKey(const Slice& internal_key,
+ ParsedInternalKey* result);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+ assert(internal_key.size() >= 8);
+ return Slice(internal_key.data(), internal_key.size() - 8);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+ assert(internal_key.size() >= 8);
+ const size_t n = internal_key.size();
+ uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+ unsigned char c = num & 0xff;
+ return static_cast<ValueType>(c);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator : public Comparator {
+ private:
+ const Comparator* user_comparator_;
+ public:
+ explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { }
+ virtual const char* Name() const;
+ virtual int Compare(const Slice& a, const Slice& b) const;
+ virtual void FindShortestSeparator(
+ std::string* start,
+ const Slice& limit) const;
+ virtual void FindShortSuccessor(std::string* key) const;
+
+ const Comparator* user_comparator() const { return user_comparator_; }
+
+ int Compare(const InternalKey& a, const InternalKey& b) const;
+};
+
+// Filter policy wrapper that converts from internal keys to user keys
+class InternalFilterPolicy : public FilterPolicy {
+ private:
+ const FilterPolicy* const user_policy_;
+ public:
+ explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
+ virtual const char* Name() const;
+ virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
+ virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
+};
+
+// Modules in this directory should keep internal keys wrapped inside
+// the following class instead of plain strings so that we do not
+// incorrectly use string comparisons instead of an InternalKeyComparator.
+class InternalKey {
+ private:
+ std::string rep_;
+ public:
+ InternalKey() { } // Leave rep_ as empty to indicate it is invalid
+ InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
+ AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+ }
+
+ void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
+ Slice Encode() const {
+ assert(!rep_.empty());
+ return rep_;
+ }
+
+ Slice user_key() const { return ExtractUserKey(rep_); }
+
+ void SetFrom(const ParsedInternalKey& p) {
+ rep_.clear();
+ AppendInternalKey(&rep_, p);
+ }
+
+ void Clear() { rep_.clear(); }
+
+ std::string DebugString() const;
+};
+
+inline int InternalKeyComparator::Compare(
+ const InternalKey& a, const InternalKey& b) const {
+ return Compare(a.Encode(), b.Encode());
+}
+
+inline bool ParseInternalKey(const Slice& internal_key,
+ ParsedInternalKey* result) {
+ const size_t n = internal_key.size();
+ if (n < 8) return false;
+ uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+ unsigned char c = num & 0xff;
+ result->sequence = num >> 8;
+ result->type = static_cast<ValueType>(c);
+ result->user_key = Slice(internal_key.data(), n - 8);
+ return (c <= static_cast<unsigned char>(kTypeValue));
+}
+
+// A helper class useful for DBImpl::Get()
+class LookupKey {
+ public:
+ // Initialize *this for looking up user_key at a snapshot with
+ // the specified sequence number.
+ LookupKey(const Slice& user_key, SequenceNumber sequence);
+
+ ~LookupKey();
+
+ // Return a key suitable for lookup in a MemTable.
+ Slice memtable_key() const { return Slice(start_, end_ - start_); }
+
+ // Return an internal key (suitable for passing to an internal iterator)
+ Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
+
+ // Return the user key
+ Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
+
+ private:
+ // We construct a char array of the form:
+ // klength varint32 <-- start_
+ // userkey char[klength] <-- kstart_
+ // tag uint64
+ // <-- end_
+ // The array is a suitable MemTable key.
+ // The suffix starting with "userkey" can be used as an InternalKey.
+ const char* start_;
+ const char* kstart_;
+ const char* end_;
+ char space_[200]; // Avoid allocation for short keys
+
+ // No copying allowed
+ LookupKey(const LookupKey&);
+ void operator=(const LookupKey&);
+};
+
+inline LookupKey::~LookupKey() {
+ if (start_ != space_) delete[] start_;
+}
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_FORMAT_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/db/skiplist.h b/src/third_party/wiredtiger/api/leveldb/leveldb/db/skiplist.h
new file mode 100644
index 00000000000..af85be6d016
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/db/skiplist.h
@@ -0,0 +1,379 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread safety
+// -------------
+//
+// Writes require external synchronization, most likely a mutex.
+// Reads require a guarantee that the SkipList will not be destroyed
+// while the read is in progress. Apart from that, reads progress
+// without any internal locking or synchronization.
+//
+// Invariants:
+//
+// (1) Allocated nodes are never deleted until the SkipList is
+// destroyed. This is trivially guaranteed by the code since we
+// never delete any skip list nodes.
+//
+// (2) The contents of a Node except for the next/prev pointers are
+// immutable after the Node has been linked into the SkipList.
+// Only Insert() modifies the list, and it is careful to initialize
+// a node and use release-stores to publish the nodes in one or
+// more lists.
+//
+// ... prev vs. next pointer ordering ...
+
+#include <assert.h>
+#include <stdlib.h>
+#include "port/port.h"
+#include "util/arena.h"
+#include "util/random.h"
+
+namespace leveldb {
+
+class Arena;
+
+template<typename Key, class Comparator>
+class SkipList {
+ private:
+ struct Node;
+
+ public:
+ // Create a new SkipList object that will use "cmp" for comparing keys,
+ // and will allocate memory using "*arena". Objects allocated in the arena
+ // must remain allocated for the lifetime of the skiplist object.
+ explicit SkipList(Comparator cmp, Arena* arena);
+
+ // Insert key into the list.
+ // REQUIRES: nothing that compares equal to key is currently in the list.
+ void Insert(const Key& key);
+
+ // Returns true iff an entry that compares equal to key is in the list.
+ bool Contains(const Key& key) const;
+
+ // Iteration over the contents of a skip list
+ class Iterator {
+ public:
+ // Initialize an iterator over the specified list.
+ // The returned iterator is not valid.
+ explicit Iterator(const SkipList* list);
+
+ // Returns true iff the iterator is positioned at a valid node.
+ bool Valid() const;
+
+ // Returns the key at the current position.
+ // REQUIRES: Valid()
+ const Key& key() const;
+
+ // Advances to the next position.
+ // REQUIRES: Valid()
+ void Next();
+
+ // Advances to the previous position.
+ // REQUIRES: Valid()
+ void Prev();
+
+ // Advance to the first entry with a key >= target
+ void Seek(const Key& target);
+
+ // Position at the first entry in list.
+ // Final state of iterator is Valid() iff list is not empty.
+ void SeekToFirst();
+
+ // Position at the last entry in list.
+ // Final state of iterator is Valid() iff list is not empty.
+ void SeekToLast();
+
+ private:
+ const SkipList* list_;
+ Node* node_;
+ // Intentionally copyable
+ };
+
+ private:
+ enum { kMaxHeight = 12 };
+
+ // Immutable after construction
+ Comparator const compare_;
+ Arena* const arena_; // Arena used for allocations of nodes
+
+ Node* const head_;
+
+ // Modified only by Insert(). Read racily by readers, but stale
+ // values are ok.
+ port::AtomicPointer max_height_; // Height of the entire list
+
+ inline int GetMaxHeight() const {
+ return static_cast<int>(
+ reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
+ }
+
+ // Read/written only by Insert().
+ Random rnd_;
+
+ Node* NewNode(const Key& key, int height);
+ int RandomHeight();
+ bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+
+ // Return true if key is greater than the data stored in "n"
+ bool KeyIsAfterNode(const Key& key, Node* n) const;
+
+ // Return the earliest node that comes at or after key.
+ // Return NULL if there is no such node.
+ //
+ // If prev is non-NULL, fills prev[level] with pointer to previous
+ // node at "level" for every level in [0..max_height_-1].
+ Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
+
+ // Return the latest node with a key < key.
+ // Return head_ if there is no such node.
+ Node* FindLessThan(const Key& key) const;
+
+ // Return the last node in the list.
+ // Return head_ if list is empty.
+ Node* FindLast() const;
+
+ // No copying allowed
+ SkipList(const SkipList&);
+ void operator=(const SkipList&);
+};
+
+// Implementation details follow
+template<typename Key, class Comparator>
+struct SkipList<Key,Comparator>::Node {
+ explicit Node(const Key& k) : key(k) { }
+
+ Key const key;
+
+ // Accessors/mutators for links. Wrapped in methods so we can
+ // add the appropriate barriers as necessary.
+ Node* Next(int n) {
+ assert(n >= 0);
+ // Use an 'acquire load' so that we observe a fully initialized
+ // version of the returned Node.
+ return reinterpret_cast<Node*>(next_[n].Acquire_Load());
+ }
+ void SetNext(int n, Node* x) {
+ assert(n >= 0);
+ // Use a 'release store' so that anybody who reads through this
+ // pointer observes a fully initialized version of the inserted node.
+ next_[n].Release_Store(x);
+ }
+
+ // No-barrier variants that can be safely used in a few locations.
+ Node* NoBarrier_Next(int n) {
+ assert(n >= 0);
+ return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
+ }
+ void NoBarrier_SetNext(int n, Node* x) {
+ assert(n >= 0);
+ next_[n].NoBarrier_Store(x);
+ }
+
+ private:
+ // Array of length equal to the node height. next_[0] is lowest level link.
+ port::AtomicPointer next_[1];
+};
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::NewNode(const Key& key, int height) {
+ char* mem = arena_->AllocateAligned(
+ sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
+ return new (mem) Node(key);
+}
+
+template<typename Key, class Comparator>
+inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) {
+ list_ = list;
+ node_ = NULL;
+}
+
+template<typename Key, class Comparator>
+inline bool SkipList<Key,Comparator>::Iterator::Valid() const {
+ return node_ != NULL;
+}
+
+template<typename Key, class Comparator>
+inline const Key& SkipList<Key,Comparator>::Iterator::key() const {
+ assert(Valid());
+ return node_->key;
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Next() {
+ assert(Valid());
+ node_ = node_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Prev() {
+ // Instead of using explicit "prev" links, we just search for the
+ // last node that falls before key.
+ assert(Valid());
+ node_ = list_->FindLessThan(node_->key);
+ if (node_ == list_->head_) {
+ node_ = NULL;
+ }
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) {
+ node_ = list_->FindGreaterOrEqual(target, NULL);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() {
+ node_ = list_->head_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
+ node_ = list_->FindLast();
+ if (node_ == list_->head_) {
+ node_ = NULL;
+ }
+}
+
+template<typename Key, class Comparator>
+int SkipList<Key,Comparator>::RandomHeight() {
+ // Increase height with probability 1 in kBranching
+ static const unsigned int kBranching = 4;
+ int height = 1;
+ while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
+ height++;
+ }
+ assert(height > 0);
+ assert(height <= kMaxHeight);
+ return height;
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
+ // NULL n is considered infinite
+ return (n != NULL) && (compare_(n->key, key) < 0);
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev)
+ const {
+ Node* x = head_;
+ int level = GetMaxHeight() - 1;
+ while (true) {
+ Node* next = x->Next(level);
+ if (KeyIsAfterNode(key, next)) {
+ // Keep searching in this list
+ x = next;
+ } else {
+ if (prev != NULL) prev[level] = x;
+ if (level == 0) {
+ return next;
+ } else {
+ // Switch to next list
+ level--;
+ }
+ }
+ }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
+ Node* x = head_;
+ int level = GetMaxHeight() - 1;
+ while (true) {
+ assert(x == head_ || compare_(x->key, key) < 0);
+ Node* next = x->Next(level);
+ if (next == NULL || compare_(next->key, key) >= 0) {
+ if (level == 0) {
+ return x;
+ } else {
+ // Switch to next list
+ level--;
+ }
+ } else {
+ x = next;
+ }
+ }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
+ const {
+ Node* x = head_;
+ int level = GetMaxHeight() - 1;
+ while (true) {
+ Node* next = x->Next(level);
+ if (next == NULL) {
+ if (level == 0) {
+ return x;
+ } else {
+ // Switch to next list
+ level--;
+ }
+ } else {
+ x = next;
+ }
+ }
+}
+
+template<typename Key, class Comparator>
+SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
+ : compare_(cmp),
+ arena_(arena),
+ head_(NewNode(0 /* any key will do */, kMaxHeight)),
+ max_height_(reinterpret_cast<void*>(1)),
+ rnd_(0xdeadbeef) {
+ for (int i = 0; i < kMaxHeight; i++) {
+ head_->SetNext(i, NULL);
+ }
+}
+
+template<typename Key, class Comparator>
+void SkipList<Key,Comparator>::Insert(const Key& key) {
+ // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
+ // here since Insert() is externally synchronized.
+ Node* prev[kMaxHeight];
+ Node* x = FindGreaterOrEqual(key, prev);
+
+ // Our data structure does not allow duplicate insertion
+ assert(x == NULL || !Equal(key, x->key));
+
+ int height = RandomHeight();
+ if (height > GetMaxHeight()) {
+ for (int i = GetMaxHeight(); i < height; i++) {
+ prev[i] = head_;
+ }
+ //fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
+
+ // It is ok to mutate max_height_ without any synchronization
+ // with concurrent readers. A concurrent reader that observes
+ // the new value of max_height_ will see either the old value of
+ // new level pointers from head_ (NULL), or a new value set in
+ // the loop below. In the former case the reader will
+ // immediately drop to the next level since NULL sorts after all
+ // keys. In the latter case the reader will use the new node.
+ max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
+ }
+
+ x = NewNode(key, height);
+ for (int i = 0; i < height; i++) {
+ // NoBarrier_SetNext() suffices since we will add a barrier when
+ // we publish a pointer to "x" in prev[i].
+ x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
+ prev[i]->SetNext(i, x);
+ }
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::Contains(const Key& key) const {
+ Node* x = FindGreaterOrEqual(key, NULL);
+ if (x != NULL && Equal(key, x->key)) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+} // namespace leveldb
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/db/write_batch.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/db/write_batch.cc
new file mode 100644
index 00000000000..0a11cb10f33
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/db/write_batch.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+// sequence: fixed64
+// count: fixed32
+// data: record[count]
+// record :=
+// kTypeValue varstring varstring |
+// kTypeDeletion varstring
+// varstring :=
+// len: varint32
+// data: uint8[len]
+
+#include "leveldb_wt.h"
+
+#include "db/write_batch_internal.h"
+
+namespace leveldb {
+
+// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
+static const size_t kHeader = 12;
+
+WriteBatch::WriteBatch() {
+ Clear();
+}
+
+WriteBatch::~WriteBatch() { }
+
+WriteBatch::Handler::~Handler() { }
+
+void WriteBatch::Clear() {
+ rep_.clear();
+ rep_.resize(kHeader);
+}
+
+Status WriteBatch::Iterate(Handler* handler) const {
+ Slice input(rep_);
+ if (input.size() < kHeader) {
+ return Status::Corruption("malformed WriteBatch (too small)");
+ }
+
+ input.remove_prefix(kHeader);
+ Slice key, value;
+ int found = 0;
+ while (!input.empty()) {
+ found++;
+ char tag = input[0];
+ input.remove_prefix(1);
+ switch (tag) {
+ case kTypeValue:
+ if (GetLengthPrefixedSlice(&input, &key) &&
+ GetLengthPrefixedSlice(&input, &value)) {
+ handler->Put(key, value);
+ } else {
+ return Status::Corruption("bad WriteBatch Put");
+ }
+ break;
+ case kTypeDeletion:
+ if (GetLengthPrefixedSlice(&input, &key)) {
+ handler->Delete(key);
+ } else {
+ return Status::Corruption("bad WriteBatch Delete");
+ }
+ break;
+ default:
+ return Status::Corruption("unknown WriteBatch tag");
+ }
+ }
+ if (found != WriteBatchInternal::Count(this)) {
+ return Status::Corruption("WriteBatch has wrong count");
+ } else {
+ return Status::OK();
+ }
+}
+
+int WriteBatchInternal::Count(const WriteBatch* b) {
+ return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
+ EncodeFixed32(&b->rep_[8], n);
+}
+
+void WriteBatch::Put(const Slice& key, const Slice& value) {
+ WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+ rep_.push_back(static_cast<char>(kTypeValue));
+ PutLengthPrefixedSlice(&rep_, key);
+ PutLengthPrefixedSlice(&rep_, value);
+}
+
+void WriteBatch::Delete(const Slice& key) {
+ WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+ rep_.push_back(static_cast<char>(kTypeDeletion));
+ PutLengthPrefixedSlice(&rep_, key);
+}
+
+void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+ assert(contents.size() >= kHeader);
+ b->rep_.assign(contents.data(), contents.size());
+}
+
+void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) {
+ SetCount(dst, Count(dst) + Count(src));
+ assert(src->rep_.size() >= kHeader);
+ dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader);
+}
+
+} // namespace leveldb
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/db/write_batch_internal.h b/src/third_party/wiredtiger/api/leveldb/leveldb/db/write_batch_internal.h
new file mode 100644
index 00000000000..c8421cce124
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/db/write_batch_internal.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
+#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
+
+#include "leveldb_wt.h"
+#include "db/dbformat.h"
+
+namespace leveldb {
+
+// WriteBatchInternal provides static methods for manipulating a
+// WriteBatch that we don't want in the public WriteBatch interface.
+class WriteBatchInternal {
+ public:
+#ifdef HAVE_ROCKSDB
+ // WriteBatch methods with column_family_id instead of ColumnFamilyHandle*
+ static void Put(WriteBatch* batch, uint32_t column_family_id,
+ const Slice& key, const Slice& value);
+
+ static void Put(WriteBatch* batch, uint32_t column_family_id,
+ const SliceParts& key, const SliceParts& value);
+
+ static void Delete(WriteBatch* batch, uint32_t column_family_id,
+ const Slice& key);
+
+ static void Merge(WriteBatch* batch, uint32_t column_family_id,
+ const Slice& key, const Slice& value);
+#endif
+ // Return the number of entries in the batch.
+ static int Count(const WriteBatch* batch);
+
+ // Set the count for the number of entries in the batch.
+ static void SetCount(WriteBatch* batch, int n);
+
+ static Slice Contents(const WriteBatch* batch) {
+ return Slice(batch->rep_);
+ }
+
+ static size_t ByteSize(const WriteBatch* batch) {
+ return batch->rep_.size();
+ }
+
+ static void SetContents(WriteBatch* batch, const Slice& contents);
+
+ static void Append(WriteBatch* dst, const WriteBatch* src);
+};
+
+} // namespace leveldb
+
+
+#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/cache.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/cache.h
new file mode 100644
index 00000000000..94be8e919a8
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/cache.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Cache is an interface that maps keys to values. It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads. It may automatically evict entries to make room
+// for new entries. Values have a specified charge against the cache
+// capacity. For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided. Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_
+#define STORAGE_LEVELDB_INCLUDE_CACHE_H_
+
+#include "leveldb_wt_config.h"
+#if defined(HAVE_ROCKSDB) && !defined(leveldb)
+#define leveldb rocksdb
+#endif
+
+#include <memory>
+#include <stdint.h>
+#include "slice.h"
+
+namespace leveldb {
+
+class Cache;
+
+// Create a new cache with a fixed size capacity. This implementation
+// of Cache uses a least-recently-used eviction policy.
+extern Cache* NewLRUCache(size_t capacity);
+#ifdef HAVE_ROCKSDB
+extern Cache* NewLRUCache(size_t capacity, int numSharedBits);
+extern Cache* NewLRUCache(size_t capacity, int numSharedBits,
+ int removeScanCountLimit);
+#endif
+
+class Cache {
+ public:
+ Cache() { }
+
+ // Destroys all existing entries by calling the "deleter"
+ // function that was passed to the constructor.
+ virtual ~Cache();
+
+ // Opaque handle to an entry stored in the cache.
+ struct Handle { };
+
+ // Insert a mapping from key->value into the cache and assign it
+ // the specified charge against the total cache capacity.
+ //
+ // Returns a handle that corresponds to the mapping. The caller
+ // must call this->Release(handle) when the returned mapping is no
+ // longer needed.
+ //
+ // When the inserted entry is no longer needed, the key and
+ // value will be passed to "deleter".
+ virtual Handle* Insert(const Slice& key, void* value, size_t charge,
+ void (*deleter)(const Slice& key, void* value)) = 0;
+
+ // If the cache has no mapping for "key", returns NULL.
+ //
+ // Else return a handle that corresponds to the mapping. The caller
+ // must call this->Release(handle) when the returned mapping is no
+ // longer needed.
+ virtual Handle* Lookup(const Slice& key) = 0;
+
+ // Release a mapping returned by a previous Lookup().
+ // REQUIRES: handle must not have been released yet.
+ // REQUIRES: handle must have been returned by a method on *this.
+ virtual void Release(Handle* handle) = 0;
+
+ // Return the value encapsulated in a handle returned by a
+ // successful Lookup().
+ // REQUIRES: handle must not have been released yet.
+ // REQUIRES: handle must have been returned by a method on *this.
+ virtual void* Value(Handle* handle) = 0;
+
+ // If the cache contains entry for key, erase it. Note that the
+ // underlying entry will be kept around until all existing handles
+ // to it have been released.
+ virtual void Erase(const Slice& key) = 0;
+
+ // Return a new numeric id. May be used by multiple clients who are
+ // sharing the same cache to partition the key space. Typically the
+ // client will allocate a new id at startup and prepend the id to
+ // its cache keys.
+ virtual uint64_t NewId() = 0;
+
+ private:
+ void LRU_Remove(Handle* e);
+ void LRU_Append(Handle* e);
+ void Unref(Handle* e);
+
+ struct Rep;
+ Rep* rep_;
+
+ // No copying allowed
+ Cache(const Cache&);
+ void operator=(const Cache&);
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_UTIL_CACHE_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/comparator.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/comparator.h
new file mode 100644
index 00000000000..78d83a4d08e
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/comparator.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
+#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
+
+#include "leveldb_wt_config.h"
+#if defined(HAVE_ROCKSDB) && !defined(leveldb)
+#define leveldb rocksdb
+#endif
+
+#include <stdint.h>
+#include <string>
+
+namespace leveldb {
+
+class Slice;
+
+// A Comparator object provides a total order across slices that are
+// used as keys in an sstable or a database. A Comparator implementation
+// must be thread-safe since leveldb may invoke its methods concurrently
+// from multiple threads.
+class Comparator {
+ public:
+ virtual ~Comparator();
+
+ // Three-way comparison. Returns value:
+ // < 0 iff "a" < "b",
+ // == 0 iff "a" == "b",
+ // > 0 iff "a" > "b"
+ virtual int Compare(const Slice& a, const Slice& b) const = 0;
+
+ // The name of the comparator. Used to check for comparator
+ // mismatches (i.e., a DB created with one comparator is
+ // accessed using a different comparator.
+ //
+ // The client of this package should switch to a new name whenever
+ // the comparator implementation changes in a way that will cause
+ // the relative ordering of any two keys to change.
+ //
+ // Names starting with "leveldb." are reserved and should not be used
+ // by any clients of this package.
+ virtual const char* Name() const = 0;
+
+ // Advanced functions: these are used to reduce the space requirements
+ // for internal data structures like index blocks.
+
+ // If *start < limit, changes *start to a short string in [start,limit).
+ // Simple comparator implementations may return with *start unchanged,
+ // i.e., an implementation of this method that does nothing is correct.
+ virtual void FindShortestSeparator(
+ std::string* start,
+ const Slice& limit) const = 0;
+
+ // Changes *key to a short string >= *key.
+ // Simple comparator implementations may return with *key unchanged,
+ // i.e., an implementation of this method that does nothing is correct.
+ virtual void FindShortSuccessor(std::string* key) const = 0;
+
+#ifdef HAVE_HYPERLEVELDB
+ // If unsure, return 0;
+ virtual uint64_t KeyNum(const Slice& key) const;
+#endif
+};
+
+// Return a builtin comparator that uses lexicographic byte-wise
+// ordering. The result remains the property of this module and
+// must not be deleted.
+extern const Comparator* BytewiseComparator();
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/db.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/db.h
new file mode 100644
index 00000000000..df8fcbbe9f8
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/db.h
@@ -0,0 +1,350 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_
+#define STORAGE_LEVELDB_INCLUDE_DB_H_
+
+#include "leveldb_wt_config.h"
+#if defined(HAVE_ROCKSDB) && !defined(leveldb)
+#define leveldb rocksdb
+#endif
+
+#include <memory>
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include "iterator.h"
+#include "options.h"
+#include "write_batch.h"
+#ifdef HAVE_HYPERLEVELDB
+#include "replay_iterator.h"
+#endif
+
+namespace leveldb {
+
+// Update Makefile if you change these
+static const int kMajorVersion = 1;
+static const int kMinorVersion = 17;
+
+struct ReadOptions;
+struct WriteOptions;
+class WriteBatch;
+
+#ifdef HAVE_ROCKSDB
+struct FlushOptions;
+class ColumnFamilyHandle {
+ public:
+ virtual ~ColumnFamilyHandle() {}
+};
+extern const std::string kDefaultColumnFamilyName;
+
+struct ColumnFamilyDescriptor {
+ std::string name;
+ ColumnFamilyOptions options;
+ ColumnFamilyDescriptor()
+ : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {}
+ ColumnFamilyDescriptor(const std::string& _name,
+ const ColumnFamilyOptions& _options)
+ : name(_name), options(_options) {}
+};
+#endif
+
+// Abstract handle to particular state of a DB.
+// A Snapshot is an immutable object and can therefore be safely
+// accessed from multiple threads without any external synchronization.
+class Snapshot {
+ protected:
+ virtual ~Snapshot();
+};
+
+// A range of keys
+struct Range {
+ Slice start; // Included in the range
+ Slice limit; // Not included in the range
+
+ Range() { }
+ Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
+};
+
+#if HAVE_BASHOLEVELDB
+// Abstract holder for a DB value.
+// This allows callers to manage their own value buffers and have
+// DB values copied directly into those buffers.
+class Value {
+ public:
+ virtual Value& assign(const char* data, size_t size) = 0;
+
+ protected:
+ virtual ~Value();
+};
+#endif
+
+// A DB is a persistent ordered map from keys to values.
+// A DB is safe for concurrent access from multiple threads without
+// any external synchronization.
+class DB {
+ public:
+ // Open the database with the specified "name".
+ // Stores a pointer to a heap-allocated database in *dbptr and returns
+ // OK on success.
+ // Stores NULL in *dbptr and returns a non-OK status on error.
+ // Caller should delete *dbptr when it is no longer needed.
+ static Status Open(const Options& options,
+ const std::string& name,
+ DB** dbptr);
+
+#ifdef HAVE_ROCKSDB
+ // Open DB with column families.
+ // db_options specify database specific options
+ // column_families is the vector of all column families in the databse,
+ // containing column family name and options. You need to open ALL column
+ // families in the database. To get the list of column families, you can use
+ // ListColumnFamilies(). Also, you can open only a subset of column families
+ // for read-only access.
+ // The default column family name is 'default' and it's stored
+ // in rocksdb::kDefaultColumnFamilyName.
+ // If everything is OK, handles will on return be the same size
+ // as column_families --- handles[i] will be a handle that you
+ // will use to operate on column family column_family[i]
+ static Status Open(const Options& db_options, const std::string& name,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
+
+ // ListColumnFamilies will open the DB specified by argument name
+ // and return the list of all column families in that DB
+ // through column_families argument. The ordering of
+ // column families in column_families is unspecified.
+ static Status ListColumnFamilies(const Options& db_options,
+ const std::string& name,
+ std::vector<std::string>* column_families);
+
+ // Create a column_family and return the handle of column family
+ // through the argument handle.
+ virtual Status CreateColumnFamily(const Options& options,
+ const std::string& column_family_name,
+ ColumnFamilyHandle** handle) = 0;
+
+ // Drop a column family specified by column_family handle. This call
+ // only records a drop record in the manifest and prevents the column
+ // family from flushing and compacting.
+ virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) = 0;
+
+ // Set the database entry for "key" to "value".
+ // Returns OK on success, and a non-OK status on error.
+ // Note: consider setting options.sync = true.
+ virtual Status Put(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) = 0;
+
+ // Remove the database entry (if any) for "key". Returns OK on
+ // success, and a non-OK status on error. It is not an error if "key"
+ // did not exist in the database.
+ // Note: consider setting options.sync = true.
+ virtual Status Delete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) = 0;
+
+ // Merge the database entry for "key" with "value". Returns OK on success,
+ // and a non-OK status on error. The semantics of this operation is
+ // determined by the user provided merge_operator when opening DB.
+ // Note: consider setting options.sync = true.
+ virtual Status Merge(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) = 0;
+
+ // May return some other Status on an error.
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value) = 0;
+
+ // If keys[i] does not exist in the database, then the i'th returned
+ // status will be one for which Status::IsNotFound() is true, and
+ // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
+ // the i'th returned status will have Status::ok() true, and (*values)[i]
+ // will store the value associated with keys[i].
+ //
+ // (*values) will always be resized to be the same size as (keys).
+ // Similarly, the number of returned statuses will be the number of keys.
+ // Note: keys will not be "de-duplicated". Duplicate keys will return
+ // duplicate values in order.
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+
+ // If the key definitely does not exist in the database, then this method
+ // returns false, else true. If the caller wants to obtain value when the key
+ // is found in memory, a bool for 'value_found' must be passed. 'value_found'
+ // will be true on return if value has been set properly.
+ // This check is potentially lighter-weight than invoking DB::Get(). One way
+ // to make this lighter weight is to avoid doing any IOs.
+ // Default implementation here returns true and sets 'value_found' to false
+ virtual bool KeyMayExist(const ReadOptions&,
+ ColumnFamilyHandle*, const Slice&,
+ std::string*, bool* value_found = NULL) {
+ if (value_found != NULL) {
+ *value_found = false;
+ }
+ return true;
+ }
+
+ virtual Iterator* NewIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family) = 0;
+
+ virtual bool GetProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, std::string* value) = 0;
+
+ // Flush all mem-table data.
+ virtual Status Flush(const FlushOptions& options,
+ ColumnFamilyHandle* column_family) = 0;
+#endif
+
+ DB() { }
+ virtual ~DB();
+
+ // Set the database entry for "key" to "value". Returns OK on success,
+ // and a non-OK status on error.
+ // Note: consider setting options.sync = true.
+ virtual Status Put(const WriteOptions& options,
+ const Slice& key,
+ const Slice& value) = 0;
+
+ // Remove the database entry (if any) for "key". Returns OK on
+ // success, and a non-OK status on error. It is not an error if "key"
+ // did not exist in the database.
+ // Note: consider setting options.sync = true.
+ virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
+
+ // Apply the specified updates to the database.
+ // Returns OK on success, non-OK on failure.
+ // Note: consider setting options.sync = true.
+ virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
+
+ // If the database contains an entry for "key" store the
+ // corresponding value in *value and return OK.
+ //
+ // If there is no entry for "key" leave *value unchanged and return
+ // a status for which Status::IsNotFound() returns true.
+ //
+ // May return some other Status on an error.
+ virtual Status Get(const ReadOptions& options,
+ const Slice& key, std::string* value) = 0;
+#if HAVE_BASHOLEVELDB
+ virtual Status Get(const ReadOptions& options,
+ const Slice& key, Value* value) = 0;
+#endif
+
+ // Return a heap-allocated iterator over the contents of the database.
+ // The result of NewIterator() is initially invalid (caller must
+ // call one of the Seek methods on the iterator before using it).
+ //
+ // Caller should delete the iterator when it is no longer needed.
+ // The returned iterator should be deleted before this db is deleted.
+ virtual Iterator* NewIterator(const ReadOptions& options) = 0;
+
+ // Return a handle to the current DB state. Iterators created with
+ // this handle will all observe a stable snapshot of the current DB
+ // state. The caller must call ReleaseSnapshot(result) when the
+ // snapshot is no longer needed.
+ virtual const Snapshot* GetSnapshot() = 0;
+
+ // Release a previously acquired snapshot. The caller must not
+ // use "snapshot" after this call.
+ virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
+
+ // DB implementations can export properties about their state
+ // via this method. If "property" is a valid property understood by this
+ // DB implementation, fills "*value" with its current value and returns
+ // true. Otherwise returns false.
+ //
+ //
+ // Valid property names include:
+ //
+ // "leveldb.num-files-at-level<N>" - return the number of files at level <N>,
+ // where <N> is an ASCII representation of a level number (e.g. "0").
+ // "leveldb.stats" - returns a multi-line string that describes statistics
+ // about the internal operation of the DB.
+ // "leveldb.sstables" - returns a multi-line string that describes all
+ // of the sstables that make up the db contents.
+ virtual bool GetProperty(const Slice& property, std::string* value) = 0;
+
+ // For each i in [0,n-1], store in "sizes[i]", the approximate
+ // file system space used by keys in "[range[i].start .. range[i].limit)".
+ //
+ // Note that the returned sizes measure file system space usage, so
+ // if the user data compresses by a factor of ten, the returned
+ // sizes will be one-tenth the size of the corresponding user data size.
+ //
+ // The results may not include the sizes of recently written data.
+ virtual void GetApproximateSizes(const Range* range, int n,
+ uint64_t* sizes) = 0;
+
+ // Compact the underlying storage for the key range [*begin,*end].
+ // In particular, deleted and overwritten versions are discarded,
+ // and the data is rearranged to reduce the cost of operations
+ // needed to access the data. This operation should typically only
+ // be invoked by users who understand the underlying implementation.
+ //
+ // begin==NULL is treated as a key before all keys in the database.
+ // end==NULL is treated as a key after all keys in the database.
+ // Therefore the following call will compact the entire database:
+ // db->CompactRange(NULL, NULL);
+ virtual void CompactRange(const Slice* begin, const Slice* end) = 0;
+
+ // Suspends the background compaction thread. This methods
+ // returns once suspended.
+ virtual void SuspendCompactions() = 0;
+ // Resumes a suspended background compation thread.
+ virtual void ResumeCompactions() = 0;
+
+#ifdef HAVE_HYPERLEVELDB
+ // Create a live backup of a live LevelDB instance.
+ // The backup is stored in a directory named "backup-<name>" under the top
+ // level of the open LevelDB database. The implementation is permitted, and
+ // even encouraged, to improve the performance of this call through
+ // hard-links.
+ virtual Status LiveBackup(const Slice& name) = 0;
+
+ // Return an opaque timestamp that identifies the current point in time of the
+ // database. This timestamp may be subsequently presented to the
+ // NewReplayIterator method to create a ReplayIterator.
+ virtual void GetReplayTimestamp(std::string* timestamp) = 0;
+
+ // Set the lower bound for manual garbage collection. This method only takes
+ // effect when Options.manual_garbage_collection is true.
+ virtual void AllowGarbageCollectBeforeTimestamp(const std::string& timestamp) = 0;
+
+ // Validate the timestamp
+ virtual bool ValidateTimestamp(const std::string& timestamp) = 0;
+
+ // Compare two timestamps and return -1, 0, 1 for lt, eq, gt
+ virtual int CompareTimestamps(const std::string& lhs, const std::string& rhs) = 0;
+
+ // Return a ReplayIterator that returns every write operation performed after
+ // the timestamp.
+ virtual Status GetReplayIterator(const std::string& timestamp,
+ ReplayIterator** iter) = 0;
+
+ // Release a previously allocated replay iterator.
+ virtual void ReleaseReplayIterator(ReplayIterator* iter) = 0;
+#endif
+ private:
+ // No copying allowed
+ DB(const DB&);
+ void operator=(const DB&);
+};
+
+// Destroy the contents of the specified database.
+// Be very careful using this method.
+Status DestroyDB(const std::string& name, const Options& options);
+
+// If a DB cannot be opened, you may attempt to call this method to
+// resurrect as much of the contents of the database as possible.
+// Some data may be lost, so be careful when calling this function
+// on a database that contains important information.
+Status RepairDB(const std::string& dbname, const Options& options);
+
+}; // namespace leveldb
+
+#endif // STORAGE_LEVELDB_INCLUDE_DB_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/env.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/env.h
new file mode 100644
index 00000000000..4ad67d36fea
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/env.h
@@ -0,0 +1,349 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An Env is an interface used by the leveldb implementation to access
+// operating system functionality like the filesystem etc. Callers
+// may wish to provide a custom Env object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All Env implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
+#define STORAGE_LEVELDB_INCLUDE_ENV_H_
+
+#include "leveldb_wt_config.h"
+#if defined(HAVE_ROCKSDB) && !defined(leveldb)
+#define leveldb rocksdb
+#endif
+
+#include <string>
+#include <vector>
+#include <stdarg.h>
+#include <stdint.h>
+#if HAVE_BASHOLEVELDB
+#include "perf_count.h"
+#endif
+#include "status.h"
+
+namespace leveldb {
+
+class FileLock;
+class Logger;
+class RandomAccessFile;
+class SequentialFile;
+class Slice;
+class WritableFile;
+
+class Env {
+ public:
+ Env() { }
+ virtual ~Env();
+
+ // Return a default environment suitable for the current operating
+ // system. Sophisticated users may wish to provide their own Env
+ // implementation instead of relying on this default environment.
+ //
+ // The result of Default() belongs to leveldb and must never be deleted.
+ static Env* Default();
+
+ // Create a brand new sequentially-readable file with the specified name.
+ // On success, stores a pointer to the new file in *result and returns OK.
+ // On failure stores NULL in *result and returns non-OK. If the file does
+ // not exist, returns a non-OK status.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual Status NewSequentialFile(const std::string& fname,
+ SequentialFile** result) = 0;
+
+ // Create a brand new random access read-only file with the
+ // specified name. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores NULL in *result and
+ // returns non-OK. If the file does not exist, returns a non-OK
+ // status.
+ //
+ // The returned file may be concurrently accessed by multiple threads.
+ virtual Status NewRandomAccessFile(const std::string& fname,
+ RandomAccessFile** result) = 0;
+
+ // Create an object that writes to a new file with the specified
+ // name. Deletes any existing file with the same name and creates a
+ // new file. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores NULL in *result and
+ // returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual Status NewWritableFile(const std::string& fname,
+ WritableFile** result) = 0;
+
+ // Returns true iff the named file exists.
+ virtual bool FileExists(const std::string& fname) = 0;
+
+ // Store in *result the names of the children of the specified directory.
+ // The names are relative to "dir".
+ // Original contents of *results are dropped.
+ virtual Status GetChildren(const std::string& dir,
+ std::vector<std::string>* result) = 0;
+
+ // Delete the named file.
+ virtual Status DeleteFile(const std::string& fname) = 0;
+
+ // Create the specified directory.
+ virtual Status CreateDir(const std::string& dirname) = 0;
+
+ // Delete the specified directory.
+ virtual Status DeleteDir(const std::string& dirname) = 0;
+
+ // Store the size of fname in *file_size.
+ virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
+
+ // Rename file src to target.
+ virtual Status RenameFile(const std::string& src,
+ const std::string& target) = 0;
+
+ // Lock the specified file. Used to prevent concurrent access to
+ // the same db by multiple processes. On failure, stores NULL in
+ // *lock and returns non-OK.
+ //
+ // On success, stores a pointer to the object that represents the
+ // acquired lock in *lock and returns OK. The caller should call
+ // UnlockFile(*lock) to release the lock. If the process exits,
+ // the lock will be automatically released.
+ //
+ // If somebody else already holds the lock, finishes immediately
+ // with a failure. I.e., this call does not wait for existing locks
+ // to go away.
+ //
+ // May create the named file if it does not already exist.
+ virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
+
+ // Release the lock acquired by a previous successful call to LockFile.
+ // REQUIRES: lock was returned by a successful LockFile() call
+ // REQUIRES: lock has not already been unlocked.
+ virtual Status UnlockFile(FileLock* lock) = 0;
+
+ // Arrange to run "(*function)(arg)" once in a background thread.
+ //
+ // "function" may run in an unspecified thread. Multiple functions
+ // added to the same Env may run concurrently in different threads.
+ // I.e., the caller may not assume that background work items are
+ // serialized.
+ virtual void Schedule(
+ void (*function)(void* arg),
+ void* arg) = 0;
+
+ // Start a new thread, invoking "function(arg)" within the new thread.
+ // When "function(arg)" returns, the thread will be destroyed.
+ virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
+
+ // *path is set to a temporary directory that can be used for testing. It may
+ // or many not have just been created. The directory may or may not differ
+ // between runs of the same process, but subsequent calls will return the
+ // same directory.
+ virtual Status GetTestDirectory(std::string* path) = 0;
+
+ // Create and return a log file for storing informational messages.
+ virtual Status NewLogger(const std::string& fname, Logger** result) = 0;
+
+ // Returns the number of micro-seconds since some fixed point in time. Only
+ // useful for computing deltas of time.
+ virtual uint64_t NowMicros() = 0;
+
+ // Sleep/delay the thread for the perscribed number of micro-seconds.
+ virtual void SleepForMicroseconds(int micros) = 0;
+
+#if HAVE_BASHOLEVELDB
+ // Riak specific: Where supported, give count of background jobs pending.
+ virtual int GetBackgroundBacklog() const {return(0);};
+
+ // Riak specific: Get object that is tracking various software counters
+ virtual PerformanceCounters * GetPerformanceCounters() {return(gPerfCounters);}
+#endif
+
+ private:
+ // No copying allowed
+ Env(const Env&);
+ void operator=(const Env&);
+};
+
+// A file abstraction for reading sequentially through a file
+class SequentialFile {
+ public:
+ SequentialFile() { }
+ virtual ~SequentialFile();
+
+ // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
+ // written by this routine. Sets "*result" to the data that was
+ // read (including if fewer than "n" bytes were successfully read).
+ // May set "*result" to point at data in "scratch[0..n-1]", so
+ // "scratch[0..n-1]" must be live when "*result" is used.
+ // If an error was encountered, returns a non-OK status.
+ //
+ // REQUIRES: External synchronization
+ virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
+
+ // Skip "n" bytes from the file. This is guaranteed to be no
+ // slower that reading the same data, but may be faster.
+ //
+ // If end of file is reached, skipping will stop at the end of the
+ // file, and Skip will return OK.
+ //
+ // REQUIRES: External synchronization
+ virtual Status Skip(uint64_t n) = 0;
+
+ private:
+ // No copying allowed
+ SequentialFile(const SequentialFile&);
+ void operator=(const SequentialFile&);
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class RandomAccessFile {
+ public:
+ RandomAccessFile() { }
+ virtual ~RandomAccessFile();
+
+ // Read up to "n" bytes from the file starting at "offset".
+ // "scratch[0..n-1]" may be written by this routine. Sets "*result"
+ // to the data that was read (including if fewer than "n" bytes were
+ // successfully read). May set "*result" to point at data in
+ // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+ // "*result" is used. If an error was encountered, returns a non-OK
+ // status.
+ //
+ // Safe for concurrent use by multiple threads.
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const = 0;
+
+ private:
+ // No copying allowed
+ RandomAccessFile(const RandomAccessFile&);
+ void operator=(const RandomAccessFile&);
+};
+
+// A file abstraction for sequential writing. The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class WritableFile {
+ public:
+ WritableFile() { }
+ virtual ~WritableFile();
+
+ virtual Status Append(const Slice& data) = 0;
+ virtual Status Close() = 0;
+ virtual Status Flush() = 0;
+ virtual Status Sync() = 0;
+
+ private:
+ // No copying allowed
+ WritableFile(const WritableFile&);
+ void operator=(const WritableFile&);
+};
+
+// An interface for writing log messages.
+class Logger {
+ public:
+ Logger() { }
+ virtual ~Logger();
+
+ // Write an entry to the log file with the specified format.
+ virtual void Logv(const char* format, va_list ap) = 0;
+
+ private:
+ // No copying allowed
+ Logger(const Logger&);
+ void operator=(const Logger&);
+};
+
+
+// Identifies a locked file.
+class FileLock {
+ public:
+ FileLock() { }
+ virtual ~FileLock();
+ private:
+ // No copying allowed
+ FileLock(const FileLock&);
+ void operator=(const FileLock&);
+};
+
+// Log the specified data to *info_log if info_log is non-NULL.
+extern void Log(Logger* info_log, const char* format, ...)
+# if defined(__GNUC__) || defined(__clang__)
+ __attribute__((__format__ (__printf__, 2, 3)))
+# endif
+ ;
+
+// A utility routine: write "data" to the named file.
+extern Status WriteStringToFile(Env* env, const Slice& data,
+ const std::string& fname);
+
+// A utility routine: read contents of named file into *data
+extern Status ReadFileToString(Env* env, const std::string& fname,
+ std::string* data);
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class EnvWrapper : public Env {
+ public:
+ // Initialize an EnvWrapper that delegates all calls to *t
+ explicit EnvWrapper(Env* t) : target_(t) { }
+ virtual ~EnvWrapper();
+
+ // Return the target to which this Env forwards all calls
+ Env* target() const { return target_; }
+
+ // The following text is boilerplate that forwards all methods to target()
+ Status NewSequentialFile(const std::string& f, SequentialFile** r) {
+ return target_->NewSequentialFile(f, r);
+ }
+ Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
+ return target_->NewRandomAccessFile(f, r);
+ }
+ Status NewWritableFile(const std::string& f, WritableFile** r) {
+ return target_->NewWritableFile(f, r);
+ }
+ bool FileExists(const std::string& f) { return target_->FileExists(f); }
+ Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
+ return target_->GetChildren(dir, r);
+ }
+ Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
+ Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
+ Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
+ Status GetFileSize(const std::string& f, uint64_t* s) {
+ return target_->GetFileSize(f, s);
+ }
+ Status RenameFile(const std::string& s, const std::string& t) {
+ return target_->RenameFile(s, t);
+ }
+ Status LockFile(const std::string& f, FileLock** l) {
+ return target_->LockFile(f, l);
+ }
+ Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
+ void Schedule(void (*f)(void*), void* a) {
+ return target_->Schedule(f, a);
+ }
+ void StartThread(void (*f)(void*), void* a) {
+ return target_->StartThread(f, a);
+ }
+ virtual Status GetTestDirectory(std::string* path) {
+ return target_->GetTestDirectory(path);
+ }
+ virtual Status NewLogger(const std::string& fname, Logger** result) {
+ return target_->NewLogger(fname, result);
+ }
+ uint64_t NowMicros() {
+ return target_->NowMicros();
+ }
+ void SleepForMicroseconds(int micros) {
+ target_->SleepForMicroseconds(micros);
+ }
+ private:
+ Env* target_;
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/filter_policy.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/filter_policy.h
new file mode 100644
index 00000000000..e434ef4b241
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/filter_policy.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A database can be configured with a custom FilterPolicy object.
+// This object is responsible for creating a small filter from a set
+// of keys. These filters are stored in leveldb and are consulted
+// automatically by leveldb to decide whether or not to read some
+// information from disk. In many cases, a filter can cut down the
+// number of disk seeks form a handful to a single disk seek per
+// DB::Get() call.
+//
+// Most people will want to use the builtin bloom filter support (see
+// NewBloomFilterPolicy() below).
+
+#ifndef STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
+#define STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
+
+#include "leveldb_wt_config.h"
+#if defined(HAVE_ROCKSDB) && !defined(leveldb)
+#define leveldb rocksdb
+#endif
+
+#include <string>
+
+namespace leveldb {
+
+class Slice;
+
+class FilterPolicy {
+ public:
+ virtual ~FilterPolicy();
+
+ // Return the name of this policy. Note that if the filter encoding
+ // changes in an incompatible way, the name returned by this method
+ // must be changed. Otherwise, old incompatible filters may be
+ // passed to methods of this type.
+ virtual const char* Name() const = 0;
+
+ // keys[0,n-1] contains a list of keys (potentially with duplicates)
+ // that are ordered according to the user supplied comparator.
+ // Append a filter that summarizes keys[0,n-1] to *dst.
+ //
+ // Warning: do not change the initial contents of *dst. Instead,
+ // append the newly constructed filter to *dst.
+ virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
+ const = 0;
+
+ // "filter" contains the data appended by a preceding call to
+ // CreateFilter() on this class. This method must return true if
+ // the key was in the list of keys passed to CreateFilter().
+ // This method may return true or false if the key was not on the
+ // list, but it should aim to return false with a high probability.
+ virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
+};
+
+// Return a new filter policy that uses a bloom filter with approximately
+// the specified number of bits per key. A good value for bits_per_key
+// is 10, which yields a filter with ~ 1% false positive rate.
+//
+// Callers must delete the result after any database that is using the
+// result has been closed.
+//
+// Note: if you are using a custom comparator that ignores some parts
+// of the keys being compared, you must not use NewBloomFilterPolicy()
+// and must provide your own FilterPolicy that also ignores the
+// corresponding parts of the keys. For example, if the comparator
+// ignores trailing spaces, it would be incorrect to use a
+// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
+// trailing spaces in keys.
+extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
+#if HAVE_BASHOLEVELDB
+extern const FilterPolicy* NewBloomFilterPolicy2(int bits_per_key);
+#endif
+
+}
+
+#endif // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/iterator.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/iterator.h
new file mode 100644
index 00000000000..2d97d180b17
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/iterator.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An iterator yields a sequence of key/value pairs from a source.
+// The following class defines the interface. Multiple implementations
+// are provided by this library. In particular, iterators are provided
+// to access the contents of a Table or a DB.
+//
+// Multiple threads can invoke const methods on an Iterator without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Iterator must use
+// external synchronization.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
+#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
+
+#include "leveldb_wt_config.h"
+#if defined(HAVE_ROCKSDB) && !defined(leveldb)
+#define leveldb rocksdb
+#endif
+
+#include "slice.h"
+#include "status.h"
+
+namespace leveldb {
+
+class Iterator {
+ public:
+ Iterator();
+ virtual ~Iterator();
+
+ // An iterator is either positioned at a key/value pair, or
+ // not valid. This method returns true iff the iterator is valid.
+ virtual bool Valid() const = 0;
+
+ // Position at the first key in the source. The iterator is Valid()
+ // after this call iff the source is not empty.
+ virtual void SeekToFirst() = 0;
+
+ // Position at the last key in the source. The iterator is
+ // Valid() after this call iff the source is not empty.
+ virtual void SeekToLast() = 0;
+
+ // Position at the first key in the source that at or past target
+ // The iterator is Valid() after this call iff the source contains
+ // an entry that comes at or past target.
+ virtual void Seek(const Slice& target) = 0;
+
+ // Moves to the next entry in the source. After this call, Valid() is
+ // true iff the iterator was not positioned at the last entry in the source.
+ // REQUIRES: Valid()
+ virtual void Next() = 0;
+
+ // Moves to the previous entry in the source. After this call, Valid() is
+ // true iff the iterator was not positioned at the first entry in source.
+ // REQUIRES: Valid()
+ virtual void Prev() = 0;
+
+ // Return the key for the current entry. The underlying storage for
+ // the returned slice is valid only until the next modification of
+ // the iterator.
+ // REQUIRES: Valid()
+ virtual Slice key() const = 0;
+
+ // Return the value for the current entry. The underlying storage for
+ // the returned slice is valid only until the next modification of
+ // the iterator.
+ // REQUIRES: !AtEnd() && !AtStart()
+ virtual Slice value() const = 0;
+
+ // If an error has occurred, return it. Else return an ok status.
+ virtual Status status() const = 0;
+
+ // Clients are allowed to register function/arg1/arg2 triples that
+ // will be invoked when this iterator is destroyed.
+ //
+ // Note that unlike all of the preceding methods, this method is
+ // not abstract and therefore clients should not override it.
+ typedef void (*CleanupFunction)(void* arg1, void* arg2);
+ void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+
+ private:
+ struct Cleanup {
+ CleanupFunction function;
+ void* arg1;
+ void* arg2;
+ Cleanup* next;
+ };
+ Cleanup cleanup_;
+
+ // No copying allowed
+ Iterator(const Iterator&);
+ void operator=(const Iterator&);
+};
+
+// Return an empty iterator (yields nothing).
+extern Iterator* NewEmptyIterator();
+
+// Return an empty iterator with the specified status.
+extern Iterator* NewErrorIterator(const Status& status);
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/options.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/options.h
new file mode 100644
index 00000000000..9dcf73fc2a0
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/options.h
@@ -0,0 +1,258 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
+#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
+
+#include "leveldb_wt_config.h"
+#if defined(HAVE_ROCKSDB) && !defined(leveldb)
+#define leveldb rocksdb
+#endif
+
+#include <memory>
+#include <stddef.h>
+
+namespace leveldb {
+
+class Cache;
+class Comparator;
+class Env;
+class FilterPolicy;
+class Logger;
+class Snapshot;
+
+// DB contents are stored in a set of blocks, each of which holds a
+// sequence of key,value pairs. Each block may be compressed before
+// being stored in a file. The following enum describes which
+// compression method (if any) is used to compress a block.
+enum CompressionType {
+ // NOTE: do not change the values of existing entries, as these are
+ // part of the persistent format on disk.
+ kNoCompression = 0x0,
+ kSnappyCompression = 0x1
+#ifdef HAVE_ROCKSDB
+ , kZlibCompression = 0x2
+#endif
+};
+
+// Options to control the behavior of a database (passed to DB::Open)
+struct Options {
+ // -------------------
+ // Parameters that affect behavior
+
+ // Comparator used to define the order of keys in the table.
+ // Default: a comparator that uses lexicographic byte-wise ordering
+ //
+ // REQUIRES: The client must ensure that the comparator supplied
+ // here has the same name and orders keys *exactly* the same as the
+ // comparator provided to previous open calls on the same DB.
+ const Comparator* comparator;
+
+ // If true, the database will be created if it is missing.
+ // Default: false
+ bool create_if_missing;
+
+ // If true, an error is raised if the database already exists.
+ // Default: false
+ bool error_if_exists;
+
+ // If true, the implementation will do aggressive checking of the
+ // data it is processing and will stop early if it detects any
+ // errors. This may have unforeseen ramifications: for example, a
+ // corruption of one DB entry may cause a large number of entries to
+ // become unreadable or for the entire DB to become unopenable.
+ // Default: false
+ bool paranoid_checks;
+
+#ifdef HAVE_ROCKSDB
+ // By default, RocksDB uses only one background thread for flush and
+ // compaction. Calling this function will set it up such that total of
+ // `total_threads` is used. Good value for `total_threads` is the number of
+ // cores. You almost definitely want to call this function if your system is
+ // bottlenecked by RocksDB.
+ Options* IncreaseParallelism(int = 16) { return this; }
+ Options* OptimizeLevelStyleCompaction() { return this; }
+#endif
+
+#if HAVE_BASHOLEVELDB
+ // Riak specific: this variable replaces paranoid_checks at one
+ // one place in the code. This variable alone controls whether or not
+ // compaction read operations check CRC values. Riak needs
+ // the compaction CRC check, but not other paranoid_checks ... so
+ // this independent control.
+ // Default: true
+ bool verify_compactions;
+#endif
+
+ // Use the specified object to interact with the environment,
+ // e.g. to read/write files, schedule background work, etc.
+ // Default: Env::Default()
+ Env* env;
+
+ // Any internal progress/error information generated by the db will
+ // be written to info_log if it is non-NULL, or to a file stored
+ // in the same directory as the DB contents if info_log is NULL.
+ // Default: NULL
+ Logger* info_log;
+
+ // -------------------
+ // Parameters that affect performance
+
+ // Amount of data to build up in memory (backed by an unsorted log
+ // on disk) before converting to a sorted on-disk file.
+ //
+ // Larger values increase performance, especially during bulk loads.
+ // Up to two write buffers may be held in memory at the same time,
+ // so you may wish to adjust this parameter to control memory usage.
+ // Also, a larger write buffer will result in a longer recovery time
+ // the next time the database is opened.
+ //
+ // Default: 4MB
+ size_t write_buffer_size;
+
+ // Number of open files that can be used by the DB. You may need to
+ // increase this if your database has a large working set (budget
+ // one open file per 2MB of working set).
+ //
+ // Default: 1000
+ int max_open_files;
+
+ // Control over blocks (user data is stored in a set of blocks, and
+ // a block is the unit of reading from disk).
+
+ // If non-NULL, use the specified cache for blocks.
+ // If NULL, leveldb will automatically create and use an 8MB internal cache.
+ // Default: NULL
+ Cache* block_cache;
+
+ // Approximate size of user data packed per block. Note that the
+ // block size specified here corresponds to uncompressed data. The
+ // actual size of the unit read from disk may be smaller if
+ // compression is enabled. This parameter can be changed dynamically.
+ //
+ // Default: 4K
+ size_t block_size;
+
+ // Number of keys between restart points for delta encoding of keys.
+ // This parameter can be changed dynamically. Most clients should
+ // leave this parameter alone.
+ //
+ // Default: 16
+ int block_restart_interval;
+
+ // Compress blocks using the specified compression algorithm. This
+ // parameter can be changed dynamically.
+ //
+ // Default: kSnappyCompression, which gives lightweight but fast
+ // compression.
+ //
+ // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+ // ~200-500MB/s compression
+ // ~400-800MB/s decompression
+ // Note that these speeds are significantly faster than most
+ // persistent storage speeds, and therefore it is typically never
+ // worth switching to kNoCompression. Even if the input data is
+ // incompressible, the kSnappyCompression implementation will
+ // efficiently detect that and will switch to uncompressed mode.
+ CompressionType compression;
+
+ // If non-NULL, use the specified filter policy to reduce disk reads.
+ // Many applications will benefit from passing the result of
+ // NewBloomFilterPolicy() here.
+ //
+ // Default: NULL
+ const FilterPolicy* filter_policy;
+
+#ifdef HAVE_HYPERLEVELDB
+ // Is the database used with the Replay mechanism? If yes, the lower bound on
+ // values to compact is (somewhat) left up to the application; if no, then
+ // LevelDB functions as usual, and uses snapshots to determine the lower
+ // bound. HyperLevelDB will always maintain the integrity of snapshots, so
+ // the application merely has the option to hold data as if it's holding a
+ // snapshot. This just prevents compaction from grabbing data before the app
+ // can get a snapshot.
+ //
+ // Default: false/no.
+ bool manual_garbage_collection;
+#endif
+
+ // Create an Options object with default values for all fields.
+ Options();
+};
+
+#ifdef HAVE_ROCKSDB
+struct ColumnFamilyOptions : public Options {
+ ColumnFamilyOptions() : Options() {}
+};
+
+struct DBOptions : public Options {
+ DBOptions() : Options() {}
+};
+#endif
+
+// Options that control read operations
+struct ReadOptions {
+ // If true, all data read from underlying storage will be
+ // verified against corresponding checksums.
+ // Default: false
+ bool verify_checksums;
+
+ // Should the data read for this iteration be cached in memory?
+ // Callers may wish to set this field to false for bulk scans.
+ // Default: true
+ bool fill_cache;
+
+ // If "snapshot" is non-NULL, read as of the supplied snapshot
+ // (which must belong to the DB that is being read and which must
+ // not have been released). If "snapshot" is NULL, use an impliicit
+ // snapshot of the state at the beginning of this read operation.
+ // Default: NULL
+ const Snapshot* snapshot;
+
+ ReadOptions()
+ : verify_checksums(false),
+ fill_cache(true),
+ snapshot(NULL) {
+ }
+};
+
+// Options that control write operations
+struct WriteOptions {
+ // If true, the write will be flushed from the operating system
+ // buffer cache (by calling WritableFile::Sync()) before the write
+ // is considered complete. If this flag is true, writes will be
+ // slower.
+ //
+ // If this flag is false, and the machine crashes, some recent
+ // writes may be lost. Note that if it is just the process that
+ // crashes (i.e., the machine does not reboot), no writes will be
+ // lost even if sync==false.
+ //
+ // In other words, a DB write with sync==false has similar
+ // crash semantics as the "write()" system call. A DB write
+ // with sync==true has similar crash semantics to a "write()"
+ // system call followed by "fsync()".
+ //
+ // Default: false
+ bool sync;
+
+ WriteOptions()
+ : sync(false) {
+ }
+};
+
+#ifdef HAVE_ROCKSDB
+// Options that control flush operations
+struct FlushOptions {
+ // If true, the flush will wait until the flush is done.
+ // Default: true
+ bool wait;
+
+ FlushOptions() : wait(true) {}
+};
+#endif
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/slice.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/slice.h
new file mode 100644
index 00000000000..1eb66dd825f
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/slice.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Slice is a simple structure containing a pointer into some external
+// storage and a size. The user of a Slice must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+//
+// Multiple threads can invoke const methods on a Slice without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Slice must use
+// external synchronization.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_
+#define STORAGE_LEVELDB_INCLUDE_SLICE_H_
+
+#include "leveldb_wt_config.h"
+#if defined(HAVE_ROCKSDB) && !defined(leveldb)
+#define leveldb rocksdb
+#endif
+
+#include <assert.h>
+#include <stddef.h>
+#include <string.h>
+#include <string>
+
+namespace leveldb {
+
+class Slice {
+ public:
+ // Create an empty slice.
+ Slice() : data_(""), size_(0) { }
+
+ // Create a slice that refers to d[0,n-1].
+ Slice(const char* d, size_t n) : data_(d), size_(n) { }
+
+ // Create a slice that refers to the contents of "s"
+ Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
+
+ // Create a slice that refers to s[0,strlen(s)-1]
+ Slice(const char* s) : data_(s), size_(strlen(s)) { }
+
+ // Return a pointer to the beginning of the referenced data
+ const char* data() const { return data_; }
+
+ // Return the length (in bytes) of the referenced data
+ size_t size() const { return size_; }
+
+ // Return true iff the length of the referenced data is zero
+ bool empty() const { return size_ == 0; }
+
+ // Return the ith byte in the referenced data.
+ // REQUIRES: n < size()
+ char operator[](size_t n) const {
+ assert(n < size());
+ return data_[n];
+ }
+
+ // Change this slice to refer to an empty array
+ void clear() { data_ = ""; size_ = 0; }
+
+ // Drop the first "n" bytes from this slice.
+ void remove_prefix(size_t n) {
+ assert(n <= size());
+ data_ += n;
+ size_ -= n;
+ }
+
+ // Return a string that contains the copy of the referenced data.
+ std::string ToString() const { return std::string(data_, size_); }
+
+ // Three-way comparison. Returns value:
+ // < 0 iff "*this" < "b",
+ // == 0 iff "*this" == "b",
+ // > 0 iff "*this" > "b"
+ int compare(const Slice& b) const;
+
+ // Return true iff "x" is a prefix of "*this"
+ bool starts_with(const Slice& x) const {
+ return ((size_ >= x.size_) &&
+ (memcmp(data_, x.data_, x.size_) == 0));
+ }
+
+// The LevelDB JNI layer peeks in here
+// private:
+ const char* data_;
+ size_t size_;
+
+ // Intentionally copyable
+};
+
+#ifdef HAVE_ROCKSDB
+// A set of Slices that are virtually concatenated together. 'parts' points
+// to an array of Slices. The number of elements in the array is 'num_parts'.
+struct SliceParts {
+ SliceParts(const Slice* _parts, int _num_parts) :
+ parts(_parts), num_parts(_num_parts) { }
+
+ const Slice* parts;
+ int num_parts;
+};
+#endif
+
+inline bool operator==(const Slice& x, const Slice& y) {
+ return ((x.size() == y.size()) &&
+ (memcmp(x.data(), y.data(), x.size()) == 0));
+}
+
+inline bool operator!=(const Slice& x, const Slice& y) {
+ return !(x == y);
+}
+
+inline int Slice::compare(const Slice& b) const {
+ const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
+ int r = memcmp(data_, b.data_, min_len);
+ if (r == 0) {
+ if (size_ < b.size_) r = -1;
+ else if (size_ > b.size_) r = +1;
+ }
+ return r;
+}
+
+} // namespace leveldb
+
+
+#endif // STORAGE_LEVELDB_INCLUDE_SLICE_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/status.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/status.h
new file mode 100644
index 00000000000..3c21f64462b
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/status.h
@@ -0,0 +1,111 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation. It may indicate success,
+// or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_
+#define STORAGE_LEVELDB_INCLUDE_STATUS_H_
+
+#include "leveldb_wt_config.h"
+#if defined(HAVE_ROCKSDB) && !defined(leveldb)
+#define leveldb rocksdb
+#endif
+
+#include <string>
+#include "slice.h"
+
+namespace leveldb {
+
+class Status {
+ public:
+ // Create a success status.
+ Status() : state_(NULL) { }
+ ~Status() { delete[] state_; }
+
+ // Copy the specified status.
+ Status(const Status& s);
+ void operator=(const Status& s);
+
+ // Return a success status.
+ static Status OK() { return Status(); }
+
+ // Return error status of an appropriate type.
+ static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kNotFound, msg, msg2);
+ }
+ static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kCorruption, msg, msg2);
+ }
+ static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kNotSupported, msg, msg2);
+ }
+ static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kInvalidArgument, msg, msg2);
+ }
+ static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kIOError, msg, msg2);
+ }
+
+ // Returns true iff the status indicates success.
+ bool ok() const { return (state_ == NULL); }
+
+ // Returns true iff the status indicates a NotFound error.
+ bool IsNotFound() const { return code() == kNotFound; }
+
+ // Returns true iff the status indicates a Corruption error.
+ bool IsCorruption() const { return code() == kCorruption; }
+
+ // Returns true iff the status indicates an IOError.
+ bool IsIOError() const { return code() == kIOError; }
+
+ // Return a string representation of this status suitable for printing.
+ // Returns the string "OK" for success.
+ std::string ToString() const;
+
+ private:
+ // OK status has a NULL state_. Otherwise, state_ is a new[] array
+ // of the following form:
+ // state_[0..3] == length of message
+ // state_[4] == code
+ // state_[5..] == message
+ const char* state_;
+
+ enum Code {
+ kOk = 0,
+ kNotFound = 1,
+ kCorruption = 2,
+ kNotSupported = 3,
+ kInvalidArgument = 4,
+ kIOError = 5
+ };
+
+ Code code() const {
+ return (state_ == NULL) ? kOk : static_cast<Code>(state_[4]);
+ }
+
+ Status(Code code, const Slice& msg, const Slice& msg2);
+ static const char* CopyState(const char* s);
+};
+
+inline Status::Status(const Status& s) {
+ state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_);
+}
+inline void Status::operator=(const Status& s) {
+ // The following condition catches both aliasing (when this == &s),
+ // and the common case where both s and *this are ok.
+ if (state_ != s.state_) {
+ delete[] state_;
+ state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_);
+ }
+}
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_INCLUDE_STATUS_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/write_batch.h b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/write_batch.h
new file mode 100644
index 00000000000..293b41ad818
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/include/leveldb/write_batch.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch holds a collection of updates to apply atomically to a DB.
+//
+// The updates are applied in the order in which they are added
+// to the WriteBatch. For example, the value of "key" will be "v3"
+// after the following batch is written:
+//
+// batch.Put("key", "v1");
+// batch.Delete("key");
+// batch.Put("key", "v2");
+// batch.Put("key", "v3");
+//
+// Multiple threads can invoke const methods on a WriteBatch without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same WriteBatch must use
+// external synchronization.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
+#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
+
+#include "leveldb_wt_config.h"
+#if defined(HAVE_ROCKSDB) && !defined(leveldb)
+#define leveldb rocksdb
+#endif
+
+#include <string>
+#include "status.h"
+
+namespace leveldb {
+
+class Slice;
+#if HAVE_ROCKSDB
+class ColumnFamilyHandle;
+struct SliceParts;
+#endif
+
+class WriteBatch {
+ public:
+#ifdef HAVE_ROCKSDB
+ explicit WriteBatch(size_t reserved_bytes = 0);
+#else
+ WriteBatch();
+#endif
+ ~WriteBatch();
+
+ // Store the mapping "key->value" in the database.
+ void Put(const Slice& key, const Slice& value);
+
+ // If the database contains a mapping for "key", erase it. Else do nothing.
+ void Delete(const Slice& key);
+
+ // Clear all updates buffered in this batch.
+ void Clear();
+
+#ifdef HAVE_ROCKSDB
+ void Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value);
+
+ // Variant of Put() that gathers output like writev(2). The key and value
+ // that will be written to the database are concatentations of arrays of
+ // slices.
+ void Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value);
+
+ void Delete(ColumnFamilyHandle* column_family, const Slice& key);
+#endif
+
+ // Support for iterating over the contents of a batch.
+ class Handler {
+ public:
+ virtual ~Handler();
+#ifdef HAVE_ROCKSDB
+ // default implementation will just call Put without column family for
+ // backwards compatibility. If the column family is not default,
+ // the function is noop
+ virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) {
+ if (column_family_id == 0) {
+ // Put() historically doesn't return status. We didn't want to be
+ // backwards incompatible so we didn't change the return status
+ // (this is a public API). We do an ordinary get and return Status::OK()
+ Put(key, value);
+ return Status::OK();
+ }
+ return Status::InvalidArgument(
+ "non-default column family and PutCF not implemented");
+ }
+ // Merge and LogData are not pure virtual. Otherwise, we would break
+ // existing clients of Handler on a source code level. The default
+ // implementation of Merge simply throws a runtime exception.
+ virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) {
+ if (column_family_id == 0) {
+ Merge(key, value);
+ return Status::OK();
+ }
+ return Status::InvalidArgument(
+ "non-default column family and MergeCF not implemented");
+ }
+ virtual void Merge(const Slice& key, const Slice& value);
+ // The default implementation of LogData does nothing.
+ virtual void LogData(const Slice& blob);
+ virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+ if (column_family_id == 0) {
+ Delete(key);
+ return Status::OK();
+ }
+ return Status::InvalidArgument(
+ "non-default column family and DeleteCF not implemented");
+ }
+ // Continue is called by WriteBatch::Iterate. If it returns false,
+ // iteration is halted. Otherwise, it continues iterating. The default
+ // implementation always returns true.
+ virtual bool Continue();
+#endif
+ virtual void Put(const Slice& key, const Slice& value) = 0;
+ virtual void Delete(const Slice& key) = 0;
+ };
+ Status Iterate(Handler* handler) const;
+
+#ifdef HAVE_ROCKSDB
+ // Retrieve data size of the batch.
+ size_t GetDataSize() const { return rep_.size(); }
+
+ // Returns the number of updates in the batch
+ int Count() const;
+#endif
+
+ private:
+ friend class WriteBatchInternal;
+
+ std::string rep_; // See comment in write_batch.cc for the format of rep_
+
+ // Intentionally copyable
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/port/port.h b/src/third_party/wiredtiger/api/leveldb/leveldb/port/port.h
new file mode 100644
index 00000000000..1f83635a82c
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/port/port.h
@@ -0,0 +1,38 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _PORT_H_
+#define _PORT_H_ 1
+/* Stub portability header for imported LevelDB code. */
+
+#include "wiredtiger.h"
+
+namespace port {
+ const int kLittleEndian = 1;
+}
+
+#endif
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/arena.h b/src/third_party/wiredtiger/api/leveldb/leveldb/util/arena.h
new file mode 100644
index 00000000000..8f7dde226c4
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/arena.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_
+#define STORAGE_LEVELDB_UTIL_ARENA_H_
+
+#include <cstddef>
+#include <vector>
+#include <assert.h>
+#include <stdint.h>
+
+namespace leveldb {
+
+class Arena {
+ public:
+ Arena();
+ ~Arena();
+
+ // Return a pointer to a newly allocated memory block of "bytes" bytes.
+ char* Allocate(size_t bytes);
+
+ // Allocate memory with the normal alignment guarantees provided by malloc
+ char* AllocateAligned(size_t bytes);
+
+ // Returns an estimate of the total memory usage of data allocated
+ // by the arena (including space allocated but not yet used for user
+ // allocations).
+ size_t MemoryUsage() const {
+ return blocks_memory_ + blocks_.capacity() * sizeof(char*);
+ }
+
+ private:
+ char* AllocateFallback(size_t bytes);
+ char* AllocateNewBlock(size_t block_bytes);
+
+ // Allocation state
+ char* alloc_ptr_;
+ size_t alloc_bytes_remaining_;
+
+ // Array of new[] allocated memory blocks
+ std::vector<char*> blocks_;
+
+ // Bytes of memory in blocks allocated so far
+ size_t blocks_memory_;
+
+ // No copying allowed
+ Arena(const Arena&);
+ void operator=(const Arena&);
+};
+
+inline char* Arena::Allocate(size_t bytes) {
+ // The semantics of what to return are a bit messy if we allow
+ // 0-byte allocations, so we disallow them here (we don't need
+ // them for our internal use).
+ assert(bytes > 0);
+ if (bytes <= alloc_bytes_remaining_) {
+ char* result = alloc_ptr_;
+ alloc_ptr_ += bytes;
+ alloc_bytes_remaining_ -= bytes;
+ return result;
+ }
+ return AllocateFallback(bytes);
+}
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_UTIL_ARENA_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/coding.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/util/coding.cc
new file mode 100644
index 00000000000..ad1f457a16a
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/coding.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/coding.h"
+
+namespace leveldb {
+
+char* EncodeVarint32(char* dst, uint32_t v) {
+ // Operate on characters as unsigneds
+ unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+ static const int B = 128;
+ if (v < (1<<7)) {
+ *(ptr++) = v;
+ } else if (v < (1<<14)) {
+ *(ptr++) = v | B;
+ *(ptr++) = v>>7;
+ } else if (v < (1<<21)) {
+ *(ptr++) = v | B;
+ *(ptr++) = (v>>7) | B;
+ *(ptr++) = v>>14;
+ } else if (v < (1<<28)) {
+ *(ptr++) = v | B;
+ *(ptr++) = (v>>7) | B;
+ *(ptr++) = (v>>14) | B;
+ *(ptr++) = v>>21;
+ } else {
+ *(ptr++) = v | B;
+ *(ptr++) = (v>>7) | B;
+ *(ptr++) = (v>>14) | B;
+ *(ptr++) = (v>>21) | B;
+ *(ptr++) = v>>28;
+ }
+ return reinterpret_cast<char*>(ptr);
+}
+
+const char* GetVarint32PtrFallback(const char* p,
+ const char* limit,
+ uint32_t* value) {
+ uint32_t result = 0;
+ for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
+ uint32_t byte = *(reinterpret_cast<const unsigned char*>(p));
+ p++;
+ if (byte & 128) {
+ // More bytes are present
+ result |= ((byte & 127) << shift);
+ } else {
+ result |= (byte << shift);
+ *value = result;
+ return reinterpret_cast<const char*>(p);
+ }
+ }
+ return NULL;
+}
+
+const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
+ uint64_t result = 0;
+ for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) {
+ uint64_t byte = *(reinterpret_cast<const unsigned char*>(p));
+ p++;
+ if (byte & 128) {
+ // More bytes are present
+ result |= ((byte & 127) << shift);
+ } else {
+ result |= (byte << shift);
+ *value = result;
+ return reinterpret_cast<const char*>(p);
+ }
+ }
+ return NULL;
+}
+
+#ifdef HAVE_ROCKSDB
+void BitStreamPutInt(char* dst, size_t dstlen, size_t offset,
+ uint32_t bits, uint64_t value) {
+ assert((offset + bits + 7)/8 <= dstlen);
+ assert(bits <= 64);
+
+ unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+
+ size_t byteOffset = offset / 8;
+ size_t bitOffset = offset % 8;
+
+ // This prevents unused variable warnings when compiling.
+#ifndef NDEBUG
+ // Store truncated value.
+ uint64_t origValue = (bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value;
+ uint32_t origBits = bits;
+#endif
+
+ while (bits > 0) {
+ size_t bitsToGet = std::min<size_t>(bits, 8 - bitOffset);
+ unsigned char mask = ((1 << bitsToGet) - 1);
+
+ ptr[byteOffset] = (ptr[byteOffset] & ~(mask << bitOffset)) +
+ ((value & mask) << bitOffset);
+
+ value >>= bitsToGet;
+ byteOffset += 1;
+ bitOffset = 0;
+ bits -= bitsToGet;
+ }
+
+ assert(origValue == BitStreamGetInt(dst, dstlen, offset, origBits));
+}
+
+uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset,
+ uint32_t bits) {
+ assert((offset + bits + 7)/8 <= srclen);
+ assert(bits <= 64);
+
+ const unsigned char* ptr = reinterpret_cast<const unsigned char*>(src);
+
+ uint64_t result = 0;
+
+ size_t byteOffset = offset / 8;
+ size_t bitOffset = offset % 8;
+ size_t shift = 0;
+
+ while (bits > 0) {
+ size_t bitsToGet = std::min<size_t>(bits, 8 - bitOffset);
+ unsigned char mask = ((1 << bitsToGet) - 1);
+
+ result += (uint64_t)((ptr[byteOffset] >> bitOffset) & mask) << shift;
+
+ shift += bitsToGet;
+ byteOffset += 1;
+ bitOffset = 0;
+ bits -= bitsToGet;
+ }
+
+ return result;
+ }
+
+void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits,
+ uint64_t value) {
+ assert((offset + bits + 7)/8 <= dst->size());
+
+ const size_t kTmpBufLen = sizeof(value) + 1;
+ char tmpBuf[kTmpBufLen];
+
+ // Number of bytes of tmpBuf being used
+ const size_t kUsedBytes = (offset%8 + bits)/8;
+
+ // Copy relevant parts of dst to tmpBuf
+ for (size_t idx = 0; idx <= kUsedBytes; ++idx) {
+ tmpBuf[idx] = (*dst)[offset/8 + idx];
+ }
+
+ BitStreamPutInt(tmpBuf, kTmpBufLen, offset%8, bits, value);
+
+ // Copy tmpBuf back to dst
+ for (size_t idx = 0; idx <= kUsedBytes; ++idx) {
+ (*dst)[offset/8 + idx] = tmpBuf[idx];
+
+ // Do the check here too as we are working with a buffer.
+ assert(((bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value) ==
+ BitStreamGetInt(dst, offset, bits));
+ }
+}
+#endif
+
+} // namespace leveldb
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/coding.h b/src/third_party/wiredtiger/api/leveldb/leveldb/util/coding.h
new file mode 100644
index 00000000000..ed56ef4ea2d
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/coding.h
@@ -0,0 +1,311 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Endian-neutral encoding:
+// * Fixed-length numbers are encoded with least-significant byte first
+// * In addition we support variable length "varint" encoding
+// * Strings are encoded prefixed by their length in varint format
+
+#ifndef STORAGE_LEVELDB_UTIL_CODING_H_
+#define STORAGE_LEVELDB_UTIL_CODING_H_
+
+#include <algorithm>
+#include <stdint.h>
+#include <string.h>
+#include <string>
+#include "leveldb_wt.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+// The maximum length of a varint in bytes for 32 and 64 bits respectively.
+const unsigned int kMaxVarint32Length = 5;
+const unsigned int kMaxVarint64Length = 10;
+
+// Standard Put... routines append to a string
+extern void PutFixed32(std::string* dst, uint32_t value);
+extern void PutFixed64(std::string* dst, uint64_t value);
+extern void PutVarint32(std::string* dst, uint32_t value);
+extern void PutVarint64(std::string* dst, uint64_t value);
+extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value);
+
+// Standard Get... routines parse a value from the beginning of a Slice
+// and advance the slice past the parsed value.
+extern bool GetVarint32(Slice* input, uint32_t* value);
+extern bool GetVarint64(Slice* input, uint64_t* value);
+extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
+
+#ifdef HAVE_ROCKSDB
+extern void PutLengthPrefixedSliceParts(std::string* dst,
+ const SliceParts& slice_parts);
+extern bool GetFixed64(Slice* input, uint64_t* value);
+// This function assumes data is well-formed.
+extern Slice GetLengthPrefixedSlice(const char* data);
+
+extern Slice GetSliceUntil(Slice* slice, char delimiter);
+#endif
+
+// Pointer-based variants of GetVarint... These either store a value
+// in *v and return a pointer just past the parsed value, or return
+// NULL on error. These routines only look at bytes in the range
+// [p..limit-1]
+extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v);
+extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v);
+
+// Returns the length of the varint32 or varint64 encoding of "v"
+extern int VarintLength(uint64_t v);
+
+// Lower-level versions of Put... that write directly into a character buffer
+// REQUIRES: dst has enough space for the value being written
+extern void EncodeFixed32(char* dst, uint32_t value);
+extern void EncodeFixed64(char* dst, uint64_t value);
+
+// Lower-level versions of Put... that write directly into a character buffer
+// and return a pointer just past the last byte written.
+// REQUIRES: dst has enough space for the value being written
+extern char* EncodeVarint32(char* dst, uint32_t value);
+extern char* EncodeVarint64(char* dst, uint64_t value);
+
+// Lower-level versions of Get... that read directly from a character buffer
+// without any bounds checking.
+
+inline uint32_t DecodeFixed32(const char* ptr) {
+ if (port::kLittleEndian) {
+ // Load the raw bytes
+ uint32_t result;
+ memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
+ return result;
+ } else {
+ return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
+ | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
+ | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
+ | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
+ }
+}
+
+inline uint64_t DecodeFixed64(const char* ptr) {
+ if (port::kLittleEndian) {
+ // Load the raw bytes
+ uint64_t result;
+ memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
+ return result;
+ } else {
+ uint64_t lo = DecodeFixed32(ptr);
+ uint64_t hi = DecodeFixed32(ptr + 4);
+ return (hi << 32) | lo;
+ }
+}
+
+// Internal routine for use by fallback path of GetVarint32Ptr
+extern const char* GetVarint32PtrFallback(const char* p,
+ const char* limit,
+ uint32_t* value);
+inline const char* GetVarint32Ptr(const char* p,
+ const char* limit,
+ uint32_t* value) {
+ if (p < limit) {
+ uint32_t result = *(reinterpret_cast<const unsigned char*>(p));
+ if ((result & 128) == 0) {
+ *value = result;
+ return p + 1;
+ }
+ }
+ return GetVarint32PtrFallback(p, limit, value);
+}
+
+// Writes an unsigned integer with bits number of bits with its least
+// significant bit at offset.
+// Bits are numbered from 0 to 7 in the first byte, 8 to 15 in the second and
+// so on.
+// value is truncated to the bits number of least significant bits.
+// REQUIRES: (offset+bits+7)/8 <= dstlen
+// REQUIRES: bits <= 64
+extern void BitStreamPutInt(char* dst, size_t dstlen, size_t offset,
+ uint32_t bits, uint64_t value);
+
+// Reads an unsigned integer with bits number of bits with its least
+// significant bit at offset.
+// Bits are numbered in the same way as ByteStreamPutInt().
+// REQUIRES: (offset+bits+7)/8 <= srclen
+// REQUIRES: bits <= 64
+extern uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset,
+ uint32_t bits);
+
+// Convenience functions
+extern void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits,
+ uint64_t value);
+extern uint64_t BitStreamGetInt(const std::string* src, size_t offset,
+ uint32_t bits);
+extern uint64_t BitStreamGetInt(const Slice* src, size_t offset,
+ uint32_t bits);
+
+// -- Implementation of the functions declared above
+inline void EncodeFixed32(char* buf, uint32_t value) {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ memcpy(buf, &value, sizeof(value));
+#else
+ buf[0] = value & 0xff;
+ buf[1] = (value >> 8) & 0xff;
+ buf[2] = (value >> 16) & 0xff;
+ buf[3] = (value >> 24) & 0xff;
+#endif
+}
+
+inline void EncodeFixed64(char* buf, uint64_t value) {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ memcpy(buf, &value, sizeof(value));
+#else
+ buf[0] = value & 0xff;
+ buf[1] = (value >> 8) & 0xff;
+ buf[2] = (value >> 16) & 0xff;
+ buf[3] = (value >> 24) & 0xff;
+ buf[4] = (value >> 32) & 0xff;
+ buf[5] = (value >> 40) & 0xff;
+ buf[6] = (value >> 48) & 0xff;
+ buf[7] = (value >> 56) & 0xff;
+#endif
+}
+
+inline void PutFixed32(std::string* dst, uint32_t value) {
+ char buf[sizeof(value)];
+ EncodeFixed32(buf, value);
+ dst->append(buf, sizeof(buf));
+}
+
+inline void PutFixed64(std::string* dst, uint64_t value) {
+ char buf[sizeof(value)];
+ EncodeFixed64(buf, value);
+ dst->append(buf, sizeof(buf));
+}
+
+inline void PutVarint32(std::string* dst, uint32_t v) {
+ char buf[5];
+ char* ptr = EncodeVarint32(buf, v);
+ dst->append(buf, ptr - buf);
+}
+
+inline char* EncodeVarint64(char* dst, uint64_t v) {
+ static const unsigned int B = 128;
+ unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+ while (v >= B) {
+ *(ptr++) = (v & (B - 1)) | B;
+ v >>= 7;
+ }
+ *(ptr++) = static_cast<unsigned char>(v);
+ return reinterpret_cast<char*>(ptr);
+}
+
+inline void PutVarint64(std::string* dst, uint64_t v) {
+ char buf[10];
+ char* ptr = EncodeVarint64(buf, v);
+ dst->append(buf, ptr - buf);
+}
+
+inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
+ PutVarint32(dst, value.size());
+ dst->append(value.data(), value.size());
+}
+
+#ifdef HAVE_ROCKSDB
+inline void PutLengthPrefixedSliceParts(std::string* dst,
+ const SliceParts& slice_parts) {
+ uint32_t total_bytes = 0;
+ for (int i = 0; i < slice_parts.num_parts; ++i) {
+ total_bytes += slice_parts.parts[i].size();
+ }
+ PutVarint32(dst, total_bytes);
+ for (int i = 0; i < slice_parts.num_parts; ++i) {
+ dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size());
+ }
+}
+#endif
+
+inline int VarintLength(uint64_t v) {
+ int len = 1;
+ while (v >= 128) {
+ v >>= 7;
+ len++;
+ }
+ return len;
+}
+
+#ifdef HAVE_ROCKSDB
+inline bool GetFixed64(Slice* input, uint64_t* value) {
+ if (input->size() < sizeof(uint64_t)) {
+ return false;
+ }
+ *value = DecodeFixed64(input->data());
+ input->remove_prefix(sizeof(uint64_t));
+ return true;
+}
+#endif
+
+inline bool GetVarint32(Slice* input, uint32_t* value) {
+ const char* p = input->data();
+ const char* limit = p + input->size();
+ const char* q = GetVarint32Ptr(p, limit, value);
+ if (q == NULL) {
+ return false;
+ } else {
+ *input = Slice(q, limit - q);
+ return true;
+ }
+}
+
+inline bool GetVarint64(Slice* input, uint64_t* value) {
+ const char* p = input->data();
+ const char* limit = p + input->size();
+ const char* q = GetVarint64Ptr(p, limit, value);
+ if (q == NULL) {
+ return false;
+ } else {
+ *input = Slice(q, limit - q);
+ return true;
+ }
+}
+
+inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
+ uint32_t len = 0;
+ if (GetVarint32(input, &len) && input->size() >= len) {
+ *result = Slice(input->data(), len);
+ input->remove_prefix(len);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+#ifdef HAVE_ROCKSDB
+inline Slice GetLengthPrefixedSlice(const char* data) {
+ uint32_t len = 0;
+ // +5: we assume "data" is not corrupted
+ const char *p = GetVarint32Ptr(data, data + 5 /* limit */, &len);
+ return Slice(p, len);
+}
+
+inline Slice GetSliceUntil(Slice* slice, char delimiter) {
+ uint32_t len = 0;
+ for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
+ // nothing
+ }
+
+ Slice ret(slice->data(), len);
+ slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
+ return ret;
+}
+#endif
+
+inline uint64_t BitStreamGetInt(const std::string* src, size_t offset,
+ uint32_t bits) {
+ return BitStreamGetInt(src->data(), src->size(), offset, bits);
+}
+
+inline uint64_t BitStreamGetInt(const Slice* src, size_t offset,
+ uint32_t bits) {
+ return BitStreamGetInt(src->data(), src->size(), offset, bits);
+}
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_UTIL_CODING_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/comparator.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/util/comparator.cc
new file mode 100644
index 00000000000..57c89628af9
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/comparator.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <stdint.h>
+#include "leveldb_wt.h"
+#include "port/port.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+Comparator::~Comparator() { }
+
+#ifdef HAVE_HYPERLEVELDB
+uint64_t Comparator::KeyNum(const Slice& key) const {
+ return 0;
+}
+#endif
+
+namespace {
+class BytewiseComparatorImpl : public Comparator {
+ public:
+ BytewiseComparatorImpl() { }
+
+ virtual const char* Name() const {
+ return "leveldb.BytewiseComparator";
+ }
+
+ virtual int Compare(const Slice& a, const Slice& b) const {
+ return a.compare(b);
+ }
+
+ virtual void FindShortestSeparator(
+ std::string* start,
+ const Slice& limit) const {
+ // Find length of common prefix
+ size_t min_length = std::min(start->size(), limit.size());
+ size_t diff_index = 0;
+ while ((diff_index < min_length) &&
+ ((*start)[diff_index] == limit[diff_index])) {
+ diff_index++;
+ }
+
+ if (diff_index >= min_length) {
+ // Do not shorten if one string is a prefix of the other
+ } else {
+ uint8_t diff_byte = static_cast<uint8_t>((*start)[diff_index]);
+ if (diff_byte < static_cast<uint8_t>(0xff) &&
+ diff_byte + 1 < static_cast<uint8_t>(limit[diff_index])) {
+ (*start)[diff_index]++;
+ start->resize(diff_index + 1);
+ assert(Compare(*start, limit) < 0);
+ }
+ }
+ }
+
+ virtual void FindShortSuccessor(std::string* key) const {
+ // Find first character that can be incremented
+ size_t n = key->size();
+ for (size_t i = 0; i < n; i++) {
+ const uint8_t byte = (*key)[i];
+ if (byte != static_cast<uint8_t>(0xff)) {
+ (*key)[i] = byte + 1;
+ key->resize(i+1);
+ return;
+ }
+ }
+ // *key is a run of 0xffs. Leave it alone.
+ }
+};
+} // namespace
+
+static const Comparator* bytewise = new BytewiseComparatorImpl;
+
+const Comparator* BytewiseComparator() {
+ return bytewise;
+}
+
+} // namespace leveldb
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/env.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/util/env.cc
new file mode 100644
index 00000000000..00a04f0dc3e
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/env.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb_wt.h"
+
+namespace leveldb {
+
+Env::~Env() {
+}
+
+SequentialFile::~SequentialFile() {
+}
+
+RandomAccessFile::~RandomAccessFile() {
+}
+
+WritableFile::~WritableFile() {
+}
+
+Logger::~Logger() {
+}
+
+FileLock::~FileLock() {
+}
+
+void Log(Logger* info_log, const char* format, ...) {
+ if (info_log != NULL) {
+ va_list ap;
+ va_start(ap, format);
+ info_log->Logv(format, ap);
+ va_end(ap);
+ }
+}
+
+static Status DoWriteStringToFile(Env* env, const Slice& data,
+ const std::string& fname,
+ bool should_sync) {
+ WritableFile* file;
+ Status s = env->NewWritableFile(fname, &file);
+ if (!s.ok()) {
+ return s;
+ }
+ s = file->Append(data);
+ if (s.ok() && should_sync) {
+ s = file->Sync();
+ }
+ if (s.ok()) {
+ s = file->Close();
+ }
+ delete file; // Will auto-close if we did not close above
+ if (!s.ok()) {
+ env->DeleteFile(fname);
+ }
+ return s;
+}
+
+Status WriteStringToFile(Env* env, const Slice& data,
+ const std::string& fname) {
+ return DoWriteStringToFile(env, data, fname, false);
+}
+
+Status WriteStringToFileSync(Env* env, const Slice& data,
+ const std::string& fname) {
+ return DoWriteStringToFile(env, data, fname, true);
+}
+
+Status ReadFileToString(Env* env, const std::string& fname, std::string* data) {
+ data->clear();
+ SequentialFile* file;
+ Status s = env->NewSequentialFile(fname, &file);
+ if (!s.ok()) {
+ return s;
+ }
+ static const int kBufferSize = 8192;
+ char* space = new char[kBufferSize];
+ while (true) {
+ Slice fragment;
+ s = file->Read(kBufferSize, &fragment, space);
+ if (!s.ok()) {
+ break;
+ }
+ data->append(fragment.data(), fragment.size());
+ if (fragment.empty()) {
+ break;
+ }
+ }
+ delete[] space;
+ delete file;
+ return s;
+}
+
+EnvWrapper::~EnvWrapper() {
+}
+
+} // namespace leveldb
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/env_posix.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/util/env_posix.cc
new file mode 100644
index 00000000000..084ae160807
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/env_posix.cc
@@ -0,0 +1,625 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <deque>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+#if defined(LEVELDB_PLATFORM_ANDROID)
+#include <sys/stat.h>
+#endif
+#include "leveldb_wt.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/posix_logger.h"
+
+namespace leveldb {
+
+namespace {
+
+static Status IOError(const std::string& context, int err_number) {
+ return Status::IOError(context, strerror(err_number));
+}
+
+class PosixSequentialFile: public SequentialFile {
+ private:
+ std::string filename_;
+ FILE* file_;
+
+ public:
+ PosixSequentialFile(const std::string& fname, FILE* f)
+ : filename_(fname), file_(f) { }
+ virtual ~PosixSequentialFile() { fclose(file_); }
+
+ virtual Status Read(size_t n, Slice* result, char* scratch) {
+ Status s;
+#ifdef HAVE_FREAD_UNLOCKED
+ size_t r = fread_unlocked(scratch, 1, n, file_);
+#else
+ size_t r = fread(scratch, 1, n, file_);
+#endif
+ *result = Slice(scratch, r);
+ if (r < n) {
+ if (feof(file_)) {
+ // We leave status as ok if we hit the end of the file
+ } else {
+ // A partial read with an error: return a non-ok status
+ s = IOError(filename_, errno);
+ }
+ }
+ return s;
+ }
+
+ virtual Status Skip(uint64_t n) {
+ if (fseek(file_, n, SEEK_CUR)) {
+ return IOError(filename_, errno);
+ }
+ return Status::OK();
+ }
+};
+
+// pread() based random-access
+class PosixRandomAccessFile: public RandomAccessFile {
+ private:
+ std::string filename_;
+ int fd_;
+
+ public:
+ PosixRandomAccessFile(const std::string& fname, int fd)
+ : filename_(fname), fd_(fd) { }
+ virtual ~PosixRandomAccessFile() { close(fd_); }
+
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const {
+ Status s;
+ ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
+ *result = Slice(scratch, (r < 0) ? 0 : r);
+ if (r < 0) {
+ // An error: return a non-ok status
+ s = IOError(filename_, errno);
+ }
+ return s;
+ }
+};
+
+// mmap() based random-access
+class PosixMmapReadableFile: public RandomAccessFile {
+ private:
+ std::string filename_;
+ void* mmapped_region_;
+ size_t length_;
+
+ public:
+ // base[0,length-1] contains the mmapped contents of the file.
+ PosixMmapReadableFile(const std::string& fname, void* base, size_t length)
+ : filename_(fname), mmapped_region_(base), length_(length) { }
+ virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); }
+
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const {
+ Status s;
+ if (offset + n > length_) {
+ *result = Slice();
+ s = IOError(filename_, EINVAL);
+ } else {
+ *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
+ }
+ return s;
+ }
+};
+
+// We preallocate up to an extra megabyte and use memcpy to append new
+// data to the file. This is safe since we either properly close the
+// file before reading from it, or for log files, the reading code
+// knows enough to skip zero suffixes.
+class PosixMmapFile : public WritableFile {
+ private:
+ std::string filename_;
+ int fd_;
+ size_t page_size_;
+ size_t map_size_; // How much extra memory to map at a time
+ char* base_; // The mapped region
+ char* limit_; // Limit of the mapped region
+ char* dst_; // Where to write next (in range [base_,limit_])
+ char* last_sync_; // Where have we synced up to
+ uint64_t file_offset_; // Offset of base_ in file
+
+ // Have we done an munmap of unsynced data?
+ bool pending_sync_;
+
+ // Roundup x to a multiple of y
+ static size_t Roundup(size_t x, size_t y) {
+ return ((x + y - 1) / y) * y;
+ }
+
+ size_t TruncateToPageBoundary(size_t s) {
+ s -= (s & (page_size_ - 1));
+ assert((s % page_size_) == 0);
+ return s;
+ }
+
+ bool UnmapCurrentRegion() {
+ bool result = true;
+ if (base_ != NULL) {
+ if (last_sync_ < limit_) {
+ // Defer syncing this data until next Sync() call, if any
+ pending_sync_ = true;
+ }
+ if (munmap(base_, limit_ - base_) != 0) {
+ result = false;
+ }
+ file_offset_ += limit_ - base_;
+ base_ = NULL;
+ limit_ = NULL;
+ last_sync_ = NULL;
+ dst_ = NULL;
+
+ // Increase the amount we map the next time, but capped at 1MB
+ if (map_size_ < (1<<20)) {
+ map_size_ *= 2;
+ }
+ }
+ return result;
+ }
+
+ bool MapNewRegion() {
+ assert(base_ == NULL);
+ if (ftruncate(fd_, file_offset_ + map_size_) < 0) {
+ return false;
+ }
+ void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd_, file_offset_);
+ if (ptr == MAP_FAILED) {
+ return false;
+ }
+ base_ = reinterpret_cast<char*>(ptr);
+ limit_ = base_ + map_size_;
+ dst_ = base_;
+ last_sync_ = base_;
+ return true;
+ }
+
+ public:
+ PosixMmapFile(const std::string& fname, int fd, size_t page_size)
+ : filename_(fname),
+ fd_(fd),
+ page_size_(page_size),
+ map_size_(Roundup(65536, page_size)),
+ base_(NULL),
+ limit_(NULL),
+ dst_(NULL),
+ last_sync_(NULL),
+ file_offset_(0),
+ pending_sync_(false) {
+ assert((page_size & (page_size - 1)) == 0);
+ }
+
+
+ ~PosixMmapFile() {
+ if (fd_ >= 0) {
+ PosixMmapFile::Close();
+ }
+ }
+
+ virtual Status Append(const Slice& data) {
+ const char* src = data.data();
+ size_t left = data.size();
+ while (left > 0) {
+ assert(base_ <= dst_);
+ assert(dst_ <= limit_);
+ size_t avail = limit_ - dst_;
+ if (avail == 0) {
+ if (!UnmapCurrentRegion() ||
+ !MapNewRegion()) {
+ return IOError(filename_, errno);
+ }
+ }
+
+ size_t n = (left <= avail) ? left : avail;
+ memcpy(dst_, src, n);
+ dst_ += n;
+ src += n;
+ left -= n;
+ }
+ return Status::OK();
+ }
+
+ virtual Status Close() {
+ Status s;
+ size_t unused = limit_ - dst_;
+ if (!UnmapCurrentRegion()) {
+ s = IOError(filename_, errno);
+ } else if (unused > 0) {
+ // Trim the extra space at the end of the file
+ if (ftruncate(fd_, file_offset_ - unused) < 0) {
+ s = IOError(filename_, errno);
+ }
+ }
+
+ if (close(fd_) < 0) {
+ if (s.ok()) {
+ s = IOError(filename_, errno);
+ }
+ }
+
+ fd_ = -1;
+ base_ = NULL;
+ limit_ = NULL;
+ return s;
+ }
+
+ virtual Status Flush() {
+ return Status::OK();
+ }
+
+ virtual Status Sync() {
+ Status s;
+
+ if (pending_sync_) {
+ // Some unmapped data was not synced
+ pending_sync_ = false;
+#ifdef HAVE_FDATASYNC
+ if (fdatasync(fd_) < 0) {
+#else
+ if (fsync(fd_) < 0) {
+#endif
+ s = IOError(filename_, errno);
+ }
+ }
+
+ if (dst_ > last_sync_) {
+ // Find the beginnings of the pages that contain the first and last
+ // bytes to be synced.
+ size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
+ size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
+ last_sync_ = dst_;
+ if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
+ s = IOError(filename_, errno);
+ }
+ }
+
+ return s;
+ }
+
+#ifdef HAVE_HYPERLEVELDB
+ virtual Status WriteAt(uint64_t offset, const Slice& data) { return Status::NotSupported("sorry!"); }
+#endif
+};
+
+static int LockOrUnlock(int fd, bool lock) {
+ errno = 0;
+ struct flock f;
+ memset(&f, 0, sizeof(f));
+ f.l_type = (lock ? F_WRLCK : F_UNLCK);
+ f.l_whence = SEEK_SET;
+ f.l_start = 0;
+ f.l_len = 0; // Lock/unlock entire file
+ return fcntl(fd, F_SETLK, &f);
+}
+
+class PosixFileLock : public FileLock {
+ public:
+ int fd_;
+};
+
+class PosixEnv : public Env {
+ public:
+ PosixEnv();
+ virtual ~PosixEnv() {
+ fprintf(stderr, "Destroying Env::Default()\n");
+ exit(1);
+ }
+
+ virtual Status NewSequentialFile(const std::string& fname,
+ SequentialFile** result) {
+ FILE* f = fopen(fname.c_str(), "r");
+ if (f == NULL) {
+ *result = NULL;
+ return IOError(fname, errno);
+ } else {
+ *result = new PosixSequentialFile(fname, f);
+ return Status::OK();
+ }
+ }
+
+ virtual Status NewRandomAccessFile(const std::string& fname,
+ RandomAccessFile** result) {
+ *result = NULL;
+ Status s;
+ int fd = open(fname.c_str(), O_RDONLY);
+ if (fd < 0) {
+ s = IOError(fname, errno);
+ } else if (sizeof(void*) >= 8) {
+ // Use mmap when virtual address-space is plentiful.
+ uint64_t size;
+ s = GetFileSize(fname, &size);
+ if (s.ok()) {
+ void* base = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
+ if (base != MAP_FAILED) {
+ *result = new PosixMmapReadableFile(fname, base, size);
+ } else {
+ s = IOError(fname, errno);
+ }
+ }
+ close(fd);
+ } else {
+ *result = new PosixRandomAccessFile(fname, fd);
+ }
+ return s;
+ }
+
+ virtual Status NewWritableFile(const std::string& fname,
+ WritableFile** result) {
+ Status s;
+ const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+ if (fd < 0) {
+ *result = NULL;
+ s = IOError(fname, errno);
+ } else {
+ *result = new PosixMmapFile(fname, fd, page_size_);
+ }
+ return s;
+ }
+
+ virtual bool FileExists(const std::string& fname) {
+ return access(fname.c_str(), F_OK) == 0;
+ }
+
+ virtual Status GetChildren(const std::string& dir,
+ std::vector<std::string>* result) {
+ result->clear();
+ DIR* d = opendir(dir.c_str());
+ if (d == NULL) {
+ return IOError(dir, errno);
+ }
+ struct dirent* entry;
+ while ((entry = readdir(d)) != NULL) {
+ result->push_back(entry->d_name);
+ }
+ closedir(d);
+ return Status::OK();
+ }
+
+ virtual Status DeleteFile(const std::string& fname) {
+ Status result;
+ if (unlink(fname.c_str()) != 0) {
+ result = IOError(fname, errno);
+ }
+ return result;
+ };
+
+ virtual Status CreateDir(const std::string& name) {
+ Status result;
+ if (mkdir(name.c_str(), 0755) != 0) {
+ result = IOError(name, errno);
+ }
+ return result;
+ };
+
+ virtual Status DeleteDir(const std::string& name) {
+ Status result;
+ if (rmdir(name.c_str()) != 0) {
+ result = IOError(name, errno);
+ }
+ return result;
+ };
+
+ virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
+ Status s;
+ struct stat sbuf;
+ if (stat(fname.c_str(), &sbuf) != 0) {
+ *size = 0;
+ s = IOError(fname, errno);
+ } else {
+ *size = sbuf.st_size;
+ }
+ return s;
+ }
+
+ virtual Status RenameFile(const std::string& src, const std::string& target) {
+ Status result;
+ if (rename(src.c_str(), target.c_str()) != 0) {
+ result = IOError(src, errno);
+ }
+ return result;
+ }
+
+ virtual Status LockFile(const std::string& fname, FileLock** lock) {
+ *lock = NULL;
+ Status result;
+ int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
+ if (fd < 0) {
+ result = IOError(fname, errno);
+ } else if (LockOrUnlock(fd, true) == -1) {
+ result = IOError("lock " + fname, errno);
+ close(fd);
+ } else {
+ PosixFileLock* my_lock = new PosixFileLock;
+ my_lock->fd_ = fd;
+ *lock = my_lock;
+ }
+ return result;
+ }
+
+ virtual Status UnlockFile(FileLock* lock) {
+ PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
+ Status result;
+ if (LockOrUnlock(my_lock->fd_, false) == -1) {
+ result = IOError("unlock", errno);
+ }
+ close(my_lock->fd_);
+ delete my_lock;
+ return result;
+ }
+
+ virtual void Schedule(void (*function)(void*), void* arg);
+
+ virtual void StartThread(void (*function)(void* arg), void* arg);
+
+ virtual Status GetTestDirectory(std::string* result) {
+ const char* env = getenv("TEST_TMPDIR");
+ if (env && env[0] != '\0') {
+ *result = env;
+ } else {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid()));
+ *result = buf;
+ }
+ // Directory may already exist
+ CreateDir(*result);
+ return Status::OK();
+ }
+
+ static uint64_t gettid() {
+ pthread_t tid = pthread_self();
+ uint64_t thread_id = 0;
+ memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
+ return thread_id;
+ }
+
+ virtual Status NewLogger(const std::string& fname, Logger** result) {
+ FILE* f = fopen(fname.c_str(), "w");
+ if (f == NULL) {
+ *result = NULL;
+ return IOError(fname, errno);
+ } else {
+ *result = new PosixLogger(f, &PosixEnv::gettid);
+ return Status::OK();
+ }
+ }
+
+ virtual uint64_t NowMicros() {
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+ }
+
+ virtual void SleepForMicroseconds(int micros) {
+ usleep(micros);
+ }
+
+#ifdef HAVE_HYPERLEVELDB
+ virtual Status CopyFile(const std::string&, const std::string&) { return Status::NotSupported("sorry!"); }
+ virtual Status LinkFile(const std::string&, const std::string&) { return Status::NotSupported("sorry!"); }
+#endif
+
+ private:
+ void PthreadCall(const char* label, int result) {
+ if (result != 0) {
+ fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+ exit(1);
+ }
+ }
+
+ // BGThread() is the body of the background thread
+ void BGThread();
+ static void* BGThreadWrapper(void* arg) {
+ reinterpret_cast<PosixEnv*>(arg)->BGThread();
+ return NULL;
+ }
+
+ size_t page_size_;
+ pthread_mutex_t mu_;
+ pthread_cond_t bgsignal_;
+ pthread_t bgthread_;
+ bool started_bgthread_;
+
+ // Entry per Schedule() call
+ struct BGItem { void* arg; void (*function)(void*); };
+ typedef std::deque<BGItem> BGQueue;
+ BGQueue queue_;
+};
+
+PosixEnv::PosixEnv() : page_size_(getpagesize()),
+ started_bgthread_(false) {
+ PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL));
+ PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL));
+}
+
+void PosixEnv::Schedule(void (*function)(void*), void* arg) {
+ PthreadCall("lock", pthread_mutex_lock(&mu_));
+
+ // Start background thread if necessary
+ if (!started_bgthread_) {
+ started_bgthread_ = true;
+ PthreadCall(
+ "create thread",
+ pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this));
+ }
+
+ // If the queue is currently empty, the background thread may currently be
+ // waiting.
+ if (queue_.empty()) {
+ PthreadCall("signal", pthread_cond_signal(&bgsignal_));
+ }
+
+ // Add to priority queue
+ queue_.push_back(BGItem());
+ queue_.back().function = function;
+ queue_.back().arg = arg;
+
+ PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+}
+
+void PosixEnv::BGThread() {
+ while (true) {
+ // Wait until there is an item that is ready to run
+ PthreadCall("lock", pthread_mutex_lock(&mu_));
+ while (queue_.empty()) {
+ PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_));
+ }
+
+ void (*function)(void*) = queue_.front().function;
+ void* arg = queue_.front().arg;
+ queue_.pop_front();
+
+ PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+ (*function)(arg);
+ }
+}
+
+namespace {
+struct StartThreadState {
+ void (*user_function)(void*);
+ void* arg;
+};
+}
+static void* StartThreadWrapper(void* arg) {
+ StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
+ state->user_function(state->arg);
+ delete state;
+ return NULL;
+}
+
+void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
+ pthread_t t;
+ StartThreadState* state = new StartThreadState;
+ state->user_function = function;
+ state->arg = arg;
+ PthreadCall("start thread",
+ pthread_create(&t, NULL, &StartThreadWrapper, state));
+}
+
+} // namespace
+
+static pthread_once_t once = PTHREAD_ONCE_INIT;
+static Env* default_env;
+static void InitDefaultEnv() { default_env = new PosixEnv; }
+
+Env* Env::Default() {
+ pthread_once(&once, InitDefaultEnv);
+ return default_env;
+}
+
+} // namespace leveldb
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/logging.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/util/logging.cc
new file mode 100644
index 00000000000..96526e76123
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/logging.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/logging.h"
+
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "leveldb_wt.h"
+
+namespace leveldb {
+
+void AppendNumberTo(std::string* str, uint64_t num) {
+ char buf[30];
+ snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num);
+ str->append(buf);
+}
+
+void AppendEscapedStringTo(std::string* str, const Slice& value) {
+ for (size_t i = 0; i < value.size(); i++) {
+ char c = value[i];
+ if (c >= ' ' && c <= '~') {
+ str->push_back(c);
+ } else {
+ char buf[10];
+ snprintf(buf, sizeof(buf), "\\x%02x",
+ static_cast<unsigned int>(c) & 0xff);
+ str->append(buf);
+ }
+ }
+}
+
+std::string NumberToString(uint64_t num) {
+ std::string r;
+ AppendNumberTo(&r, num);
+ return r;
+}
+
+std::string EscapeString(const Slice& value) {
+ std::string r;
+ AppendEscapedStringTo(&r, value);
+ return r;
+}
+
+bool ConsumeChar(Slice* in, char c) {
+ if (!in->empty() && (*in)[0] == c) {
+ in->remove_prefix(1);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool ConsumeDecimalNumber(Slice* in, uint64_t* val) {
+ uint64_t v = 0;
+ int digits = 0;
+ while (!in->empty()) {
+ char c = (*in)[0];
+ if (c >= '0' && c <= '9') {
+ ++digits;
+ const int delta = (c - '0');
+ static const uint64_t kMaxUint64 = ~static_cast<uint64_t>(0);
+ if (v > kMaxUint64/10 ||
+ (v == kMaxUint64/10 && (uint64_t)delta > kMaxUint64%10)) {
+ // Overflow
+ return false;
+ }
+ v = (v * 10) + delta;
+ in->remove_prefix(1);
+ } else {
+ break;
+ }
+ }
+ *val = v;
+ return (digits > 0);
+}
+
+} // namespace leveldb
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/logging.h b/src/third_party/wiredtiger/api/leveldb/leveldb/util/logging.h
new file mode 100644
index 00000000000..b0c5da813e8
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/logging.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Must not be included from any .h files to avoid polluting the namespace
+// with macros.
+
+#ifndef STORAGE_LEVELDB_UTIL_LOGGING_H_
+#define STORAGE_LEVELDB_UTIL_LOGGING_H_
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string>
+#include "port/port.h"
+
+namespace leveldb {
+
+class Slice;
+class WritableFile;
+
+// Append a human-readable printout of "num" to *str
+extern void AppendNumberTo(std::string* str, uint64_t num);
+
+// Append a human-readable printout of "value" to *str.
+// Escapes any non-printable characters found in "value".
+extern void AppendEscapedStringTo(std::string* str, const Slice& value);
+
+// Return a human-readable printout of "num"
+extern std::string NumberToString(uint64_t num);
+
+// Return a human-readable version of "value".
+// Escapes any non-printable characters found in "value".
+extern std::string EscapeString(const Slice& value);
+
+// If *in starts with "c", advances *in past the first character and
+// returns true. Otherwise, returns false.
+extern bool ConsumeChar(Slice* in, char c);
+
+// Parse a human-readable number from "*in" into *value. On success,
+// advances "*in" past the consumed number and sets "*val" to the
+// numeric value. Otherwise, returns false and leaves *in in an
+// unspecified state.
+extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val);
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_UTIL_LOGGING_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/options.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/util/options.cc
new file mode 100644
index 00000000000..a8c79233bb5
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/options.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb_wt.h"
+
+namespace leveldb {
+
+Options::Options()
+ : comparator(BytewiseComparator()),
+ create_if_missing(false),
+ error_if_exists(false),
+ paranoid_checks(false),
+ env(Env::Default()),
+ info_log(NULL),
+ write_buffer_size(4<<20),
+ max_open_files(1000),
+ block_cache(NULL),
+ block_size(4096),
+ block_restart_interval(16),
+ compression(kSnappyCompression),
+ filter_policy(NULL) {
+}
+
+
+} // namespace leveldb
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/posix_logger.h b/src/third_party/wiredtiger/api/leveldb/leveldb/util/posix_logger.h
new file mode 100644
index 00000000000..f15de45e05e
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/posix_logger.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Logger implementation that can be shared by all environments
+// where enough posix functionality is available.
+
+#ifndef STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_
+#define STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_
+
+#include <algorithm>
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include "leveldb_wt.h"
+
+namespace leveldb {
+
+class PosixLogger : public Logger {
+ private:
+ FILE* file_;
+ uint64_t (*gettid_)(); // Return the thread id for the current thread
+ public:
+ PosixLogger(FILE* f, uint64_t (*gettid)()) : file_(f), gettid_(gettid) { }
+ virtual ~PosixLogger() {
+ fclose(file_);
+ }
+ virtual void Logv(const char* format, va_list ap) {
+ const uint64_t thread_id = (*gettid_)();
+
+ // We try twice: the first time with a fixed-size stack allocated buffer,
+ // and the second time with a much larger dynamically allocated buffer.
+ char buffer[500];
+ for (int iter = 0; iter < 2; iter++) {
+ char* base;
+ int bufsize;
+ if (iter == 0) {
+ bufsize = sizeof(buffer);
+ base = buffer;
+ } else {
+ bufsize = 30000;
+ base = new char[bufsize];
+ }
+ char* p = base;
+ char* limit = base + bufsize;
+
+ struct timeval now_tv;
+ gettimeofday(&now_tv, NULL);
+ const time_t seconds = now_tv.tv_sec;
+ struct tm t;
+ localtime_r(&seconds, &t);
+ p += snprintf(p, limit - p,
+ "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
+ t.tm_year + 1900,
+ t.tm_mon + 1,
+ t.tm_mday,
+ t.tm_hour,
+ t.tm_min,
+ t.tm_sec,
+ static_cast<int>(now_tv.tv_usec),
+ static_cast<long long unsigned int>(thread_id));
+
+ // Print the message
+ if (p < limit) {
+ va_list backup_ap;
+ va_copy(backup_ap, ap);
+ p += vsnprintf(p, limit - p, format, backup_ap);
+ va_end(backup_ap);
+ }
+
+ // Truncate to available space if necessary
+ if (p >= limit) {
+ if (iter == 0) {
+ continue; // Try again with larger buffer
+ } else {
+ p = limit - 1;
+ }
+ }
+
+ // Add newline if necessary
+ if (p == base || p[-1] != '\n') {
+ *p++ = '\n';
+ }
+
+ assert(p <= limit);
+ fwrite(base, 1, p - base, file_);
+ fflush(file_);
+ if (base != buffer) {
+ delete[] base;
+ }
+ break;
+ }
+ }
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_UTIL_POSIX_LOGGER_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/random.h b/src/third_party/wiredtiger/api/leveldb/leveldb/util/random.h
new file mode 100644
index 00000000000..66e0c94e7cb
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/random.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_UTIL_RANDOM_H_
+#define STORAGE_LEVELDB_UTIL_RANDOM_H_
+
+#include <stdint.h>
+
+namespace leveldb {
+
+// A very simple random number generator. Not especially good at
+// generating truly random bits, but good enough for our needs in this
+// package.
+class Random {
+ private:
+ uint32_t seed_;
+ public:
+ explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { }
+ uint32_t Next() {
+ static const uint32_t M = 2147483647L; // 2^31-1
+ static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0
+ // We are computing
+ // seed_ = (seed_ * A) % M, where M = 2^31-1
+ //
+ // seed_ must not be zero or M, or else all subsequent computed values
+ // will be zero or M respectively. For all other values, seed_ will end
+ // up cycling through every number in [1,M-1]
+ uint64_t product = seed_ * A;
+
+ // Compute (product % M) using the fact that ((x << 31) % M) == x.
+ seed_ = static_cast<uint32_t>((product >> 31) + (product & M));
+ // The first reduction may overflow by 1 bit, so we may need to
+ // repeat. mod == M is not possible; using > allows the faster
+ // sign-bit-based test.
+ if (seed_ > M) {
+ seed_ -= M;
+ }
+ return seed_;
+ }
+ // Returns a uniformly distributed value in the range [0..n-1]
+ // REQUIRES: n > 0
+ uint32_t Uniform(int n) { return Next() % n; }
+
+ // Randomly returns true ~"1/n" of the time, and false otherwise.
+ // REQUIRES: n > 0
+ bool OneIn(int n) { return (Next() % n) == 0; }
+
+ // Skewed: pick "base" uniformly from range [0,max_log] and then
+ // return "base" random bits. The effect is to pick a number in the
+ // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+ uint32_t Skewed(int max_log) {
+ return Uniform(1 << Uniform(max_log + 1));
+ }
+
+ // Shuffle the array into random order
+ void Shuffle(int *array, int n) {
+ if (n > 1) {
+ int i;
+ for (i=0; i<n-1; i++) {
+ int j = i + Next() / (2147483647 / (n-i) + 1);
+ int t = array[j];
+ array[j] = array[i];
+ array[i] = t;
+ }
+ }
+ }
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_UTIL_RANDOM_H_
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb/util/status.cc b/src/third_party/wiredtiger/api/leveldb/leveldb/util/status.cc
new file mode 100644
index 00000000000..e8edd9dbb11
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb/util/status.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+#include "leveldb_wt.h"
+
+namespace leveldb {
+
+const char* Status::CopyState(const char* state) {
+ uint32_t size;
+ memcpy(&size, state, sizeof(size));
+ char* result = new char[size + 5];
+ memcpy(result, state, size + 5);
+ return result;
+}
+
+Status::Status(Code code_arg, const Slice& msg, const Slice& msg2) {
+ assert(code_arg != kOk);
+ const uint32_t len1 = msg.size();
+ const uint32_t len2 = msg2.size();
+ const uint32_t size = len1 + (len2 ? (2 + len2) : 0);
+ char* result = new char[size + 5];
+ memcpy(result, &size, sizeof(size));
+ result[4] = static_cast<char>(code_arg);
+ memcpy(result + 5, msg.data(), len1);
+ if (len2) {
+ result[5 + len1] = ':';
+ result[6 + len1] = ' ';
+ memcpy(result + 7 + len1, msg2.data(), len2);
+ }
+ state_ = result;
+}
+
+std::string Status::ToString() const {
+ if (state_ == NULL) {
+ return "OK";
+ } else {
+ char tmp[30];
+ const char* type;
+ switch (code()) {
+ case kOk:
+ type = "OK";
+ break;
+ case kNotFound:
+ type = "NotFound: ";
+ break;
+ case kCorruption:
+ type = "Corruption: ";
+ break;
+ case kNotSupported:
+ type = "Not implemented: ";
+ break;
+ case kInvalidArgument:
+ type = "Invalid argument: ";
+ break;
+ case kIOError:
+ type = "IO error: ";
+ break;
+ default:
+ snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
+ static_cast<int>(code()));
+ type = tmp;
+ break;
+ }
+ std::string result(type);
+ uint32_t length;
+ memcpy(&length, state_, sizeof(length));
+ result.append(state_ + 5, length);
+ return result;
+ }
+}
+
+} // namespace leveldb
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb_test.cc b/src/third_party/wiredtiger/api/leveldb/leveldb_test.cc
new file mode 100644
index 00000000000..25cfe0c379e
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb_test.cc
@@ -0,0 +1,141 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <iostream>
+#include "leveldb_wt.h"
+
+using namespace std;
+
+extern "C" int main() {
+ leveldb::DB* db;
+ leveldb::Options options;
+ options.create_if_missing = true;
+ leveldb::Status s = leveldb::DB::Open(options, "WTLDB_HOME", &db);
+ assert(s.ok());
+
+ s = db->Put(leveldb::WriteOptions(), "key", "value");
+ s = db->Put(leveldb::WriteOptions(), "key2", "value2");
+ s = db->Put(leveldb::WriteOptions(), "key3", "value3");
+ s = db->Put(leveldb::WriteOptions(), "key4", "value4");
+ assert(s.ok());
+
+#ifdef HAVE_HYPERLEVELDB
+ leveldb::ReplayIterator* replay_start;
+ leveldb::ReplayIterator* replay_ts;
+ leveldb::ReplayIterator* replay_now;
+ leveldb::ReplayIterator* replay_last;
+ std::string timestamp;
+ std::string timestamp_last;
+
+ cout << "Perform Live Backup" << endl;
+ s = db->LiveBackup("test");
+
+ // Test out a bunch of the ReplayIterator methods.
+ db->GetReplayTimestamp(&timestamp);
+ cout << "timestamp 1 " << timestamp << endl << "Put key5" << endl;
+ s = db->Put(leveldb::WriteOptions(), "key5", "value5");
+ db->GetReplayTimestamp(&timestamp_last);
+ // Verify a bunch of timestamp comparisons
+ cout << "timestamp 2 " << timestamp_last << endl;
+ cout << "CompareTimestamps tests" << endl;
+ assert(db->CompareTimestamps(timestamp, timestamp_last) < 0);
+ assert(db->CompareTimestamps("all", timestamp_last) < 0);
+ assert(db->CompareTimestamps(timestamp, "now") < 0);
+ assert(db->CompareTimestamps("now", timestamp_last) == 0);
+ assert(db->CompareTimestamps(timestamp_last, "now") == 0);
+ assert(db->CompareTimestamps("now", timestamp) > 0);
+ assert(db->CompareTimestamps("now", "all") > 0);
+
+ s = db->GetReplayIterator("all", &replay_start);
+ assert(replay_start->Valid());
+ cout << "Replay at all(start):" << endl;
+ cout << replay_start->key().ToString() << ": " << replay_start->value().ToString() << endl;
+ s = db->GetReplayIterator(timestamp, &replay_ts);
+ assert(replay_ts->Valid());
+ cout << "Replay at timestamp " << timestamp << ":" << endl;
+ cout << replay_ts->key().ToString() << ": " << replay_ts->value().ToString() << endl;
+ s = db->GetReplayIterator("now", &replay_now);
+ assert(replay_now->Valid());
+ cout << "Replay at now(end):" << endl;
+ cout << replay_now->key().ToString() << ": " << replay_now->value().ToString() << endl;
+ s = db->GetReplayIterator(timestamp_last, &replay_last);
+ assert(replay_last->Valid());
+ cout << "Replay at last timestamp " << timestamp_last << ":" << endl;
+ cout << replay_last->key().ToString() << ": " << replay_last->value().ToString() << endl;
+ assert(replay_now->key().ToString() == replay_last->key().ToString());
+ cout << "Replay walk from all/start:" << endl;
+ while (replay_start->Valid()) {
+ cout << replay_start->key().ToString() << ": " << replay_start->value().ToString() << endl;
+ replay_start->Next();
+ }
+ // We reached the end of log, iterator should still not be valid.
+ // But if we write something, the iterator should find it and become
+ // valid again.
+ assert(!replay_start->Valid());
+ s = db->Put(leveldb::WriteOptions(), "key6", "value6");
+ assert(replay_start->Valid());
+ db->ReleaseReplayIterator(replay_start);
+ db->ReleaseReplayIterator(replay_ts);
+ db->ReleaseReplayIterator(replay_now);
+ db->ReleaseReplayIterator(replay_last);
+#endif
+
+ // Read through the main database
+ cout << "Read main database:" << endl;
+ leveldb::ReadOptions read_options;
+ read_options.snapshot = db->GetSnapshot();
+ leveldb::Iterator* iter = db->NewIterator(read_options);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ cout << iter->key().ToString() << ": " << iter->value().ToString() << endl;
+ }
+
+ delete iter;
+ db->ReleaseSnapshot(read_options.snapshot);
+
+ delete db;
+
+#ifdef HAVE_HYPERLEVELDB
+ // Read through the backup database
+ leveldb::DB* db_bkup;
+ options.create_if_missing = false;
+ s = leveldb::DB::Open(options, "WTLDB_HOME/backup-test", &db_bkup);
+ read_options.snapshot = db_bkup->GetSnapshot();
+ iter = db_bkup->NewIterator(read_options);
+ cout << "Read Backup database:" << endl;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ cout << iter->key().ToString() << ": " << iter->value().ToString() << endl;
+ }
+
+ delete iter;
+ db_bkup->ReleaseSnapshot(read_options.snapshot);
+
+ delete db_bkup;
+#endif
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb_wt.cc b/src/third_party/wiredtiger/api/leveldb/leveldb_wt.cc
new file mode 100644
index 00000000000..8fc7d1ca092
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb_wt.cc
@@ -0,0 +1,810 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "leveldb_wt.h"
+#include <errno.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sstream>
+
+#if HAVE_BASHOLEVELDB
+namespace leveldb {
+Value::~Value() {}
+
+class StringValue : public Value {
+ public:
+ explicit StringValue(std::string& val) : value_(val) {}
+ ~StringValue() {}
+
+ StringValue& assign(const char* data, size_t size) {
+ value_.assign(data, size);
+ return *this;
+ }
+
+ private:
+ std::string& value_;
+};
+}
+#endif
+
+Status leveldb::DestroyDB(const std::string& name, const Options& options) {
+ WT_CONNECTION *conn;
+ int ret, t_ret;
+ /* If the database doesn't exist, there is nothing to destroy. */
+ if (access((name + "/WiredTiger").c_str(), F_OK) != 0)
+ return Status::OK();
+ if ((ret = ::wiredtiger_open(name.c_str(), NULL, NULL, &conn)) != 0)
+ return WiredTigerErrorToStatus(ret, NULL);
+ WT_SESSION *session;
+ if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+ goto cleanup;
+ if ((ret = session->drop(session, WT_URI, "force")) != 0)
+ goto cleanup;
+
+cleanup:
+ if ((t_ret = conn->close(conn, NULL)) != 0 && ret == 0)
+ ret = t_ret;
+ return WiredTigerErrorToStatus(ret, NULL);
+}
+
+Status leveldb::RepairDB(const std::string& dbname, const Options& options) {
+ return Status::NotSupported("sorry!");
+}
+
+/* Destructors required for interfaces. */
+leveldb::DB::~DB() {}
+Snapshot::~Snapshot() {}
+
+Status WiredTigerErrorToStatus(int wiredTigerError, const char *msg) {
+ if (wiredTigerError == 0)
+ return Status::OK();
+
+ if (msg == NULL)
+ msg = wiredtiger_strerror(wiredTigerError);
+
+ if (wiredTigerError != WT_NOTFOUND)
+ printf("Failing status: %d -> %s\n", wiredTigerError, msg);
+
+ if (wiredTigerError == WT_NOTFOUND)
+ return Status::NotFound(Slice(msg));
+ else if (wiredTigerError == WT_ERROR || wiredTigerError == WT_PANIC)
+ return Status::Corruption(Slice(msg));
+ else if (wiredTigerError == ENOTSUP)
+ return Status::NotSupported(Slice(msg));
+ else if (wiredTigerError == EINVAL)
+ return Status::InvalidArgument(Slice(msg));
+ else if (wiredTigerError == EPERM || wiredTigerError == ENOENT ||
+ wiredTigerError == EIO || wiredTigerError == EBADF ||
+ wiredTigerError == EEXIST || wiredTigerError == ENOSPC)
+ return Status::IOError(Slice(msg));
+ else if (wiredTigerError == WT_ROLLBACK)
+ return Status::IOError("ROLLBACK"); // TODO: Is this the best translation?
+ else
+ return Status::Corruption(Slice(msg));
+}
+
+/* Iterators, from leveldb/table/iterator.cc */
+Iterator::Iterator() {
+ cleanup_.function = NULL;
+ cleanup_.next = NULL;
+}
+
+Iterator::~Iterator() {
+ if (cleanup_.function != NULL) {
+ (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2);
+ for (Cleanup* c = cleanup_.next; c != NULL; ) {
+ (*c->function)(c->arg1, c->arg2);
+ Cleanup* next = c->next;
+ delete c;
+ c = next;
+ }
+ }
+}
+
+void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) {
+ assert(func != NULL);
+ Cleanup* c;
+ if (cleanup_.function == NULL) {
+ c = &cleanup_;
+ } else {
+ c = new Cleanup;
+ c->next = cleanup_.next;
+ cleanup_.next = c;
+ }
+ c->function = func;
+ c->arg1 = arg1;
+ c->arg2 = arg2;
+}
+
+namespace {
+class EmptyIterator : public Iterator {
+ public:
+ EmptyIterator(const Status& s) : status_(s) { }
+ virtual bool Valid() const { return false; }
+ virtual void Seek(const Slice& target) { }
+ virtual void SeekToFirst() { }
+ virtual void SeekToLast() { }
+ virtual void Next() { assert(false); }
+ virtual void Prev() { assert(false); }
+ Slice key() const { assert(false); return Slice(); }
+ Slice value() const { assert(false); return Slice(); }
+ virtual Status status() const { return status_; }
+ private:
+ Status status_;
+};
+} // namespace
+
+Iterator* NewEmptyIterator() {
+ return new EmptyIterator(Status::OK());
+}
+
+Iterator* NewErrorIterator(const Status& status) {
+ return new EmptyIterator(status);
+}
+
+namespace {
+class FilterPolicyImpl : public FilterPolicy {
+public:
+ FilterPolicyImpl(int bits_per_key) : bits_per_key_(bits_per_key) {}
+ ~FilterPolicyImpl() {}
+ virtual const char *Name() const { return "FilterPolicyImpl"; }
+ virtual void CreateFilter(const Slice *keys, int n, std::string *dst) const {}
+ virtual bool KeyMayMatch(const Slice &key, const Slice &filter) const { return true; }
+
+ int bits_per_key_;
+};
+};
+
+namespace leveldb {
+FilterPolicy::~FilterPolicy() {}
+
+const FilterPolicy *NewBloomFilterPolicy(int bits_per_key) {
+ return new FilterPolicyImpl(bits_per_key);
+}
+#if HAVE_BASHOLEVELDB
+const FilterPolicy *NewBloomFilterPolicy2(int bits_per_key) {
+ return NewBloomFilterPolicy(bits_per_key);
+}
+#endif
+
+Cache::~Cache() {}
+Cache *NewLRUCache(size_t capacity) {
+ return new CacheImpl(capacity);
+}
+}
+
+int
+wtleveldb_create(
+ WT_CONNECTION *conn, const Options &options, std::string const &uri)
+{
+ int ret;
+ std::stringstream s_table;
+ s_table << WT_TABLE_CONFIG;
+ s_table << "internal_page_max=" << options.block_size << ",";
+ s_table << "leaf_page_max=" << options.block_size << ",";
+ s_table << "leaf_item_max=" << options.block_size / 4 << ",";
+ if (options.compression == leveldb::kSnappyCompression)
+ s_table << "block_compressor=snappy,";
+#ifdef HAVE_ROCKSDB
+ if (options.compression == leveldb::kZlibCompression)
+ s_table << "block_compressor=zlib,";
+#endif
+ s_table << "lsm=(";
+ s_table << "chunk_size=" << options.write_buffer_size << ",";
+ if (options.filter_policy) {
+ int bits = ((FilterPolicyImpl *)options.filter_policy)->bits_per_key_;
+ s_table << "bloom_bit_count=" << bits << ",";
+ // Approximate the optimal number of hashes
+ s_table << "bloom_hash_count=" << (int)(0.6 * bits) << ",";
+ }
+ s_table << "),";
+ WT_SESSION *session;
+ std::string table_config = s_table.str();
+ if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+ return (ret);
+ if ((ret = session->create(session, uri.c_str(), table_config.c_str())) != 0)
+ return (ret);
+ if ((ret = session->close(session, NULL)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+Status
+leveldb::DB::Open(const Options &options, const std::string &name, leveldb::DB **dbptr)
+{
+ // Build the wiredtiger_open config.
+ std::stringstream s_conn;
+ s_conn << WT_CONN_CONFIG;
+ if (options.create_if_missing) {
+ (void)mkdir(name.c_str(), 0777);
+ s_conn << "create,";
+ }
+ if (options.error_if_exists)
+ s_conn << "exclusive,";
+#ifndef HAVE_BUILTIN_EXTENSION_SNAPPY
+ if (options.compression == kSnappyCompression)
+ s_conn << "extensions=[libwiredtiger_snappy.so],";
+#endif
+#ifdef HAVE_ROCKSDB
+#ifndef HAVE_BUILTIN_ZLIB
+ if (options.compression == kZlibCompression)
+ s_conn << "extensions=[libwiredtiger_zlib.so],";
+#endif
+#endif
+ size_t cache_size = 2 * options.write_buffer_size;
+ cache_size += (size_t)options.max_open_files * (4 << 20);
+ if (options.block_cache)
+ cache_size += ((CacheImpl *)options.block_cache)->capacity_;
+ else
+ cache_size += 100 << 20;
+ s_conn << "cache_size=" << cache_size << ",";
+ std::string conn_config = s_conn.str();
+
+ WT_CONNECTION *conn;
+ printf("Open: home %s config %s\r\n",name.c_str(),conn_config.c_str());
+ int ret = ::wiredtiger_open(name.c_str(), NULL, conn_config.c_str(), &conn);
+ if (ret == ENOENT)
+ return Status::NotFound(Slice("Database does not exist."));
+ else if (ret == EEXIST)
+ return Status::NotFound(Slice("Database already exists."));
+ else if (ret != 0)
+ return WiredTigerErrorToStatus(ret, NULL);
+
+ if (options.create_if_missing)
+ ret = wtleveldb_create(conn, options, WT_URI);
+
+ if (ret != 0) {
+ conn->close(conn, NULL);
+ return WiredTigerErrorToStatus(ret, NULL);
+ }
+ *dbptr = new DbImpl(conn);
+ return Status::OK();
+}
+
+// Set the database entry for "key" to "value". Returns OK on success,
+// and a non-OK status on error.
+// Note: consider setting options.sync = true.
+Status
+DbImpl::Put(const WriteOptions& options,
+ const Slice& key, const Slice& value)
+{
+ WT_CURSOR *cursor = GetContext()->GetCursor();
+ WT_ITEM item;
+
+ item.data = key.data();
+ item.size = key.size();
+ cursor->set_key(cursor, &item);
+ item.data = value.data();
+ item.size = value.size();
+ cursor->set_value(cursor, &item);
+ int ret = cursor->insert(cursor);
+ return WiredTigerErrorToStatus(ret, NULL);
+}
+
+// Remove the database entry (if any) for "key". Returns OK on
+// success, and a non-OK status on error. It is not an error if "key"
+// did not exist in the database.
+// Note: consider setting options.sync = true.
+Status
+DbImpl::Delete(const WriteOptions& options, const Slice& key)
+{
+ WT_CURSOR *cursor = GetContext()->GetCursor();
+ WT_ITEM item;
+
+ item.data = key.data();
+ item.size = key.size();
+ cursor->set_key(cursor, &item);
+ int ret = cursor->remove(cursor);
+ // Reset the WiredTiger cursor so it doesn't keep any pages pinned. Track
+ // failures in debug builds since we don't expect failure, but don't pass
+ // failures on - it's not necessary for correct operation.
+ if (ret == 0) {
+ int t_ret = cursor->reset(cursor);
+ assert(t_ret == 0);
+ } else if (ret == WT_NOTFOUND)
+ ret = 0;
+ return WiredTigerErrorToStatus(ret, NULL);
+}
+
+void
+WriteBatchHandler::Put(const Slice& key, const Slice& value) {
+ WT_CURSOR *cursor = context_->GetCursor();
+ WT_ITEM item;
+
+ item.data = key.data();
+ item.size = key.size();
+ cursor->set_key(cursor, &item);
+ item.data = value.data();
+ item.size = value.size();
+ cursor->set_value(cursor, &item);
+ int ret = cursor->insert(cursor);
+ if (ret != 0 && status_ == 0)
+ status_ = ret;
+}
+
+void WriteBatchHandler::Delete(const Slice& key) {
+ WT_CURSOR *cursor = context_->GetCursor();
+ WT_ITEM item;
+
+ item.data = key.data();
+ item.size = key.size();
+ cursor->set_key(cursor, &item);
+ int ret = cursor->remove(cursor);
+ if (ret != 0 && ret != WT_NOTFOUND && status_ == 0)
+ status_ = ret;
+}
+
+// Apply the specified updates to the database.
+// Returns OK on success, non-OK on failure.
+// Note: consider setting options.sync = true.
+Status
+DbImpl::Write(const WriteOptions& options, WriteBatch* updates)
+{
+ const char *errmsg = NULL;
+ Status status = Status::OK();
+ OperationContext *context = GetContext();
+ WT_SESSION *session = context->GetSession();
+ int ret = 0, t_ret;
+
+#ifdef HAVE_ROCKSDB
+ int need_txn = (updates->Count() > 1);
+#else
+ int need_txn = 1;
+#endif
+
+ for (;;) {
+ if (need_txn && (ret = session->begin_transaction(session, NULL)) != 0) {
+ errmsg = "Begin transaction failed in Write batch";
+ goto err;
+ }
+
+ WriteBatchHandler handler(this, context);
+#if 0
+ status = updates->Iterate(&handler);
+#else
+ try {
+ status = updates->Iterate(&handler);
+ } catch(...) {
+ if (need_txn)
+ (void)session->rollback_transaction(session, NULL);
+ throw;
+ }
+#endif
+ if (!status.ok() || (ret = handler.GetWiredTigerStatus()) != WT_ROLLBACK)
+ break;
+ // Roll back the transaction on deadlock so we can try again
+ if (need_txn && (ret = session->rollback_transaction(session, NULL)) != 0) {
+ errmsg = "Rollback transaction failed in Write batch";
+ goto err;
+ }
+ }
+
+ if (need_txn && status.ok() && ret == 0) {
+ ret = session->commit_transaction(session, NULL);
+ } else if (need_txn) {
+ t_ret = session->rollback_transaction(session, NULL);
+ if (ret == 0)
+ ret = t_ret;
+ }
+
+err:
+ if (status.ok() && ret != 0)
+ status = WiredTigerErrorToStatus(ret, errmsg);
+ return status;
+}
+
+// If the database contains an entry for "key" store the
+// corresponding value in *value and return OK.
+//
+// If there is no entry for "key" leave *value unchanged and return
+// a status for which Status::IsNotFound() returns true.
+//
+// May return some other Status on an error.
+Status
+DbImpl::Get(const ReadOptions& options,
+ const Slice& key, std::string* value)
+{
+ WT_CURSOR *cursor = GetContext(options)->GetCursor();
+ const char *errmsg = NULL;
+
+ WT_ITEM item;
+ item.data = key.data();
+ item.size = key.size();
+ cursor->set_key(cursor, &item);
+ int ret = cursor->search(cursor);
+ if (ret == 0) {
+ ret = cursor->get_value(cursor, &item);
+ if (ret == 0) {
+ // Make a copy of the value to return, then the cursor can be reset
+ *value = std::string((const char *)item.data, item.size);
+ ret = cursor->reset(cursor);
+ }
+ } else if (ret == WT_NOTFOUND)
+ errmsg = "DB::Get key not found";
+ return WiredTigerErrorToStatus(ret, errmsg);
+}
+
+#if HAVE_BASHOLEVELDB
+// If the database contains an entry for "key" store the
+// corresponding value in *value and return OK.
+//
+// If there is no entry for "key" leave *value unchanged and return
+// a status for which Status::IsNotFound() returns true.
+//
+// May return some other Status on an error.
+Status
+DbImpl::Get(const ReadOptions& options,
+ const Slice& key, Value* value)
+{
+ const char *errmsg = NULL;
+
+ WT_CURSOR *cursor = GetContext(options)->GetCursor();
+ WT_ITEM item;
+ item.data = key.data();
+ item.size = key.size();
+ cursor->set_key(cursor, &item);
+ int ret = cursor->search(cursor);
+ if (ret == 0) {
+ ret = cursor->get_value(cursor, &item);
+ if (ret == 0) {
+ // This call makes a copy, reset the cursor afterwards.
+ value->assign((const char *)item.data, item.size);
+ ret = cursor->reset(cursor);
+ }
+ } else if (ret == WT_NOTFOUND)
+ errmsg = "DB::Get key not found";
+err:
+ return WiredTigerErrorToStatus(ret, errmsg);
+}
+#endif
+
+// Return a heap-allocated iterator over the contents of the database.
+// The result of NewIterator() is initially invalid (caller must
+// call one of the Seek methods on the iterator before using it).
+//
+// Caller should delete the iterator when it is no longer needed.
+// The returned iterator should be deleted before this db is deleted.
+Iterator *
+DbImpl::NewIterator(const ReadOptions& options)
+{
+ /* Iterators own the cursor until they are closed. */
+ OperationContext *context = GetContext(options);
+ WT_CURSOR *c = context->GetCursor();
+ context->SetCursor(NULL);
+ return new IteratorImpl(this, c);
+}
+
+SnapshotImpl::SnapshotImpl(DbImpl *db) :
+ Snapshot(), db_(db), context_(db->NewContext()), status_(Status::OK())
+{
+}
+
+// Return a handle to the current DB state. Iterators created with
+// this handle will all observe a stable snapshot of the current DB
+// state. The caller must call ReleaseSnapshot(result) when the
+// snapshot is no longer needed.
+const Snapshot *
+DbImpl::GetSnapshot()
+{
+ SnapshotImpl *si = new SnapshotImpl(this);
+ WT_SESSION *session = si->GetContext()->GetSession();
+ int ret = session->begin_transaction(session, NULL);
+ assert(ret == 0);
+ return si;
+}
+
+// Release a previously acquired snapshot. The caller must not
+// use "snapshot" after this call.
+void
+DbImpl::ReleaseSnapshot(const Snapshot* snapshot)
+{
+ SnapshotImpl *si =
+ static_cast<SnapshotImpl *>(const_cast<Snapshot *>(snapshot));
+ if (si != NULL) {
+ // We started a transaction: we could commit it here, but it will be rolled
+ // back automatically by closing the session, which we have to do anyway.
+ int ret = si->GetContext()->Close();
+ assert(ret == 0);
+ delete si;
+ }
+}
+
+// DB implementations can export properties about their state
+// via this method. If "property" is a valid property understood by this
+// DB implementation, fills "*value" with its current value and returns
+// true. Otherwise returns false.
+//
+//
+// Valid property names include:
+//
+// "leveldb.num-files-at-level<N>" - return the number of files at level <N>,
+// where <N> is an ASCII representation of a level number (e.g. "0").
+// "leveldb.stats" - returns a multi-line string that describes statistics
+// about the internal operation of the DB.
+// "leveldb.sstables" - returns a multi-line string that describes all
+// of the sstables that make up the db contents.
+bool
+DbImpl::GetProperty(const Slice& property, std::string* value)
+{
+ /* Not supported */
+ return false;
+}
+
+// For each i in [0,n-1], store in "sizes[i]", the approximate
+// file system space used by keys in "[range[i].start .. range[i].limit)".
+//
+// Note that the returned sizes measure file system space usage, so
+// if the user data compresses by a factor of ten, the returned
+// sizes will be one-tenth the size of the corresponding user data size.
+//
+// The results may not include the sizes of recently written data.
+void
+DbImpl::GetApproximateSizes(const Range* range, int n,
+ uint64_t* sizes)
+{
+ int i;
+
+ /* XXX Not supported */
+ for (i = 0; i < n; i++)
+ sizes[i] = 1;
+}
+
+// Compact the underlying storage for the key range [*begin,*end].
+// In particular, deleted and overwritten versions are discarded,
+// and the data is rearranged to reduce the cost of operations
+// needed to access the data. This operation should typically only
+// be invoked by users who understand the underlying implementation.
+//
+// begin==NULL is treated as a key before all keys in the database.
+// end==NULL is treated as a key after all keys in the database.
+// Therefore the following call will compact the entire database:
+// db->CompactRange(NULL, NULL);
+void
+DbImpl::CompactRange(const Slice* begin, const Slice* end)
+{
+ // The compact doesn't need a cursor, but the context always opens a
+ // cursor when opening the session - so grab that, and use the session.
+ WT_CURSOR *cursor = GetContext()->GetCursor();
+ WT_SESSION *session = cursor->session;
+ int ret = session->compact(session, WT_URI, NULL);
+ assert(ret == 0);
+}
+
+// Suspends the background compaction thread. This methods
+// returns once suspended.
+void
+DbImpl::SuspendCompactions()
+{
+ /* Not supported */
+}
+
+// Resumes a suspended background compation thread.
+void
+DbImpl::ResumeCompactions()
+{
+ /* Not supported */
+}
+
+IteratorImpl::~IteratorImpl()
+{
+ if (cursor_ != NULL) {
+ OperationContext *context = db_->GetContext();
+ /*
+ * If we are in the same thread where the iterator was opened, and there is
+ * no cursor stashed there, return it.
+ */
+ if (cursor_->session == context->GetSession()) {
+#ifdef HAVE_ROCKSDB
+ if (context->GetCursor(id_) == NULL) {
+ context->SetCursor(id_, cursor_);
+ cursor_ = NULL;
+ }
+#else
+ if (context->GetCursor() == NULL) {
+ context->SetCursor(cursor_);
+ cursor_ = NULL;
+ }
+#endif
+ }
+ if (cursor_ != NULL) {
+ int ret = cursor_->close(cursor_);
+ assert(ret == 0);
+ }
+ }
+}
+
+// Position at the first key in the source. The iterator is Valid()
+// after this call iff the source is not empty.
+void
+IteratorImpl::SeekToFirst()
+{
+ int ret;
+ WT_ITEM item;
+
+ if (!Status().ok())
+ return;
+
+ if ((ret = cursor_->reset(cursor_)) != 0) {
+ SetError(ret);
+ return;
+ }
+ ret = cursor_->next(cursor_);
+ if (ret == WT_NOTFOUND) {
+ valid_ = false;
+ return;
+ } else if (ret != 0) {
+ SetError(ret);
+ return;
+ }
+ if ((ret = cursor_->get_key(cursor_, &item)) != 0) {
+ SetError(ret);
+ return;
+ }
+ key_ = Slice((const char *)item.data, item.size);
+ if ((ret = cursor_->get_value(cursor_, &item)) != 0) {
+ SetError(ret);
+ return;
+ }
+ value_ = Slice((const char *)item.data, item.size);
+ valid_ = true;
+}
+
+// Position at the last key in the source. The iterator is
+// Valid() after this call iff the source is not empty.
+void
+IteratorImpl::SeekToLast()
+{
+ int ret;
+ WT_ITEM item;
+
+ if (!Status().ok())
+ return;
+
+ if ((ret = cursor_->reset(cursor_)) != 0) {
+ SetError(ret);
+ return;
+ }
+ ret = cursor_->prev(cursor_);
+ if (ret == WT_NOTFOUND) {
+ valid_ = false;
+ return;
+ } else if (ret != 0) {
+ SetError(ret);
+ return;
+ }
+ if ((ret = cursor_->get_key(cursor_, &item)) != 0) {
+ SetError(ret);
+ return;
+ }
+ key_ = Slice((const char *)item.data, item.size);
+ if ((ret = cursor_->get_value(cursor_, &item)) != 0) {
+ SetError(ret);
+ return;
+ }
+ value_ = Slice((const char *)item.data, item.size);
+ valid_ = true;
+}
+
+// Position at the first key in the source that at or past target
+// The iterator is Valid() after this call iff the source contains
+// an entry that comes at or past target.
+void
+IteratorImpl::Seek(const Slice& target)
+{
+ WT_ITEM item;
+
+ if (!Status().ok())
+ return;
+
+ item.data = target.data();
+ item.size = target.size();
+ cursor_->set_key(cursor_, &item);
+ int cmp, ret = cursor_->search_near(cursor_, &cmp);
+ if (ret == 0 && cmp < 0)
+ ret = cursor_->next(cursor_);
+ if (ret != 0) {
+ if (ret != WT_NOTFOUND)
+ SetError(ret);
+ valid_ = false;
+ return;
+ }
+ if ((ret = cursor_->get_key(cursor_, &item)) != 0) {
+ SetError(ret);
+ return;
+ }
+ key_ = Slice((const char *)item.data, item.size);
+ if ((ret = cursor_->get_value(cursor_, &item)) != 0) {
+ SetError(ret);
+ return;
+ }
+ value_ = Slice((const char *)item.data, item.size);
+ valid_ = true;
+}
+
+// Moves to the next entry in the source. After this call, Valid() is
+// true iff the iterator was not positioned at the last entry in the source.
+// REQUIRES: Valid()
+void
+IteratorImpl::Next()
+{
+ int ret;
+ WT_ITEM item;
+
+ if (!Status().ok() || !valid_)
+ return;
+
+ ret = cursor_->next(cursor_);
+ if (ret != 0) {
+ if (ret != WT_NOTFOUND)
+ SetError(ret);
+ valid_ = false;
+ return;
+ }
+ if ((ret = cursor_->get_key(cursor_, &item)) != 0) {
+ SetError(ret);
+ return;
+ }
+ key_ = Slice((const char *)item.data, item.size);
+ if ((ret = cursor_->get_value(cursor_, &item)) != 0) {
+ SetError(ret);
+ return;
+ }
+ value_ = Slice((const char *)item.data, item.size);
+ valid_ = true;
+}
+
+// Moves to the previous entry in the source. After this call, Valid() is
+// true iff the iterator was not positioned at the first entry in source.
+// REQUIRES: Valid()
+void
+IteratorImpl::Prev()
+{
+ WT_ITEM item;
+
+ if (!Status().ok() || !valid_)
+ return;
+
+ int ret = cursor_->prev(cursor_);
+ if (ret != 0) {
+ if (ret != WT_NOTFOUND)
+ SetError(ret);
+ valid_ = false;
+ return;
+ }
+ if ((ret = cursor_->get_key(cursor_, &item)) != 0) {
+ SetError(ret);
+ return;
+ }
+ key_ = Slice((const char *)item.data, item.size);
+ if ((ret = cursor_->get_value(cursor_, &item)) != 0) {
+ SetError(ret);
+ return;
+ }
+ value_ = Slice((const char *)item.data, item.size);
+ valid_ = true;
+}
diff --git a/src/third_party/wiredtiger/api/leveldb/leveldb_wt.h b/src/third_party/wiredtiger/api/leveldb/leveldb_wt.h
new file mode 100644
index 00000000000..683482ad23c
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/leveldb_wt.h
@@ -0,0 +1,460 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _INCLUDE_LEVELDB_WT_H
+#define _INCLUDE_LEVELDB_WT_H 1
+
+#include "leveldb_wt_config.h"
+
+#include "leveldb/cache.h"
+#include "leveldb/comparator.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/filter_policy.h"
+#include "leveldb/options.h"
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
+#include "leveldb/write_batch.h"
+#if HAVE_BASHO_LEVELDB
+#include "basho/perf_count.h"
+#endif
+
+#include "wiredtiger.h"
+
+#define WT_URI "table:data"
+#define WT_CONN_CONFIG \
+ "log=(enabled),checkpoint=(wait=180),checkpoint_sync=false," \
+ "session_max=8192,mmap=false," \
+ "transaction_sync=(enabled=true,method=none),"
+// Note: LSM doesn't split, build full pages from the start
+#define WT_TABLE_CONFIG "type=lsm,split_pct=100,leaf_item_max=1KB," \
+ "lsm=(chunk_size=100MB,bloom_config=(leaf_page_max=8MB)),"
+#define WT_TIMESTAMP_FORMAT "%d.%llu"
+// We're also only interested in operations to the user file. Skip over
+// any changes to the metadata.
+// !!! Currently WT guarantees that the metadata file is always at
+// fileid 0 and the implementation here only uses one table. This will
+// breakdown if either of those assumptions changes.
+#define WT_VALID_OPERATION(fileid, optype) \
+ ((fileid) != 0 && \
+ ((optype) == WT_LOGOP_COL_PUT || \
+ (optype) == WT_LOGOP_COL_REMOVE || \
+ (optype) == WT_LOGOP_ROW_PUT || \
+ (optype) == WT_LOGOP_ROW_REMOVE))
+
+using leveldb::Cache;
+using leveldb::FilterPolicy;
+using leveldb::Iterator;
+using leveldb::Options;
+using leveldb::ReadOptions;
+using leveldb::WriteBatch;
+using leveldb::WriteOptions;
+using leveldb::Range;
+using leveldb::Slice;
+using leveldb::Snapshot;
+using leveldb::Status;
+#if HAVE_BASHOLEVELDB
+using leveldb::Value;
+#endif
+#if HAVE_ROCKSDB
+using leveldb::FlushOptions;
+using leveldb::ColumnFamilyHandle;
+#endif
+
+extern Status WiredTigerErrorToStatus(int wiredTigerError, const char *msg = "");
+
+/* POSIX thread-local storage */
+template <class T>
+class ThreadLocal {
+public:
+ static void cleanup(void *val) {
+ delete (T *)val;
+ }
+
+ ThreadLocal() {
+ int ret = pthread_key_create(&key_, cleanup);
+ assert(ret == 0);
+ }
+
+ ~ThreadLocal() {
+ int ret = pthread_key_delete(key_);
+ assert(ret == 0);
+ }
+
+ T *Get() {
+ return (T *)(pthread_getspecific(key_));
+ }
+
+ void Set(T *value) {
+ int ret = pthread_setspecific(key_, value);
+ assert(ret == 0);
+ }
+
+private:
+ pthread_key_t key_;
+};
+
+/* WiredTiger implementations. */
+class DbImpl;
+
+/* Context for operations (including snapshots, write batches, transactions) */
+class OperationContext {
+public:
+ OperationContext(WT_CONNECTION *conn) {
+ int ret = conn->open_session(conn, NULL, "isolation=snapshot", &session_);
+ assert(ret == 0);
+ ret = session_->open_cursor(
+ session_, WT_URI, NULL, NULL, &cursor_);
+ assert(ret == 0);
+ }
+
+ ~OperationContext() {
+#ifdef WANT_SHUTDOWN_RACES
+ int ret = Close();
+ assert(ret == 0);
+#endif
+ }
+
+ int Close() {
+ int ret = 0;
+ if (session_ != NULL)
+ ret = session_->close(session_, NULL);
+ session_ = NULL;
+ return (ret);
+ }
+
+ WT_CURSOR *GetCursor() { return cursor_; }
+ void SetCursor(WT_CURSOR *c) { cursor_ = c; }
+#ifdef HAVE_ROCKSDB
+ WT_CURSOR *GetCursor(u_int i) {
+ return (i < cursors_.size()) ? cursors_[i] : NULL;
+ }
+ void SetCursor(u_int i, WT_CURSOR *c) {
+ if (i >= cursors_.size())
+ cursors_.resize(i + 1);
+ cursors_[i] = c;
+ }
+#endif
+ WT_SESSION *GetSession() { return session_; }
+
+private:
+ WT_SESSION *session_;
+ WT_CURSOR *cursor_;
+#ifdef HAVE_ROCKSDB
+ std::vector<WT_CURSOR *> cursors_;
+#endif
+};
+
+class CacheImpl : public Cache {
+public:
+ CacheImpl(size_t capacity) : Cache(), capacity_(capacity) {}
+ virtual ~CacheImpl() {}
+
+ virtual Handle* Insert(const Slice&, void*, size_t,
+ void (*)(const Slice&, void*)) { return 0; }
+ virtual Handle* Lookup(const Slice&) { return 0; }
+ virtual void Release(Handle*) {}
+ virtual void* Value(Handle*) { return 0; }
+ virtual void Erase(const Slice&) {}
+ virtual uint64_t NewId() { return 0; }
+
+ size_t capacity_;
+};
+
+#ifdef HAVE_ROCKSDB
+// ColumnFamilyHandleImpl is the class that clients use to access different
+// column families. It has non-trivial destructor, which gets called when client
+// is done using the column family
+class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
+ public:
+ ColumnFamilyHandleImpl(DbImpl* db, std::string const &name, uint32_t id) : db_(db), id_(id), name_(name) {}
+ ColumnFamilyHandleImpl(const ColumnFamilyHandleImpl &copyfrom) : db_(copyfrom.db_), id_(copyfrom.id_), name_(copyfrom.name_) {}
+ virtual ~ColumnFamilyHandleImpl() {}
+ virtual uint32_t GetID() const { return id_; }
+
+ std::string const &GetName() const { return name_; }
+ std::string const GetURI() const { return "table:" + name_; }
+
+ private:
+ DbImpl* db_;
+ uint32_t id_;
+ std::string const name_;
+};
+#endif
+
+class IteratorImpl : public Iterator {
+public:
+ IteratorImpl(DbImpl *db, WT_CURSOR *cursor, uint32_t id=0) : db_(db), cursor_(cursor), id_(id) {}
+ virtual ~IteratorImpl();
+
+ // An iterator is either positioned at a key/value pair, or
+ // not valid. This method returns true iff the iterator is valid.
+ virtual bool Valid() const { return valid_; }
+
+ virtual void SeekToFirst();
+
+ virtual void SeekToLast();
+
+ virtual void Seek(const Slice& target);
+
+ virtual void Next();
+
+ virtual void Prev();
+
+ virtual Slice key() const {
+ return key_;
+ }
+
+ virtual Slice value() const {
+ return value_;
+ }
+
+ virtual Status status() const {
+ return status_;
+ }
+
+private:
+ DbImpl *db_;
+ WT_CURSOR *cursor_;
+ Slice key_, value_;
+ Status status_;
+ bool valid_;
+ uint32_t id_;
+
+ void SetError(int wiredTigerError) {
+ valid_ = false;
+ status_ = WiredTigerErrorToStatus(wiredTigerError, NULL);
+ }
+
+ // No copying allowed
+ IteratorImpl(const IteratorImpl&);
+ void operator=(const IteratorImpl&);
+};
+
+class SnapshotImpl : public Snapshot {
+friend class DbImpl;
+friend class IteratorImpl;
+public:
+ SnapshotImpl(DbImpl *db);
+ virtual ~SnapshotImpl() { delete context_; }
+protected:
+ OperationContext *GetContext() const { return context_; }
+ Status GetStatus() const { return status_; }
+ Status SetupTransaction();
+private:
+ DbImpl *db_;
+ OperationContext *context_;
+ Status status_;
+};
+
+class DbImpl : public leveldb::DB {
+friend class IteratorImpl;
+friend class SnapshotImpl;
+public:
+ DbImpl(WT_CONNECTION *conn) :
+ DB(), conn_(conn), context_(new ThreadLocal<OperationContext>) {}
+ virtual ~DbImpl() {
+ delete context_;
+ int ret = conn_->close(conn_, NULL);
+ assert(ret == 0);
+ }
+
+ virtual Status Put(const WriteOptions& options,
+ const Slice& key,
+ const Slice& value);
+
+ virtual Status Delete(const WriteOptions& options, const Slice& key);
+
+ virtual Status Write(const WriteOptions& options, WriteBatch* updates);
+
+ virtual Status Get(const ReadOptions& options,
+ const Slice& key, std::string* value);
+
+#if HAVE_BASHOLEVELDB
+ virtual Status Get(const ReadOptions& options,
+ const Slice& key, Value* value);
+#endif
+
+#ifdef HAVE_HYPERLEVELDB
+ virtual Status LiveBackup(const Slice& name);
+ virtual void GetReplayTimestamp(std::string* timestamp);
+ virtual void AllowGarbageCollectBeforeTimestamp(const std::string& timestamp);
+ virtual bool ValidateTimestamp(const std::string& timestamp);
+ virtual int CompareTimestamps(const std::string& lhs, const std::string& rhs);
+ virtual Status GetReplayIterator(const std::string& timestamp,
+ leveldb::ReplayIterator** iter);
+ virtual void ReleaseReplayIterator(leveldb::ReplayIterator* iter);
+#endif
+
+#ifdef HAVE_ROCKSDB
+ virtual Status CreateColumnFamily(const Options& options,
+ const std::string& column_family_name,
+ ColumnFamilyHandle** handle);
+
+ // Drop a column family specified by column_family handle. This call
+ // only records a drop record in the manifest and prevents the column
+ // family from flushing and compacting.
+ virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
+
+ // Set the database entry for "key" to "value".
+ // Returns OK on success, and a non-OK status on error.
+ // Note: consider setting options.sync = true.
+ virtual Status Put(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value);
+
+ // Remove the database entry (if any) for "key". Returns OK on
+ // success, and a non-OK status on error. It is not an error if "key"
+ // did not exist in the database.
+ // Note: consider setting options.sync = true.
+ virtual Status Delete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key);
+
+ // Merge the database entry for "key" with "value". Returns OK on success,
+ // and a non-OK status on error. The semantics of this operation is
+ // determined by the user provided merge_operator when opening DB.
+ // Note: consider setting options.sync = true.
+ virtual Status Merge(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value);
+
+ // May return some other Status on an error.
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value);
+
+ // If keys[i] does not exist in the database, then the i'th returned
+ // status will be one for which Status::IsNotFound() is true, and
+ // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
+ // the i'th returned status will have Status::ok() true, and (*values)[i]
+ // will store the value associated with keys[i].
+ //
+ // (*values) will always be resized to be the same size as (keys).
+ // Similarly, the number of returned statuses will be the number of keys.
+ // Note: keys will not be "de-duplicated". Duplicate keys will return
+ // duplicate values in order.
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values);
+
+ virtual Iterator* NewIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family);
+
+ virtual bool GetProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, std::string* value);
+
+ // Flush all mem-table data.
+ virtual Status Flush(const FlushOptions& options,
+ ColumnFamilyHandle* column_family);
+
+ ColumnFamilyHandleImpl *GetCF(uint32_t id) {
+ return (id < columns_.size()) ? static_cast<ColumnFamilyHandleImpl *>(columns_[id]) : NULL;
+ }
+ void SetColumns(std::vector<ColumnFamilyHandle *> &cols) {
+ columns_ = cols;
+ }
+#endif
+
+ virtual Iterator* NewIterator(const ReadOptions& options);
+
+ virtual const Snapshot* GetSnapshot();
+
+ virtual void ReleaseSnapshot(const Snapshot* snapshot);
+
+ virtual bool GetProperty(const Slice& property, std::string* value);
+
+ virtual void GetApproximateSizes(const Range* range, int n,
+ uint64_t* sizes);
+
+ virtual void CompactRange(const Slice* begin, const Slice* end);
+
+ virtual void SuspendCompactions();
+
+ virtual void ResumeCompactions();
+
+ OperationContext *GetContext() {
+ OperationContext *ctx = context_->Get();
+ if (ctx == NULL) {
+ ctx = NewContext();
+ context_->Set(ctx);
+ }
+ return (ctx);
+ }
+
+private:
+ WT_CONNECTION *conn_;
+ ThreadLocal<OperationContext> *context_;
+#ifdef HAVE_ROCKSDB
+ std::vector<ColumnFamilyHandle*> columns_;
+#endif
+
+ OperationContext *NewContext() {
+ return new OperationContext(conn_);
+ }
+
+ OperationContext *GetContext(const ReadOptions &options) {
+ if (options.snapshot == NULL)
+ return GetContext();
+ else {
+ const SnapshotImpl *si =
+ static_cast<const SnapshotImpl *>(options.snapshot);
+ assert(si->GetStatus().ok());
+ return si->GetContext();
+ }
+ }
+
+ // No copying allowed
+ DbImpl(const DbImpl&);
+ void operator=(const DbImpl&);
+};
+
+// Implemention of WriteBatch::Handler
+class WriteBatchHandler : public WriteBatch::Handler {
+public:
+ WriteBatchHandler(DbImpl *db, OperationContext *context) : db_(db), context_(context), status_(0) {}
+ virtual ~WriteBatchHandler() {}
+ int GetWiredTigerStatus() { return status_; }
+
+ virtual void Put(const Slice& key, const Slice& value);
+
+ virtual void Delete(const Slice& key);
+
+#ifdef HAVE_ROCKSDB
+ // Implementations are in rocksdb_wt.cc
+ virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value);
+ virtual Status DeleteCF(uint32_t column_family_id, const Slice& key);
+#endif
+
+private:
+ DbImpl *db_;
+ OperationContext *context_;
+ int status_;
+};
+
+#endif
diff --git a/src/third_party/wiredtiger/api/leveldb/rocks_wt.cc b/src/third_party/wiredtiger/api/leveldb/rocks_wt.cc
new file mode 100644
index 00000000000..6ccab2c1e78
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/rocks_wt.cc
@@ -0,0 +1,315 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "leveldb_wt.h"
+#include <errno.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sstream>
+
+using leveldb::Cache;
+using leveldb::DB;
+using leveldb::FlushOptions;
+using leveldb::FilterPolicy;
+using leveldb::Iterator;
+using leveldb::Options;
+using leveldb::ReadOptions;
+using leveldb::WriteBatch;
+using leveldb::WriteOptions;
+using leveldb::Range;
+using leveldb::Slice;
+using leveldb::Snapshot;
+using leveldb::Status;
+
+static int
+wtrocks_get_cursor(OperationContext *context, ColumnFamilyHandle *cfhp, WT_CURSOR **cursorp, int acquire=0)
+{
+ ColumnFamilyHandleImpl *cf =
+ static_cast<ColumnFamilyHandleImpl *>(cfhp);
+ if (cf == NULL) {
+ fprintf(stderr, "Missing column!\n");
+ assert(0);
+ }
+ WT_CURSOR *c = context->GetCursor(cf->GetID());
+ if (c == NULL) {
+ WT_SESSION *session = context->GetSession();
+ int ret;
+ if ((ret = session->open_cursor(
+ session, cf->GetURI().c_str(), NULL, NULL, &c)) != 0) {
+ fprintf(stderr, "Failed to open cursor on %s: %s\n", cf->GetURI().c_str(), wiredtiger_strerror(ret));
+ return (ret);
+ }
+ if (!acquire)
+ context->SetCursor(cf->GetID(), c);
+ } else if (acquire)
+ context->SetCursor(cf->GetID(), NULL);
+ *cursorp = c;
+ return (0);
+}
+
+Status
+DB::ListColumnFamilies(
+ Options const &options, std::string const &name,
+ std::vector<std::string> *column_families)
+{
+ std::vector<std::string> cf;
+ DB *dbptr;
+ Status status = DB::Open(options, name, &dbptr);
+ if (!status.ok())
+ return status;
+ DbImpl *db = static_cast<DbImpl *>(dbptr);
+ OperationContext *context = db->GetContext();
+ WT_SESSION *session = context->GetSession();
+ WT_CURSOR *c;
+ int ret = session->open_cursor(session, "metadata:", NULL, NULL, &c);
+ if (ret != 0)
+ goto err;
+ c->set_key(c, "table:");
+ /* Position on the first table entry */
+ int cmp;
+ ret = c->search_near(c, &cmp);
+ if (ret != 0 || (cmp < 0 && (ret = c->next(c)) != 0))
+ goto err;
+ /* Add entries while we are getting "table" URIs. */
+ for (; ret == 0; ret = c->next(c)) {
+ const char *key;
+ if ((ret = c->get_key(c, &key)) != 0)
+ goto err;
+ if (strncmp(key, "table:", strlen("table:")) != 0)
+ break;
+ printf("List column families: [%d] = %s\n", (int)cf.size(), key);
+ cf.push_back(std::string(key + strlen("table:")));
+ }
+
+err: delete db;
+ /*
+ * WT_NOTFOUND is not an error: it just means we got to the end of the
+ * list of tables.
+ */
+ if (ret == 0 || ret == WT_NOTFOUND) {
+ *column_families = cf;
+ ret = 0;
+ }
+ return WiredTigerErrorToStatus(ret);
+}
+
+Status
+DB::Open(Options const &options, std::string const &name, const std::vector<ColumnFamilyDescriptor> &column_families, std::vector<ColumnFamilyHandle*> *handles, DB**dbptr)
+{
+ Status status = Open(options, name, dbptr);
+ if (!status.ok())
+ return status;
+ DbImpl *db = static_cast<DbImpl *>(*dbptr);
+ std::vector<ColumnFamilyHandle*> cfhandles(
+ column_families.size());
+ for (size_t i = 0; i < column_families.size(); i++) {
+ printf("Open column families: [%d] = %s\n", (int)i, column_families[i].name.c_str());
+ cfhandles[i] = new ColumnFamilyHandleImpl(
+ db, column_families[i].name, (int)i);
+ }
+ db->SetColumns(*handles = cfhandles);
+ return Status::OK();
+}
+
+void
+WriteBatch::Handler::Merge(const Slice& key, const Slice& value)
+{
+}
+
+void
+WriteBatch::Handler::LogData(const Slice& blob)
+{
+}
+
+Status
+WriteBatchHandler::PutCF(
+ uint32_t column_family_id, const Slice& key, const Slice& value)
+{
+ WT_CURSOR *cursor;
+ int ret = wtrocks_get_cursor(context_, db_->GetCF(column_family_id), &cursor);
+ if (ret != 0)
+ return WiredTigerErrorToStatus(ret);
+ WT_ITEM item;
+ item.data = key.data();
+ item.size = key.size();
+ cursor->set_key(cursor, &item);
+ item.data = value.data();
+ item.size = value.size();
+ cursor->set_value(cursor, &item);
+ ret = cursor->insert(cursor);
+ return WiredTigerErrorToStatus(ret);
+}
+
+Status
+WriteBatchHandler::DeleteCF(uint32_t column_family_id, const Slice& key)
+{
+ WT_CURSOR *cursor;
+ int ret = wtrocks_get_cursor(context_, db_->GetCF(column_family_id), &cursor);
+ if (ret != 0)
+ return WiredTigerErrorToStatus(ret);
+ WT_ITEM item;
+ item.data = key.data();
+ item.size = key.size();
+ cursor->set_key(cursor, &item);
+ ret = cursor->remove(cursor);
+ if (ret == 0) {
+ int t_ret = cursor->reset(cursor);
+ assert(t_ret == 0);
+ } else if (ret == WT_NOTFOUND)
+ ret = 0;
+ return WiredTigerErrorToStatus(ret);
+}
+
+Status
+DbImpl::Merge(WriteOptions const&, ColumnFamilyHandle*, Slice const&, Slice const&)
+{
+ return WiredTigerErrorToStatus(ENOTSUP);
+}
+
+Status
+DbImpl::CreateColumnFamily(Options const &options, std::string const &name, ColumnFamilyHandle **cfhp)
+{
+ extern int wtleveldb_create(WT_CONNECTION *,
+ const Options &, std::string const &uri);
+ int ret = wtleveldb_create(conn_, options, "table:" + name);
+ if (ret != 0)
+ return WiredTigerErrorToStatus(ret);
+ int id = (int)columns_.size();
+ *cfhp = new ColumnFamilyHandleImpl(this, name, id);
+ printf("Create column family: [%d] = %s\n", id, name.c_str());
+ columns_.push_back(*cfhp);
+ return Status::OK();
+}
+
+Status
+DbImpl::DropColumnFamily(ColumnFamilyHandle *cfhp)
+{
+ ColumnFamilyHandleImpl *cf =
+ static_cast<ColumnFamilyHandleImpl *>(cfhp);
+ WT_SESSION *session = GetContext()->GetSession();
+ int ret = session->drop(session, cf->GetURI().c_str(), NULL);
+ return WiredTigerErrorToStatus(ret);
+}
+
+Status
+DbImpl::Delete(WriteOptions const &write_options, ColumnFamilyHandle *cfhp, Slice const &key)
+{
+ WT_CURSOR *cursor;
+ int ret = wtrocks_get_cursor(GetContext(), cfhp, &cursor);
+ if (ret != 0)
+ return WiredTigerErrorToStatus(ret);
+ WT_ITEM item;
+ item.data = key.data();
+ item.size = key.size();
+ cursor->set_key(cursor, &item);
+ ret = cursor->remove(cursor);
+ // Reset the WiredTiger cursor so it doesn't keep any pages pinned.
+ // Track failures in debug builds since we don't expect failure, but
+ // don't pass failures on - it's not necessary for correct operation.
+ int t_ret = cursor->reset(cursor);
+ assert(t_ret == 0);
+ return WiredTigerErrorToStatus(ret);
+}
+
+Status
+DbImpl::Flush(FlushOptions const&, ColumnFamilyHandle* cfhp)
+{
+ ColumnFamilyHandleImpl *cf =
+ static_cast<ColumnFamilyHandleImpl *>(cfhp);
+ WT_SESSION *session = GetContext()->GetSession();
+ return WiredTigerErrorToStatus(session->checkpoint(session, ("target=(\"" + cf->GetURI() + "\")").c_str()));
+}
+
+Status
+DbImpl::Get(ReadOptions const &options, ColumnFamilyHandle *cfhp, Slice const &key, std::string *value)
+{
+ const char *errmsg = NULL;
+ OperationContext *context = GetContext(options);
+
+ WT_CURSOR *cursor;
+ int ret = wtrocks_get_cursor(context, cfhp, &cursor);
+ if (ret != 0)
+ return WiredTigerErrorToStatus(ret);
+
+ WT_ITEM item;
+ item.data = key.data();
+ item.size = key.size();
+ cursor->set_key(cursor, &item);
+ if ((ret = cursor->search(cursor)) == 0 &&
+ (ret = cursor->get_value(cursor, &item)) == 0) {
+ *value = std::string((const char *)item.data, item.size);
+ ret = cursor->reset(cursor);
+ }
+ if (ret == WT_NOTFOUND)
+ errmsg = "DB::Get key not found";
+ return WiredTigerErrorToStatus(ret, errmsg);
+}
+
+bool
+DbImpl::GetProperty(ColumnFamilyHandle*, Slice const&, std::string*)
+{
+ return false;
+}
+
+std::vector<Status>
+DbImpl::MultiGet(ReadOptions const&, std::vector<ColumnFamilyHandle*> const&, std::vector<Slice> const&, std::vector<std::string, std::allocator<std::string> >*)
+{
+ std::vector<Status> ret;
+ ret.push_back(WiredTigerErrorToStatus(ENOTSUP));
+ return ret;
+}
+
+Iterator *
+DbImpl::NewIterator(ReadOptions const &options, ColumnFamilyHandle *cfhp)
+{
+ OperationContext *context = GetContext(options);
+
+ WT_CURSOR *c;
+ int ret = wtrocks_get_cursor(context, cfhp, &c, 1);
+ assert(ret == 0);
+ return new IteratorImpl(this, c,
+ static_cast<ColumnFamilyHandleImpl *>(cfhp)->GetID());
+}
+
+Status
+DbImpl::Put(WriteOptions const &options, ColumnFamilyHandle *cfhp, Slice const &key, Slice const &value)
+{
+ WT_CURSOR *cursor;
+ int ret = wtrocks_get_cursor(GetContext(), cfhp, &cursor);
+ if (ret != 0)
+ return WiredTigerErrorToStatus(ret);
+
+ WT_ITEM item;
+ item.data = key.data();
+ item.size = key.size();
+ cursor->set_key(cursor, &item);
+ item.data = value.data();
+ item.size = value.size();
+ cursor->set_value(cursor, &item);
+ ret = cursor->insert(cursor);
+ return WiredTigerErrorToStatus(ret, NULL);
+}
diff --git a/src/third_party/wiredtiger/api/leveldb/rocksdb/LICENSE b/src/third_party/wiredtiger/api/leveldb/rocksdb/LICENSE
new file mode 100644
index 00000000000..b1329018690
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/rocksdb/LICENSE
@@ -0,0 +1,35 @@
+BSD License
+
+For rocksdb software
+
+Copyright (c) 2014, Facebook, Inc.
+All rights reserved.
+---------------------------------------------------------------------
+
+Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/third_party/wiredtiger/api/leveldb/rocksdb/PATENTS b/src/third_party/wiredtiger/api/leveldb/rocksdb/PATENTS
new file mode 100644
index 00000000000..6bafb4a342f
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/rocksdb/PATENTS
@@ -0,0 +1,23 @@
+Additional Grant of Patent Rights
+
+"Software" means the rocksdb software distributed by Facebook, Inc.
+
+Facebook hereby grants you a perpetual, worldwide, royalty-free,
+non-exclusive, irrevocable (subject to the termination provision below)
+license under any rights in any patent claims owned by Facebook, to make,
+have made, use, sell, offer to sell, import, and otherwise transfer the
+Software. For avoidance of doubt, no license is granted under Facebook's
+rights in any patent claims that are infringed by (i) modifications to the
+Software made by you or a third party, or (ii) the Software in combination
+with any software or other technology provided by you or a third party.
+
+The license granted hereunder will terminate, automatically and without
+notice, for anyone that makes any claim (including by filing any lawsuit,
+assertion or other action) alleging (a) direct, indirect, or contributory
+infringement or inducement to infringe any patent: (i) by Facebook or any
+of its subsidiaries or affiliates, whether or not such claim is related
+to the Software, (ii) by any party if such claim arises in whole or in
+part from any software, product or service of Facebook or any of its
+subsidiaries or affiliates, whether or not such claim is related to the
+Software, or (iii) by any party relating to the Software; or (b) that
+any right in any patent claim of Facebook is invalid or unenforceable.
diff --git a/src/third_party/wiredtiger/api/leveldb/rocksdb/write_batch.cc b/src/third_party/wiredtiger/api/leveldb/rocksdb/write_batch.cc
new file mode 100644
index 00000000000..aa7a3d239f9
--- /dev/null
+++ b/src/third_party/wiredtiger/api/leveldb/rocksdb/write_batch.cc
@@ -0,0 +1,275 @@
+// Copyright (c) 2013, Facebook, Inc. All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+// sequence: fixed64
+// count: fixed32
+// data: record[count]
+// record :=
+// kTypeValue varstring varstring
+// kTypeMerge varstring varstring
+// kTypeDeletion varstring
+// kTypeColumnFamilyValue varint32 varstring varstring
+// kTypeColumnFamilyMerge varint32 varstring varstring
+// kTypeColumnFamilyDeletion varint32 varstring varstring
+// varstring :=
+// len: varint32
+// data: uint8[len]
+
+#include "leveldb_wt.h"
+
+#include "db/write_batch_internal.h"
+
+#include <stdexcept>
+
+namespace rocksdb {
+
+// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
+static const size_t kHeader = 12;
+
+WriteBatch::WriteBatch(size_t reserved_bytes) {
+ rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader);
+ Clear();
+}
+
+WriteBatch::~WriteBatch() { }
+
+WriteBatch::Handler::~Handler() { }
+
+void WriteBatch::Handler::Put(const Slice& key, const Slice& value) {
+ // you need to either implement Put or PutCF
+ throw std::runtime_error("Handler::Put not implemented!");
+}
+
+#ifdef NOT_YET
+void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) {
+ throw std::runtime_error("Handler::Merge not implemented!");
+}
+#endif
+
+void WriteBatch::Handler::Delete(const Slice& key) {
+ // you need to either implement Delete or DeleteCF
+ throw std::runtime_error("Handler::Delete not implemented!");
+}
+
+#ifdef NOT_YET
+void WriteBatch::Handler::LogData(const Slice& blob) {
+ // If the user has not specified something to do with blobs, then we ignore
+ // them.
+}
+#endif
+
+bool WriteBatch::Handler::Continue() {
+ return true;
+}
+
+void WriteBatch::Clear() {
+ rep_.clear();
+ rep_.resize(kHeader);
+}
+
+int WriteBatch::Count() const {
+ return WriteBatchInternal::Count(this);
+}
+
+Status WriteBatch::Iterate(Handler* handler) const {
+ Slice input(rep_);
+ if (input.size() < kHeader) {
+ return Status::Corruption("malformed WriteBatch (too small)");
+ }
+
+ input.remove_prefix(kHeader);
+ Slice key, value, blob;
+ int found = 0;
+ Status s;
+ while (s.ok() && !input.empty() && handler->Continue()) {
+ char tag = input[0];
+ input.remove_prefix(1);
+ uint32_t column_family = 0; // default
+ switch (tag) {
+ case kTypeColumnFamilyValue:
+ if (!GetVarint32(&input, &column_family)) {
+ return Status::Corruption("bad WriteBatch Put");
+ }
+ // intentional fallthrough
+ case kTypeValue:
+ if (GetLengthPrefixedSlice(&input, &key) &&
+ GetLengthPrefixedSlice(&input, &value)) {
+ s = handler->PutCF(column_family, key, value);
+ found++;
+ } else {
+ return Status::Corruption("bad WriteBatch Put");
+ }
+ break;
+ case kTypeColumnFamilyDeletion:
+ if (!GetVarint32(&input, &column_family)) {
+ return Status::Corruption("bad WriteBatch Delete");
+ }
+ // intentional fallthrough
+ case kTypeDeletion:
+ if (GetLengthPrefixedSlice(&input, &key)) {
+ s = handler->DeleteCF(column_family, key);
+ found++;
+ } else {
+ return Status::Corruption("bad WriteBatch Delete");
+ }
+ break;
+ case kTypeColumnFamilyMerge:
+ if (!GetVarint32(&input, &column_family)) {
+ return Status::Corruption("bad WriteBatch Merge");
+ }
+ // intentional fallthrough
+ case kTypeMerge:
+ if (GetLengthPrefixedSlice(&input, &key) &&
+ GetLengthPrefixedSlice(&input, &value)) {
+ s = handler->MergeCF(column_family, key, value);
+ found++;
+ } else {
+ return Status::Corruption("bad WriteBatch Merge");
+ }
+ break;
+ case kTypeLogData:
+ if (GetLengthPrefixedSlice(&input, &blob)) {
+ handler->LogData(blob);
+ } else {
+ return Status::Corruption("bad WriteBatch Blob");
+ }
+ break;
+ default:
+ return Status::Corruption("unknown WriteBatch tag");
+ }
+ }
+ if (!s.ok()) {
+ return s;
+ }
+ if (found != WriteBatchInternal::Count(this)) {
+ return Status::Corruption("WriteBatch has wrong count");
+ } else {
+ return Status::OK();
+ }
+}
+
+int WriteBatchInternal::Count(const WriteBatch* b) {
+ return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
+ EncodeFixed32(&b->rep_[8], n);
+}
+
+#ifdef NOT_YET
+SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
+ return SequenceNumber(DecodeFixed64(b->rep_.data()));
+}
+
+void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
+ EncodeFixed64(&b->rep_[0], seq);
+}
+#endif
+
+void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
+ const Slice& key, const Slice& value) {
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeValue));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSlice(&b->rep_, key);
+ PutLengthPrefixedSlice(&b->rep_, value);
+}
+
+namespace {
+inline uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
+ uint32_t column_family_id = 0;
+ if (column_family != NULL) {
+ ColumnFamilyHandleImpl *cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ column_family_id = cfh->GetID();
+ }
+ return column_family_id;
+}
+} // namespace
+
+void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) {
+ WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value);
+}
+
+void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
+ const SliceParts& key, const SliceParts& value) {
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeValue));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSliceParts(&b->rep_, key);
+ PutLengthPrefixedSliceParts(&b->rep_, value);
+}
+
+void WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+ const SliceParts& value) {
+ WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value);
+}
+
+void WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
+ const Slice& key) {
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeDeletion));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSlice(&b->rep_, key);
+}
+
+void WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
+ WriteBatchInternal::Delete(this, GetColumnFamilyID(column_family), key);
+}
+
+#ifdef NOT_YET
+void WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
+ const Slice& key, const Slice& value) {
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ if (column_family_id == 0) {
+ b->rep_.push_back(static_cast<char>(kTypeMerge));
+ } else {
+ b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
+ PutVarint32(&b->rep_, column_family_id);
+ }
+ PutLengthPrefixedSlice(&b->rep_, key);
+ PutLengthPrefixedSlice(&b->rep_, value);
+}
+
+void WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) {
+ WriteBatchInternal::Merge(this, GetColumnFamilyID(column_family), key, value);
+}
+
+void WriteBatch::PutLogData(const Slice& blob) {
+ rep_.push_back(static_cast<char>(kTypeLogData));
+ PutLengthPrefixedSlice(&rep_, blob);
+}
+#endif
+
+void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+ assert(contents.size() >= kHeader);
+ b->rep_.assign(contents.data(), contents.size());
+}
+
+void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) {
+ SetCount(dst, Count(dst) + Count(src));
+ assert(src->rep_.size() >= kHeader);
+ dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader);
+}
+
+} // namespace rocksdb
diff --git a/src/third_party/wiredtiger/autogen.sh b/src/third_party/wiredtiger/autogen.sh
new file mode 100755
index 00000000000..0fb46feb9da
--- /dev/null
+++ b/src/third_party/wiredtiger/autogen.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+# Helper script with a familiar name to run auto* on a development tree.
+sh `dirname $0`/build_posix/reconf
diff --git a/src/third_party/wiredtiger/bench/wtperf/Makefile.am b/src/third_party/wiredtiger/bench/wtperf/Makefile.am
new file mode 100644
index 00000000000..0630a27f640
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/Makefile.am
@@ -0,0 +1,16 @@
+AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include
+
+LDADD = $(top_builddir)/libwiredtiger.la -lm
+
+noinst_PROGRAMS = wtperf
+wtperf_LDFLAGS = -static
+wtperf_SOURCES =\
+ config.c misc.c track.c wtperf.c wtperf.h wtperf_opt.i
+
+TESTS = smoke.sh
+AM_TESTS_ENVIRONMENT = rm -rf WT_TEST ; mkdir WT_TEST ;
+# automake 1.11 compatibility
+TESTS_ENVIRONMENT = $(AM_TESTS_ENVIRONMENT)
+
+clean-local:
+ rm -rf WT_TEST *.core
diff --git a/src/third_party/wiredtiger/bench/wtperf/README b/src/third_party/wiredtiger/bench/wtperf/README
new file mode 100644
index 00000000000..db15c892e15
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/README
@@ -0,0 +1,3 @@
+This is a benchmark program for testing the performance of WiredTiger.
+
+Scripts for running the benchmark can be found in the runners directory.
diff --git a/src/third_party/wiredtiger/bench/wtperf/config.c b/src/third_party/wiredtiger/bench/wtperf/config.c
new file mode 100644
index 00000000000..31b20621eea
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/config.c
@@ -0,0 +1,736 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wtperf.h"
+
+/* All options changeable on command line using -o or -O are listed here. */
+static CONFIG_OPT config_opts[] = {
+#define OPT_DEFINE_DESC
+#include "wtperf_opt.i"
+#undef OPT_DEFINE_DESC
+};
+
+static int config_opt(CONFIG *, WT_CONFIG_ITEM *, WT_CONFIG_ITEM *);
+static void config_opt_usage(void);
+
+/*
+ * STRING_MATCH --
+ * Return if a string matches a bytestring of a specified length.
+ */
+#undef STRING_MATCH
+#define STRING_MATCH(str, bytes, len) \
+ (strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0')
+
+
+/*
+ * config_assign --
+ * Assign the src config to the dest, any storage allocated in dest is
+ * freed as a result.
+ */
+int
+config_assign(CONFIG *dest, const CONFIG *src)
+{
+ size_t i, len;
+ char *newstr, **pstr;
+
+ config_free(dest);
+ memcpy(dest, src, sizeof(CONFIG));
+
+ if (src->uris != NULL) {
+ dest->uris = calloc(src->table_count, sizeof(char *));
+ if (dest->uris == NULL)
+ return (enomem(dest));
+ for (i = 0; i < src->table_count; i++)
+ dest->uris[i] = strdup(src->uris[i]);
+ }
+ dest->ckptthreads = NULL;
+ dest->popthreads = NULL;
+ dest->workers = NULL;
+
+ if (src->base_uri != NULL)
+ dest->base_uri = strdup(src->base_uri);
+ if (src->workload != NULL) {
+ dest->workload = calloc(WORKLOAD_MAX, sizeof(WORKLOAD));
+ if (dest->workload == NULL)
+ return (enomem(dest));
+ memcpy(dest->workload,
+ src->workload, WORKLOAD_MAX * sizeof(WORKLOAD));
+ }
+
+ for (i = 0; i < sizeof(config_opts) / sizeof(config_opts[0]); i++)
+ if (config_opts[i].type == STRING_TYPE ||
+ config_opts[i].type == CONFIG_STRING_TYPE) {
+ pstr = (char **)
+ ((u_char *)dest + config_opts[i].offset);
+ if (*pstr != NULL) {
+ len = strlen(*pstr) + 1;
+ if ((newstr = malloc(len)) == NULL)
+ return (enomem(src));
+ strncpy(newstr, *pstr, len);
+ *pstr = newstr;
+ }
+ }
+ return (0);
+}
+
+/*
+ * config_free --
+ * Free any storage allocated in the config struct.
+ */
+void
+config_free(CONFIG *cfg)
+{
+ size_t i;
+ char **pstr;
+
+ for (i = 0; i < sizeof(config_opts) / sizeof(config_opts[0]); i++)
+ if (config_opts[i].type == STRING_TYPE ||
+ config_opts[i].type == CONFIG_STRING_TYPE) {
+ pstr = (char **)
+ ((unsigned char *)cfg + config_opts[i].offset);
+ if (*pstr != NULL) {
+ free(*pstr);
+ *pstr = NULL;
+ }
+ }
+ if (cfg->uris != NULL) {
+ for (i = 0; i < cfg->table_count; i++)
+ free(cfg->uris[i]);
+ free(cfg->uris);
+ }
+
+ free(cfg->ckptthreads);
+ free(cfg->popthreads);
+ free(cfg->base_uri);
+ free(cfg->workers);
+ free(cfg->workload);
+}
+
+/*
+ * config_compress --
+ * Parse the compression configuration.
+ */
+int
+config_compress(CONFIG *cfg)
+{
+ int ret;
+ const char *s;
+
+ ret = 0;
+ s = cfg->compression;
+ if (strcmp(s, "none") == 0) {
+ cfg->compress_ext = NULL;
+ cfg->compress_table = NULL;
+ } else if (strcmp(s, "bzip") == 0) {
+ cfg->compress_ext = BZIP_EXT;
+ cfg->compress_table = BZIP_BLK;
+ } else if (strcmp(s, "snappy") == 0) {
+ cfg->compress_ext = SNAPPY_EXT;
+ cfg->compress_table = SNAPPY_BLK;
+ } else if (strcmp(s, "zlib") == 0) {
+ cfg->compress_ext = ZLIB_EXT;
+ cfg->compress_table = ZLIB_BLK;
+ } else {
+ fprintf(stderr,
+ "invalid compression configuration: %s\n", s);
+ ret = EINVAL;
+ }
+ return (ret);
+
+}
+
+/*
+ * config_threads --
+ * Parse the thread configuration.
+ */
+static int
+config_threads(CONFIG *cfg, const char *config, size_t len)
+{
+ WORKLOAD *workp;
+ WT_CONFIG_ITEM groupk, groupv, k, v;
+ WT_CONFIG_PARSER *group, *scan;
+ int ret;
+
+ group = scan = NULL;
+ /* Allocate the workload array. */
+ if ((cfg->workload = calloc(WORKLOAD_MAX, sizeof(WORKLOAD))) == NULL)
+ return (enomem(cfg));
+ cfg->workload_cnt = 0;
+
+ /*
+ * The thread configuration may be in multiple groups, that is, we have
+ * to handle configurations like:
+ * threads=((count=2,reads=1),(count=8,inserts=2,updates=1))
+ *
+ * Start a scan on the original string, then do scans on each string
+ * returned from the original string.
+ */
+ if ((ret =
+ wiredtiger_config_parser_open(NULL, config, len, &group)) != 0)
+ goto err;
+ while ((ret = group->next(group, &groupk, &groupv)) == 0) {
+ if ((ret = wiredtiger_config_parser_open(
+ NULL, groupk.str, groupk.len, &scan)) != 0)
+ goto err;
+
+ /* Move to the next workload slot. */
+ if (cfg->workload_cnt == WORKLOAD_MAX) {
+ fprintf(stderr,
+ "too many workloads configured, only %d workloads "
+ "supported\n",
+ WORKLOAD_MAX);
+ return (EINVAL);
+ }
+ workp = &cfg->workload[cfg->workload_cnt++];
+
+ while ((ret = scan->next(scan, &k, &v)) == 0) {
+ if (STRING_MATCH("count", k.str, k.len)) {
+ if ((workp->threads = v.val) <= 0)
+ goto err;
+ continue;
+ }
+ if (STRING_MATCH("throttle", k.str, k.len)) {
+ if ((workp->throttle = v.val) < 0)
+ goto err;
+ continue;
+ }
+ if (STRING_MATCH("insert", k.str, k.len) ||
+ STRING_MATCH("inserts", k.str, k.len)) {
+ if ((workp->insert = v.val) < 0)
+ goto err;
+ continue;
+ }
+ if (STRING_MATCH("ops_per_txn", k.str, k.len)) {
+ if ((workp->ops_per_txn = v.val) < 0)
+ goto err;
+ continue;
+ }
+ if (STRING_MATCH("read", k.str, k.len) ||
+ STRING_MATCH("reads", k.str, k.len)) {
+ if ((workp->read = v.val) < 0)
+ goto err;
+ continue;
+ }
+ if (STRING_MATCH("update", k.str, k.len) ||
+ STRING_MATCH("updates", k.str, k.len)) {
+ if ((workp->update = v.val) < 0)
+ goto err;
+ continue;
+ }
+ goto err;
+ }
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ if (ret != 0 )
+ goto err;
+ ret = scan->close(scan);
+ scan = NULL;
+ if (ret != 0)
+ goto err;
+
+ if (workp->insert == 0 &&
+ workp->read == 0 && workp->update == 0)
+ goto err;
+ cfg->workers_cnt += (u_int)workp->threads;
+ }
+
+ ret = group->close(group);
+ group = NULL;
+ if (ret != 0)
+ goto err;
+
+ return (0);
+
+err: if (group != NULL)
+ (void)group->close(group);
+ if (scan != NULL)
+ (void)scan->close(scan);
+
+ fprintf(stderr,
+ "invalid thread configuration or scan error: %.*s\n",
+ (int)len, config);
+ return (EINVAL);
+}
+
+/*
+ * config_opt --
+ * Check a single key=value returned by the config parser against our table
+ * of valid keys, along with the expected type. If everything is okay, set the
+ * value.
+ */
+static int
+config_opt(CONFIG *cfg, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v)
+{
+ CONFIG_OPT *popt;
+ char *newstr, **strp;
+ size_t i, nopt;
+ uint64_t newlen;
+ void *valueloc;
+
+ popt = NULL;
+ nopt = sizeof(config_opts)/sizeof(config_opts[0]);
+ for (i = 0; i < nopt; i++)
+ if (strlen(config_opts[i].name) == k->len &&
+ strncmp(config_opts[i].name, k->str, k->len) == 0) {
+ popt = &config_opts[i];
+ break;
+ }
+ if (popt == NULL) {
+ fprintf(stderr, "wtperf: Error: "
+ "unknown option \'%.*s\'\n", (int)k->len, k->str);
+ fprintf(stderr, "Options:\n");
+ for (i = 0; i < nopt; i++)
+ fprintf(stderr, "\t%s\n", config_opts[i].name);
+ return (EINVAL);
+ }
+ valueloc = ((unsigned char *)cfg + popt->offset);
+ switch (popt->type) {
+ case BOOL_TYPE:
+ if (v->type != WT_CONFIG_ITEM_BOOL) {
+ fprintf(stderr, "wtperf: Error: "
+ "bad bool value for \'%.*s=%.*s\'\n",
+ (int)k->len, k->str, (int)v->len, v->str);
+ return (EINVAL);
+ }
+ *(int *)valueloc = (int)v->val;
+ break;
+ case INT_TYPE:
+ if (v->type != WT_CONFIG_ITEM_NUM) {
+ fprintf(stderr, "wtperf: Error: "
+ "bad int value for \'%.*s=%.*s\'\n",
+ (int)k->len, k->str, (int)v->len, v->str);
+ return (EINVAL);
+ }
+ if (v->val > INT_MAX) {
+ fprintf(stderr, "wtperf: Error: "
+ "int value out of range for \'%.*s=%.*s\'\n",
+ (int)k->len, k->str, (int)v->len, v->str);
+ return (EINVAL);
+ }
+ *(int *)valueloc = (int)v->val;
+ break;
+ case UINT32_TYPE:
+ if (v->type != WT_CONFIG_ITEM_NUM) {
+ fprintf(stderr, "wtperf: Error: "
+ "bad uint32 value for \'%.*s=%.*s\'\n",
+ (int)k->len, k->str, (int)v->len, v->str);
+ return (EINVAL);
+ }
+ if (v->val < 0 || v->val > UINT_MAX) {
+ fprintf(stderr, "wtperf: Error: "
+ "uint32 value out of range for \'%.*s=%.*s\'\n",
+ (int)k->len, k->str, (int)v->len, v->str);
+ return (EINVAL);
+ }
+ *(uint32_t *)valueloc = (uint32_t)v->val;
+ break;
+ case CONFIG_STRING_TYPE:
+ if (v->type != WT_CONFIG_ITEM_STRING) {
+ fprintf(stderr, "wtperf: Error: "
+ "bad string value for \'%.*s=%.*s\'\n",
+ (int)k->len, k->str, (int)v->len, v->str);
+ return (EINVAL);
+ }
+ strp = (char **)valueloc;
+ newlen = v->len + 1;
+ if (*strp == NULL) {
+ if ((newstr = calloc(newlen, sizeof(char))) == NULL)
+ return (enomem(cfg));
+ strncpy(newstr, v->str, v->len);
+ } else {
+ newlen += (strlen(*strp) + 1);
+ if ((newstr = calloc(newlen, sizeof(char))) == NULL)
+ return (enomem(cfg));
+ snprintf(newstr, newlen,
+ "%s,%*s", *strp, (int)v->len, v->str);
+ /* Free the old value now we've copied it. */
+ free(*strp);
+ }
+ *strp = newstr;
+ break;
+ case STRING_TYPE:
+ /*
+ * Thread configuration is the one case where the type isn't a
+ * "string", it's a "struct".
+ */
+ if (v->type == WT_CONFIG_ITEM_STRUCT &&
+ STRING_MATCH("threads", k->str, k->len))
+ return (config_threads(cfg, v->str, v->len));
+
+ if (v->type != WT_CONFIG_ITEM_STRING) {
+ fprintf(stderr, "wtperf: Error: "
+ "bad string value for \'%.*s=%.*s\'\n",
+ (int)k->len, k->str, (int)v->len, v->str);
+ return (EINVAL);
+ }
+ strp = (char **)valueloc;
+ free(*strp);
+ if ((newstr = malloc(v->len + 1)) == NULL)
+ return (enomem(cfg));
+ strncpy(newstr, v->str, v->len);
+ newstr[v->len] = '\0';
+ *strp = newstr;
+ break;
+ }
+ return (0);
+}
+
+/*
+ * config_opt_file --
+ * Parse a configuration file. We recognize comments '#' and continuation
+ * via lines ending in '\'.
+ */
+int
+config_opt_file(CONFIG *cfg, const char *filename)
+{
+ struct stat sb;
+ ssize_t read_size;
+ size_t buf_size, linelen, optionpos;
+ int contline, fd, linenum, ret;
+ char option[1024];
+ char *comment, *file_buf, *line, *ltrim, *rtrim;
+
+ file_buf = NULL;
+
+ if ((fd = open(filename, O_RDONLY)) == -1) {
+ fprintf(stderr, "wtperf: %s: %s\n", filename, strerror(errno));
+ return (errno);
+ }
+ if ((ret = fstat(fd, &sb)) != 0) {
+ fprintf(stderr, "wtperf: stat of %s: %s\n",
+ filename, strerror(errno));
+ ret = errno;
+ goto err;
+ }
+ buf_size = (size_t)sb.st_size;
+ file_buf = calloc(buf_size + 2, 1);
+ if (file_buf == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+ read_size = read(fd, file_buf, buf_size);
+ if (read_size == -1
+#ifndef _WIN32
+ /* Windows automatically translates \r\n -> \n so counts will be off */
+ || (size_t)read_size != buf_size
+#endif
+ ) {
+ fprintf(stderr,
+ "wtperf: read unexpected amount from config file\n");
+ ret = EINVAL;
+ goto err;
+ }
+ /* Make sure the buffer is terminated correctly. */
+ file_buf[read_size] = '\0';
+
+ ret = 0;
+ optionpos = 0;
+ linenum = 0;
+ /*
+ * We should switch this from using strtok to generating a single
+ * WiredTiger configuration string compatible string, and using
+ * the WiredTiger configuration parser to parse it at once.
+ */
+#define WTPERF_CONFIG_DELIMS "\n\\"
+ for (line = strtok(file_buf, WTPERF_CONFIG_DELIMS);
+ line != NULL;
+ line = strtok(NULL, WTPERF_CONFIG_DELIMS)) {
+ linenum++;
+ /* trim the line */
+ for (ltrim = line; *ltrim && isspace(*ltrim); ltrim++)
+ ;
+ rtrim = &ltrim[strlen(ltrim)];
+ if (rtrim > ltrim && rtrim[-1] == '\n')
+ rtrim--;
+
+ contline = (rtrim > ltrim && rtrim[-1] == '\\');
+ if (contline)
+ rtrim--;
+
+ comment = strchr(ltrim, '#');
+ if (comment != NULL && comment < rtrim)
+ rtrim = comment;
+ while (rtrim > ltrim && isspace(rtrim[-1]))
+ rtrim--;
+
+ linelen = (size_t)(rtrim - ltrim);
+ if (linelen == 0)
+ continue;
+
+ if (linelen + optionpos + 1 > sizeof(option)) {
+ fprintf(stderr, "wtperf: %s: %d: line overflow\n",
+ filename, linenum);
+ ret = EINVAL;
+ break;
+ }
+ *rtrim = '\0';
+ strncpy(&option[optionpos], ltrim, linelen);
+ option[optionpos + linelen] = '\0';
+ if (contline)
+ optionpos += linelen;
+ else {
+ if ((ret = config_opt_line(cfg, option)) != 0) {
+ fprintf(stderr, "wtperf: %s: %d: parse error\n",
+ filename, linenum);
+ break;
+ }
+ optionpos = 0;
+ }
+ }
+ if (ret == 0 && optionpos > 0) {
+ fprintf(stderr, "wtperf: %s: %d: last line continues\n",
+ filename, linenum);
+ ret = EINVAL;
+ goto err;
+ }
+
+err: if (fd != -1)
+ (void)close(fd);
+ if (file_buf != NULL)
+ free(file_buf);
+ return (ret);
+}
+
+/*
+ * config_opt_line --
+ * Parse a single line of config options. Continued lines have already
+ * been joined.
+ */
+int
+config_opt_line(CONFIG *cfg, const char *optstr)
+{
+ WT_CONFIG_ITEM k, v;
+ WT_CONFIG_PARSER *scan;
+ int ret, t_ret;
+
+ if ((ret = wiredtiger_config_parser_open(
+ NULL, optstr, strlen(optstr), &scan)) != 0) {
+ lprintf(cfg, ret, 0, "Error in config_scan_begin");
+ return (ret);
+ }
+
+ while (ret == 0) {
+ if ((ret = scan->next(scan, &k, &v)) != 0) {
+ /* Any parse error has already been reported. */
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ break;
+ }
+ ret = config_opt(cfg, &k, &v);
+ }
+ if ((t_ret = scan->close(scan)) != 0) {
+ lprintf(cfg, ret, 0, "Error in config_scan_end");
+ if (ret == 0)
+ ret = t_ret;
+ }
+
+ return (ret);
+}
+
+/*
+ * config_opt_str --
+ * Set a single string config option.
+ */
+int
+config_opt_str(CONFIG *cfg, const char *name, const char *value)
+{
+ int ret;
+ char *optstr;
+
+ /* name="value" */
+ if ((optstr = malloc(strlen(name) + strlen(value) + 4)) == NULL)
+ return (enomem(cfg));
+ sprintf(optstr, "%s=\"%s\"", name, value);
+ ret = config_opt_line(cfg, optstr);
+ free(optstr);
+ return (ret);
+}
+
+/*
+ * config_sanity --
+ * Configuration sanity checks.
+ */
+int
+config_sanity(CONFIG *cfg)
+{
+ /* Various intervals should be less than the run-time. */
+ if (cfg->run_time > 0 &&
+ ((cfg->checkpoint_threads != 0 &&
+ cfg->checkpoint_interval > cfg->run_time) ||
+ cfg->report_interval > cfg->run_time ||
+ cfg->sample_interval > cfg->run_time)) {
+ fprintf(stderr, "interval value longer than the run-time\n");
+ return (EINVAL);
+ }
+ if (cfg->table_count < 1 || cfg->table_count > 99) {
+ fprintf(stderr,
+ "invalid table count, less than 1 or greater than 99\n");
+ return (EINVAL);
+ }
+ if (cfg->database_count < 1 || cfg->database_count > 99) {
+ fprintf(stderr,
+ "invalid database count, less than 1 or greater than 99\n");
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * config_print --
+ * Print out the configuration in verbose mode.
+ */
+void
+config_print(CONFIG *cfg)
+{
+ WORKLOAD *workp;
+ u_int i;
+
+ printf("Workload configuration:\n");
+ printf("\tHome: %s\n", cfg->home);
+ printf("\tTable name: %s\n", cfg->table_name);
+ printf("\tConnection configuration: %s\n", cfg->conn_config);
+ if (cfg->sess_config != NULL)
+ printf("\tSession configuration: %s\n", cfg->sess_config);
+
+ printf("\t%s table: %s\n",
+ cfg->create ? "Creating new" : "Using existing",
+ cfg->table_config);
+ printf("\tKey size: %" PRIu32 ", value size: %" PRIu32 "\n",
+ cfg->key_sz, cfg->value_sz);
+ if (cfg->create)
+ printf("\tPopulate threads: %" PRIu32 ", inserting %" PRIu32
+ " rows\n",
+ cfg->populate_threads, cfg->icount);
+
+ printf("\tWorkload seconds, operations: %" PRIu32 ", %" PRIu32 "\n",
+ cfg->run_time, cfg->run_ops);
+ if (cfg->workload != NULL) {
+ printf("\tWorkload configuration(s):\n");
+ for (i = 0, workp = cfg->workload;
+ i < cfg->workload_cnt; ++i, ++workp)
+ printf("\t\t%" PRId64 " threads (inserts=%" PRId64
+ ", reads=%" PRId64 ", updates=%" PRId64 ")\n",
+ workp->threads,
+ workp->insert, workp->read, workp->update);
+ }
+
+ printf("\tCheckpoint threads, interval: %" PRIu32 ", %" PRIu32 "\n",
+ cfg->checkpoint_threads, cfg->checkpoint_interval);
+ printf("\tReporting interval: %" PRIu32 "\n", cfg->report_interval);
+ printf("\tSampling interval: %" PRIu32 "\n", cfg->sample_interval);
+
+ printf("\tVerbosity: %" PRIu32 "\n", cfg->verbose);
+}
+
+/*
+ * pretty_print --
+ * Print out lines of text for a 80 character window.
+ */
+static void
+pretty_print(const char *p, const char *indent)
+{
+ const char *t;
+
+ for (;; p = t + 1) {
+ if (strlen(p) <= 70)
+ break;
+ for (t = p + 70; t > p && *t != ' '; --t)
+ ;
+ if (t == p) /* No spaces? */
+ break;
+ printf("%s%.*s\n",
+ indent == NULL ? "" : indent, (int)(t - p), p);
+ }
+ if (*p != '\0')
+ printf("%s%s\n", indent == NULL ? "" : indent, p);
+}
+
+/*
+ * config_opt_usage --
+ * Configuration usage error message.
+ */
+static void
+config_opt_usage(void)
+{
+ size_t i, nopt;
+ const char *defaultval, *typestr;
+
+ pretty_print(
+ "The following are options settable using -o or -O, showing the "
+ "type and default value.\n", NULL);
+ pretty_print(
+ "String values must be enclosed in \" quotes, boolean values must "
+ "be either true or false.\n", NULL);
+
+ nopt = sizeof(config_opts)/sizeof(config_opts[0]);
+ for (i = 0; i < nopt; i++) {
+ defaultval = config_opts[i].defaultval;
+ typestr = "string";
+ switch (config_opts[i].type) {
+ case BOOL_TYPE:
+ typestr = "boolean";
+ if (strcmp(defaultval, "0") == 0)
+ defaultval = "false";
+ else
+ defaultval = "true";
+ break;
+ case CONFIG_STRING_TYPE:
+ case STRING_TYPE:
+ break;
+ case INT_TYPE:
+ typestr = "int";
+ break;
+ case UINT32_TYPE:
+ typestr = "unsigned int";
+ break;
+ }
+ printf("%s (%s, default=%s)\n",
+ config_opts[i].name, typestr, defaultval);
+ pretty_print(config_opts[i].description, "\t");
+ }
+}
+
+/*
+ * usage --
+ * wtperf usage print, no error.
+ */
+void
+usage(void)
+{
+ printf("wtperf [-C config] "
+ "[-H mount] [-h home] [-O file] [-o option] [-T config]\n");
+ printf("\t-C <string> additional connection configuration\n");
+ printf("\t (added to option conn_config)\n");
+ printf("\t-H <mount> configure Helium volume mount point\n");
+ printf("\t-h <string> Wired Tiger home must exist, default WT_TEST\n");
+ printf("\t-O <file> file contains options as listed below\n");
+ printf("\t-o option=val[,option=val,...] set options listed below\n");
+ printf("\t-T <string> additional table configuration\n");
+ printf("\t (added to option table_config)\n");
+ printf("\n");
+ config_opt_usage();
+}
diff --git a/src/third_party/wiredtiger/bench/wtperf/config_opt.h b/src/third_party/wiredtiger/bench/wtperf/config_opt.h
new file mode 100644
index 00000000000..3de30e723b1
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/config_opt.h
@@ -0,0 +1,38 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+typedef enum {
+ BOOL_TYPE, CONFIG_STRING_TYPE, INT_TYPE, STRING_TYPE, UINT32_TYPE
+} CONFIG_OPT_TYPE;
+
+typedef struct {
+ const char *name;
+ const char *description;
+ const char *defaultval;
+ CONFIG_OPT_TYPE type;
+ size_t offset;
+} CONFIG_OPT;
diff --git a/src/third_party/wiredtiger/bench/wtperf/doxy.c b/src/third_party/wiredtiger/bench/wtperf/doxy.c
new file mode 100644
index 00000000000..50cba9191e2
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/doxy.c
@@ -0,0 +1,110 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <string.h>
+#include <stdio.h>
+
+#include "config_opt.h"
+
+static const CONFIG_OPT config_opts[] = {
+#define OPT_DEFINE_DOXYGEN
+#include "wtperf_opt.i"
+#undef OPT_DEFINE_DOXYGEN
+};
+
+/*
+ * pretty_print --
+ * Print out lines of text for a 80 character window.
+ */
+static void
+pretty_print(const char *p, const char *indent)
+{
+ const char *t;
+
+ for (;; p = t + 1) {
+ if (strlen(p) <= 70)
+ break;
+ for (t = p + 70; t > p && *t != ' '; --t)
+ ;
+ if (t == p) /* No spaces? */
+ break;
+ printf("%s%.*s\n",
+ indent == NULL ? "" : indent, (int)(t - p), p);
+ }
+ if (*p != '\0')
+ printf("%s%s\n", indent == NULL ? "" : indent, p);
+}
+
+/*
+ * config_doxygen --
+ * Output the configuration information for doxgen.
+ */
+static void
+config_doxygen(void)
+{
+ size_t i, nopt;
+ const char *defaultval, *typestr;
+
+ nopt = sizeof(config_opts)/sizeof(config_opts[0]);
+ for (i = 0; i < nopt; i++) {
+ defaultval = config_opts[i].defaultval;
+ typestr = "string";
+ switch (config_opts[i].type) {
+ case BOOL_TYPE:
+ typestr = "boolean";
+ if (strcmp(defaultval, "0") == 0)
+ defaultval = "false";
+ else
+ defaultval = "true";
+ break;
+ case CONFIG_STRING_TYPE:
+ case STRING_TYPE:
+ break;
+ case INT_TYPE:
+ typestr = "int";
+ break;
+ case UINT32_TYPE:
+ typestr = "unsigned int";
+ break;
+ }
+ printf("@par %s (%s, default=%s)\n",
+ config_opts[i].name, typestr, defaultval);
+ pretty_print(config_opts[i].description, NULL);
+ }
+}
+
+/*
+ * config_doxygen --
+ * A standalone program to output the configuration options in a doxygen
+ * format.
+ */
+int
+main()
+{
+ config_doxygen();
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/bench/wtperf/misc.c b/src/third_party/wiredtiger/bench/wtperf/misc.c
new file mode 100644
index 00000000000..0a9a1de73e2
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/misc.c
@@ -0,0 +1,113 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wtperf.h"
+
+int
+enomem(const CONFIG *cfg)
+{
+ const char *msg;
+
+ msg = "Unable to allocate memory";
+ if (cfg->logf == NULL)
+ fprintf(stderr, "%s\n", msg);
+ else
+ lprintf(cfg, ENOMEM, 0, "%s", msg);
+ return (ENOMEM);
+}
+
+/* Setup the logging output mechanism. */
+int
+setup_log_file(CONFIG *cfg)
+{
+ int ret;
+ char *fname;
+
+ ret = 0;
+
+ if (cfg->verbose < 1)
+ return (0);
+
+ if ((fname = calloc(strlen(cfg->monitor_dir) +
+ strlen(cfg->table_name) + strlen(".stat") + 2, 1)) == NULL)
+ return (enomem(cfg));
+
+ sprintf(fname, "%s/%s.stat", cfg->monitor_dir, cfg->table_name);
+ cfg->logf = fopen(fname, "w");
+ if (cfg->logf == NULL) {
+ ret = errno;
+ fprintf(stderr, "%s: %s\n", fname, strerror(ret));
+ }
+ free(fname);
+ if (cfg->logf == NULL)
+ return (ret);
+
+ /* Use line buffering for the log file. */
+ (void)setvbuf(cfg->logf, NULL, _IOLBF, 32);
+ return (0);
+}
+
+/*
+ * Log printf - output a log message.
+ */
+void
+lprintf(const CONFIG *cfg, int err, uint32_t level, const char *fmt, ...)
+{
+ va_list ap;
+
+ if (err == 0 && level <= cfg->verbose) {
+ va_start(ap, fmt);
+ vfprintf(cfg->logf, fmt, ap);
+ va_end(ap);
+ fprintf(cfg->logf, "\n");
+
+ if (level < cfg->verbose) {
+ va_start(ap, fmt);
+ vprintf(fmt, ap);
+ va_end(ap);
+ printf("\n");
+ }
+ }
+ if (err == 0)
+ return;
+
+ /* We are dealing with an error. */
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ fprintf(stderr, " Error: %s\n", wiredtiger_strerror(err));
+ if (cfg->logf != NULL) {
+ va_start(ap, fmt);
+ vfprintf(cfg->logf, fmt, ap);
+ va_end(ap);
+ fprintf(cfg->logf, " Error: %s\n", wiredtiger_strerror(err));
+ }
+
+ /* Never attempt to continue if we got a panic from WiredTiger. */
+ if (err == WT_PANIC)
+ abort();
+}
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree.wtperf
new file mode 100644
index 00000000000..f8c270f3d1f
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree.wtperf
@@ -0,0 +1,8 @@
+# wtperf options file: evict btree configuration
+conn_config="cache_size=50M"
+table_config="type=file"
+icount=10000000
+report_interval=5
+run_time=120
+populate_threads=1
+threads=((count=16,reads=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm.wtperf
new file mode 100644
index 00000000000..962f6a60201
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/evict-lsm.wtperf
@@ -0,0 +1,9 @@
+# wtperf options file: evict lsm configuration
+conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6)"
+table_config="type=lsm,os_cache_dirty_max=16MB"
+compact=true
+icount=10000000
+report_interval=5
+run_time=120
+populate_threads=1
+threads=((count=16,reads=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/fruit-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/fruit-lsm.wtperf
new file mode 100644
index 00000000000..a34c74d5e08
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/fruit-lsm.wtperf
@@ -0,0 +1,22 @@
+# wtperf options file: simulate riak and its test1 and test2 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+#
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600),log=(enabled=true),transaction_sync=(enabled=true,method=none),checkpoint=(wait=180),lsm_manager=(worker_thread_max=12)"
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600)"
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024"
+compact=true
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB),type=lsm,leaf_page_max=16K,leaf_item_max=4K,os_cache_dirty_max=16MB"
+icount=25000000
+key_sz=40
+value_sz=800
+#max_latency=2000
+pareto=true
+populate_threads=20
+report_interval=10
+random_value=true
+run_time=18000
+sample_interval=10
+table_count=8
+threads=((count=20,read=6,update=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/fruit-short.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/fruit-short.wtperf
new file mode 100644
index 00000000000..a2ea535b4c5
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/fruit-short.wtperf
@@ -0,0 +1,20 @@
+# wtperf options file: simulate riak and its test1 and test2 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+compact=true
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB),type=lsm,leaf_page_max=16K"
+icount=25000000
+key_sz=40
+value_sz=800
+max_latency=2000
+pareto=true
+populate_threads=20
+report_interval=10
+random_value=true
+run_time=1800
+sample_interval=10
+threads=((count=20,read=6,update=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/get_ckpt.py b/src/third_party/wiredtiger/bench/wtperf/runners/get_ckpt.py
new file mode 100755
index 00000000000..38688870e7f
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/get_ckpt.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+# Generate input data to GNUplot from checkpoint information in a wtperf run
+
+import sys
+
+time = 0 # seconds
+print "%d, %d" % (0, 0)
+
+for line in sys.stdin:
+ if line.strip().endswith('secs'):
+ time += int(line.split(' ')[7])
+ if line.startswith('Finished checkpoint'):
+ duration = (int(line.split(' ')[3]) + 500) / 1000 # convert ms to secs
+ print "%d, %d" % (time - duration, 1)
+ print "%d, %d" % (time, 0)
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/insert-rmw.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/insert-rmw.wtperf
new file mode 100644
index 00000000000..50db0baa0d1
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/insert-rmw.wtperf
@@ -0,0 +1,9 @@
+# wtperf options file: Test the insert-rmw functionality
+conn_config="cache_size=500MB"
+table_config="type=lsm"
+icount=500000
+insert_rmw=true
+report_interval=5
+run_time=120
+populate_threads=1
+threads=((count=4,inserts=1),(count=2,reads=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/large-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/large-lsm.wtperf
new file mode 100644
index 00000000000..4b41b3fc84a
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/large-lsm.wtperf
@@ -0,0 +1,11 @@
+# wtperf options file: large lsm configuration
+conn_config="cache_size=20G,mmap=false,lsm_manager=(worker_thread_max=8)"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=5GB,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+compact=true
+compression="snappy"
+icount=250000000
+report_interval=5
+run_time=1200
+populate_threads=1
+sample_interval=10
+threads=((count=16,reads=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/long-txn-btree.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/long-txn-btree.wtperf
new file mode 100644
index 00000000000..31f264b1536
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/long-txn-btree.wtperf
@@ -0,0 +1,9 @@
+# wtperf options file: run autocommit and long transactions in parallel.
+conn_config="cache_size=500MB"
+table_config="type=file"
+icount=5000000
+report_interval=5
+run_time=120
+populate_threads=1
+# Have a long running reader, throttled to a max of 5000 ops per second.
+threads=((count=4,reads=1,updates=1),(count=1,reads=1,ops_per_txn=100000,throttle=5000))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/long-txn-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/long-txn-lsm.wtperf
new file mode 100644
index 00000000000..cc5a6b77a17
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/long-txn-lsm.wtperf
@@ -0,0 +1,9 @@
+# wtperf options file: run autocommit and long transactions in parallel.
+conn_config="cache_size=500MB"
+table_config="lsm=(chunk_size=5MB),type=lsm,os_cache_dirty_max=16MB"
+icount=5000000
+report_interval=5
+run_time=120
+populate_threads=1
+# Have a long running reader, throttled to a max of 5000 ops per second.
+threads=((count=4,reads=1,updates=1),(count=1,reads=1,ops_per_txn=100000,throttle=5000))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/medium-btree.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/medium-btree.wtperf
new file mode 100644
index 00000000000..92c63e73480
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/medium-btree.wtperf
@@ -0,0 +1,8 @@
+# wtperf options file: medium btree configuration
+conn_config="cache_size=1G"
+table_config="type=file"
+icount=50000000
+report_interval=5
+run_time=120
+populate_threads=1
+threads=((count=16,reads=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/medium-lsm-async.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/medium-lsm-async.wtperf
new file mode 100644
index 00000000000..1d729bcee23
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/medium-lsm-async.wtperf
@@ -0,0 +1,9 @@
+# wtperf options file: medium lsm configuration using async operations
+conn_config="cache_size=1G,lsm_manager=(worker_thread_max=6)"
+async_threads=10
+table_config="lsm=(chunk_size=100MB),type=lsm,os_cache_dirty_max=16MB"
+icount=50000000
+report_interval=5
+run_time=120
+populate_threads=1
+threads=((count=5,reads=1,updates=2))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/medium-lsm-compact.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/medium-lsm-compact.wtperf
new file mode 100644
index 00000000000..0d4a6f965d1
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/medium-lsm-compact.wtperf
@@ -0,0 +1,9 @@
+# wtperf options file: medium lsm configuration
+conn_config="cache_size=1G,lsm_manager=(worker_thread_max=6)"
+table_config="lsm=(chunk_size=100MB,chunk_max=1TB),type=lsm,os_cache_dirty_max=16MB"
+icount=50000000
+populate_threads=1
+compact=true
+threads=((count=16,read=1))
+run_time=120
+report_interval=5
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/medium-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/medium-lsm.wtperf
new file mode 100644
index 00000000000..3e676dc0b70
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/medium-lsm.wtperf
@@ -0,0 +1,8 @@
+# wtperf options file: medium lsm configuration
+conn_config="cache_size=1G,lsm_manager=(worker_thread_max=6)"
+table_config="lsm=(chunk_size=100MB),type=lsm,os_cache_dirty_max=16MB"
+icount=50000000
+report_interval=5
+run_time=120
+populate_threads=1
+threads=((count=16,reads=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm-noprefix.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm-noprefix.wtperf
new file mode 100644
index 00000000000..413c16075d3
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm-noprefix.wtperf
@@ -0,0 +1,10 @@
+# wtperf options file: medium lsm configuration, with multiple tables.
+conn_config="cache_size=1G,lsm_manager=(worker_thread_max=8)"
+table_config="lsm=(chunk_size=100MB,chunk_max=1TB),type=lsm,prefix_compression=false,os_cache_dirty_max=16MB"
+icount=50000000
+populate_threads=1
+compact=true
+threads=((count=8,read=1),(count=8,update=1))
+run_time=180
+report_interval=5
+table_count=4
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm.wtperf
new file mode 100644
index 00000000000..99b7b49aebd
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/medium-multi-lsm.wtperf
@@ -0,0 +1,10 @@
+# wtperf options file: medium lsm configuration, with multiple tables.
+conn_config="cache_size=1G,lsm_manager=(worker_thread_max=8)"
+table_config="lsm=(chunk_size=100MB,chunk_max=1TB),type=lsm,os_cache_dirty_max=16MB"
+icount=50000000
+populate_threads=1
+compact=true
+threads=((count=8,read=1),(count=8,update=1))
+run_time=180
+report_interval=5
+table_count=4
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-long.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-long.wtperf
new file mode 100644
index 00000000000..c5341f96c38
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree-long.wtperf
@@ -0,0 +1,16 @@
+# A btree configuration with ten databases, each with a single reader and
+# single writer. Throttled to 4000 ops/second.
+# This configuration is set to run for 10 hours.
+conn_config="cache_size=100MB,log=(enabled=false)"
+database_count=10
+table_config="leaf_page_max=4k,internal_page_max=16k,leaf_item_max=1433,internal_item_max=3100,type=file"
+icount=50000
+populate_threads=1
+random_range=10000000
+checkpoint_interval=120
+checkpoint_threads=1
+report_interval=5
+run_time=36000
+threads=((count=1,throttle=1000,reads=1),(count=1,throttle=1000,inserts=1))
+value_sz=100
+warmup=10
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree.wtperf
new file mode 100644
index 00000000000..e316dda9e88
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/multi-btree.wtperf
@@ -0,0 +1,17 @@
+# wtperf options file: small btree multi-database configuration
+# Original cache was 500MB. Shared cache is 500MB * database_count.
+conn_config="shared_cache=(name=pool,size=2500MB,chunk=1M),log=(enabled=false)"
+#conn_config="cache_size=250MB,log=(enabled=false)"
+database_count=10
+table_config="leaf_page_max=4k,internal_page_max=16k,leaf_item_max=1433,internal_item_max=3100,type=file"
+# Likewise, divide original icount by database_count.
+icount=50000
+populate_threads=1
+random_range=100000000
+checkpoint_interval=20
+checkpoint_threads=1
+report_interval=5
+run_time=20
+threads=((count=2,reads=1),(count=2,inserts=1))
+value_sz=100
+warmup=10
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/parallel-pop-btree.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/parallel-pop-btree.wtperf
new file mode 100644
index 00000000000..5ec5c84e551
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/parallel-pop-btree.wtperf
@@ -0,0 +1,9 @@
+# wtperf options file: Run populate thread multi-threaded and with groups
+# of operations in each transaction.
+conn_config="cache_size=200MB"
+table_config="type=file"
+transaction_config="isolation=snapshot"
+icount=10000000
+report_interval=5
+populate_ops_per_txn=100
+populate_threads=5
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/parallel-pop-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/parallel-pop-lsm.wtperf
new file mode 100644
index 00000000000..41309fd07de
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/parallel-pop-lsm.wtperf
@@ -0,0 +1,9 @@
+# wtperf options file: Run populate thread multi-threaded and with groups
+# of operations in each transaction.
+conn_config="cache_size=200MB"
+table_config="type=lsm,os_cache_dirty_max=16MB"
+transaction_config="isolation=snapshot"
+icount=10000000
+report_interval=5
+populate_ops_per_txn=100
+populate_threads=5
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/shared-cache-stress.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/shared-cache-stress.wtperf
new file mode 100644
index 00000000000..87d14f4f5c1
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/shared-cache-stress.wtperf
@@ -0,0 +1,12 @@
+# Stress out the shared cache.
+conn_config="statistics=(none),shared_cache=(name=wt-cache,size=536870912,reserve=10MB,chunk=20MB,)"
+table_config="allocation_size=4KB,key_gap=10,split_pct=75,internal_page_max=4KB,internal_key_truncate=false,prefix_compression=false,leaf_item_max=1433,type=file,internal_item_max=1433,exclusive=true,leaf_page_max=4KB,block_compressor=,"
+checkpoint_interval=100
+checkpoint_threads=1
+icount=50000
+random_range=500000
+report_interval=5
+run_time=600
+populate_threads=1
+threads=((count=1,inserts=1),(count=1,reads=1))
+database_count=25
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/small-btree.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/small-btree.wtperf
new file mode 100644
index 00000000000..5defd413fa6
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/small-btree.wtperf
@@ -0,0 +1,8 @@
+# wtperf options file: small btree configuration
+conn_config="cache_size=500MB"
+table_config="type=file"
+icount=500000
+report_interval=5
+run_time=120
+populate_threads=1
+threads=((count=8,reads=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/small-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/small-lsm.wtperf
new file mode 100644
index 00000000000..1b00d18d76b
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/small-lsm.wtperf
@@ -0,0 +1,8 @@
+# wtperf options file: small lsm configuration
+conn_config="cache_size=500MB"
+table_config="lsm=(chunk_size=5MB),type=lsm,os_cache_dirty_max=16MB"
+icount=500000
+report_interval=5
+run_time=120
+populate_threads=1
+threads=((count=8,reads=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test1-1b-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test1-1b-lsm.wtperf
new file mode 100644
index 00000000000..662a286970c
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test1-1b-lsm.wtperf
@@ -0,0 +1,18 @@
+# wtperf options file: simulate riak and its test1 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+compact=true
+compression="snappy"
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=5GB,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+icount=1000000000
+key_sz=40
+value_sz=1000
+max_latency=2000
+populate_threads=20
+report_interval=10
+random_value=true
+sample_interval=10
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test1-2b-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test1-2b-lsm.wtperf
new file mode 100644
index 00000000000..2a3923e62ab
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test1-2b-lsm.wtperf
@@ -0,0 +1,18 @@
+# wtperf options file: simulate riak and its test1 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+compact=true
+compression="snappy"
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=20GB,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+icount=2000000000
+key_sz=40
+value_sz=1000
+max_latency=2000
+populate_threads=20
+report_interval=10
+random_value=true
+sample_interval=10
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test1-500m-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test1-500m-lsm.wtperf
new file mode 100644
index 00000000000..91c7858a605
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test1-500m-lsm.wtperf
@@ -0,0 +1,18 @@
+# wtperf options file: simulate riak and its test1 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+compact=true
+compression="snappy"
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=5GB,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+icount=500000000
+key_sz=40
+value_sz=1000
+max_latency=2000
+populate_threads=20
+report_interval=10
+random_value=true
+sample_interval=10
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test1-50m-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test1-50m-lsm.wtperf
new file mode 100644
index 00000000000..229455d62a1
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test1-50m-lsm.wtperf
@@ -0,0 +1,17 @@
+# wtperf options file: simulate riak and its test1 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+#
+#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=30),lsm_manager=(worker_thread_max=6)"
+conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+compact=true
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+icount=50000000
+key_sz=40
+value_sz=1000
+max_latency=2000
+populate_threads=10
+report_interval=10
+random_value=true
+sample_interval=10
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test2-1b-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test2-1b-lsm.wtperf
new file mode 100644
index 00000000000..42452092dfc
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test2-1b-lsm.wtperf
@@ -0,0 +1,18 @@
+# wtperf options file: simulate riak and its test2 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+compression="snappy"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=5GB,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+key_sz=40
+value_sz=1000
+max_latency=2000
+report_interval=10
+run_time=14400
+sample_interval=10
+threads=((count=20,reads=4,updates=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test2-2b-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test2-2b-lsm.wtperf
new file mode 100644
index 00000000000..7fc1155bcb2
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test2-2b-lsm.wtperf
@@ -0,0 +1,18 @@
+# wtperf options file: simulate riak and its test2 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+compression="snappy"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=20GB,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+key_sz=40
+value_sz=1000
+max_latency=2000
+report_interval=10
+run_time=14400
+sample_interval=10
+threads=((count=20,reads=4,updates=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test2-500m-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test2-500m-lsm.wtperf
new file mode 100644
index 00000000000..c03f5041a4d
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test2-500m-lsm.wtperf
@@ -0,0 +1,18 @@
+# wtperf options file: simulate riak and its test2 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+create=false
+compression="snappy"
+sess_config="isolation=snapshot
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=5GB,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+key_sz=40
+value_sz=1000
+max_latency=2000
+report_interval=10
+run_time=14400
+sample_interval=10
+threads=((count=20,reads=4,updates=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test2-50m-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test2-50m-lsm.wtperf
new file mode 100644
index 00000000000..51a06d84222
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test2-50m-lsm.wtperf
@@ -0,0 +1,17 @@
+# wtperf options file: simulate riak and its test2 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=30)"
+conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+key_sz=40
+value_sz=1000
+max_latency=2000
+report_interval=10
+run_time=1440
+sample_interval=10
+threads=((count=10,reads=4,updates=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test3-1b-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test3-1b-lsm.wtperf
new file mode 100644
index 00000000000..113b79bc9f9
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test3-1b-lsm.wtperf
@@ -0,0 +1,20 @@
+# wtperf options file: simulate riak and its test3 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=8)"
+compression="snappy"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=5GB,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+key_sz=40
+value_sz=1000
+max_latency=2000
+pareto=true
+report_interval=10
+run_time=14400
+sample_interval=10
+#threads=((count=20,reads=1,updates=1))
+threads=((count=10,reads=1),(count=10,updates=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test3-2b-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test3-2b-lsm.wtperf
new file mode 100644
index 00000000000..574cf54b109
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test3-2b-lsm.wtperf
@@ -0,0 +1,20 @@
+# wtperf options file: simulate riak and its test3 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=8)"
+compression="snappy"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=20GB,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+key_sz=40
+value_sz=1000
+max_latency=2000
+pareto=true
+report_interval=10
+run_time=14400
+sample_interval=10
+#threads=((count=20,reads=1,updates=1))
+threads=((count=10,reads=1),(count=10,updates=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test3-500m-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test3-500m-lsm.wtperf
new file mode 100644
index 00000000000..307d92b6db6
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test3-500m-lsm.wtperf
@@ -0,0 +1,20 @@
+# wtperf options file: simulate riak and its test3 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=8)"
+create=false
+compression="snappy"
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=5GB,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+key_sz=40
+value_sz=1000
+max_latency=2000
+pareto=true
+report_interval=10
+run_time=14400
+sample_interval=10
+#threads=((count=20,reads=1,updates=1))
+threads=((count=10,reads=1),(count=10,updates=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test3-50m-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test3-50m-lsm.wtperf
new file mode 100644
index 00000000000..278b1ce3872
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test3-50m-lsm.wtperf
@@ -0,0 +1,19 @@
+# wtperf options file: simulate riak and its test3 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=30)"
+conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+key_sz=40
+value_sz=1000
+max_latency=2000
+pareto=true
+report_interval=10
+run_time=1440
+sample_interval=10
+#threads=((count=10,reads=1,updates=1))
+threads=((count=5,reads=1),(count=5,updates=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test4-1b-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test4-1b-lsm.wtperf
new file mode 100644
index 00000000000..8ff3cadbe59
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test4-1b-lsm.wtperf
@@ -0,0 +1,18 @@
+# wtperf options file: simulate riak and its test4 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+compression="snappy"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=5GB,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+key_sz=40
+value_sz=1000
+max_latency=2000
+report_interval=10
+run_time=14400
+sample_interval=10
+threads=((count=20,reads=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test4-2b-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test4-2b-lsm.wtperf
new file mode 100644
index 00000000000..d8f02872feb
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test4-2b-lsm.wtperf
@@ -0,0 +1,18 @@
+# wtperf options file: simulate riak and its test4 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+compression="snappy"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=20GB,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+key_sz=40
+value_sz=1000
+max_latency=2000
+report_interval=10
+run_time=14400
+sample_interval=10
+threads=((count=20,reads=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test4-500m-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test4-500m-lsm.wtperf
new file mode 100644
index 00000000000..19e6f84c9dc
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test4-500m-lsm.wtperf
@@ -0,0 +1,18 @@
+# wtperf options file: simulate riak and its test4 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+create=false
+compression="snappy"
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_max=5GB,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+key_sz=40
+value_sz=1000
+max_latency=2000
+report_interval=10
+run_time=14400
+sample_interval=10
+threads=((count=20,reads=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test4-50m-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test4-50m-lsm.wtperf
new file mode 100644
index 00000000000..f716a97943f
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/test4-50m-lsm.wtperf
@@ -0,0 +1,17 @@
+# wtperf options file: simulate riak and its test4 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=30)"
+conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+key_sz=40
+value_sz=1000
+max_latency=2000
+report_interval=10
+run_time=1440
+sample_interval=10
+threads=((count=10,reads=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/update-btree.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/update-btree.wtperf
new file mode 100644
index 00000000000..efd6cc8b028
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/update-btree.wtperf
@@ -0,0 +1,8 @@
+# wtperf options file: btree with inserts/updates, in memory
+conn_config="cache_size=1G"
+table_config="type=file"
+icount=5000000
+report_interval=5
+run_time=120
+populate_threads=1
+threads=((count=2,inserts=1),(count=2,reads=1),(count=2,updates=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/update-checkpoint-btree.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/update-checkpoint-btree.wtperf
new file mode 100644
index 00000000000..a2e4caab159
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/update-checkpoint-btree.wtperf
@@ -0,0 +1,12 @@
+# wtperf options file: btree with inserts/updates and checkpointing, in memory
+# Note: The cache needs to be sized to approximate the amount of inserts
+# that will happen during the given run_time.
+conn_config="cache_size=25G"
+table_config="type=file"
+icount=5000000
+report_interval=5
+checkpoint_threads=1
+checkpoint_interval=10
+run_time=300
+populate_threads=1
+threads=((count=1,inserts=1),(count=2,reads=1),(count=2,updates=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/update-checkpoint-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/update-checkpoint-lsm.wtperf
new file mode 100644
index 00000000000..ebd95db910f
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/update-checkpoint-lsm.wtperf
@@ -0,0 +1,12 @@
+# wtperf options file: lsm with inserts/updates and checkpointing, in memory
+# Note: The cache needs to be sized to approximate the amount of inserts
+# that will happen during the given run_time.
+conn_config="cache_size=25G"
+table_config="lsm=(chunk_size=20MB),type=lsm,os_cache_dirty_max=16MB"
+icount=5000000
+report_interval=5
+checkpoint_threads=1
+checkpoint_interval=10
+run_time=300
+populate_threads=1
+threads=((count=1,inserts=1),(count=2,reads=1),(count=2,updates=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/update-large-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/update-large-lsm.wtperf
new file mode 100644
index 00000000000..c3e468ec540
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/update-large-lsm.wtperf
@@ -0,0 +1,9 @@
+# wtperf options file: lsm with inserts/updates, in memory
+conn_config="cache_size=2G,lsm_manager=(worker_thread_max=6)"
+table_config="lsm=(chunk_size=50MB),type=lsm,os_cache_dirty_max=16MB"
+icount=200000000
+report_interval=5
+run_time=1200
+populate_threads=10
+compact=true
+threads=((count=2,inserts=1),(count=10,reads=1),(count=2,updates=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/update-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/update-lsm.wtperf
new file mode 100644
index 00000000000..52c4fb0192a
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/update-lsm.wtperf
@@ -0,0 +1,8 @@
+# wtperf options file: lsm with inserts/updates, in memory
+conn_config="cache_size=1G,lsm_manager=(worker_thread_max=6)"
+table_config="lsm=(chunk_size=20MB),type=lsm,os_cache_dirty_max=16MB"
+icount=5000000
+report_interval=5
+run_time=120
+populate_threads=1
+threads=((count=2,inserts=1),(count=2,reads=1),(count=2,updates=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/voxer-10k-short.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/voxer-10k-short.wtperf
new file mode 100644
index 00000000000..47228079db8
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/voxer-10k-short.wtperf
@@ -0,0 +1,19 @@
+# wtperf options file: simulate riak and a short form of its voxer config.
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+compact=true
+compression="snappy"
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+icount=15000
+key_sz=40
+value_sz=10000
+max_latency=2000
+populate_threads=1
+report_interval=5
+random_value=true
+run_time=300
+threads=((count=10,read=1),(count=10,update=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/voxer-10k.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/voxer-10k.wtperf
new file mode 100644
index 00000000000..9b4ed2acaee
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/voxer-10k.wtperf
@@ -0,0 +1,20 @@
+# wtperf options file: simulate riak and its test1 and test2 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+compact=true
+compression="snappy"
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+icount=15000
+key_sz=40
+value_sz=10000
+max_latency=2000
+populate_threads=1
+report_interval=10
+random_value=true
+run_time=18000
+sample_interval=10
+threads=((count=20,read=1,update=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/voxer-130k-short.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/voxer-130k-short.wtperf
new file mode 100644
index 00000000000..83f67062bf8
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/voxer-130k-short.wtperf
@@ -0,0 +1,19 @@
+# wtperf options file: simulate riak and a short form of its voxer config.
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+compact=true
+compression="snappy"
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+icount=15000
+key_sz=40
+value_sz=130000
+max_latency=2000
+populate_threads=1
+report_interval=5
+random_value=true
+run_time=300
+threads=((count=10,read=1),(count=10,update=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/voxer-130k.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/voxer-130k.wtperf
new file mode 100644
index 00000000000..a3439f0c575
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/voxer-130k.wtperf
@@ -0,0 +1,20 @@
+# wtperf options file: simulate riak and its test1 and test2 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,lsm_manager=(worker_thread_max=6)"
+compact=true
+compression="snappy"
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB),type=lsm,leaf_page_max=16K,os_cache_dirty_max=16MB"
+icount=15000
+key_sz=40
+value_sz=130000
+max_latency=2000
+populate_threads=1
+report_interval=10
+random_value=true
+run_time=18000
+sample_interval=10
+threads=((count=20,read=1,update=1))
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_ckpt.sh b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_ckpt.sh
new file mode 100755
index 00000000000..c8993bfc1ba
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_ckpt.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+# A script for running the wtperf benchmark to analyze the performance
+# of checkpoint operations.
+
+# General configuration settings:
+BIN_DIR='.'
+ROOT_DIR=`/bin/pwd`
+SCRIPT_DIR=`dirname $0`
+RUNTIME=900
+REUSE=0
+VERBOSE=0
+WORKLOAD=0 # skip the populate phase.
+PERF_BASE="-M"
+OPTFILE=''
+DEBUG=
+GDB=${GDB:-gdb}
+
+USAGE="Usage: `basename $0` [-hdRWsv] [-b binary dir] [-r root dir] [-O optfile]"
+
+# Parse command line options.
+while getopts b:dhO:RWr:sv OPT; do
+ case "$OPT" in
+ b)
+ BIN_DIR=$OPTARG
+ ;;
+ d)
+ export TERM=dtterm
+ DEBUG="$GDB --args"
+ ;;
+ h)
+ echo $USAGE
+ exit 0
+ ;;
+ O)
+ OPTFILE=-O$OPTARG
+ PERF_BASE=""
+ ;;
+ R)
+ REUSE=1
+ ;;
+ r)
+ ROOT_DIR=$OPTARG
+ ;;
+ s)
+ RUNTIME=20
+ PERF_BASE="-S"
+ ;;
+ v)
+ VERBOSE=0
+ ;;
+ W)
+ WORKLOAD=1
+ REUSE=1 # skip the populate phase.
+ ;;
+ \?)
+ # getopts issues an error message
+ echo $USAGE >&2
+ exit 1
+ ;;
+ esac
+done
+
+# Configuration settings that may be altered by command line options
+WTPERF=${BIN_DIR}/wtperf
+if [ ! -x $WTPERF ]; then
+ echo "Could not find or execute $WTPERF"
+ exit 1
+fi
+
+DB_HOME="$ROOT_DIR/WT_TEST"
+OUT_DIR="$ROOT_DIR/results"
+SHARED_OPTS="${OPTFILE} ${PERF_BASE} -o read_threads=1,update_threads=1,report_interval=1,uri=\"table:test\" -o verbose=1 -h ${DB_HOME}"
+CREATE_OPTS="$SHARED_OPTS -o run_time=0"
+RUN_OPTS="$SHARED_OPTS -o run_time=$RUNTIME"
+if [ $WORKLOAD -eq 0 ]; then
+ RUN_OPTS="$RUN_OPTS -o create=false"
+else
+ RUN_OPTS="$RUN_OPTS -o icount=0"
+fi
+
+if [ $REUSE -eq 0 ]; then
+ if [ $VERBOSE -ne 0 ]; then
+ echo "Creating database and archiving it for reuse."
+ fi
+ rm -rf $DB_HOME && mkdir $DB_HOME
+ $DEBUG $WTPERF $CREATE_OPTS || exit 1
+
+ # Save the database so that it can be re-used by all runs.
+ # I'd rather not hard code WT_TEST, but need to get the path right.
+ rm -f $ROOT_DIR/WT_TEST.tgz
+ tar zcf $ROOT_DIR/WT_TEST.tgz -C $ROOT_DIR WT_TEST
+fi
+
+rm -rf $OUT_DIR && mkdir $OUT_DIR
+
+# Run the benchmarks..
+# for ckpt in "" "-c 120"; do
+for ckpt in "checkpoint_threads=1,checkpoint_interval=120"; do
+ # for opts in "" "-C eviction_dirty_target=20"; do
+ for opts in ""; do
+ if [ $VERBOSE -ne 0 ]; then
+ echo "Doing a run with:"
+ echo "\t$WTPERF $RUN_OPTS $ckpt $opts"
+ fi
+ res_name="run_${ckpt},${opts}"
+ res_name=`echo $res_name | tr '[:upper:][=\- ,]' '[:lower:]_'`
+ if [ $WORKLOAD -eq 0 ]; then
+ rm -rf $DB_HOME && tar zxf $ROOT_DIR/WT_TEST.tgz -C $ROOT_DIR
+ else
+ rm -rf $DB_HOME && mkdir $DB_HOME
+ fi
+ if [ "$DEBUG" = '' ]; then
+ $WTPERF $RUN_OPTS -o "$ckpt" -o "$opts" &
+ pid=$!
+ t=0
+ while kill -0 $pid 2> /dev/null; do
+ echo "Time $t"
+ pmp $pid
+ sleep 1
+ (( t++ ))
+ done > $OUT_DIR/${res_name}.trace
+ else
+ $DEBUG $WTPERF $RUN_OPTS $ckpt $opts
+ fi
+ cp $DB_HOME/test.stat "$OUT_DIR/${res_name}.res"
+ done
+done
+
+if [ $VERBOSE -ne 0 ]; then
+ echo "Post processing result files."
+fi
+for f in ${OUT_DIR}/*res; do
+ grep "^[0-9]* reads" ${f} | sed -e 's/ reads//' -e 's/ inserts//' -e 's/ updates in 1 secs//' > ${f}.out
+ ${SCRIPT_DIR}/get_ckpt.py < ${f} > ${f}.ckpt
+done
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_run.sh b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_run.sh
new file mode 100755
index 00000000000..3296a4072b5
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_run.sh
@@ -0,0 +1,168 @@
+#!/bin/sh
+
+# wtperf_run.sh - run wtperf regression tests on the Jenkins platform.
+#
+# The Jenkins machines show variability so we run this script to run
+# each wtperf test several times. We throw away the min and max
+# number and average the remaining values. That is the number we
+# give to Jenkins for plotting. We write these values to a
+# test.average file in the current directory (which is
+# build_posix/bench/wtperf).
+#
+# This script should be invoked with the pathname of the wtperf test
+# config to run.
+#
+if test "$#" -ne "1"; then
+ echo "Must specify wtperf test to run"
+ exit 1
+fi
+wttest=$1
+home=./WT_TEST
+outfile=./wtperf.out
+rm -f $outfile
+runmax=5
+run=1
+
+avg=(0 0 0)
+max=(0 0 0)
+min=(0 0 0)
+sum=(0 0 0)
+# Load needs floating point and bc, handle separately.
+loadindex=4
+avg[$loadindex]=0
+max[$loadindex]=0
+min[$loadindex]=0
+sum[$loadindex]=0
+ops=(read insert update)
+outp=("Read count:" "Insert count:" "Update count:")
+outp[$loadindex]="Load time:"
+
+# getval min/max val cur
+# Returns the minimum or maximum of val and cur.
+# min == 0, max == 1.
+getval()
+{
+ max="$1"
+ val="$2"
+ cur="$3"
+ ret=$cur
+ echo "getval: max $max val $val cur $cur" >> $outfile
+ if test "$max" -eq "1"; then
+ if test "$val" -gt "$cur"; then
+ ret=$val
+ fi
+ elif test "$val" -lt "$cur"; then
+ ret=$val
+ fi
+ echo "$ret"
+}
+
+isstable()
+{
+ min="$1"
+ max="$2"
+ tmp=`echo "scale=3; $min * 1.03" | bc`
+ if (($(bc <<< "$tmp < $max") )); then
+ ret=0
+ else
+ ret=1
+ fi
+ echo "$ret"
+}
+
+getmin=0
+getmax=1
+while test "$run" -le "$runmax"; do
+ rm -rf $home
+ mkdir $home
+ LD_PRELOAD=/usr/lib64/libjemalloc.so.1 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib ./wtperf -O $wttest
+ if test "$?" -ne "0"; then
+ exit 1
+ fi
+ # Load is always using floating point, so handle separately
+ l=`grep "^Load time:" ./WT_TEST/test.stat`
+ if test "$?" -eq "0"; then
+ load=`echo $l | cut -d ' ' -f 3`
+ else
+ load=0
+ fi
+ cur[$loadindex]=$load
+ sum[$loadindex]=`echo "${sum[$loadindex]} + $load" | bc`
+ echo "cur ${cur[$loadindex]} sum ${sum[$loadindex]}" >> $outfile
+ for i in ${!ops[*]}; do
+ l=`grep "Executed.*${ops[$i]} operations" ./WT_TEST/test.stat`
+ if test "$?" -eq "0"; then
+ n=`echo $l | cut -d ' ' -f 2`
+ else
+ n=0
+ fi
+ cur[$i]=$n
+ sum[$i]=`expr $n + ${sum[$i]}`
+ done
+ #
+ # Keep running track of min and max for each operation type.
+ #
+ if test "$run" -eq "1"; then
+ for i in ${!cur[*]}; do
+ min[$i]=${cur[$i]}
+ max[$i]=${cur[$i]}
+ done
+ else
+ for i in ${!cur[*]}; do
+ if test "$i" -eq "$loadindex"; then
+ if (($(bc <<< "${cur[$i]} < ${min[$i]}") )); then
+ min[$i]=${cur[$i]}
+ fi
+ if (($(bc <<< "${cur[$i]} > ${max[$i]}") )); then
+ max[$i]=${cur[$i]}
+ fi
+ else
+ min[$i]=$(getval $getmin ${cur[$i]} ${min[$i]})
+ max[$i]=$(getval $getmax ${cur[$i]} ${max[$i]})
+ fi
+ done
+ fi
+ #
+ # After 3 runs see if this is a very stable test. If so, we
+ # can skip the last 2 runs and just use these values. We
+ # define "very stable" to be that the min and max are within
+ # 3% of each other.
+ if test "$run" -eq "3"; then
+ # Only if all values are stable, we can break.
+ unstable=0
+ for i in ${!min[*]}; do
+ stable=$(isstable ${min[$i]} ${max[$i]})
+ if test "$stable" -eq "0"; then
+ unstable=1
+ break
+ fi
+ done
+ if test "$unstable" -eq "0"; then
+ break
+ fi
+ fi
+ run=`expr $run + 1`
+done
+
+if test "$run" -le "$runmax"; then
+ numruns=`expr $run - 2`
+else
+ numruns=`expr $runmax - 2`
+fi
+#
+# The sum contains all runs. Subtract out the min/max values.
+# Average the remaining and write it out to the file.
+#
+for i in ${!min[*]}; do
+ if test "$i" -eq "$loadindex"; then
+ s=`echo "scale=3; ${sum[$i]} - ${min[$i]} - ${max[$i]}" | bc`
+ avg[$i]=`echo "scale=3; $s / $numruns" | bc`
+ else
+ s=`expr ${sum[$i]} - ${min[$i]} - ${max[$i]}`
+ avg[$i]=`expr $s / $numruns`
+ fi
+done
+for i in ${!outp[*]}; do
+ echo "${outp[$i]} ${avg[$i]}" >> $outfile
+done
+exit 0
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_track.sh b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_track.sh
new file mode 100644
index 00000000000..db92cb95931
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_track.sh
@@ -0,0 +1,204 @@
+#!/bin/bash
+
+# wtperf_track.sh - track performance regression trends on the Jenkins platform.
+#
+# This is run after a performance build, with a count or time value.
+# wtperf_track.sh -t [ -p percent ] name time_value
+# wtperf_track.sh -c [ -p percent ] name count_value
+#
+# The values are kept in a .csv file (see $TRACK_FILE) stored under
+# the userContent directory (see $STATE_DIR), which is served as
+# visible files by Jenkins. The .csv contains fields: buildid,
+# curtime, value, loadavg, diskavg.
+#
+# Since our Jenkins hosts have some variability to their load level,
+# we allow some 'resiliance' in the regression checking. To this end,
+# we calculate 3 values:
+#
+# v3: The 'best' of the most recent 3 values, where 'best' is
+# lowest for a time value and highest for a count.
+#
+# v20: The average of the best 10 of the last 20 values.
+#
+# v100: The average of the best 50 of the last 100 values.
+#
+# If v3 is more then p% worse than v20, or if v20 is more than p% worse
+# than v100, we issue a warning. The default value of p is 5, and
+# is set with the -p option.
+#
+# We expect that the Jenkins job is configured to capture the WARNING
+# output, in order to send mail or mark the build as unstable.
+
+JENKINS_HOME=${JENKINS_HOME:?"Must be run on the Jenkins platform"}
+JOB_NAME=${JOB_NAME:?"Must be run on the Jenkins platform"}
+BUILD_ID=${BUILD_ID:?"Must be run on the Jenkins platform"}
+
+STATE_DIR="/home/jenkins/wtperf_track"
+TYPE=unknown # set to 'time' or 'count' depending on -t/-c option
+NAME= # the command line name
+VALUE= # the command line value
+
+Usage()
+{
+ cat >&2 <<EOF
+Usage: wtperf_track.sh -t [ options ] name time_value
+ wtperf_track.sh -c [ options ] name count_value
+
+name is the name of this metric (e.g. "load_time")
+value is an integer or float value
+
+Options:
+ -t: value is a time value, lower numbers are better
+ -c: value is a count value, higher numbers are better
+ -p pct: require short term avgs to be witnin 'pct' percent of long term avgs
+
+One of the -t and -c options is required.
+EOF
+}
+
+# GetValues n filename
+# gets the last n values from a .csv file
+GetValues()
+{
+ cut -f 3 -d , "$2" | sed -e '1d' -e 's/"//g' | tail -"$1"
+}
+
+# MinValues n direction
+# Given a list of values on input,
+# removes all but n smallest (if direction is >0) or
+# n largest if (direction is <0)
+MinValues()
+{
+ rflag=-r
+ if [ "$2" -gt 0 ]; then
+ rflag=
+ fi
+ sort -g $rflag | tail -$1
+}
+
+# AvgValues
+# Given a list of values on input,
+# returns the average.
+AvgValues()
+{
+ VALUES=$(tr '\n' ' ' | sed -e 's/^ *//' -e 's/ *$//' -e 's/ */ /')
+ n=$(echo "$VALUES" | wc -w)
+ echo "$VALUES" | sed -e 's/ /+/g' -e 's/^/scale=3;(/' -e "s:\$:)/$n:"| bc
+}
+
+# CheckValues sval lval direction pct desc
+# Given two values 'sval', 'lval', make sure that sval is no more
+# than 'pct' percent less that lval (if direction is >0) or that
+# sval is no more than 'ct' percent greater than lval (if direction <0)
+# Returns 0 for normal, 1 for out of range
+CheckValues()
+{
+ sval="$1"
+ lval="$2"
+ direction="$3"
+ pct="$4"
+ desc="$5"
+ if [ "$direction" -gt 0 ]; then
+ expr="($sval * (1.00 + $pct / 100.00)) - $lval"
+ else
+ expr="($lval * (1.00 + $pct / 100.00)) - $sval"
+ fi
+ result=$(echo "scale=3; $expr" | bc)
+ # bc error?
+ if [ $? != 0 ]; then
+ return 1
+ fi
+ if [ "$(echo $result | grep '^-')" != '' ]; then
+ echo "$desc: WARNING: $type $sval not within $pct% of $lval (curval=$VALUE)" >&2
+ return 1
+ fi
+ return 0
+}
+
+GetCpuLoadAverage()
+{
+ uptime | sed -e 's/.*: *//' -e 's/ .*//'
+}
+
+GetDiskLoadAverage()
+{
+ DEVICE=$(df "$1" | grep /dev | head -1 | sed -e 's:.*\(/dev/[^ ]*\).*:\1/:')
+ case `uname -s` in
+ *Linux* )
+ # iostat -d $DEVICE | grep -v Device: | head -1 | sed -e 's/.* //'
+ echo '0.0'
+ ;;
+ * )
+ echo '0.0'
+ ;;
+ esac
+}
+
+direction=0
+pct=5
+while getopts tcp: OPT; do
+ case "$OPT" in
+ t)
+ direction=-1
+ type=time
+ ;;
+ c)
+ direction=1
+ type=count
+ ;;
+ p)
+ pct=$OPTARG
+ ;;
+ *)
+ # getopts issues an error message
+ Usage
+ exit 1
+ ;;
+ esac
+done
+
+shift $((OPTIND-1))
+if [ "$#" != 2 ]; then
+ echo "Missing name/value" >&2
+ Usage
+ exit 1
+fi
+NAME="$1"
+VALUE="$2"
+
+if [ "$direction" = 0 ]; then
+ echo "Missing -t or -c option" >&2
+ Usage
+ exit 1
+fi
+
+TRACK_FILE="${STATE_DIR}/${JOB_NAME}.${NAME}.csv"
+
+mkdir -p "${STATE_DIR}" || exit 1
+if [ ! -f "${TRACK_FILE}" ]; then
+ echo '"buildid","time","value","loadavg","diskavg"' \
+ > ${TRACK_FILE} || exit 1
+fi
+TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+LOAD_AVG=$(GetCpuLoadAverage)
+DISK_AVG=$(GetDiskLoadAverage .)
+echo "${BUILD_ID},${TIME},${VALUE},${LOAD_AVG},${DISK_AVG}" \
+ >> ${TRACK_FILE} || exit 1
+
+v3=$(GetValues 3 ${TRACK_FILE} | MinValues 1 $direction)
+v20=$(GetValues 20 ${TRACK_FILE} | MinValues 10 $direction | AvgValues)
+v100=$(GetValues 100 ${TRACK_FILE} | MinValues 50 $direction | AvgValues)
+
+v3=${v3:?"Internal error: v3 not set"}
+v20=${v20:?"Internal error: v20 not set"}
+v100=${v100:?"Internal error: v100 not set"}
+
+ecode=0
+prefix="$JOB_NAME: build #$BUILD_ID"
+echo "$prefix: curval=$VALUE v3=$v3 v20=$v20 v100=$v100 pct=$pct"
+CheckValues $v3 $v20 $direction $pct "$prefix: short term trend" || ecode=1
+CheckValues $v20 $v100 $direction $pct "$prefix: long term trend" || ecode=1
+# Rather than a failure exit for a trend out of range,
+# Jenkins can capture the WARNING output, send email and mark
+# a build as unstable.
+#exit $ecode
diff --git a/src/third_party/wiredtiger/bench/wtperf/smoke.sh b/src/third_party/wiredtiger/bench/wtperf/smoke.sh
new file mode 100755
index 00000000000..062277d90dc
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/smoke.sh
@@ -0,0 +1,4 @@
+#! /bin/sh
+
+# Smoke-test wtperf as part of running "make check".
+./wtperf -O `dirname $0`/runners/small-lsm.wtperf -o "run_time=20"
diff --git a/src/third_party/wiredtiger/bench/wtperf/track.c b/src/third_party/wiredtiger/bench/wtperf/track.c
new file mode 100644
index 00000000000..3919d0eb1ab
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/track.c
@@ -0,0 +1,324 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wtperf.h"
+
+/*
+ * Return total insert operations for the populate phase.
+ */
+uint64_t
+sum_pop_ops(CONFIG *cfg)
+{
+ CONFIG_THREAD *thread;
+ uint64_t total;
+ u_int i;
+
+ total = 0;
+
+ for (i = 0, thread = cfg->popthreads;
+ thread != NULL && i < cfg->populate_threads; ++i, ++thread)
+ total += thread->insert.ops;
+ return (total);
+}
+
+/*
+ * Return total checkpoint operations.
+ */
+uint64_t
+sum_ckpt_ops(CONFIG *cfg)
+{
+ CONFIG_THREAD *thread;
+ uint64_t total;
+ u_int i;
+
+ total = 0;
+
+ for (i = 0, thread = cfg->ckptthreads;
+ thread != NULL && i < cfg->checkpoint_threads; ++i, ++thread)
+ total += thread->ckpt.ops;
+ return (total);
+}
+
+/*
+ * Return total operations count for the worker threads.
+ */
+static uint64_t
+sum_ops(CONFIG *cfg, size_t field_offset)
+{
+ CONFIG_THREAD *thread;
+ uint64_t total;
+ int64_t i, th_cnt;
+
+ total = 0;
+ if (cfg->popthreads == NULL) {
+ thread = cfg->workers;
+ th_cnt = cfg->workers_cnt;
+ } else {
+ thread = cfg->popthreads;
+ th_cnt = cfg->populate_threads;
+ }
+ for (i = 0; thread != NULL && i < th_cnt; ++i, ++thread)
+ total += ((TRACK *)((uint8_t *)thread + field_offset))->ops;
+
+ return (total);
+}
+uint64_t
+sum_insert_ops(CONFIG *cfg)
+{
+ return (sum_ops(cfg, offsetof(CONFIG_THREAD, insert)));
+}
+uint64_t
+sum_read_ops(CONFIG *cfg)
+{
+ return (sum_ops(cfg, offsetof(CONFIG_THREAD, read)));
+}
+uint64_t
+sum_update_ops(CONFIG *cfg)
+{
+ return (sum_ops(cfg, offsetof(CONFIG_THREAD, update)));
+}
+
+/*
+ * latency_op --
+ * Get average, minimum and maximum latency for this period for a
+ * particular operation.
+ */
+static void
+latency_op(CONFIG *cfg,
+ size_t field_offset, uint32_t *avgp, uint32_t *minp, uint32_t *maxp)
+{
+ CONFIG_THREAD *thread;
+ TRACK *track;
+ uint64_t ops, latency, tmp;
+ int64_t i, th_cnt;
+ uint32_t max, min;
+
+ ops = latency = 0;
+ max = 0;
+ min = UINT32_MAX;
+
+ if (cfg->popthreads == NULL) {
+ thread = cfg->workers;
+ th_cnt = cfg->workers_cnt;
+ } else {
+ thread = cfg->popthreads;
+ th_cnt = cfg->populate_threads;
+ }
+ for (i = 0; thread != NULL && i < th_cnt; ++i, ++thread) {
+ track = (TRACK *)((uint8_t *)thread + field_offset);
+ tmp = track->latency_ops;
+ ops += tmp - track->last_latency_ops;
+ track->last_latency_ops = tmp;
+ tmp = track->latency;
+ latency += tmp - track->last_latency;
+ track->last_latency = tmp;
+
+ if (min > track->min_latency)
+ min = track->min_latency;
+ track->min_latency = UINT32_MAX;
+ if (max < track->max_latency)
+ max = track->max_latency;
+ track->max_latency = 0;
+ }
+
+ if (ops == 0)
+ *avgp = *minp = *maxp = 0;
+ else {
+ *minp = min;
+ *maxp = max;
+ *avgp = (uint32_t)(latency / ops);
+ }
+}
+void
+latency_read(CONFIG *cfg, uint32_t *avgp, uint32_t *minp, uint32_t *maxp)
+{
+ static uint32_t last_avg = 0, last_max = 0, last_min = 0;
+
+ latency_op(cfg, offsetof(CONFIG_THREAD, read), avgp, minp, maxp);
+
+ /*
+ * If nothing happened, graph the average, minimum and maximum as they
+ * were the last time, it keeps the graphs from having discontinuities.
+ */
+ if (*minp == 0) {
+ *avgp = last_avg;
+ *minp = last_min;
+ *maxp = last_max;
+ } else {
+ last_avg = *avgp;
+ last_min = *minp;
+ last_max = *maxp;
+ }
+}
+void
+latency_insert(CONFIG *cfg, uint32_t *avgp, uint32_t *minp, uint32_t *maxp)
+{
+ static uint32_t last_avg = 0, last_max = 0, last_min = 0;
+
+ latency_op(cfg, offsetof(CONFIG_THREAD, insert), avgp, minp, maxp);
+
+ /*
+ * If nothing happened, graph the average, minimum and maximum as they
+ * were the last time, it keeps the graphs from having discontinuities.
+ */
+ if (*minp == 0) {
+ *avgp = last_avg;
+ *minp = last_min;
+ *maxp = last_max;
+ } else {
+ last_avg = *avgp;
+ last_min = *minp;
+ last_max = *maxp;
+ }
+}
+void
+latency_update(CONFIG *cfg, uint32_t *avgp, uint32_t *minp, uint32_t *maxp)
+{
+ static uint32_t last_avg = 0, last_max = 0, last_min = 0;
+
+ latency_op(cfg, offsetof(CONFIG_THREAD, update), avgp, minp, maxp);
+
+ /*
+ * If nothing happened, graph the average, minimum and maximum as they
+ * were the last time, it keeps the graphs from having discontinuities.
+ */
+ if (*minp == 0) {
+ *avgp = last_avg;
+ *minp = last_min;
+ *maxp = last_max;
+ } else {
+ last_avg = *avgp;
+ last_min = *minp;
+ last_max = *maxp;
+ }
+}
+
+/*
+ * sum_latency --
+ * Sum latency for a set of threads.
+ */
+static void
+sum_latency(CONFIG *cfg, size_t field_offset, TRACK *total)
+{
+ CONFIG_THREAD *thread;
+ TRACK *trk;
+ int64_t i;
+ u_int j;
+
+ memset(total, 0, sizeof(*total));
+
+ for (i = 0, thread = cfg->workers;
+ thread != NULL && i < cfg->workers_cnt; ++i, ++thread) {
+ trk = (TRACK *)((uint8_t *)thread + field_offset);
+
+ for (j = 0; j < ELEMENTS(trk->us); ++j) {
+ total->ops += trk->us[j];
+ total->us[j] += trk->us[j];
+ }
+ for (j = 0; j < ELEMENTS(trk->ms); ++j) {
+ total->ops += trk->ms[j];
+ total->ms[j] += trk->ms[j];
+ }
+ for (j = 0; j < ELEMENTS(trk->sec); ++j) {
+ total->ops += trk->sec[j];
+ total->sec[j] += trk->sec[j];
+ }
+ }
+}
+static void
+sum_insert_latency(CONFIG *cfg, TRACK *total)
+{
+ sum_latency(cfg, offsetof(CONFIG_THREAD, insert), total);
+}
+static void
+sum_read_latency(CONFIG *cfg, TRACK *total)
+{
+ sum_latency(cfg, offsetof(CONFIG_THREAD, read), total);
+}
+static void
+sum_update_latency(CONFIG *cfg, TRACK *total)
+{
+ sum_latency(cfg, offsetof(CONFIG_THREAD, update), total);
+}
+
+static void
+latency_print_single(CONFIG *cfg, TRACK *total, const char *name)
+{
+ FILE *fp;
+ u_int i;
+ uint64_t cumops;
+ char path[1024];
+
+ snprintf(path, sizeof(path), "%s/latency.%s", cfg->monitor_dir, name);
+ if ((fp = fopen(path, "w")) == NULL) {
+ lprintf(cfg, errno, 0, "%s", path);
+ return;
+ }
+
+ fprintf(fp,
+ "#usecs,operations,cumulative-operations,total-operations\n");
+ cumops = 0;
+ for (i = 0; i < ELEMENTS(total->us); ++i) {
+ if (total->us[i] == 0)
+ continue;
+ cumops += total->us[i];
+ fprintf(fp,
+ "%u,%" PRIu32 ",%" PRIu64 ",%" PRIu64 "\n",
+ (i + 1), total->us[i], cumops, total->ops);
+ }
+ for (i = 1; i < ELEMENTS(total->ms); ++i) {
+ if (total->ms[i] == 0)
+ continue;
+ cumops += total->ms[i];
+ fprintf(fp,
+ "%llu,%" PRIu32 ",%" PRIu64 ",%" PRIu64 "\n",
+ ms_to_us(i + 1), total->ms[i], cumops, total->ops);
+ }
+ for (i = 1; i < ELEMENTS(total->sec); ++i) {
+ if (total->sec[i] == 0)
+ continue;
+ cumops += total->sec[i];
+ fprintf(fp,
+ "%llu,%" PRIu32 ",%" PRIu64 ",%" PRIu64 "\n",
+ sec_to_us(i + 1), total->sec[i], cumops, total->ops);
+ }
+
+ (void)fclose(fp);
+}
+
+void
+latency_print(CONFIG *cfg)
+{
+ TRACK total;
+
+ sum_insert_latency(cfg, &total);
+ latency_print_single(cfg, &total, "insert");
+ sum_read_latency(cfg, &total);
+ latency_print_single(cfg, &total, "read");
+ sum_update_latency(cfg, &total);
+ latency_print_single(cfg, &total, "update");
+}
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
new file mode 100644
index 00000000000..74d9152f326
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c
@@ -0,0 +1,2298 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wtperf.h"
+
+/* Default values. */
+static const CONFIG default_cfg = {
+ "WT_TEST", /* home */
+ "WT_TEST", /* monitor dir */
+ NULL, /* base_uri */
+ NULL, /* uris */
+ NULL, /* helium_mount */
+ NULL, /* conn */
+ NULL, /* logf */
+ NULL, /* async */
+ NULL, NULL, /* compressor ext, blk */
+ NULL, NULL, /* populate, checkpoint threads */
+
+ NULL, /* worker threads */
+ 0, /* worker thread count */
+ NULL, /* workloads */
+ 0, /* workload count */
+ 0, /* use_asyncops */
+ 0, /* checkpoint operations */
+ 0, /* insert operations */
+ 0, /* read operations */
+ 0, /* update operations */
+ 0, /* insert key */
+ 0, /* checkpoint in progress */
+ 0, /* thread error */
+ 0, /* notify threads to stop */
+ 0, /* in warmup phase */
+ 0, /* total seconds running */
+
+#define OPT_DEFINE_DEFAULT
+#include "wtperf_opt.i"
+#undef OPT_DEFINE_DEFAULT
+};
+
+static const char * const debug_cconfig = "";
+static const char * const debug_tconfig = "";
+
+static void *checkpoint_worker(void *);
+static int create_tables(CONFIG *);
+static int create_uris(CONFIG *);
+static int execute_populate(CONFIG *);
+static int execute_workload(CONFIG *);
+static int find_table_count(CONFIG *);
+static void *monitor(void *);
+static void *populate_thread(void *);
+static void randomize_value(CONFIG_THREAD *, char *);
+static int start_all_runs(CONFIG *);
+static int start_run(CONFIG *);
+static int start_threads(CONFIG *,
+ WORKLOAD *, CONFIG_THREAD *, u_int, void *(*)(void *));
+static int stop_threads(CONFIG *, u_int, CONFIG_THREAD *);
+static void *thread_run_wtperf(void *);
+static void *worker(void *);
+static void worker_throttle(int64_t, int64_t *, struct timespec *);
+static uint64_t wtperf_rand(CONFIG_THREAD *);
+static uint64_t wtperf_value_range(CONFIG *);
+
+#define HELIUM_NAME "dev1"
+#define HELIUM_PATH \
+ "../../ext/test/helium/.libs/libwiredtiger_helium.so"
+#define HELIUM_CONFIG ",type=helium"
+
+/* Retrieve an ID for the next insert operation. */
+static inline uint64_t
+get_next_incr(CONFIG *cfg)
+{
+ return (WT_ATOMIC_ADD8(cfg->insert_key, 1));
+}
+
+static inline void
+generate_key(CONFIG *cfg, char *key_buf, uint64_t keyno)
+{
+ /*
+ * Don't change to snprintf, sprintf is faster in some tests.
+ */
+ sprintf(key_buf, "%0*" PRIu64, cfg->key_sz - 1, keyno);
+}
+
+static void
+randomize_value(CONFIG_THREAD *thread, char *value_buf)
+{
+ uint8_t *vb;
+ uint32_t i;
+
+ /*
+ * Each time we're called overwrite value_buf[0] and one other
+ * randomly chosen byte (other than the trailing NUL).
+ * Make sure we don't write a NUL: keep the value the same length.
+ */
+ i = __wt_random(thread->rnd) % (thread->cfg->value_sz - 1);
+ while (value_buf[i] == '\0' && i > 0)
+ --i;
+ if (i > 0) {
+ vb = (uint8_t *)value_buf;
+ vb[0] = (__wt_random(thread->rnd) % 255) + 1;
+ vb[i] = (__wt_random(thread->rnd) % 255) + 1;
+ }
+}
+
+static int
+cb_asyncop(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op, int ret, uint32_t flags)
+{
+ CONFIG *cfg;
+ CONFIG_THREAD *thread;
+ TRACK *trk;
+ WT_ASYNC_OPTYPE type;
+ char *value;
+ uint32_t *tables;
+ int t_ret;
+
+ (void)cb;
+ (void)flags;
+
+ cfg = NULL; /* -Wconditional-uninitialized */
+ thread = NULL; /* -Wconditional-uninitialized */
+
+ type = op->get_type(op);
+ if (type != WT_AOP_COMPACT) {
+ thread = (CONFIG_THREAD *)op->app_private;
+ cfg = thread->cfg;
+ }
+
+ trk = NULL;
+ switch (type) {
+ case WT_AOP_COMPACT:
+ tables = (uint32_t *)op->app_private;
+ WT_ATOMIC_ADD4(*tables, (uint32_t)-1);
+ break;
+ case WT_AOP_INSERT:
+ trk = &thread->insert;
+ break;
+ case WT_AOP_SEARCH:
+ trk = &thread->read;
+ if (ret == 0 &&
+ (t_ret = op->get_value(op, &value)) != 0) {
+ ret = t_ret;
+ lprintf(cfg, ret, 0, "get_value in read.");
+ goto err;
+ }
+ break;
+ case WT_AOP_UPDATE:
+ trk = &thread->update;
+ break;
+ case WT_AOP_NONE:
+ case WT_AOP_REMOVE:
+ /* We never expect this type. */
+ lprintf(cfg, ret, 0, "No type in op %" PRIu64, op->get_id(op));
+ goto err;
+ }
+
+ /*
+ * Either we have success and we track it, or failure and panic.
+ *
+ * Reads and updates can fail with WT_NOTFOUND: we may be searching
+ * in a random range, or an insert op might have updated the
+ * last record in the table but not yet finished the actual insert.
+ */
+ if (type == WT_AOP_COMPACT)
+ return (0);
+ if (ret == 0 || (ret == WT_NOTFOUND && type != WT_AOP_INSERT)) {
+ if (!cfg->in_warmup)
+ (void)WT_ATOMIC_ADD8(trk->ops, 1);
+ return (0);
+ }
+err:
+ /* Panic if error */
+ lprintf(cfg, ret, 0, "Error in op %" PRIu64,
+ op->get_id(op));
+ cfg->error = cfg->stop = 1;
+ return (1);
+}
+
+static WT_ASYNC_CALLBACK cb = { cb_asyncop };
+
+/*
+ * track_operation --
+ * Update an operation's tracking structure with new latency information.
+ */
+static inline void
+track_operation(TRACK *trk, uint64_t usecs)
+{
+ uint64_t v;
+
+ /* average microseconds per call */
+ v = (uint64_t)usecs;
+
+ trk->latency += usecs; /* track total latency */
+
+ if (v > trk->max_latency) /* track max/min latency */
+ trk->max_latency = (uint32_t)v;
+ if (v < trk->min_latency)
+ trk->min_latency = (uint32_t)v;
+
+ /*
+ * Update a latency bucket.
+ * First buckets: usecs from 100us to 1000us at 100us each.
+ */
+ if (v < 1000)
+ ++trk->us[v];
+
+ /*
+ * Second buckets: millseconds from 1ms to 1000ms, at 1ms each.
+ */
+ else if (v < ms_to_us(1000))
+ ++trk->ms[us_to_ms(v)];
+
+ /*
+ * Third buckets are seconds from 1s to 100s, at 1s each.
+ */
+ else if (v < sec_to_us(100))
+ ++trk->sec[us_to_sec(v)];
+
+ /* >100 seconds, accumulate in the biggest bucket. */
+ else
+ ++trk->sec[ELEMENTS(trk->sec) - 1];
+}
+
+static const char *
+op_name(uint8_t *op)
+{
+ switch (*op) {
+ case WORKER_INSERT:
+ return ("insert");
+ case WORKER_INSERT_RMW:
+ return ("insert_rmw");
+ case WORKER_READ:
+ return ("read");
+ case WORKER_UPDATE:
+ return ("update");
+ default:
+ return ("unknown");
+ }
+ /* NOTREACHED */
+}
+
+static void *
+worker_async(void *arg)
+{
+ CONFIG *cfg;
+ CONFIG_THREAD *thread;
+ WT_ASYNC_OP *asyncop;
+ WT_CONNECTION *conn;
+ uint64_t next_val;
+ uint8_t *op, *op_end;
+ int ret;
+ char *key_buf, *value_buf;
+
+ thread = (CONFIG_THREAD *)arg;
+ cfg = thread->cfg;
+ conn = cfg->conn;
+
+ key_buf = thread->key_buf;
+ value_buf = thread->value_buf;
+
+ op = thread->workload->ops;
+ op_end = op + sizeof(thread->workload->ops);
+
+ while (!cfg->stop) {
+ /*
+ * Generate the next key and setup operation specific
+ * statistics tracking objects.
+ */
+ switch (*op) {
+ case WORKER_INSERT:
+ case WORKER_INSERT_RMW:
+ if (cfg->random_range)
+ next_val = wtperf_rand(thread);
+ else
+ next_val = cfg->icount + get_next_incr(cfg);
+ break;
+ case WORKER_READ:
+ case WORKER_UPDATE:
+ next_val = wtperf_rand(thread);
+
+ /*
+ * If the workload is started without a populate phase
+ * we rely on at least one insert to get a valid item
+ * id.
+ */
+ if (wtperf_value_range(cfg) < next_val)
+ continue;
+ break;
+ default:
+ goto err; /* can't happen */
+ }
+
+ generate_key(cfg, key_buf, next_val);
+
+ /*
+ * Spread the data out around the multiple databases.
+ * Sleep to allow workers a chance to run and process async ops.
+ * Then retry to get an async op.
+ */
+ while ((ret = conn->async_new_op(
+ conn, cfg->uris[next_val % cfg->table_count],
+ NULL, &cb, &asyncop)) == EBUSY)
+ (void)usleep(10000);
+ if (ret != 0)
+ goto err;
+
+ asyncop->app_private = thread;
+ asyncop->set_key(asyncop, key_buf);
+ switch (*op) {
+ case WORKER_READ:
+ ret = asyncop->search(asyncop);
+ if (ret == 0)
+ break;
+ goto op_err;
+ case WORKER_INSERT:
+ if (cfg->random_value)
+ randomize_value(thread, value_buf);
+ asyncop->set_value(asyncop, value_buf);
+ if ((ret = asyncop->insert(asyncop)) == 0)
+ break;
+ goto op_err;
+ case WORKER_UPDATE:
+ if (cfg->random_value)
+ randomize_value(thread, value_buf);
+ asyncop->set_value(asyncop, value_buf);
+ if ((ret = asyncop->update(asyncop)) == 0)
+ break;
+ goto op_err;
+ default:
+op_err: lprintf(cfg, ret, 0,
+ "%s failed for: %s, range: %"PRIu64,
+ op_name(op), key_buf, wtperf_value_range(cfg));
+ goto err; /* can't happen */
+ }
+
+ /* Schedule the next operation */
+ if (++op == op_end)
+ op = thread->workload->ops;
+ }
+
+ if (conn->async_flush(conn) != 0)
+ goto err;
+
+ /* Notify our caller we failed and shut the system down. */
+ if (0) {
+err: cfg->error = cfg->stop = 1;
+ }
+ return (NULL);
+}
+
+static void *
+worker(void *arg)
+{
+ struct timespec start, stop, interval;
+ CONFIG *cfg;
+ CONFIG_THREAD *thread;
+ TRACK *trk;
+ WT_CONNECTION *conn;
+ WT_CURSOR **cursors, *cursor;
+ WT_SESSION *session;
+ int64_t ops, ops_per_txn, throttle_ops;
+ size_t i;
+ uint64_t next_val, usecs;
+ uint8_t *op, *op_end;
+ int measure_latency, ret;
+ char *value_buf, *key_buf, *value;
+
+ thread = (CONFIG_THREAD *)arg;
+ cfg = thread->cfg;
+ conn = cfg->conn;
+ cursors = NULL;
+ ops = 0;
+ ops_per_txn = thread->workload->ops_per_txn;
+ session = NULL;
+ trk = NULL;
+ throttle_ops = 0;
+
+ if ((ret = conn->open_session(
+ conn, NULL, cfg->sess_config, &session)) != 0) {
+ lprintf(cfg, ret, 0, "worker: WT_CONNECTION.open_session");
+ goto err;
+ }
+ cursors = calloc(cfg->table_count, sizeof(WT_CURSOR *));
+ if (cursors == NULL) {
+ lprintf(cfg, ENOMEM, 0,
+ "worker: couldn't allocate cursor array");
+ goto err;
+ }
+ for (i = 0; i < cfg->table_count; i++) {
+ if ((ret = session->open_cursor(session,
+ cfg->uris[i], NULL, NULL, &cursors[i])) != 0) {
+ lprintf(cfg, ret, 0,
+ "worker: WT_SESSION.open_cursor: %s",
+ cfg->uris[i]);
+ goto err;
+ }
+ }
+ /* Setup the timer for throttling. */
+ if (thread->workload->throttle != 0 &&
+ (ret = __wt_epoch(NULL, &interval)) != 0) {
+ lprintf(cfg, ret, 0, "Get time call failed");
+ goto err;
+ }
+
+ key_buf = thread->key_buf;
+ value_buf = thread->value_buf;
+
+ op = thread->workload->ops;
+ op_end = op + sizeof(thread->workload->ops);
+
+ if (ops_per_txn != 0 &&
+ (ret = session->begin_transaction(session, NULL)) != 0) {
+ lprintf(cfg, ret, 0, "First transaction begin failed");
+ goto err;
+ }
+
+ while (!cfg->stop) {
+ /*
+ * Generate the next key and setup operation specific
+ * statistics tracking objects.
+ */
+ switch (*op) {
+ case WORKER_INSERT:
+ case WORKER_INSERT_RMW:
+ trk = &thread->insert;
+ if (cfg->random_range)
+ next_val = wtperf_rand(thread);
+ else
+ next_val = cfg->icount + get_next_incr(cfg);
+ break;
+ case WORKER_READ:
+ trk = &thread->read;
+ /* FALLTHROUGH */
+ case WORKER_UPDATE:
+ if (*op == WORKER_UPDATE)
+ trk = &thread->update;
+ next_val = wtperf_rand(thread);
+
+ /*
+ * If the workload is started without a populate phase
+ * we rely on at least one insert to get a valid item
+ * id.
+ */
+ if (wtperf_value_range(cfg) < next_val)
+ continue;
+ break;
+ default:
+ goto err; /* can't happen */
+ }
+
+ generate_key(cfg, key_buf, next_val);
+
+ /*
+ * Spread the data out around the multiple databases.
+ */
+ cursor = cursors[next_val % cfg->table_count];
+
+ /*
+ * Skip the first time we do an operation, when trk->ops
+ * is 0, to avoid first time latency spikes.
+ */
+ measure_latency =
+ cfg->sample_interval != 0 && trk->ops != 0 && (
+ trk->ops % cfg->sample_rate == 0);
+ if (measure_latency &&
+ (ret = __wt_epoch(NULL, &start)) != 0) {
+ lprintf(cfg, ret, 0, "Get time call failed");
+ goto err;
+ }
+
+ cursor->set_key(cursor, key_buf);
+
+ switch (*op) {
+ case WORKER_READ:
+ /*
+ * Reads can fail with WT_NOTFOUND: we may be searching
+ * in a random range, or an insert thread might have
+ * updated the last record in the table but not yet
+ * finished the actual insert. Count failed search in
+ * a random range as a "read".
+ */
+ ret = cursor->search(cursor);
+ if (ret == 0) {
+ if ((ret = cursor->get_value(
+ cursor, &value)) != 0) {
+ lprintf(cfg, ret, 0,
+ "get_value in read.");
+ goto err;
+ }
+ }
+ if (ret == 0 || ret == WT_NOTFOUND)
+ break;
+ goto op_err;
+ case WORKER_INSERT_RMW:
+ if ((ret = cursor->search(cursor)) != WT_NOTFOUND)
+ goto op_err;
+
+ /* The error return reset the cursor's key. */
+ cursor->set_key(cursor, key_buf);
+
+ /* FALLTHROUGH */
+ case WORKER_INSERT:
+ if (cfg->random_value)
+ randomize_value(thread, value_buf);
+ cursor->set_value(cursor, value_buf);
+ if ((ret = cursor->insert(cursor)) == 0)
+ break;
+ goto op_err;
+ case WORKER_UPDATE:
+ if ((ret = cursor->search(cursor)) == 0) {
+ if ((ret = cursor->get_value(
+ cursor, &value)) != 0) {
+ lprintf(cfg, ret, 0,
+ "get_value in update.");
+ goto err;
+ }
+ /*
+ * Copy as much of the previous value as is
+ * safe, and be sure to NUL-terminate.
+ */
+ strncpy(value_buf, value, cfg->value_sz);
+ value_buf[cfg->value_sz - 1] = '\0';
+ if (value_buf[0] == 'a')
+ value_buf[0] = 'b';
+ else
+ value_buf[0] = 'a';
+ if (cfg->random_value)
+ randomize_value(thread, value_buf);
+ cursor->set_value(cursor, value_buf);
+ if ((ret = cursor->update(cursor)) == 0)
+ break;
+ goto op_err;
+ }
+
+ /*
+ * Reads can fail with WT_NOTFOUND: we may be searching
+ * in a random range, or an insert thread might have
+ * updated the last record in the table but not yet
+ * finished the actual insert. Count failed search in
+ * a random range as a "read".
+ */
+ if (ret == WT_NOTFOUND)
+ break;
+
+op_err: lprintf(cfg, ret, 0,
+ "%s failed for: %s, range: %"PRIu64,
+ op_name(op), key_buf, wtperf_value_range(cfg));
+ goto err;
+ default:
+ goto err; /* can't happen */
+ }
+
+ /* Release the cursor, if we have multiple tables. */
+ if (cfg->table_count > 1 && ret == 0 &&
+ *op != WORKER_INSERT && *op != WORKER_INSERT_RMW) {
+ if ((ret = cursor->reset(cursor)) != 0) {
+ lprintf(cfg, ret, 0, "Cursor reset failed");
+ goto err;
+ }
+ }
+
+ /* Gather statistics */
+ if (!cfg->in_warmup) {
+ if (measure_latency) {
+ if ((ret = __wt_epoch(NULL, &stop)) != 0) {
+ lprintf(cfg, ret, 0,
+ "Get time call failed");
+ goto err;
+ }
+ ++trk->latency_ops;
+ usecs = ns_to_us(WT_TIMEDIFF(stop, start));
+ track_operation(trk, usecs);
+ }
+ /* Increment operation count */
+ ++trk->ops;
+ }
+
+ /* Commit our work if configured for explicit transactions */
+ if (ops_per_txn != 0 && ops++ % ops_per_txn == 0) {
+ if ((ret = session->commit_transaction(
+ session, NULL)) != 0) {
+ lprintf(cfg, ret, 0,
+ "Worker transaction commit failed");
+ goto err;
+ }
+ if ((ret = session->begin_transaction(
+ session, NULL)) != 0) {
+ lprintf(cfg, ret, 0,
+ "Worker transaction commit failed");
+ goto err;
+ }
+ }
+
+ /* Schedule the next operation */
+ if (++op == op_end)
+ op = thread->workload->ops;
+
+ /*
+ * Check throttling periodically to avoid taking too
+ * many time samples.
+ */
+ if (thread->workload->throttle != 0 &&
+ throttle_ops++ % THROTTLE_OPS == 0)
+ worker_throttle(thread->workload->throttle,
+ &throttle_ops, &interval);
+ }
+
+ if ((ret = session->close(session, NULL)) != 0) {
+ lprintf(cfg, ret, 0, "Session close in worker failed");
+ goto err;
+ }
+
+ /* Notify our caller we failed and shut the system down. */
+ if (0) {
+err: cfg->error = cfg->stop = 1;
+ }
+ if (cursors != NULL)
+ free(cursors);
+
+ return (NULL);
+}
+
+/*
+ * run_mix_schedule_op --
+ * Replace read operations with another operation, in the configured
+ * percentage.
+ */
+static void
+run_mix_schedule_op(WORKLOAD *workp, int op, int64_t op_cnt)
+{
+ int jump, pass;
+ uint8_t *p, *end;
+
+ /* Jump around the array to roughly spread out the operations. */
+ jump = 100 / op_cnt;
+
+ /*
+ * Find a read operation and replace it with another operation. This
+ * is roughly n-squared, but it's an N of 100, leave it.
+ */
+ p = workp->ops;
+ end = workp->ops + sizeof(workp->ops);
+ while (op_cnt-- > 0) {
+ for (pass = 0; *p != WORKER_READ; ++p)
+ if (p == end) {
+ /*
+ * Passed a percentage of total operations and
+ * should always be a read operation to replace,
+ * but don't allow infinite loops.
+ */
+ if (++pass > 1)
+ return;
+ p = workp->ops;
+ }
+ *p = (uint8_t)op;
+
+ if (end - jump < p)
+ p = workp->ops;
+ else
+ p += jump;
+ }
+}
+
+/*
+ * run_mix_schedule --
+ * Schedule the mixed-run operations.
+ */
+static int
+run_mix_schedule(CONFIG *cfg, WORKLOAD *workp)
+{
+ int64_t pct;
+
+ /* Confirm reads, inserts and updates cannot all be zero. */
+ if (workp->insert == 0 && workp->read == 0 && workp->update == 0) {
+ lprintf(cfg, EINVAL, 0, "no operations scheduled");
+ return (EINVAL);
+ }
+
+ /*
+ * Check for a simple case where the thread is only doing insert or
+ * update operations (because the default operation for a job-mix is
+ * read, the subsequent code works fine if only reads are specified).
+ */
+ if (workp->insert != 0 && workp->read == 0 && workp->update == 0) {
+ memset(workp->ops,
+ cfg->insert_rmw ? WORKER_INSERT_RMW : WORKER_INSERT,
+ sizeof(workp->ops));
+ return (0);
+ }
+ if (workp->insert == 0 && workp->read == 0 && workp->update != 0) {
+ memset(workp->ops, WORKER_UPDATE, sizeof(workp->ops));
+ return (0);
+ }
+
+ /*
+ * The worker thread configuration is done as ratios of operations. If
+ * the caller gives us something insane like "reads=77,updates=23" (do
+ * 77 reads for every 23 updates), we don't want to do 77 reads followed
+ * by 23 updates, we want to uniformly distribute the read and update
+ * operations across the space. Convert to percentages and then lay out
+ * the operations across an array.
+ *
+ * Percentage conversion is lossy, the application can do stupid stuff
+ * here, for example, imagine a configured ratio of "reads=1,inserts=2,
+ * updates=999999". First, if the percentages are skewed enough, some
+ * operations might never be done. Second, we set the base operation to
+ * read, which means any fractional results from percentage conversion
+ * will be reads, implying read operations in some cases where reads
+ * weren't configured. We should be fine if the application configures
+ * something approaching a rational set of ratios.
+ */
+ memset(workp->ops, WORKER_READ, sizeof(workp->ops));
+
+ pct = (workp->insert * 100) /
+ (workp->insert + workp->read + workp->update);
+ if (pct != 0)
+ run_mix_schedule_op(workp,
+ cfg->insert_rmw ? WORKER_INSERT_RMW : WORKER_INSERT, pct);
+ pct = (workp->update * 100) /
+ (workp->insert + workp->read + workp->update);
+ if (pct != 0)
+ run_mix_schedule_op(workp, WORKER_UPDATE, pct);
+ return (0);
+}
+
+static void *
+populate_thread(void *arg)
+{
+ struct timespec start, stop;
+ CONFIG *cfg;
+ CONFIG_THREAD *thread;
+ TRACK *trk;
+ WT_CONNECTION *conn;
+ WT_CURSOR **cursors, *cursor;
+ WT_SESSION *session;
+ size_t i;
+ uint64_t op, usecs;
+ uint32_t opcount;
+ int intxn, measure_latency, ret, stress_checkpoint_due;
+ char *value_buf, *key_buf;
+ const char *cursor_config;
+
+ thread = (CONFIG_THREAD *)arg;
+ cfg = thread->cfg;
+ conn = cfg->conn;
+ session = NULL;
+ cursors = NULL;
+ ret = stress_checkpoint_due = 0;
+ trk = &thread->insert;
+
+ key_buf = thread->key_buf;
+ value_buf = thread->value_buf;
+
+ if ((ret = conn->open_session(
+ conn, NULL, cfg->sess_config, &session)) != 0) {
+ lprintf(cfg, ret, 0, "populate: WT_CONNECTION.open_session");
+ goto err;
+ }
+
+ /* Do bulk loads if populate is single-threaded. */
+ cursor_config = cfg->populate_threads == 1 ? "bulk" : NULL;
+ /* Create the cursors. */
+ cursors = calloc(cfg->table_count, sizeof(WT_CURSOR *));
+ if (cursors == NULL) {
+ lprintf(cfg, ENOMEM, 0,
+ "worker: couldn't allocate cursor array");
+ goto err;
+ }
+ for (i = 0; i < cfg->table_count; i++) {
+ if ((ret = session->open_cursor(
+ session, cfg->uris[i], NULL,
+ cursor_config, &cursors[i])) != 0) {
+ lprintf(cfg, ret, 0,
+ "populate: WT_SESSION.open_cursor: %s",
+ cfg->uris[i]);
+ goto err;
+ }
+ }
+
+ /* Populate the databases. */
+ for (intxn = 0, opcount = 0;;) {
+ op = get_next_incr(cfg);
+ if (op > cfg->icount)
+ break;
+
+ if (cfg->populate_ops_per_txn != 0 && !intxn) {
+ if ((ret = session->begin_transaction(
+ session, cfg->transaction_config)) != 0) {
+ lprintf(cfg, ret, 0,
+ "Failed starting transaction.");
+ goto err;
+ }
+ intxn = 1;
+ }
+ /*
+ * Figure out which table this op belongs to.
+ */
+ cursor = cursors[op % cfg->table_count];
+ generate_key(cfg, key_buf, op);
+ measure_latency =
+ cfg->sample_interval != 0 && trk->ops != 0 && (
+ trk->ops % cfg->sample_rate == 0);
+ if (measure_latency &&
+ (ret = __wt_epoch(NULL, &start)) != 0) {
+ lprintf(cfg, ret, 0, "Get time call failed");
+ goto err;
+ }
+ cursor->set_key(cursor, key_buf);
+ if (cfg->random_value)
+ randomize_value(thread, value_buf);
+ cursor->set_value(cursor, value_buf);
+ if ((ret = cursor->insert(cursor)) != 0) {
+ lprintf(cfg, ret, 0, "Failed inserting");
+ goto err;
+ }
+ /*
+ * Gather statistics.
+ * We measure the latency of inserting a single key. If there
+ * are multiple tables, it is the time for insertion into all
+ * of them.
+ */
+ if (measure_latency) {
+ if ((ret = __wt_epoch(NULL, &stop)) != 0) {
+ lprintf(cfg, ret, 0, "Get time call failed");
+ goto err;
+ }
+ ++trk->latency_ops;
+ usecs = ns_to_us(WT_TIMEDIFF(stop, start));
+ track_operation(trk, usecs);
+ }
+ ++thread->insert.ops; /* Same as trk->ops */
+
+ if (cfg->checkpoint_stress_rate != 0 &&
+ (op % cfg->checkpoint_stress_rate) == 0)
+ stress_checkpoint_due = 1;
+
+ if (cfg->populate_ops_per_txn != 0) {
+ if (++opcount < cfg->populate_ops_per_txn)
+ continue;
+ opcount = 0;
+
+ if ((ret = session->commit_transaction(
+ session, NULL)) != 0)
+ lprintf(cfg, ret, 0,
+ "Fail committing, transaction was aborted");
+ intxn = 0;
+ }
+
+ if (stress_checkpoint_due && intxn == 0) {
+ stress_checkpoint_due = 0;
+ if ((ret = session->checkpoint(session, NULL)) != 0) {
+ lprintf(cfg, ret, 0, "Checkpoint failed");
+ goto err;
+ }
+ }
+ }
+ if (intxn &&
+ (ret = session->commit_transaction(session, NULL)) != 0)
+ lprintf(cfg, ret, 0,
+ "Fail committing, transaction was aborted");
+
+ if ((ret = session->close(session, NULL)) != 0) {
+ lprintf(cfg, ret, 0, "Error closing session in populate");
+ goto err;
+ }
+
+ /* Notify our caller we failed and shut the system down. */
+ if (0) {
+err: cfg->error = cfg->stop = 1;
+ }
+ if (cursors != NULL)
+ free(cursors);
+
+ return (NULL);
+}
+
+static void *
+populate_async(void *arg)
+{
+ struct timespec start, stop;
+ CONFIG *cfg;
+ CONFIG_THREAD *thread;
+ TRACK *trk;
+ WT_ASYNC_OP *asyncop;
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+ uint64_t op, usecs;
+ int measure_latency, ret;
+ char *value_buf, *key_buf;
+
+ thread = (CONFIG_THREAD *)arg;
+ cfg = thread->cfg;
+ conn = cfg->conn;
+ session = NULL;
+ ret = 0;
+ trk = &thread->insert;
+
+ key_buf = thread->key_buf;
+ value_buf = thread->value_buf;
+
+ if ((ret = conn->open_session(
+ conn, NULL, cfg->sess_config, &session)) != 0) {
+ lprintf(cfg, ret, 0, "populate: WT_CONNECTION.open_session");
+ goto err;
+ }
+
+ /*
+ * Measuring latency of one async op is not meaningful. We
+ * will measure the time it takes to do all of them, including
+ * the time to process by workers.
+ */
+ measure_latency =
+ cfg->sample_interval != 0 && trk->ops != 0 && (
+ trk->ops % cfg->sample_rate == 0);
+ if (measure_latency &&
+ (ret = __wt_epoch(NULL, &start)) != 0) {
+ lprintf(cfg, ret, 0, "Get time call failed");
+ goto err;
+ }
+ /* Populate the databases. */
+ for (;;) {
+ op = get_next_incr(cfg);
+ if (op > cfg->icount)
+ break;
+ /*
+ * Allocate an async op for whichever table.
+ */
+ while ((ret = conn->async_new_op(
+ conn, cfg->uris[op % cfg->table_count],
+ NULL, &cb, &asyncop)) == EBUSY)
+ (void)usleep(10000);
+ if (ret != 0)
+ goto err;
+
+ asyncop->app_private = thread;
+ generate_key(cfg, key_buf, op);
+ asyncop->set_key(asyncop, key_buf);
+ if (cfg->random_value)
+ randomize_value(thread, value_buf);
+ asyncop->set_value(asyncop, value_buf);
+ if ((ret = asyncop->insert(asyncop)) != 0) {
+ lprintf(cfg, ret, 0, "Failed inserting");
+ goto err;
+ }
+ }
+ /*
+ * Gather statistics.
+ * We measure the latency of inserting a single key. If there
+ * are multiple tables, it is the time for insertion into all
+ * of them. Note that currently every populate thread will call
+ * async_flush and those calls will convoy. That is not the
+ * most efficient way, but we want to flush before measuring latency.
+ */
+ if (conn->async_flush(conn) != 0)
+ goto err;
+ if (measure_latency) {
+ if ((ret = __wt_epoch(NULL, &stop)) != 0) {
+ lprintf(cfg, ret, 0,
+ "Get time call failed");
+ goto err;
+ }
+ ++trk->latency_ops;
+ usecs = ns_to_us(WT_TIMEDIFF(stop, start));
+ track_operation(trk, usecs);
+ }
+ if ((ret = session->close(session, NULL)) != 0) {
+ lprintf(cfg, ret, 0, "Error closing session in populate");
+ goto err;
+ }
+
+ /* Notify our caller we failed and shut the system down. */
+ if (0) {
+err: cfg->error = cfg->stop = 1;
+ }
+ return (NULL);
+}
+
+static void *
+monitor(void *arg)
+{
+ struct timespec t;
+ struct tm *tm, _tm;
+ CONFIG *cfg;
+ FILE *fp;
+ size_t len;
+ uint64_t min_thr, reads, inserts, updates;
+ uint64_t cur_reads, cur_inserts, cur_updates;
+ uint64_t last_reads, last_inserts, last_updates;
+ uint32_t read_avg, read_min, read_max;
+ uint32_t insert_avg, insert_min, insert_max;
+ uint32_t update_avg, update_min, update_max;
+ uint32_t latency_max;
+ u_int i;
+ int ret;
+ char buf[64], *path;
+
+ cfg = (CONFIG *)arg;
+ assert(cfg->sample_interval != 0);
+ fp = NULL;
+ path = NULL;
+
+ min_thr = (uint64_t)cfg->min_throughput;
+ latency_max = (uint32_t)ms_to_us(cfg->max_latency);
+
+ /* Open the logging file. */
+ len = strlen(cfg->monitor_dir) + 100;
+ if ((path = malloc(len)) == NULL) {
+ (void)enomem(cfg);
+ goto err;
+ }
+ snprintf(path, len, "%s/monitor", cfg->monitor_dir);
+ if ((fp = fopen(path, "w")) == NULL) {
+ lprintf(cfg, errno, 0, "%s", path);
+ goto err;
+ }
+ /* Set line buffering for monitor file. */
+ (void)setvbuf(fp, NULL, _IOLBF, 0);
+ fprintf(fp,
+ "#time,"
+ "totalsec,"
+ "read ops per second,"
+ "insert ops per second,"
+ "update ops per second,"
+ "checkpoints,"
+ "read average latency(uS),"
+ "read minimum latency(uS),"
+ "read maximum latency(uS),"
+ "insert average latency(uS),"
+ "insert min latency(uS),"
+ "insert maximum latency(uS),"
+ "update average latency(uS),"
+ "update min latency(uS),"
+ "update maximum latency(uS)"
+ "\n");
+ last_reads = last_inserts = last_updates = 0;
+ while (!cfg->stop) {
+ for (i = 0; i < cfg->sample_interval; i++) {
+ sleep(1);
+ if (cfg->stop)
+ break;
+ }
+ /* If the workers are done, don't bother with a final call. */
+ if (cfg->stop)
+ break;
+ if (cfg->in_warmup)
+ continue;
+
+ if ((ret = __wt_epoch(NULL, &t)) != 0) {
+ lprintf(cfg, ret, 0, "Get time call failed");
+ goto err;
+ }
+ tm = localtime_r(&t.tv_sec, &_tm);
+ (void)strftime(buf, sizeof(buf), "%b %d %H:%M:%S", tm);
+
+ reads = sum_read_ops(cfg);
+ inserts = sum_insert_ops(cfg);
+ updates = sum_update_ops(cfg);
+ latency_read(cfg, &read_avg, &read_min, &read_max);
+ latency_insert(cfg, &insert_avg, &insert_min, &insert_max);
+ latency_update(cfg, &update_avg, &update_min, &update_max);
+
+ cur_reads = (reads - last_reads) / cfg->sample_interval;
+ cur_updates = (updates - last_updates) / cfg->sample_interval;
+ /*
+ * For now the only item we need to worry about changing is
+ * inserts when we transition from the populate phase to
+ * workload phase.
+ */
+ if (inserts < last_inserts)
+ cur_inserts = 0;
+ else
+ cur_inserts =
+ (inserts - last_inserts) / cfg->sample_interval;
+
+ (void)fprintf(fp,
+ "%s,%" PRIu32
+ ",%" PRIu64 ",%" PRIu64 ",%" PRIu64
+ ",%c"
+ ",%" PRIu32 ",%" PRIu32 ",%" PRIu32
+ ",%" PRIu32 ",%" PRIu32 ",%" PRIu32
+ ",%" PRIu32 ",%" PRIu32 ",%" PRIu32
+ "\n",
+ buf, cfg->totalsec,
+ cur_reads, cur_inserts, cur_updates,
+ cfg->ckpt ? 'Y' : 'N',
+ read_avg, read_min, read_max,
+ insert_avg, insert_min, insert_max,
+ update_avg, update_min, update_max);
+
+ if (latency_max != 0 &&
+ (read_max > latency_max || insert_max > latency_max ||
+ update_max > latency_max))
+ lprintf(cfg, WT_PANIC, 0,
+ "max latency exceeded: threshold %" PRIu32
+ " read max %" PRIu32 " insert max %" PRIu32
+ " update max %" PRIu32, latency_max,
+ read_max, insert_max, update_max);
+ if (min_thr != 0 &&
+ ((cur_reads != 0 && cur_reads < min_thr) ||
+ (cur_inserts != 0 && cur_inserts < min_thr) ||
+ (cur_updates != 0 && cur_updates < min_thr)))
+ lprintf(cfg, WT_PANIC, 0,
+ "minimum throughput not met: threshold %" PRIu64
+ " reads %" PRIu64 " inserts %" PRIu64
+ " updates %" PRIu64, min_thr, cur_reads,
+ cur_inserts, cur_updates);
+ last_reads = reads;
+ last_inserts = inserts;
+ last_updates = updates;
+ }
+
+ /* Notify our caller we failed and shut the system down. */
+ if (0) {
+err: cfg->error = cfg->stop = 1;
+ }
+
+ if (fp != NULL)
+ (void)fclose(fp);
+ free(path);
+
+ return (NULL);
+}
+
+static void *
+checkpoint_worker(void *arg)
+{
+ CONFIG *cfg;
+ CONFIG_THREAD *thread;
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+ struct timespec e, s;
+ uint32_t i;
+ int ret;
+
+ thread = (CONFIG_THREAD *)arg;
+ cfg = thread->cfg;
+ conn = cfg->conn;
+ session = NULL;
+
+ if ((ret = conn->open_session(
+ conn, NULL, cfg->sess_config, &session)) != 0) {
+ lprintf(cfg, ret, 0,
+ "open_session failed in checkpoint thread.");
+ goto err;
+ }
+
+ while (!cfg->stop) {
+ /* Break the sleep up, so we notice interrupts faster. */
+ for (i = 0; i < cfg->checkpoint_interval; i++) {
+ sleep(1);
+ if (cfg->stop)
+ break;
+ }
+ /* If the workers are done, don't bother with a final call. */
+ if (cfg->stop)
+ break;
+
+ if ((ret = __wt_epoch(NULL, &s)) != 0) {
+ lprintf(cfg, ret, 0, "Get time failed in checkpoint.");
+ goto err;
+ }
+ cfg->ckpt = 1;
+ if ((ret = session->checkpoint(session, NULL)) != 0) {
+ lprintf(cfg, ret, 0, "Checkpoint failed.");
+ goto err;
+ }
+ cfg->ckpt = 0;
+ ++thread->ckpt.ops;
+
+ if ((ret = __wt_epoch(NULL, &e)) != 0) {
+ lprintf(cfg, ret, 0, "Get time failed in checkpoint.");
+ goto err;
+ }
+ }
+
+ if (session != NULL &&
+ ((ret = session->close(session, NULL)) != 0)) {
+ lprintf(cfg, ret, 0,
+ "Error closing session in checkpoint worker.");
+ goto err;
+ }
+
+ /* Notify our caller we failed and shut the system down. */
+ if (0) {
+err: cfg->error = cfg->stop = 1;
+ }
+
+ return (NULL);
+}
+
+static int
+execute_populate(CONFIG *cfg)
+{
+ struct timespec start, stop;
+ CONFIG_THREAD *popth;
+ WT_ASYNC_OP *asyncop;
+ size_t i;
+ uint64_t last_ops, msecs;
+ uint32_t interval, tables;
+ int elapsed, ret;
+ void *(*pfunc)(void *);
+
+ lprintf(cfg, 0, 1,
+ "Starting %" PRIu32
+ " populate thread(s) for %" PRIu32 " items",
+ cfg->populate_threads, cfg->icount);
+
+ cfg->insert_key = 0;
+
+ if ((cfg->popthreads =
+ calloc(cfg->populate_threads, sizeof(CONFIG_THREAD))) == NULL)
+ return (enomem(cfg));
+ if (cfg->use_asyncops > 0) {
+ lprintf(cfg, 0, 1, "Starting %" PRIu32 " async thread(s)",
+ cfg->async_threads);
+ pfunc = populate_async;
+ } else
+ pfunc = populate_thread;
+ if ((ret = start_threads(cfg, NULL,
+ cfg->popthreads, cfg->populate_threads, pfunc)) != 0)
+ return (ret);
+
+ if ((ret = __wt_epoch(NULL, &start)) != 0) {
+ lprintf(cfg, ret, 0, "Get time failed in populate.");
+ return (ret);
+ }
+ for (elapsed = 0, interval = 0, last_ops = 0;
+ cfg->insert_key < cfg->icount && cfg->error == 0;) {
+ /*
+ * Sleep for 100th of a second, report_interval is in second
+ * granularity, each 100th increment of elapsed is a single
+ * increment of interval.
+ */
+ (void)usleep(10000);
+ if (cfg->report_interval == 0 || ++elapsed < 100)
+ continue;
+ elapsed = 0;
+ if (++interval < cfg->report_interval)
+ continue;
+ interval = 0;
+ cfg->totalsec += cfg->report_interval;
+ cfg->insert_ops = sum_pop_ops(cfg);
+ lprintf(cfg, 0, 1,
+ "%" PRIu64 " populate inserts (%" PRIu64 " of %"
+ PRIu32 ") in %" PRIu32 " secs (%" PRIu32 " total secs)",
+ cfg->insert_ops - last_ops, cfg->insert_ops,
+ cfg->icount, cfg->report_interval, cfg->totalsec);
+ last_ops = cfg->insert_ops;
+ }
+ if ((ret = __wt_epoch(NULL, &stop)) != 0) {
+ lprintf(cfg, ret, 0, "Get time failed in populate.");
+ return (ret);
+ }
+
+ /*
+ * Move popthreads aside to narrow possible race with the monitor
+ * thread. The latency tracking code also requires that popthreads be
+ * NULL when the populate phase is finished, to know that the workload
+ * phase has started.
+ */
+ popth = cfg->popthreads;
+ cfg->popthreads = NULL;
+ ret = stop_threads(cfg, cfg->populate_threads, popth);
+ free(popth);
+ if (ret != 0)
+ return (ret);
+
+ /* Report if any worker threads didn't finish. */
+ if (cfg->error != 0) {
+ lprintf(cfg, WT_ERROR, 0,
+ "Populate thread(s) exited without finishing.");
+ return (WT_ERROR);
+ }
+
+ lprintf(cfg, 0, 1, "Finished load of %" PRIu32 " items", cfg->icount);
+ msecs = ns_to_ms(WT_TIMEDIFF(stop, start));
+ lprintf(cfg, 0, 1,
+ "Load time: %.2f\n" "load ops/sec: %" PRIu64,
+ (double)msecs / (double)MSEC_PER_SEC,
+ (uint64_t)((cfg->icount / msecs) / MSEC_PER_SEC));
+
+ /*
+ * If configured, compact to allow LSM merging to complete. We
+ * set an unlimited timeout because if we close the connection
+ * then any in-progress compact/merge is aborted.
+ */
+ if (cfg->compact) {
+ assert(cfg->async_threads > 0);
+ lprintf(cfg, 0, 1, "Compact after populate");
+ if ((ret = __wt_epoch(NULL, &start)) != 0) {
+ lprintf(cfg, ret, 0, "Get time failed in populate.");
+ return (ret);
+ }
+ tables = cfg->table_count;
+ for (i = 0; i < cfg->table_count; i++) {
+ /*
+ * If no ops are available, retry. Any other error,
+ * return.
+ */
+ while ((ret = cfg->conn->async_new_op(cfg->conn,
+ cfg->uris[i], "timeout=0", &cb, &asyncop)) == EBUSY)
+ (void)usleep(10000);
+ if (ret != 0)
+ return (ret);
+
+ asyncop->app_private = &tables;
+ if ((ret = asyncop->compact(asyncop)) != 0) {
+ lprintf(cfg, ret, 0, "Async compact failed.");
+ return (ret);
+ }
+ }
+ if ((ret = cfg->conn->async_flush(cfg->conn)) != 0) {
+ lprintf(cfg, ret, 0, "Populate async flush failed.");
+ return (ret);
+ }
+ if ((ret = __wt_epoch(NULL, &stop)) != 0) {
+ lprintf(cfg, ret, 0, "Get time failed in populate.");
+ return (ret);
+ }
+ lprintf(cfg, 0, 1,
+ "Compact completed in %" PRIu64 " seconds",
+ (uint64_t)(ns_to_sec(WT_TIMEDIFF(stop, start))));
+ assert(tables == 0);
+ }
+ return (0);
+}
+
+static int
+close_reopen(CONFIG *cfg)
+{
+ int ret;
+
+ /*
+ * Reopen the connection. We do this so that the workload phase always
+ * starts with the on-disk files, and so that read-only workloads can
+ * be identified. This is particularly important for LSM, where the
+ * merge algorithm is more aggressive for read-only trees.
+ */
+ /* cfg->conn is released no matter the return value from close(). */
+ ret = cfg->conn->close(cfg->conn, NULL);
+ cfg->conn = NULL;
+ if (ret != 0) {
+ lprintf(cfg, ret, 0, "Closing the connection failed");
+ return (ret);
+ }
+ if ((ret = wiredtiger_open(
+ cfg->home, NULL, cfg->conn_config, &cfg->conn)) != 0) {
+ lprintf(cfg, ret, 0, "Re-opening the connection failed");
+ return (ret);
+ }
+ /*
+ * If we started async threads only for the purposes of compact,
+ * then turn it off before starting the workload so that those extra
+ * threads looking for work that will never arrive don't affect
+ * performance.
+ */
+ if (cfg->compact && cfg->use_asyncops == 0) {
+ if ((ret = cfg->conn->reconfigure(
+ cfg->conn, "async=(enabled=false)")) != 0) {
+ lprintf(cfg, ret, 0, "Reconfigure async off failed");
+ return (ret);
+ }
+ }
+ return (0);
+}
+
+static int
+execute_workload(CONFIG *cfg)
+{
+ CONFIG_THREAD *threads;
+ WORKLOAD *workp;
+ uint64_t last_ckpts, last_inserts, last_reads, last_updates;
+ uint32_t interval, run_ops, run_time;
+ u_int i;
+ int ret, t_ret;
+ void *(*pfunc)(void *);
+
+ cfg->insert_key = 0;
+ cfg->insert_ops = cfg->read_ops = cfg->update_ops = 0;
+
+ last_ckpts = last_inserts = last_reads = last_updates = 0;
+ ret = 0;
+
+ if (cfg->warmup != 0)
+ cfg->in_warmup = 1;
+
+ /* Allocate memory for the worker threads. */
+ if ((cfg->workers =
+ calloc((size_t)cfg->workers_cnt, sizeof(CONFIG_THREAD))) == NULL) {
+ ret = enomem(cfg);
+ goto err;
+ }
+
+ if (cfg->use_asyncops > 0) {
+ lprintf(cfg, 0, 1, "Starting %" PRIu32 " async thread(s)",
+ cfg->async_threads);
+ pfunc = worker_async;
+ } else
+ pfunc = worker;
+
+ /* Start each workload. */
+ for (threads = cfg->workers, i = 0,
+ workp = cfg->workload; i < cfg->workload_cnt; ++i, ++workp) {
+ lprintf(cfg, 0, 1,
+ "Starting workload #%d: %" PRId64 " threads, inserts=%"
+ PRId64 ", reads=%" PRId64 ", updates=%" PRId64,
+ i + 1,
+ workp->threads, workp->insert, workp->read, workp->update);
+
+ /* Figure out the workload's schedule. */
+ if ((ret = run_mix_schedule(cfg, workp)) != 0)
+ goto err;
+
+ /* Start the workload's threads. */
+ if ((ret = start_threads(
+ cfg, workp, threads, (u_int)workp->threads, pfunc)) != 0)
+ goto err;
+ threads += workp->threads;
+ }
+
+ if (cfg->warmup != 0) {
+ lprintf(cfg, 0, 1,
+ "Waiting for warmup duration of %" PRIu32, cfg->warmup);
+ sleep(cfg->warmup);
+ cfg->in_warmup = 0;
+ }
+
+ for (interval = cfg->report_interval, run_time = cfg->run_time,
+ run_ops = cfg->run_ops; cfg->error == 0;) {
+ /*
+ * Sleep for one second at a time.
+ * If we are tracking run time, check to see if we're done, and
+ * if we're only tracking run time, go back to sleep.
+ */
+ sleep(1);
+ if (run_time != 0) {
+ if (--run_time == 0)
+ break;
+ if (!interval && !run_ops)
+ continue;
+ }
+
+ /* Sum the operations we've done. */
+ cfg->ckpt_ops = sum_ckpt_ops(cfg);
+ cfg->insert_ops = sum_insert_ops(cfg);
+ cfg->read_ops = sum_read_ops(cfg);
+ cfg->update_ops = sum_update_ops(cfg);
+
+ /* If we're checking total operations, see if we're done. */
+ if (run_ops != 0 && run_ops <=
+ cfg->insert_ops + cfg->read_ops + cfg->update_ops)
+ break;
+
+ /* If writing out throughput information, see if it's time. */
+ if (interval == 0 || --interval > 0)
+ continue;
+ interval = cfg->report_interval;
+ cfg->totalsec += cfg->report_interval;
+
+ lprintf(cfg, 0, 1,
+ "%" PRIu64 " reads, %" PRIu64 " inserts, %" PRIu64
+ " updates, %" PRIu64 " checkpoints in %" PRIu32
+ " secs (%" PRIu32 " total secs)",
+ cfg->read_ops - last_reads,
+ cfg->insert_ops - last_inserts,
+ cfg->update_ops - last_updates,
+ cfg->ckpt_ops - last_ckpts,
+ cfg->report_interval, cfg->totalsec);
+ last_reads = cfg->read_ops;
+ last_inserts = cfg->insert_ops;
+ last_updates = cfg->update_ops;
+ last_ckpts = cfg->ckpt_ops;
+ }
+
+ /* Notify the worker threads they are done. */
+err: cfg->stop = 1;
+
+ if ((t_ret = stop_threads(
+ cfg, (u_int)cfg->workers_cnt, cfg->workers)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Report if any worker threads didn't finish. */
+ if (cfg->error != 0) {
+ lprintf(cfg, WT_ERROR, 0,
+ "Worker thread(s) exited without finishing.");
+ if (ret == 0)
+ ret = WT_ERROR;
+ }
+ return (ret);
+}
+
+/*
+ * Ensure that icount matches the number of records in the
+ * existing table.
+ */
+static int
+find_table_count(CONFIG *cfg)
+{
+ WT_CONNECTION *conn;
+ WT_CURSOR *cursor;
+ WT_SESSION *session;
+ uint32_t i, max_icount, table_icount;
+ int ret, t_ret;
+ char *key;
+
+ conn = cfg->conn;
+
+ max_icount = 0;
+ if ((ret = conn->open_session(
+ conn, NULL, cfg->sess_config, &session)) != 0) {
+ lprintf(cfg, ret, 0,
+ "find_table_count: open_session failed");
+ goto out;
+ }
+ for (i = 0; i < cfg->table_count; i++) {
+ if ((ret = session->open_cursor(session, cfg->uris[i],
+ NULL, NULL, &cursor)) != 0) {
+ lprintf(cfg, ret, 0,
+ "find_table_count: open_cursor failed");
+ goto err;
+ }
+ if ((ret = cursor->prev(cursor)) != 0) {
+ lprintf(cfg, ret, 0,
+ "find_table_count: cursor prev failed");
+ goto err;
+ }
+ if ((ret = cursor->get_key(cursor, &key)) != 0) {
+ lprintf(cfg, ret, 0,
+ "find_table_count: cursor get_key failed");
+ goto err;
+ }
+ table_icount = (uint32_t)atoi(key);
+ if (table_icount > max_icount)
+ max_icount = table_icount;
+
+ if ((ret = cursor->close(cursor)) != 0) {
+ lprintf(cfg, ret, 0,
+ "find_table_count: cursor close failed");
+ goto err;
+ }
+ }
+err: if ((t_ret = session->close(session, NULL)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ lprintf(cfg, ret, 0,
+ "find_table_count: session close failed");
+ }
+ cfg->icount = max_icount;
+out: return (ret);
+}
+
+/*
+ * Populate the uri array if more than one table is being used.
+ */
+static int
+create_uris(CONFIG *cfg)
+{
+ size_t base_uri_len;
+ uint32_t i;
+ int ret;
+ char *uri;
+
+ ret = 0;
+ base_uri_len = strlen(cfg->base_uri);
+ cfg->uris = calloc(cfg->table_count, sizeof(char *));
+ if (cfg->uris == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+ for (i = 0; i < cfg->table_count; i++) {
+ uri = cfg->uris[i] = calloc(base_uri_len + 3, 1);
+ if (uri == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+ memcpy(uri, cfg->base_uri, base_uri_len);
+ /*
+ * If there is only one table, just use base name.
+ */
+ if (cfg->table_count > 1) {
+ uri[base_uri_len] = uri[base_uri_len + 1] = '0';
+ uri[base_uri_len] = '0' + (i / 10);
+ uri[base_uri_len + 1] = '0' + (i % 10);
+ }
+ }
+err: if (ret != 0 && cfg->uris != NULL) {
+ for (i = 0; i < cfg->table_count; i++)
+ free(cfg->uris[i]);
+ free(cfg->uris);
+ cfg->uris = NULL;
+ }
+ return (ret);
+}
+
+static int
+create_tables(CONFIG *cfg)
+{
+ WT_SESSION *session;
+ size_t i;
+ int ret;
+
+ if (cfg->create == 0)
+ return (0);
+
+ if ((ret = cfg->conn->open_session(
+ cfg->conn, NULL, cfg->sess_config, &session)) != 0) {
+ lprintf(cfg, ret, 0,
+ "Error opening a session on %s", cfg->home);
+ return (ret);
+ }
+
+ for (i = 0; i < cfg->table_count; i++)
+ if ((ret = session->create(
+ session, cfg->uris[i], cfg->table_config)) != 0) {
+ lprintf(cfg, ret, 0,
+ "Error creating table %s", cfg->uris[i]);
+ return (ret);
+ }
+
+ if ((ret = session->close(session, NULL)) != 0) {
+ lprintf(cfg, ret, 0, "Error closing session");
+ return (ret);
+ }
+
+ return (0);
+}
+
+static int
+start_all_runs(CONFIG *cfg)
+{
+ CONFIG *next_cfg, **configs;
+ pthread_t *threads;
+ size_t cmd_len, home_len, i;
+ int ret, t_ret;
+ char *cmd_buf, *new_home;
+
+ ret = 0;
+ configs = NULL;
+ cmd_buf = NULL;
+
+ if (cfg->database_count == 1)
+ return (start_run(cfg));
+
+ /* Allocate an array to hold our config struct copies. */
+ configs = calloc(cfg->database_count, sizeof(CONFIG *));
+ if (configs == NULL)
+ return (ENOMEM);
+
+ /* Allocate an array to hold our thread IDs. */
+ threads = calloc(cfg->database_count, sizeof(pthread_t));
+ if (threads == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+
+ home_len = strlen(cfg->home);
+ cmd_len = (home_len * 2) + 30; /* Add some slop. */
+ cmd_buf = calloc(cmd_len, 1);
+ if (cmd_buf == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+ for (i = 0; i < cfg->database_count; i++) {
+ next_cfg = calloc(1, sizeof(CONFIG));
+ if (next_cfg == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+ configs[i] = next_cfg;
+ if ((ret = config_assign(next_cfg, cfg)) != 0)
+ goto err;
+
+ /* Setup a unique home directory for each database. */
+ new_home = malloc(home_len + 5);
+ if (new_home == NULL) {
+ ret = ENOMEM;
+ goto err;
+ }
+ snprintf(new_home, home_len + 5, "%s/D%02d", cfg->home, (int)i);
+ next_cfg->home = new_home;
+
+ /* If the monitor dir is default, update it too. */
+ if (strcmp(cfg->monitor_dir, cfg->home) == 0)
+ next_cfg->monitor_dir = new_home;
+
+ /* Create clean home directories. */
+ snprintf(cmd_buf, cmd_len, "rm -rf %s && mkdir %s",
+ next_cfg->home, next_cfg->home);
+ if ((ret = system(cmd_buf)) != 0) {
+ fprintf(stderr, "%s: failed\n", cmd_buf);
+ goto err;
+ }
+ if ((ret = pthread_create(
+ &threads[i], NULL, thread_run_wtperf, next_cfg)) != 0) {
+ lprintf(cfg, ret, 0, "Error creating thread");
+ goto err;
+ }
+ }
+
+ /* Wait for threads to finish. */
+ for (i = 0; i < cfg->database_count; i++) {
+ if ((t_ret = pthread_join(threads[i], NULL)) != 0) {
+ lprintf(cfg, ret, 0, "Error joining thread");
+ if (ret == 0)
+ ret = t_ret;
+ }
+ }
+
+err: for (i = 0; i < cfg->database_count && configs[i] != NULL; i++) {
+ free((char *)configs[i]->home);
+ config_free(configs[i]);
+ free(configs[i]);
+ }
+ free(configs);
+ free(threads);
+ free(cmd_buf);
+
+ return (ret);
+}
+
+/* Run an instance of wtperf for a given configuration. */
+static void *
+thread_run_wtperf(void *arg)
+{
+ CONFIG *cfg;
+ int ret;
+
+ cfg = (CONFIG *)arg;
+ if ((ret = start_run(cfg)) != 0)
+ lprintf(cfg, ret, 0, "Run failed for: %s.", cfg->home);
+ return (NULL);
+}
+
+static int
+start_run(CONFIG *cfg)
+{
+ pthread_t monitor_thread;
+ uint64_t total_ops;
+ int monitor_created, ret, t_ret;
+ char helium_buf[256];
+
+ monitor_created = ret = 0;
+ /* [-Wconditional-uninitialized] */
+ memset(&monitor_thread, 0, sizeof(monitor_thread));
+
+ if ((ret = setup_log_file(cfg)) != 0)
+ goto err;
+
+ if ((ret = wiredtiger_open( /* Open the real connection. */
+ cfg->home, NULL, cfg->conn_config, &cfg->conn)) != 0) {
+ lprintf(cfg, ret, 0, "Error connecting to %s", cfg->home);
+ goto err;
+ }
+
+ /* Configure optional Helium volume. */
+ if (cfg->helium_mount != NULL) {
+ snprintf(helium_buf, sizeof(helium_buf),
+ "entry=wiredtiger_extension_init,config=["
+ "%s=[helium_devices=\"he://./%s\","
+ "helium_o_volume_truncate=1]]",
+ HELIUM_NAME, cfg->helium_mount);
+ if ((ret = cfg->conn->load_extension(
+ cfg->conn, HELIUM_PATH, helium_buf)) != 0)
+ lprintf(cfg,
+ ret, 0, "Error loading Helium: %s", helium_buf);
+ }
+
+ if ((ret = create_uris(cfg)) != 0)
+ goto err;
+ if ((ret = create_tables(cfg)) != 0)
+ goto err;
+
+ /* Start the monitor thread. */
+ if (cfg->sample_interval != 0) {
+ if ((ret = pthread_create(
+ &monitor_thread, NULL, monitor, cfg)) != 0) {
+ lprintf(
+ cfg, ret, 0, "Error creating monitor thread.");
+ goto err;
+ }
+ monitor_created = 1;
+ }
+
+ /* If creating, populate the table. */
+ if (cfg->create != 0 && execute_populate(cfg) != 0)
+ goto err;
+
+ /* Optional workload. */
+ if (cfg->workers_cnt != 0 &&
+ (cfg->run_time != 0 || cfg->run_ops != 0)) {
+ /*
+ * If we have a workload, close and reopen the connection so
+ * that LSM can detect read-only workloads.
+ */
+ if (close_reopen(cfg) != 0)
+ goto err;
+
+ /* Didn't create, set insert count. */
+ if (cfg->create == 0 && find_table_count(cfg) != 0)
+ goto err;
+ /* Start the checkpoint thread. */
+ if (cfg->checkpoint_threads != 0) {
+ lprintf(cfg, 0, 1,
+ "Starting %" PRIu32 " checkpoint thread(s)",
+ cfg->checkpoint_threads);
+ if ((cfg->ckptthreads =
+ calloc(cfg->checkpoint_threads,
+ sizeof(CONFIG_THREAD))) == NULL) {
+ ret = enomem(cfg);
+ goto err;
+ }
+ if (start_threads(cfg, NULL, cfg->ckptthreads,
+ cfg->checkpoint_threads, checkpoint_worker) != 0)
+ goto err;
+ }
+ /* Execute the workload. */
+ if ((ret = execute_workload(cfg)) != 0)
+ goto err;
+
+ /* One final summation of the operations we've completed. */
+ cfg->read_ops = sum_read_ops(cfg);
+ cfg->insert_ops = sum_insert_ops(cfg);
+ cfg->update_ops = sum_update_ops(cfg);
+ cfg->ckpt_ops = sum_ckpt_ops(cfg);
+ total_ops = cfg->read_ops + cfg->insert_ops + cfg->update_ops;
+
+ lprintf(cfg, 0, 1,
+ "Executed %" PRIu64 " read operations (%" PRIu64
+ "%%) %" PRIu64 " ops/sec",
+ cfg->read_ops, (cfg->read_ops * 100) / total_ops,
+ cfg->read_ops / cfg->run_time);
+ lprintf(cfg, 0, 1,
+ "Executed %" PRIu64 " insert operations (%" PRIu64
+ "%%) %" PRIu64 " ops/sec",
+ cfg->insert_ops, (cfg->insert_ops * 100) / total_ops,
+ cfg->insert_ops / cfg->run_time);
+ lprintf(cfg, 0, 1,
+ "Executed %" PRIu64 " update operations (%" PRIu64
+ "%%) %" PRIu64 " ops/sec",
+ cfg->update_ops, (cfg->update_ops * 100) / total_ops,
+ cfg->update_ops / cfg->run_time);
+ lprintf(cfg, 0, 1,
+ "Executed %" PRIu64 " checkpoint operations",
+ cfg->ckpt_ops);
+
+ latency_print(cfg);
+ }
+
+ if (0) {
+err: if (ret == 0)
+ ret = EXIT_FAILURE;
+ }
+
+ /* Notify the worker threads they are done. */
+ cfg->stop = 1;
+
+ if ((t_ret = stop_threads(cfg, 1, cfg->ckptthreads)) != 0)
+ if (ret == 0)
+ ret = t_ret;
+
+ if (monitor_created != 0 &&
+ (t_ret = pthread_join(monitor_thread, NULL)) != 0) {
+ lprintf(cfg, ret, 0, "Error joining monitor thread.");
+ if (ret == 0)
+ ret = t_ret;
+ }
+
+ if (cfg->conn != NULL &&
+ (t_ret = cfg->conn->close(cfg->conn, NULL)) != 0) {
+ lprintf(cfg, t_ret, 0,
+ "Error closing connection to %s", cfg->home);
+ if (ret == 0)
+ ret = t_ret;
+ }
+
+ if (ret == 0) {
+ if (cfg->run_time == 0 && cfg->run_ops == 0)
+ lprintf(cfg, 0, 1, "Run completed");
+ else
+ lprintf(cfg, 0, 1, "Run completed: %" PRIu32 " %s",
+ cfg->run_time == 0 ? cfg->run_ops : cfg->run_time,
+ cfg->run_time == 0 ? "operations" : "seconds");
+ }
+
+ if (cfg->logf != NULL) {
+ if ((t_ret = fflush(cfg->logf)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = fclose(cfg->logf)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ return (ret);
+}
+
+extern int __wt_optind, __wt_optreset;
+extern char *__wt_optarg;
+
+int
+main(int argc, char *argv[])
+{
+ CONFIG *cfg, _cfg;
+ size_t req_len;
+ int ch, monitor_set, ret;
+ const char *opts = "C:H:h:m:O:o:T:";
+ const char *config_opts;
+ char *cc_buf, *tc_buf, *user_cconfig, *user_tconfig;
+
+ monitor_set = ret = 0;
+ config_opts = NULL;
+ cc_buf = tc_buf = user_cconfig = user_tconfig = NULL;
+
+ /* Setup the default configuration values. */
+ cfg = &_cfg;
+ memset(cfg, 0, sizeof(*cfg));
+ if (config_assign(cfg, &default_cfg))
+ goto err;
+
+ /* Do a basic validation of options, and home is needed before open. */
+ while ((ch = __wt_getopt("wtperf", argc, argv, opts)) != EOF)
+ switch (ch) {
+ case 'C':
+ if (user_cconfig == NULL)
+ user_cconfig = strdup(__wt_optarg);
+ else {
+ user_cconfig = realloc(user_cconfig,
+ strlen(user_cconfig) +
+ strlen(__wt_optarg) + 2);
+ strcat(user_cconfig, ",");
+ strcat(user_cconfig, __wt_optarg);
+ }
+ break;
+ case 'H':
+ cfg->helium_mount = __wt_optarg;
+ break;
+ case 'O':
+ config_opts = __wt_optarg;
+ break;
+ case 'T':
+ if (user_tconfig == NULL)
+ user_tconfig = strdup(__wt_optarg);
+ else {
+ user_tconfig = realloc(user_tconfig,
+ strlen(user_tconfig) +
+ strlen(__wt_optarg) + 2);
+ strcat(user_tconfig, ",");
+ strcat(user_tconfig, __wt_optarg);
+ }
+ break;
+ case 'h':
+ cfg->home = __wt_optarg;
+ break;
+ case 'm':
+ cfg->monitor_dir = __wt_optarg;
+ monitor_set = 1;
+ break;
+ case '?':
+ fprintf(stderr, "Invalid option\n");
+ usage();
+ goto einval;
+ }
+
+ /*
+ * If the user did not specify a monitor directory then set the
+ * monitor directory to the home dir.
+ */
+ if (!monitor_set)
+ cfg->monitor_dir = cfg->home;
+
+ /* Parse configuration settings from configuration file. */
+ if (config_opts != NULL && config_opt_file(cfg, config_opts) != 0)
+ goto einval;
+
+ /* Parse options that override values set via a configuration file. */
+ __wt_optreset = __wt_optind = 1;
+ while ((ch = __wt_getopt("wtperf", argc, argv, opts)) != EOF)
+ switch (ch) {
+ case 'o':
+ /* Allow -o key=value */
+ if (config_opt_line(cfg, __wt_optarg) != 0)
+ goto einval;
+ break;
+ }
+
+ cfg->async_config = NULL;
+ /*
+ * If the user specified async_threads we use async for all ops.
+ * If the user wants compaction, then we also enable async for
+ * the compact operation, but not for the workloads.
+ */
+ if (cfg->async_threads > 0)
+ cfg->use_asyncops = 1;
+ if (cfg->compact && cfg->async_threads == 0)
+ cfg->async_threads = 2;
+ if (cfg->async_threads > 0) {
+ /*
+ * The maximum number of async threasd is two digits, so just
+ * use that to compute the space we need. Assume the default
+ * of 1024 for the max ops. Although we could bump that up
+ * to 4096 if needed.
+ */
+ req_len = strlen(",async=(enabled=true,threads=)") + 4;
+ if ((cfg->async_config = calloc(req_len, 1)) == NULL) {
+ ret = enomem(cfg);
+ goto err;
+ }
+ snprintf(cfg->async_config, req_len,
+ ",async=(enabled=true,threads=%d)",
+ cfg->async_threads);
+ }
+ if ((ret = config_compress(cfg)) != 0)
+ goto err;
+
+ /* Build the URI from the table name. */
+ req_len = strlen("table:") +
+ strlen(HELIUM_NAME) + strlen(cfg->table_name) + 2;
+ if ((cfg->base_uri = calloc(req_len, 1)) == NULL) {
+ ret = enomem(cfg);
+ goto err;
+ }
+ snprintf(cfg->base_uri, req_len, "table:%s%s%s",
+ cfg->helium_mount == NULL ? "" : HELIUM_NAME,
+ cfg->helium_mount == NULL ? "" : "/",
+ cfg->table_name);
+
+ /* Make stdout line buffered, so verbose output appears quickly. */
+ (void)setvbuf(stdout, NULL, _IOLBF, 32);
+
+ /* Concatenate non-default configuration strings. */
+ if (cfg->verbose > 1 || user_cconfig != NULL ||
+ cfg->compress_ext != NULL || cfg->async_config != NULL) {
+ req_len = strlen(cfg->conn_config) + strlen(debug_cconfig) + 3;
+ if (user_cconfig != NULL)
+ req_len += strlen(user_cconfig);
+ if (cfg->async_config != NULL)
+ req_len += strlen(cfg->async_config);
+ if (cfg->compress_ext != NULL)
+ req_len += strlen(cfg->compress_ext);
+ if ((cc_buf = calloc(req_len, 1)) == NULL) {
+ ret = enomem(cfg);
+ goto err;
+ }
+ /*
+ * This is getting hard to parse.
+ */
+ snprintf(cc_buf, req_len, "%s%s%s%s%s%s%s",
+ cfg->conn_config,
+ cfg->async_config ? cfg->async_config : "",
+ cfg->compress_ext ? cfg->compress_ext : "",
+ cfg->verbose > 1 ? ",": "",
+ cfg->verbose > 1 ? debug_cconfig : "",
+ user_cconfig ? ",": "",
+ user_cconfig ? user_cconfig : "");
+ if ((ret = config_opt_str(cfg, "conn_config", cc_buf)) != 0)
+ goto err;
+ }
+ if (cfg->verbose > 1 || cfg->helium_mount != NULL ||
+ user_tconfig != NULL || cfg->compress_table != NULL) {
+ req_len = strlen(cfg->table_config) + strlen(HELIUM_CONFIG) +
+ strlen(debug_tconfig) + 3;
+ if (user_tconfig != NULL)
+ req_len += strlen(user_tconfig);
+ if (cfg->compress_table != NULL)
+ req_len += strlen(cfg->compress_table);
+ if ((tc_buf = calloc(req_len, 1)) == NULL) {
+ ret = enomem(cfg);
+ goto err;
+ }
+ /*
+ * This is getting hard to parse.
+ */
+ snprintf(tc_buf, req_len, "%s%s%s%s%s%s%s",
+ cfg->table_config,
+ cfg->compress_table ? cfg->compress_table : "",
+ cfg->verbose > 1 ? ",": "",
+ cfg->verbose > 1 ? debug_tconfig : "",
+ user_tconfig ? ",": "",
+ user_tconfig ? user_tconfig : "",
+ cfg->helium_mount == NULL ? "" : HELIUM_CONFIG);
+ if ((ret = config_opt_str(cfg, "table_config", tc_buf)) != 0)
+ goto err;
+ }
+
+ /* Sanity-check the configuration. */
+ if ((ret = config_sanity(cfg)) != 0)
+ goto err;
+
+ /* Display the configuration. */
+ if (cfg->verbose > 1)
+ config_print(cfg);
+
+ if ((ret = start_all_runs(cfg)) != 0)
+ goto err;
+
+ if (0) {
+einval: ret = EINVAL;
+ }
+
+err: config_free(cfg);
+ free(cc_buf);
+ free(tc_buf);
+ free(user_cconfig);
+ free(user_tconfig);
+
+ return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+static int
+start_threads(CONFIG *cfg,
+ WORKLOAD *workp, CONFIG_THREAD *base, u_int num, void *(*func)(void *))
+{
+ CONFIG_THREAD *thread;
+ u_int i, j;
+ int ret;
+
+ /* Initialize the threads. */
+ for (i = 0, thread = base; i < num; ++i, ++thread) {
+ thread->cfg = cfg;
+ thread->workload = workp;
+
+ /*
+ * We don't want the threads executing in lock-step, move each
+ * new RNG state further along in the sequence.
+ */
+ if (i == 0)
+ __wt_random_init(thread->rnd);
+ else {
+ thread->rnd[0] = (thread - 1)->rnd[0];
+ thread->rnd[1] = (thread - 1)->rnd[1];
+ }
+ for (j = 0; j < 1000; ++j)
+ (void)__wt_random(thread->rnd);
+
+ /*
+ * Every thread gets a key/data buffer because we don't bother
+ * to distinguish between threads needing them and threads that
+ * don't, it's not enough memory to bother. These buffers hold
+ * strings: trailing NUL is included in the size.
+ */
+ if ((thread->key_buf = calloc(cfg->key_sz, 1)) == NULL)
+ return (enomem(cfg));
+ if ((thread->value_buf = calloc(cfg->value_sz, 1)) == NULL)
+ return (enomem(cfg));
+ /*
+ * Initialize and then toss in a bit of random values if needed.
+ */
+ memset(thread->value_buf, 'a', cfg->value_sz - 1);
+ if (cfg->random_value)
+ randomize_value(thread, thread->value_buf);
+
+ /*
+ * Every thread gets tracking information and is initialized
+ * for latency measurements, for the same reason.
+ */
+ thread->ckpt.min_latency =
+ thread->insert.min_latency = thread->read.min_latency =
+ thread->update.min_latency = UINT32_MAX;
+ thread->ckpt.max_latency = thread->insert.max_latency =
+ thread->read.max_latency = thread->update.max_latency = 0;
+ }
+
+ /* Start the threads. */
+ for (i = 0, thread = base; i < num; ++i, ++thread)
+ if ((ret = pthread_create(
+ &thread->handle, NULL, func, thread)) != 0) {
+ lprintf(cfg, ret, 0, "Error creating thread");
+ return (ret);
+ }
+
+ return (0);
+}
+
+static int
+stop_threads(CONFIG *cfg, u_int num, CONFIG_THREAD *threads)
+{
+ u_int i;
+ int ret;
+
+ if (num == 0 || threads == NULL)
+ return (0);
+
+ for (i = 0; i < num; ++i, ++threads) {
+ if ((ret = pthread_join(threads->handle, NULL)) != 0) {
+ lprintf(cfg, ret, 0, "Error joining thread");
+ return (ret);
+ }
+
+ free(threads->key_buf);
+ threads->key_buf = NULL;
+ free(threads->value_buf);
+ threads->value_buf = NULL;
+ }
+
+ /*
+ * We don't free the thread structures or any memory referenced, or NULL
+ * the reference when we stop the threads; the thread structure is still
+ * being read by the monitor thread (among others). As a standalone
+ * program, leaking memory isn't a concern, and it's simpler that way.
+ */
+ return (0);
+}
+
+/*
+ * TODO: Spread the stalls out, so we don't flood at the start of each
+ * second and then pause. Doing this every 10th of a second is probably enough
+ */
+static void
+worker_throttle(int64_t throttle, int64_t *ops, struct timespec *interval)
+{
+ struct timespec now;
+ uint64_t usecs_to_complete;
+ if (*ops < throttle)
+ return;
+
+ /* Ignore errors, we don't really care. */
+ if (__wt_epoch(NULL, &now) != 0)
+ return;
+
+ /*
+ * If we've completed enough operations, reset the counters.
+ * If we did enough operations in less than a second, sleep for
+ * the rest of the second.
+ */
+ usecs_to_complete = ns_to_us(WT_TIMEDIFF(now, *interval));
+ if (usecs_to_complete < USEC_PER_SEC)
+ (void)usleep((useconds_t)(USEC_PER_SEC - usecs_to_complete));
+
+ *ops = 0;
+ *interval = now;
+}
+
+static uint64_t
+wtperf_value_range(CONFIG *cfg)
+{
+ if (cfg->random_range)
+ return (cfg->icount + cfg->random_range);
+
+ return (cfg->icount + cfg->insert_key - (u_int)(cfg->workers_cnt + 1));
+}
+
+static uint64_t
+wtperf_rand(CONFIG_THREAD *thread)
+{
+ CONFIG *cfg;
+ double S1, S2, U;
+ uint64_t rval;
+
+ cfg = thread->cfg;
+
+ /*
+ * Use WiredTiger's random number routine: it's lock-free and fairly
+ * good.
+ */
+ rval = (uint64_t)__wt_random(thread->rnd);
+
+ /* Use Pareto distribution to give 80/20 hot/cold values. */
+ if (cfg->pareto) {
+#define PARETO_SHAPE 1.5
+ S1 = (-1 / PARETO_SHAPE);
+ S2 = wtperf_value_range(cfg) * 0.2 * (PARETO_SHAPE - 1);
+ U = 1 - (double)rval / (double)UINT32_MAX;
+ rval = (pow(U, S1) - 1) * S2;
+ /*
+ * This Pareto calculation chooses out of range values about
+ * about 2% of the time, from my testing. That will lead to the
+ * first item in the table being "hot".
+ */
+ if (rval > wtperf_value_range(cfg))
+ rval = wtperf_value_range(cfg);
+ }
+ /*
+ * Wrap the key to within the expected range and avoid zero: we never
+ * insert that key.
+ */
+ rval = (rval % wtperf_value_range(cfg)) + 1;
+ return (rval);
+}
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.h b/src/third_party/wiredtiger/bench/wtperf/wtperf.h
new file mode 100644
index 00000000000..cc70e76bd5d
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.h
@@ -0,0 +1,247 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _WIN32
+#include <sys/time.h>
+#endif
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <assert.h>
+#include <ctype.h>
+#ifndef _WIN32
+#include <dirent.h>
+#endif
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#ifndef _WIN32
+#include <pthread.h>
+#endif
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <wt_internal.h>
+
+#ifdef _WIN32
+#include "windows_shim.h"
+#endif
+
+#include "config_opt.h"
+
+typedef struct __config CONFIG;
+typedef struct __config_thread CONFIG_THREAD;
+
+#define EXT_PFX ",extensions=("
+#define EXT_SFX ")"
+#define EXTPATH "../../ext/compressors/" /* Extensions path */
+#define BLKCMP_PFX ",block_compressor="
+
+#define BZIP_BLK BLKCMP_PFX "bzip2"
+#define BZIP_EXT \
+ EXT_PFX EXTPATH "bzip2/.libs/libwiredtiger_bzip2.so" EXT_SFX
+#define SNAPPY_BLK BLKCMP_PFX "snappy"
+#define SNAPPY_EXT \
+ EXT_PFX EXTPATH "snappy/.libs/libwiredtiger_snappy.so" EXT_SFX
+#define ZLIB_BLK BLKCMP_PFX "zlib"
+#define ZLIB_EXT \
+ EXT_PFX EXTPATH "zlib/.libs/libwiredtiger_zlib.so" EXT_SFX
+
+typedef struct {
+ int64_t threads; /* Thread count */
+ int64_t insert; /* Insert ratio */
+ int64_t read; /* Read ratio */
+ int64_t update; /* Update ratio */
+ int64_t throttle; /* Maximum operations/second */
+ /* Number of operations per transaction. Zero for autocommit */
+ int64_t ops_per_txn;
+
+#define WORKER_INSERT 1 /* Insert */
+#define WORKER_INSERT_RMW 2 /* Insert with read-modify-write */
+#define WORKER_READ 3 /* Read */
+#define WORKER_UPDATE 4 /* Update */
+ uint8_t ops[100]; /* Operation schedule */
+} WORKLOAD;
+
+/*
+ * NOTE: If you add any fields to this structure here, you must also add
+ * an initialization in wtperf.c in the default_cfg.
+ */
+struct __config { /* Configuration struction */
+ const char *home; /* WiredTiger home */
+ const char *monitor_dir; /* Monitor output dir */
+ char *base_uri; /* Object URI */
+ char **uris; /* URIs if multiple tables */
+ const char *helium_mount; /* Optional Helium mount point */
+
+ WT_CONNECTION *conn; /* Database connection */
+
+ FILE *logf; /* Logging handle */
+
+ char *async_config; /* Config string for async */
+
+ const char *compress_ext; /* Compression extension for conn */
+ const char *compress_table; /* Compression arg to table create */
+
+ CONFIG_THREAD *ckptthreads, *popthreads;
+
+#define WORKLOAD_MAX 50
+ CONFIG_THREAD *workers; /* Worker threads */
+ u_int workers_cnt;
+
+ WORKLOAD *workload; /* Workloads */
+ u_int workload_cnt;
+
+ uint32_t use_asyncops; /* Use async operations */
+ /* State tracking variables. */
+
+ uint64_t ckpt_ops; /* checkpoint operations */
+ uint64_t insert_ops; /* insert operations */
+ uint64_t read_ops; /* read operations */
+ uint64_t update_ops; /* update operations */
+
+ uint64_t insert_key; /* insert key */
+
+ volatile int ckpt; /* checkpoint in progress */
+ volatile int error; /* thread error */
+ volatile int stop; /* notify threads to stop */
+ volatile int in_warmup; /* Running warmup phase */
+
+ volatile uint32_t totalsec; /* total seconds running */
+
+ /* Fields changeable on command line are listed in wtperf_opt.i */
+#define OPT_DECLARE_STRUCT
+#include "wtperf_opt.i"
+#undef OPT_DECLARE_STRUCT
+};
+
+#define ELEMENTS(a) (sizeof(a) / sizeof(a[0]))
+
+#define THROTTLE_OPS 100
+
+#define THOUSAND (1000ULL)
+#define MILLION (1000000ULL)
+#define BILLION (1000000000ULL)
+
+#define NSEC_PER_SEC BILLION
+#define USEC_PER_SEC MILLION
+#define MSEC_PER_SEC THOUSAND
+
+#define ns_to_ms(v) ((v) / MILLION)
+#define ns_to_sec(v) ((v) / BILLION)
+#define ns_to_us(v) ((v) / THOUSAND)
+
+#define us_to_ms(v) ((v) / THOUSAND)
+#define us_to_ns(v) ((v) * THOUSAND)
+#define us_to_sec(v) ((v) / MILLION)
+
+#define ms_to_ns(v) ((v) * MILLION)
+#define ms_to_us(v) ((v) * THOUSAND)
+#define ms_to_sec(v) ((v) / THOUSAND)
+
+#define sec_to_ns(v) ((v) * BILLION)
+#define sec_to_us(v) ((v) * MILLION)
+#define sec_to_ms(v) ((v) * THOUSAND)
+
+typedef struct {
+ /*
+ * Threads maintain the total thread operation and total latency they've
+ * experienced; the monitor thread periodically copies these values into
+ * the last_XXX fields.
+ */
+ uint64_t ops; /* Total operations */
+ uint64_t latency_ops; /* Total ops sampled for latency */
+ uint64_t latency; /* Total latency */
+
+ uint64_t last_latency_ops; /* Last read by monitor thread */
+ uint64_t last_latency;
+
+ /*
+ * Minimum/maximum latency, shared with the monitor thread, that is, the
+ * monitor thread clears it so it's recalculated again for each period.
+ */
+ uint32_t min_latency; /* Minimum latency (uS) */
+ uint32_t max_latency; /* Maximum latency (uS) */
+
+ /*
+ * Latency buckets.
+ */
+ uint32_t us[1000]; /* < 1us ... 1000us */
+ uint32_t ms[1000]; /* < 1ms ... 1000ms */
+ uint32_t sec[100]; /* < 1s 2s ... 100s */
+} TRACK;
+
+struct __config_thread { /* Per-thread structure */
+ CONFIG *cfg; /* Enclosing configuration */
+
+ uint32_t rnd[2]; /* Random number generation state */
+
+ pthread_t handle; /* Handle */
+
+ char *key_buf, *value_buf; /* Key/value memory */
+
+ WORKLOAD *workload; /* Workload */
+
+ TRACK ckpt; /* Checkpoint operations */
+ TRACK insert; /* Insert operations */
+ TRACK read; /* Read operations */
+ TRACK update; /* Update operations */
+};
+
+int config_assign(CONFIG *, const CONFIG *);
+int config_compress(CONFIG *);
+void config_free(CONFIG *);
+int config_opt_file(CONFIG *, const char *);
+int config_opt_line(CONFIG *, const char *);
+int config_opt_str(CONFIG *, const char *, const char *);
+void config_print(CONFIG *);
+int config_sanity(CONFIG *);
+void latency_insert(CONFIG *, uint32_t *, uint32_t *, uint32_t *);
+void latency_read(CONFIG *, uint32_t *, uint32_t *, uint32_t *);
+void latency_update(CONFIG *, uint32_t *, uint32_t *, uint32_t *);
+void latency_print(CONFIG *);
+int enomem(const CONFIG *);
+int setup_log_file(CONFIG *);
+uint64_t sum_ckpt_ops(CONFIG *);
+uint64_t sum_insert_ops(CONFIG *);
+uint64_t sum_pop_ops(CONFIG *);
+uint64_t sum_read_ops(CONFIG *);
+uint64_t sum_update_ops(CONFIG *);
+void usage(void);
+
+void lprintf(const CONFIG *, int err, uint32_t, const char *, ...)
+#if defined(__GNUC__)
+__attribute__((format (printf, 4, 5)))
+#endif
+;
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i
new file mode 100644
index 00000000000..3b4ddb6b3ad
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i
@@ -0,0 +1,172 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * wtperf_opt.i
+ * List of options for wtperf. This is included multiple times.
+ */
+
+#ifdef OPT_DECLARE_STRUCT
+#define DEF_OPT_AS_BOOL(name, initval, desc) int name;
+#define DEF_OPT_AS_CONFIG_STRING(name, initval, desc) const char *name;
+#define DEF_OPT_AS_STRING(name, initval, desc) const char *name;
+#define DEF_OPT_AS_UINT32(name, initval, desc) uint32_t name;
+#endif
+
+#ifdef OPT_DEFINE_DESC
+#define DEF_OPT_AS_BOOL(name, initval, desc) \
+ { #name, desc, #initval, BOOL_TYPE, offsetof(CONFIG, name) },
+#define DEF_OPT_AS_CONFIG_STRING(name, initval, desc) \
+ { #name, desc, initval, CONFIG_STRING_TYPE, \
+ offsetof(CONFIG, name) },
+#define DEF_OPT_AS_STRING(name, initval, desc) \
+ { #name, desc, initval, STRING_TYPE, offsetof(CONFIG, name) },
+#define DEF_OPT_AS_UINT32(name, initval, desc) \
+ { #name, desc, #initval, UINT32_TYPE, offsetof(CONFIG, name) },
+#endif
+
+#ifdef OPT_DEFINE_DEFAULT
+#define DEF_OPT_AS_BOOL(name, initval, desc) initval,
+#define DEF_OPT_AS_CONFIG_STRING(name, initval, desc) initval,
+#define DEF_OPT_AS_STRING(name, initval, desc) initval,
+#define DEF_OPT_AS_UINT32(name, initval, desc) initval,
+#endif
+
+#ifdef OPT_DEFINE_DOXYGEN
+#define DEF_OPT_AS_BOOL(name, initval, desc) \
+ { #name, desc, #initval, BOOL_TYPE, 0 },
+#define DEF_OPT_AS_CONFIG_STRING(name, initval, desc) \
+ { #name, desc, initval, CONFIG_STRING_TYPE, 0 },
+#define DEF_OPT_AS_STRING(name, initval, desc) \
+ { #name, desc, initval, STRING_TYPE, 0 },
+#define DEF_OPT_AS_UINT32(name, initval, desc) \
+ { #name, desc, #initval, UINT32_TYPE, 0 },
+#endif
+
+/*
+ * Each option listed here represents a CONFIG struct field that may be
+ * altered on command line via -o and -O. Each option appears here as:
+ * DEF_OPT_AS_BOOL(name, initval, desc)
+ * DEF_OPT_AS_CONFIG_STRING(name, initval, desc)
+ * DEF_OPT_AS_STRING(name, initval, desc)
+ * DEF_OPT_AS_UINT32(name, initval, desc)
+ *
+ * The first four forms (*_{CONFIG_STRING|STRING|BOOL|UINT}) have these
+ * parameters:
+ * name: a C identifier, this identifier will be a field in CONFIG,
+ * and identifies the option for -o or -O.
+ * initval: a default initial value for the field.
+ * The default values are tiny, we want the basic run to be fast.
+ * desc: a description that will appear in the usage message.
+ *
+ * The difference between CONFIG_STRING and STRING is that CONFIG_STRING
+ * options are appended to existing content, whereas STRING options overwrite.
+ */
+DEF_OPT_AS_UINT32(async_threads, 0, "number of async worker threads")
+DEF_OPT_AS_UINT32(checkpoint_interval, 120,
+ "checkpoint every interval seconds during the workload phase.")
+DEF_OPT_AS_UINT32(checkpoint_stress_rate, 0,
+ "checkpoint every rate operations during the populate phase in the "
+ "populate thread(s), 0 to disable")
+DEF_OPT_AS_UINT32(checkpoint_threads, 0, "number of checkpoint threads")
+DEF_OPT_AS_CONFIG_STRING(conn_config, "create",
+ "connection configuration string")
+DEF_OPT_AS_BOOL(compact, 0, "post-populate compact for LSM merging activity")
+DEF_OPT_AS_STRING(compression, "none",
+ "compression extension. Allowed configuration values are: "
+ "'none', 'bzip', 'snappy', 'zlib'")
+DEF_OPT_AS_BOOL(create, 1,
+ "do population phase; false to use existing database")
+DEF_OPT_AS_UINT32(database_count, 1,
+ "number of WiredTiger databases to use. Each database will execute the"
+ " workload using a separate home directory and complete set of worker"
+ " threads")
+DEF_OPT_AS_UINT32(icount, 5000,
+ "number of records to initially populate. If multiple tables are "
+ "configured, each table has this many items inserted.")
+DEF_OPT_AS_BOOL(insert_rmw, 0,
+ "execute a read prior to each insert in workload phase")
+DEF_OPT_AS_UINT32(key_sz, 20, "key size")
+DEF_OPT_AS_UINT32(min_throughput, 0,
+ "abort if any throughput measured is less than this amount. Requires "
+ "sample_interval to be configured")
+DEF_OPT_AS_UINT32(max_latency, 0,
+ "abort if any latency measured exceeds this number of milliseconds."
+ "Requires sample_interval to be configured")
+DEF_OPT_AS_BOOL(pareto, 0, "use pareto 80/20 distribution for random numbers")
+DEF_OPT_AS_UINT32(populate_ops_per_txn, 0,
+ "number of operations to group into each transaction in the populate "
+ "phase, zero for auto-commit")
+DEF_OPT_AS_UINT32(populate_threads, 1,
+ "number of populate threads, 1 for bulk load")
+DEF_OPT_AS_UINT32(random_range, 0,
+ "if non zero choose a value from within this range as the key for "
+ "insert operations")
+DEF_OPT_AS_BOOL(random_value, 0, "generate random content for the value")
+DEF_OPT_AS_UINT32(report_interval, 2,
+ "output throughput information every interval seconds, 0 to disable")
+DEF_OPT_AS_UINT32(run_ops, 0,
+ "total read, insert and update workload operations")
+DEF_OPT_AS_UINT32(run_time, 0,
+ "total workload seconds")
+DEF_OPT_AS_UINT32(sample_interval, 0,
+ "performance logging every interval seconds, 0 to disable")
+DEF_OPT_AS_UINT32(sample_rate, 50,
+ "how often the latency of operations is measured. One for every operation,"
+ "two for every second operation, three for every third operation etc.")
+DEF_OPT_AS_CONFIG_STRING(sess_config, "", "session configuration string")
+DEF_OPT_AS_CONFIG_STRING(table_config,
+ "key_format=S,value_format=S,type=lsm,exclusive=true,"
+ "allocation_size=4kb,internal_page_max=64kb,leaf_page_max=4kb,"
+ "split_pct=100",
+ "table configuration string")
+DEF_OPT_AS_UINT32(table_count, 1,
+ "number of tables to run operations over. Keys are divided evenly "
+ "over the tables. Default 1, maximum 99.")
+DEF_OPT_AS_STRING(threads, "", "workload configuration: each 'count' "
+ "entry is the total number of threads, and the 'insert', 'read' and "
+ "'update' entries are the ratios of insert, read and update operations "
+ "done by each worker thread; If a throttle value is provided each thread "
+ "will do a maximum of that number of operations per second; multiple "
+ "workload configurations may be "
+ "specified; for example, a more complex threads configuration might be "
+ "'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' "
+ "which would create 2 threads doing nothing but reads and 8 threads "
+ "each doing 50% inserts and 25% reads and updates. Allowed configuration "
+ "values are 'count', 'throttle', 'reads', 'inserts', 'updates'. There are "
+ "also behavior modifiers, supported modifiers are 'ops_per_txn'")
+DEF_OPT_AS_CONFIG_STRING(transaction_config, "",
+ "transaction configuration string, relevant when populate_opts_per_txn "
+ "is nonzero")
+DEF_OPT_AS_STRING(table_name, "test", "table name")
+DEF_OPT_AS_UINT32(value_sz, 100, "value size")
+DEF_OPT_AS_UINT32(verbose, 1, "verbosity")
+DEF_OPT_AS_UINT32(warmup, 0,
+ "How long to run the workload phase before starting measurements")
+
+#undef DEF_OPT_AS_BOOL
+#undef DEF_OPT_AS_CONFIG_STRING
+#undef DEF_OPT_AS_STRING
+#undef DEF_OPT_AS_UINT32
diff --git a/src/third_party/wiredtiger/build_darwin/wiredtiger_config.h b/src/third_party/wiredtiger/build_darwin/wiredtiger_config.h
new file mode 100644
index 00000000000..48b94c4078d
--- /dev/null
+++ b/src/third_party/wiredtiger/build_darwin/wiredtiger_config.h
@@ -0,0 +1,151 @@
+/* wiredtiger_config.h. Generated from config.hin by configure. */
+/* build_posix/config.hin. Generated from configure.ac by autoheader. */
+
+/* Define if building universal (internal helper macro) */
+/* #undef AC_APPLE_UNIVERSAL_BUILD */
+
+/* Define to 1 to pause for debugger attach on failure. */
+/* #undef HAVE_ATTACH */
+
+/* Build the LevelDB API with Basho LevelDB support. */
+/* #undef HAVE_BASHOLEVELDB */
+
+/* Snappy support automatically loaded. */
+/* #undef HAVE_BUILTIN_EXTENSION_SNAPPY */
+
+/* Zlib support automatically loaded. */
+/* #undef HAVE_BUILTIN_EXTENSION_ZLIB */
+
+/* Define to 1 if you have the `clock_gettime' function. */
+/* #undef HAVE_CLOCK_GETTIME */
+
+/* Define to 1 for diagnostic tests. */
+/* #undef HAVE_DIAGNOSTIC */
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the `fallocate' function. */
+/* #undef HAVE_FALLOCATE */
+
+/* Define to 1 if you have the `fcntl' function. */
+#define HAVE_FCNTL 1
+
+/* Define to 1 if you have the `fdatasync' function. */
+/* #undef HAVE_FDATASYNC */
+
+/* Define to 1 if you have the `fread_unlocked' function. */
+/* #undef HAVE_FREAD_UNLOCKED */
+
+/* Define to 1 if you have the `ftruncate' function. */
+#define HAVE_FTRUNCATE 1
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#define HAVE_GETTIMEOFDAY 1
+
+/* Build the LevelDB API with HyperLevelDB support. */
+/* #undef HAVE_HYPERLEVELDB */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the `bz2' library (-lbz2). */
+/* #undef HAVE_LIBBZ2 */
+
+/* Define to 1 if you have the `dl' library (-ldl). */
+#define HAVE_LIBDL 1
+
+/* Define to 1 if you have the `pthread' library (-lpthread). */
+#define HAVE_LIBPTHREAD 1
+
+/* Define to 1 if you have the `rt' library (-lrt). */
+/* #undef HAVE_LIBRT */
+
+/* Define to 1 if you have the `snappy' library (-lsnappy). */
+/* #undef HAVE_LIBSNAPPY */
+
+/* Define to 1 if you have the `z' library (-lz). */
+/* #undef HAVE_LIBZ */
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the `posix_fadvise' function. */
+/* #undef HAVE_POSIX_FADVISE */
+
+/* Define to 1 if you have the `posix_fallocate' function. */
+/* #undef HAVE_POSIX_FALLOCATE */
+
+/* Define to 1 if you have the `posix_madvise' function. */
+#define HAVE_POSIX_MADVISE 1
+
+/* Define to 1 if you have the `posix_memalign' function. */
+#define HAVE_POSIX_MEMALIGN 1
+
+/* Define to 1 if you have the <pthread_np.h> header file. */
+/* #undef HAVE_PTHREAD_NP_H */
+
+/* Build the LevelDB API with RocksDB support. */
+/* #undef HAVE_ROCKSDB */
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strtouq' function. */
+#define HAVE_STRTOUQ 1
+
+/* Define to 1 if you have the `sync_file_range' function. */
+/* #undef HAVE_SYNC_FILE_RANGE */
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Enable verbose message configuration. */
+/* #undef HAVE_VERBOSE */
+
+/* Spinlock type from mutex.h. */
+#define SPINLOCK_TYPE SPINLOCK_PTHREAD_MUTEX
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
+ significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined AC_APPLE_UNIVERSAL_BUILD
+# if defined __BIG_ENDIAN__
+# define WORDS_BIGENDIAN 1
+# endif
+#else
+# ifndef WORDS_BIGENDIAN
+/* # undef WORDS_BIGENDIAN */
+# endif
+#endif
+
+/* Default alignment of buffers used for I/O */
+#define WT_BUFFER_ALIGNMENT_DEFAULT 0
+
+/* Enable large inode numbers on Mac OS X 10.5. */
+#ifndef _DARWIN_USE_64_BIT_INODE
+# define _DARWIN_USE_64_BIT_INODE 1
+#endif
+
+/* Number of bits in a file offset, on hosts where this is settable. */
+/* #undef _FILE_OFFSET_BITS */
+
+/* Define for large files, on AIX-style hosts. */
+/* #undef _LARGE_FILES */
diff --git a/src/third_party/wiredtiger/build_freebsd/wiredtiger_config.h b/src/third_party/wiredtiger/build_freebsd/wiredtiger_config.h
new file mode 100644
index 00000000000..aaeff21168a
--- /dev/null
+++ b/src/third_party/wiredtiger/build_freebsd/wiredtiger_config.h
@@ -0,0 +1,151 @@
+/* wiredtiger_config.h. Generated from config.hin by configure. */
+/* build_posix/config.hin. Generated from configure.ac by autoheader. */
+
+/* Define if building universal (internal helper macro) */
+/* #undef AC_APPLE_UNIVERSAL_BUILD */
+
+/* Define to 1 to pause for debugger attach on failure. */
+/* #undef HAVE_ATTACH */
+
+/* Build the LevelDB API with Basho LevelDB support. */
+/* #undef HAVE_BASHOLEVELDB */
+
+/* Snappy support automatically loaded. */
+/* #undef HAVE_BUILTIN_EXTENSION_SNAPPY */
+
+/* Zlib support automatically loaded. */
+/* #undef HAVE_BUILTIN_EXTENSION_ZLIB */
+
+/* Define to 1 if you have the `clock_gettime' function. */
+#define HAVE_CLOCK_GETTIME 1
+
+/* Define to 1 for diagnostic tests. */
+/* #undef HAVE_DIAGNOSTIC */
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the `fallocate' function. */
+/* #undef HAVE_FALLOCATE */
+
+/* Define to 1 if you have the `fcntl' function. */
+#define HAVE_FCNTL 1
+
+/* Define to 1 if you have the `fdatasync' function. */
+/* #undef HAVE_FDATASYNC */
+
+/* Define to 1 if you have the `fread_unlocked' function. */
+/* #undef HAVE_FREAD_UNLOCKED */
+
+/* Define to 1 if you have the `ftruncate' function. */
+#define HAVE_FTRUNCATE 1
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#define HAVE_GETTIMEOFDAY 1
+
+/* Build the LevelDB API with HyperLevelDB support. */
+/* #undef HAVE_HYPERLEVELDB */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the `bz2' library (-lbz2). */
+/* #undef HAVE_LIBBZ2 */
+
+/* Define to 1 if you have the `dl' library (-ldl). */
+/* #undef HAVE_LIBDL */
+
+/* Define to 1 if you have the `pthread' library (-lpthread). */
+#define HAVE_LIBPTHREAD 1
+
+/* Define to 1 if you have the `rt' library (-lrt). */
+#define HAVE_LIBRT 1
+
+/* Define to 1 if you have the `snappy' library (-lsnappy). */
+/* #undef HAVE_LIBSNAPPY */
+
+/* Define to 1 if you have the `z' library (-lz). */
+/* #undef HAVE_LIBZ */
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the `posix_fadvise' function. */
+#define HAVE_POSIX_FADVISE 1
+
+/* Define to 1 if you have the `posix_fallocate' function. */
+#define HAVE_POSIX_FALLOCATE 1
+
+/* Define to 1 if you have the `posix_madvise' function. */
+#define HAVE_POSIX_MADVISE 1
+
+/* Define to 1 if you have the `posix_memalign' function. */
+#define HAVE_POSIX_MEMALIGN 1
+
+/* Define to 1 if you have the <pthread_np.h> header file. */
+#define HAVE_PTHREAD_NP_H 1
+
+/* Build the LevelDB API with RocksDB support. */
+/* #undef HAVE_ROCKSDB */
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strtouq' function. */
+#define HAVE_STRTOUQ 1
+
+/* Define to 1 if you have the `sync_file_range' function. */
+/* #undef HAVE_SYNC_FILE_RANGE */
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Enable verbose message configuration. */
+/* #undef HAVE_VERBOSE */
+
+/* Spinlock type from mutex.h. */
+#define SPINLOCK_TYPE SPINLOCK_PTHREAD_MUTEX
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
+ significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined AC_APPLE_UNIVERSAL_BUILD
+# if defined __BIG_ENDIAN__
+# define WORDS_BIGENDIAN 1
+# endif
+#else
+# ifndef WORDS_BIGENDIAN
+/* # undef WORDS_BIGENDIAN */
+# endif
+#endif
+
+/* Default alignment of buffers used for I/O */
+#define WT_BUFFER_ALIGNMENT_DEFAULT 0
+
+/* Enable large inode numbers on Mac OS X 10.5. */
+#ifndef _DARWIN_USE_64_BIT_INODE
+# define _DARWIN_USE_64_BIT_INODE 1
+#endif
+
+/* Number of bits in a file offset, on hosts where this is settable. */
+/* #undef _FILE_OFFSET_BITS */
+
+/* Define for large files, on AIX-style hosts. */
+/* #undef _LARGE_FILES */
diff --git a/src/third_party/wiredtiger/build_linux/wiredtiger_config.h b/src/third_party/wiredtiger/build_linux/wiredtiger_config.h
new file mode 100644
index 00000000000..db6ea2661d0
--- /dev/null
+++ b/src/third_party/wiredtiger/build_linux/wiredtiger_config.h
@@ -0,0 +1,151 @@
+/* wiredtiger_config.h. Generated from config.hin by configure. */
+/* build_posix/config.hin. Generated from configure.ac by autoheader. */
+
+/* Define if building universal (internal helper macro) */
+/* #undef AC_APPLE_UNIVERSAL_BUILD */
+
+/* Define to 1 to pause for debugger attach on failure. */
+/* #undef HAVE_ATTACH */
+
+/* Build the LevelDB API with Basho LevelDB support. */
+/* #undef HAVE_BASHOLEVELDB */
+
+/* Snappy support automatically loaded. */
+/* #undef HAVE_BUILTIN_EXTENSION_SNAPPY */
+
+/* Zlib support automatically loaded. */
+/* #undef HAVE_BUILTIN_EXTENSION_ZLIB */
+
+/* Define to 1 if you have the `clock_gettime' function. */
+#define HAVE_CLOCK_GETTIME 1
+
+/* Define to 1 for diagnostic tests. */
+/* #undef HAVE_DIAGNOSTIC */
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the `fallocate' function. */
+/* #undef HAVE_FALLOCATE */
+
+/* Define to 1 if you have the `fcntl' function. */
+#define HAVE_FCNTL 1
+
+/* Define to 1 if you have the `fdatasync' function. */
+#define HAVE_FDATASYNC 1
+
+/* Define to 1 if you have the `fread_unlocked' function. */
+#define HAVE_FREAD_UNLOCKED 1
+
+/* Define to 1 if you have the `ftruncate' function. */
+#define HAVE_FTRUNCATE 1
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#define HAVE_GETTIMEOFDAY 1
+
+/* Build the LevelDB API with HyperLevelDB support. */
+/* #undef HAVE_HYPERLEVELDB */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the `bz2' library (-lbz2). */
+/* #undef HAVE_LIBBZ2 */
+
+/* Define to 1 if you have the `dl' library (-ldl). */
+#define HAVE_LIBDL 1
+
+/* Define to 1 if you have the `pthread' library (-lpthread). */
+#define HAVE_LIBPTHREAD 1
+
+/* Define to 1 if you have the `rt' library (-lrt). */
+#define HAVE_LIBRT 1
+
+/* Define to 1 if you have the `snappy' library (-lsnappy). */
+/* #undef HAVE_LIBSNAPPY */
+
+/* Define to 1 if you have the `z' library (-lz). */
+/* #undef HAVE_LIBZ */
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the `posix_fadvise' function. */
+#define HAVE_POSIX_FADVISE 1
+
+/* Define to 1 if you have the `posix_fallocate' function. */
+#define HAVE_POSIX_FALLOCATE 1
+
+/* Define to 1 if you have the `posix_madvise' function. */
+#define HAVE_POSIX_MADVISE 1
+
+/* Define to 1 if you have the `posix_memalign' function. */
+#define HAVE_POSIX_MEMALIGN 1
+
+/* Define to 1 if you have the <pthread_np.h> header file. */
+/* #undef HAVE_PTHREAD_NP_H */
+
+/* Build the LevelDB API with RocksDB support. */
+/* #undef HAVE_ROCKSDB */
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strtouq' function. */
+#define HAVE_STRTOUQ 1
+
+/* Define to 1 if you have the `sync_file_range' function. */
+/* #undef HAVE_SYNC_FILE_RANGE */
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Enable verbose message configuration. */
+/* #undef HAVE_VERBOSE */
+
+/* Spinlock type from mutex.h. */
+#define SPINLOCK_TYPE SPINLOCK_PTHREAD_MUTEX
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
+ significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined AC_APPLE_UNIVERSAL_BUILD
+# if defined __BIG_ENDIAN__
+# define WORDS_BIGENDIAN 1
+# endif
+#else
+# ifndef WORDS_BIGENDIAN
+/* # undef WORDS_BIGENDIAN */
+# endif
+#endif
+
+/* Default alignment of buffers used for I/O */
+#define WT_BUFFER_ALIGNMENT_DEFAULT 4096
+
+/* Enable large inode numbers on Mac OS X 10.5. */
+#ifndef _DARWIN_USE_64_BIT_INODE
+# define _DARWIN_USE_64_BIT_INODE 1
+#endif
+
+/* Number of bits in a file offset, on hosts where this is settable. */
+/* #undef _FILE_OFFSET_BITS */
+
+/* Define for large files, on AIX-style hosts. */
+/* #undef _LARGE_FILES */
diff --git a/src/third_party/wiredtiger/build_posix/Make.base b/src/third_party/wiredtiger/build_posix/Make.base
new file mode 100644
index 00000000000..51a8e77cebe
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/Make.base
@@ -0,0 +1,81 @@
+ACLOCAL_AMFLAGS = -I build_posix/aclocal
+
+# BEGIN SUBDIRS, maintained by makemake and Make.subdirs
+# END SUBDIRS
+
+lib_LTLIBRARIES = libwiredtiger.la
+LDADD = $(lib_LTLIBRARIES)
+
+# The LevelDB libraries prefer to include all of the objects in a single
+# library. Create a convenience library for them.
+if LEVELDB
+noinst_LTLIBRARIES = libwiredtiger_static.la
+endif
+
+# BEGIN SOURCES, maintained by makemake and dist/filelist
+# END SOURCES
+
+bin_PROGRAMS = wt
+wt_SOURCES =\
+ src/utilities/util_backup.c \
+ src/utilities/util_cpyright.c \
+ src/utilities/util_compact.c \
+ src/utilities/util_create.c \
+ src/utilities/util_drop.c \
+ src/utilities/util_dump.c \
+ src/utilities/util_list.c \
+ src/utilities/util_load.c \
+ src/utilities/util_load_json.c \
+ src/utilities/util_loadtext.c \
+ src/utilities/util_main.c \
+ src/utilities/util_misc.c \
+ src/utilities/util_printlog.c \
+ src/utilities/util_read.c \
+ src/utilities/util_rename.c \
+ src/utilities/util_salvage.c \
+ src/utilities/util_stat.c \
+ src/utilities/util_upgrade.c \
+ src/utilities/util_verbose.c \
+ src/utilities/util_verify.c \
+ src/utilities/util_write.c
+
+man1_MANS = $(MAN1_PAGES)
+man3_MANS = $(MAN3_PAGES)
+
+include_HEADERS= wiredtiger.h src/include/wiredtiger_ext.h
+AM_CPPFLAGS = -I$(srcdir)/src/include
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = wiredtiger.pc
+
+$(srcdir)/Makefile.am: $(srcdir)/build_posix/Make.base $(srcdir)/build_posix/makemake $(srcdir)/dist/filelist
+ @cd $(srcdir)/build_posix && sh makemake
+
+libtool: $(LIBTOOL_DEPS)
+ $(SHELL) ./config.status libtool
+
+$(srcdir)/src/include/extern.h: auto-includes.chk
+$(srcdir)/src/include/wt_internal.h: auto-includes.chk
+
+auto-includes.chk: $(libwiredtiger_la_SOURCES)
+ @(cd $(srcdir)/dist && sh s_prototypes && sh s_typedef -b) && touch $@
+
+$(srcdir)/docs/index.html:
+ @cd $(srcdir)/dist && sh s_docs
+
+$(srcdir)/docs/man/man1/wt.1: $(srcdir)/docs/index.html
+$(srcdir)/docs/man/man3/wiredtiger.3: $(srcdir)/docs/index.html
+
+libwiredtiger_la_LIBADD =
+if HAVE_BUILTIN_EXTENSION_SNAPPY
+libwiredtiger_la_LIBADD += ext/compressors/snappy/libwiredtiger_snappy.la
+endif
+if HAVE_BUILTIN_EXTENSION_ZLIB
+libwiredtiger_la_LIBADD += ext/compressors/zlib/libwiredtiger_zlib.la
+endif
+
+libwiredtiger_static_la_LIBADD=$(libwiredtiger_la_LIBADD)
+libwiredtiger_static_la_SOURCES=$(libwiredtiger_la_SOURCES)
+
+clean-local:
+ rm -rf WT_TEST
diff --git a/src/third_party/wiredtiger/build_posix/Make.subdirs b/src/third_party/wiredtiger/build_posix/Make.subdirs
new file mode 100644
index 00000000000..d37acef50e1
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/Make.subdirs
@@ -0,0 +1,28 @@
+# List of sub-directories, used by makemake to create Makefile.am
+#
+# The format is:
+# <dir> [<condition>]
+#
+# If the directory exists, it is added to AUTO_SUBDIRS.
+# If a condition is included, the subdir is made conditional via AM_CONDITIONAL
+ext/collators/reverse
+ext/compressors/bzip2 BZIP2
+ext/compressors/nop
+ext/compressors/snappy SNAPPY
+ext/compressors/zlib ZLIB
+ext/datasources/helium HAVE_HELIUM
+ext/test/kvs_bdb HAVE_BERKELEY_DB
+.
+api/leveldb LEVELDB
+bench/wtperf
+examples/c
+lang/java JAVA
+examples/java JAVA
+lang/python PYTHON
+test/bloom
+test/checkpoint
+test/fops
+test/format HAVE_BERKELEY_DB
+test/huge
+test/salvage
+test/thread
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/ax_check_class.m4 b/src/third_party/wiredtiger/build_posix/aclocal/ax_check_class.m4
new file mode 100644
index 00000000000..098aa77290b
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/ax_check_class.m4
@@ -0,0 +1,144 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_check_class.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_CHECK_CLASS
+#
+# DESCRIPTION
+#
+# AX_CHECK_CLASS tests the existence of a given Java class, either in a
+# jar or in a '.class' file.
+#
+# *Warning*: its success or failure can depend on a proper setting of the
+# CLASSPATH env. variable.
+#
+# Note: This is part of the set of autoconf M4 macros for Java programs.
+# It is VERY IMPORTANT that you download the whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission. The
+# general documentation, as well as the sample configure.in, is included
+# in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception, the respective Autoconf Macro's copyright owner
+# gives unlimited permission to copy, distribute and modify the configure
+# scripts that are the output of Autoconf when processing the Macro. You
+# need not follow the terms of the GNU General Public License when using
+# or distributing such scripts, even though portions of the text of the
+# Macro appear in them. The GNU General Public License (GPL) does govern
+# all other use of the material that constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the Autoconf
+# Macro released by the Autoconf Archive. When you make and distribute a
+# modified version of the Autoconf Macro, you may extend this special
+# exception to the GPL to apply to your modified version as well.
+
+#serial 7
+
+AU_ALIAS([AC_CHECK_CLASS], [AX_CHECK_CLASS])
+AC_DEFUN([AX_CHECK_CLASS],[
+AC_REQUIRE([AX_PROG_JAVA])
+ac_var_name=`echo $1 | sed 's/\./_/g'`
+dnl Normaly I'd use a AC_CACHE_CHECK here but since the variable name is
+dnl dynamic I need an extra level of extraction
+AC_MSG_CHECKING([for $1 class])
+AC_CACHE_VAL(ax_cv_class_$ac_var_name, [
+if test x$ac_cv_prog_uudecode_base64 = xyes; then
+dnl /**
+dnl * Test.java: used to test dynamicaly if a class exists.
+dnl */
+dnl public class Test
+dnl {
+dnl
+dnl public static void
+dnl main( String[] argv )
+dnl {
+dnl Class lib;
+dnl if (argv.length < 1)
+dnl {
+dnl System.err.println ("Missing argument");
+dnl System.exit (77);
+dnl }
+dnl try
+dnl {
+dnl lib = Class.forName (argv[0]);
+dnl }
+dnl catch (ClassNotFoundException e)
+dnl {
+dnl System.exit (1);
+dnl }
+dnl lib = null;
+dnl System.exit (0);
+dnl }
+dnl
+dnl }
+cat << \EOF > Test.uue
+begin-base64 644 Test.class
+yv66vgADAC0AKQcAAgEABFRlc3QHAAQBABBqYXZhL2xhbmcvT2JqZWN0AQAE
+bWFpbgEAFihbTGphdmEvbGFuZy9TdHJpbmc7KVYBAARDb2RlAQAPTGluZU51
+bWJlclRhYmxlDAAKAAsBAANlcnIBABVMamF2YS9pby9QcmludFN0cmVhbTsJ
+AA0ACQcADgEAEGphdmEvbGFuZy9TeXN0ZW0IABABABBNaXNzaW5nIGFyZ3Vt
+ZW50DAASABMBAAdwcmludGxuAQAVKExqYXZhL2xhbmcvU3RyaW5nOylWCgAV
+ABEHABYBABNqYXZhL2lvL1ByaW50U3RyZWFtDAAYABkBAARleGl0AQAEKEkp
+VgoADQAXDAAcAB0BAAdmb3JOYW1lAQAlKExqYXZhL2xhbmcvU3RyaW5nOylM
+amF2YS9sYW5nL0NsYXNzOwoAHwAbBwAgAQAPamF2YS9sYW5nL0NsYXNzBwAi
+AQAgamF2YS9sYW5nL0NsYXNzTm90Rm91bmRFeGNlcHRpb24BAAY8aW5pdD4B
+AAMoKVYMACMAJAoAAwAlAQAKU291cmNlRmlsZQEACVRlc3QuamF2YQAhAAEA
+AwAAAAAAAgAJAAUABgABAAcAAABtAAMAAwAAACkqvgSiABCyAAwSD7YAFBBN
+uAAaKgMyuAAeTKcACE0EuAAaAUwDuAAasQABABMAGgAdACEAAQAIAAAAKgAK
+AAAACgAAAAsABgANAA4ADgATABAAEwASAB4AFgAiABgAJAAZACgAGgABACMA
+JAABAAcAAAAhAAEAAQAAAAUqtwAmsQAAAAEACAAAAAoAAgAAAAQABAAEAAEA
+JwAAAAIAKA==
+====
+EOF
+ if $UUDECODE Test.uue; then
+ :
+ else
+ echo "configure: __oline__: uudecode had trouble decoding base 64 file 'Test.uue'" >&AS_MESSAGE_LOG_FD
+ echo "configure: failed file was:" >&AS_MESSAGE_LOG_FD
+ cat Test.uue >&AS_MESSAGE_LOG_FD
+ ac_cv_prog_uudecode_base64=no
+ fi
+ rm -f Test.uue
+ if AC_TRY_COMMAND($JAVA $JAVAFLAGS Test $1) >/dev/null 2>&1; then
+ eval "ac_cv_class_$ac_var_name=yes"
+ else
+ eval "ac_cv_class_$ac_var_name=no"
+ fi
+ rm -f Test.class
+else
+ AX_TRY_COMPILE_JAVA([$1], , [eval "ac_cv_class_$ac_var_name=yes"],
+ [eval "ac_cv_class_$ac_var_name=no"])
+fi
+eval "ac_var_val=$`eval echo ac_cv_class_$ac_var_name`"
+eval "HAVE_$ac_var_name=$`echo ac_cv_class_$ac_var_val`"
+HAVE_LAST_CLASS=$ac_var_val
+if test x$ac_var_val = xyes; then
+ ifelse([$2], , :, [$2])
+else
+ ifelse([$3], , :, [$3])
+fi
+])
+dnl for some reason the above statment didn't fall though here?
+dnl do scripts have variable scoping?
+eval "ac_var_val=$`eval echo ac_cv_class_$ac_var_name`"
+AC_MSG_RESULT($ac_var_val)
+])
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/ax_check_junit.m4 b/src/third_party/wiredtiger/build_posix/aclocal/ax_check_junit.m4
new file mode 100644
index 00000000000..724e0e0814f
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/ax_check_junit.m4
@@ -0,0 +1,72 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_check_junit.html
+# ===========================================================================
+#
+# WiredTiger: Updated to use JUnit 4 call semantics.
+#
+# SYNOPSIS
+#
+# AX_CHECK_JUNIT
+#
+# DESCRIPTION
+#
+# AX_CHECK_JUNIT tests the availability of the Junit testing framework,
+# and set some variables for conditional compilation of the test suite by
+# automake.
+#
+# If available, JUNIT is set to a command launching the text based user
+# interface of Junit, @JAVA_JUNIT@ is set to $JAVA_JUNIT and @TESTS_JUNIT@
+# is set to $TESTS_JUNIT, otherwise they are set to empty values.
+#
+# You can use these variables in your Makefile.am file like this :
+#
+# # Some of the following classes are built only if junit is available
+# JAVA_JUNIT = Class1Test.java Class2Test.java AllJunitTests.java
+#
+# noinst_JAVA = Example1.java Example2.java @JAVA_JUNIT@
+#
+# EXTRA_JAVA = $(JAVA_JUNIT)
+#
+# TESTS_JUNIT = AllJunitTests
+#
+# TESTS = StandaloneTest1 StandaloneTest2 @TESTS_JUNIT@
+#
+# EXTRA_TESTS = $(TESTS_JUNIT)
+#
+# AllJunitTests :
+# echo "#! /bin/sh" > $@
+# echo "exec @JUNIT@ my.package.name.AllJunitTests" >> $@
+# chmod +x $@
+#
+# LICENSE
+#
+# Copyright (c) 2008 Luc Maisonobe <luc@spaceroots.org>
+#
+# Copying and distribution of this file, with or without modification, are
+# permitted in any medium without royalty provided the copyright notice
+# and this notice are preserved. This file is offered as-is, without any
+# warranty.
+
+#serial 5
+
+AU_ALIAS([AC_CHECK_JUNIT], [AX_CHECK_JUNIT])
+AC_DEFUN([AX_CHECK_JUNIT],[
+AC_CACHE_VAL(ac_cv_prog_JUNIT,[
+AX_CHECK_CLASS(org.junit.runner.JUnitCore)
+if test x"`eval 'echo $ac_cv_class_org_junit_runner_JUnitCore'`" != xno ; then
+ ac_cv_prog_JUNIT='$(CLASSPATH_ENV) $(JAVA) $(JAVAFLAGS) org.junit.runner.JUnitCore'
+fi])
+AC_MSG_CHECKING([for junit])
+if test x"`eval 'echo $ac_cv_prog_JUNIT'`" != x ; then
+ JUNIT="$ac_cv_prog_JUNIT"
+ JAVA_JUNIT='$(JAVA_JUNIT)'
+ TESTS_JUNIT='$(TESTS_JUNIT)'
+else
+ JUNIT=
+ JAVA_JUNIT=
+ TESTS_JUNIT=
+fi
+AC_MSG_RESULT($JAVA_JUNIT)
+AC_SUBST(JUNIT)
+AC_SUBST(JAVA_JUNIT)
+AC_SUBST(TESTS_JUNIT)])
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/ax_java_options.m4 b/src/third_party/wiredtiger/build_posix/aclocal/ax_java_options.m4
new file mode 100644
index 00000000000..36c10d922bd
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/ax_java_options.m4
@@ -0,0 +1,48 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_java_options.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_JAVA_OPTIONS
+#
+# DESCRIPTION
+#
+# AX_JAVA_OPTIONS adds configure command line options used for Java m4
+# macros. This Macro is optional.
+#
+# Note: This is part of the set of autoconf M4 macros for Java programs.
+# It is VERY IMPORTANT that you download the whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission. The
+# general documentation, as well as the sample configure.in, is included
+# in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Devin Weaver <ktohg@tritarget.com>
+#
+# Copying and distribution of this file, with or without modification, are
+# permitted in any medium without royalty provided the copyright notice
+# and this notice are preserved. This file is offered as-is, without any
+# warranty.
+
+#serial 6
+
+AU_ALIAS([AC_JAVA_OPTIONS], [AX_JAVA_OPTIONS])
+AC_DEFUN([AX_JAVA_OPTIONS],[
+AC_ARG_WITH(java-prefix,
+ [ --with-java-prefix=PFX prefix where Java runtime is installed (optional)])
+AC_ARG_WITH(javac-flags,
+ [ --with-javac-flags=FLAGS flags to pass to the Java compiler (optional)])
+AC_ARG_WITH(java-flags,
+ [ --with-java-flags=FLAGS flags to pass to the Java VM (optional)])
+JAVAPREFIX=$with_java_prefix
+JAVACFLAGS=$with_javac_flags
+JAVAFLAGS=$with_java_flags
+AC_SUBST(JAVAPREFIX)dnl
+AC_SUBST(JAVACFLAGS)dnl
+AC_SUBST(JAVAFLAGS)dnl
+AC_SUBST(JAVA)dnl
+AC_SUBST(JAVAC)dnl
+])
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/ax_jni_include_dir.m4 b/src/third_party/wiredtiger/build_posix/aclocal/ax_jni_include_dir.m4
new file mode 100644
index 00000000000..249e3650f0d
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/ax_jni_include_dir.m4
@@ -0,0 +1,128 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_jni_include_dir.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_JNI_INCLUDE_DIR
+#
+# DESCRIPTION
+#
+# AX_JNI_INCLUDE_DIR finds include directories needed for compiling
+# programs using the JNI interface.
+#
+# JNI include directories are usually in the Java distribution. This is
+# deduced from the value of $JAVA_HOME, $JAVAC, or the path to "javac", in
+# that order. When this macro completes, a list of directories is left in
+# the variable JNI_INCLUDE_DIRS.
+#
+# Example usage follows:
+#
+# AX_JNI_INCLUDE_DIR
+#
+# for JNI_INCLUDE_DIR in $JNI_INCLUDE_DIRS
+# do
+# CPPFLAGS="$CPPFLAGS -I$JNI_INCLUDE_DIR"
+# done
+#
+# If you want to force a specific compiler:
+#
+# - at the configure.in level, set JAVAC=yourcompiler before calling
+# AX_JNI_INCLUDE_DIR
+#
+# - at the configure level, setenv JAVAC
+#
+# Note: This macro can work with the autoconf M4 macros for Java programs.
+# This particular macro is not part of the original set of macros.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Don Anderson <dda@sleepycat.com>
+#
+# Copying and distribution of this file, with or without modification, are
+# permitted in any medium without royalty provided the copyright notice
+# and this notice are preserved. This file is offered as-is, without any
+# warranty.
+
+#serial 10
+
+AU_ALIAS([AC_JNI_INCLUDE_DIR], [AX_JNI_INCLUDE_DIR])
+AC_DEFUN([AX_JNI_INCLUDE_DIR],[
+
+JNI_INCLUDE_DIRS=""
+
+if test "x$JAVA_HOME" != x; then
+ _JTOPDIR="$JAVA_HOME"
+else
+ if test "x$JAVAC" = x; then
+ JAVAC=javac
+ fi
+ AC_PATH_PROG([_ACJNI_JAVAC], [$JAVAC], [no])
+ if test "x$_ACJNI_JAVAC" = xno; then
+ AC_MSG_ERROR([cannot find JDK; try setting \$JAVAC or \$JAVA_HOME])
+ fi
+ _ACJNI_FOLLOW_SYMLINKS("$_ACJNI_JAVAC")
+ _JTOPDIR=`echo "$_ACJNI_FOLLOWED" | sed -e 's://*:/:g' -e 's:/[[^/]]*$::'`
+fi
+
+case "$host_os" in
+ darwin*) _JTOPDIR=`echo "$_JTOPDIR" | sed -e 's:/[[^/]]*$::'`
+ _JINC="$_JTOPDIR/Headers";;
+ *) _JINC="$_JTOPDIR/include";;
+esac
+_AS_ECHO_LOG([_JTOPDIR=$_JTOPDIR])
+_AS_ECHO_LOG([_JINC=$_JINC])
+
+# get the likely subdirectories for system specific java includes
+case "$host_os" in
+bsdi*) _JNI_INC_SUBDIRS="bsdos";;
+freebsd*) _JNI_INC_SUBDIRS="freebsd";;
+linux*) _JNI_INC_SUBDIRS="linux genunix";;
+osf*) _JNI_INC_SUBDIRS="alpha";;
+solaris*) _JNI_INC_SUBDIRS="solaris";;
+mingw*) _JNI_INC_SUBDIRS="win32";;
+cygwin*) _JNI_INC_SUBDIRS="win32";;
+*) _JNI_INC_SUBDIRS="genunix";;
+esac
+
+# search for jni.h in the paths
+found=no
+for dir in "$_JINC" "`echo "$_JTOPDIR" | sed -e 's:/[[^/]]*$::'`"/include ; do
+ # add any subdirectories that are present
+ saved_CPPFLAGS="$CPPFLAGS"
+ for JINCSUBDIR in $_JNI_INC_SUBDIRS ; do
+ if test -d "$dir/$JINCSUBDIR" ; then
+ JNI_INCLUDE_DIRS="$JNI_INCLUDE_DIRS $dir/$JINCSUBDIR"
+ CPPFLAGS="$CPPFLAGS -I$dir/$JINCSUBDIR"
+ fi
+ done
+
+ AC_CHECK_HEADER([$dir/jni.h],
+ [JNI_INCLUDE_DIRS="$JNI_INCLUDE_DIRS $dir" ; found=yes; break])
+ CPPFLAGS="$saved_CPPFLAGS"
+done
+
+if test $found = no ; then
+ AC_MSG_ERROR([cannot find JDK header files])
+fi
+])
+
+# _ACJNI_FOLLOW_SYMLINKS <path>
+# Follows symbolic links on <path>,
+# finally setting variable _ACJNI_FOLLOWED
+# ----------------------------------------
+AC_DEFUN([_ACJNI_FOLLOW_SYMLINKS],[
+# find the include directory relative to the javac executable
+_cur="$1"
+while ls -ld "$_cur" 2>/dev/null | grep " -> " >/dev/null; do
+ AC_MSG_CHECKING([symlink for $_cur])
+ _slink=`ls -ld "$_cur" | sed 's/.* -> //'`
+ case "$_slink" in
+ /*) _cur="$_slink";;
+ # 'X' avoids triggering unwanted echo options.
+ *) _cur=`echo "X$_cur" | sed -e 's/^X//' -e 's:[[^/]]*$::'`"$_slink";;
+ esac
+ AC_MSG_RESULT([$_cur])
+done
+_ACJNI_FOLLOWED="$_cur"
+])# _ACJNI
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/ax_pkg_swig.m4 b/src/third_party/wiredtiger/build_posix/aclocal/ax_pkg_swig.m4
new file mode 100644
index 00000000000..9ebdeb531b9
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/ax_pkg_swig.m4
@@ -0,0 +1,135 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_pkg_swig.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_PKG_SWIG([major.minor.micro], [action-if-found], [action-if-not-found])
+#
+# DESCRIPTION
+#
+# This macro searches for a SWIG installation on your system. If found,
+# then SWIG is AC_SUBST'd; if not found, then $SWIG is empty. If SWIG is
+# found, then SWIG_LIB is set to the SWIG library path, and AC_SUBST'd.
+#
+# You can use the optional first argument to check if the version of the
+# available SWIG is greater than or equal to the value of the argument. It
+# should have the format: N[.N[.N]] (N is a number between 0 and 999. Only
+# the first N is mandatory.) If the version argument is given (e.g.
+# 1.3.17), AX_PKG_SWIG checks that the swig package is this version number
+# or higher.
+#
+# As usual, action-if-found is executed if SWIG is found, otherwise
+# action-if-not-found is executed.
+#
+# In configure.in, use as:
+#
+# AX_PKG_SWIG(1.3.17, [], [ AC_MSG_ERROR([SWIG is required to build..]) ])
+# AX_SWIG_ENABLE_CXX
+# AX_SWIG_MULTI_MODULE_SUPPORT
+# AX_SWIG_PYTHON
+#
+# LICENSE
+#
+# Copyright (c) 2008 Sebastian Huber <sebastian-huber@web.de>
+# Copyright (c) 2008 Alan W. Irwin <irwin@beluga.phys.uvic.ca>
+# Copyright (c) 2008 Rafael Laboissiere <rafael@laboissiere.net>
+# Copyright (c) 2008 Andrew Collier <colliera@ukzn.ac.za>
+# Copyright (c) 2011 Murray Cumming <murrayc@openismus.com>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception, the respective Autoconf Macro's copyright owner
+# gives unlimited permission to copy, distribute and modify the configure
+# scripts that are the output of Autoconf when processing the Macro. You
+# need not follow the terms of the GNU General Public License when using
+# or distributing such scripts, even though portions of the text of the
+# Macro appear in them. The GNU General Public License (GPL) does govern
+# all other use of the material that constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the Autoconf
+# Macro released by the Autoconf Archive. When you make and distribute a
+# modified version of the Autoconf Macro, you may extend this special
+# exception to the GPL to apply to your modified version as well.
+
+#serial 8
+
+AC_DEFUN([AX_PKG_SWIG],[
+ # Some systems have SWIG 2.0 named "swig2.0"
+ AC_PATH_PROGS([SWIG],[swig2.0 swig])
+ if test -z "$SWIG" ; then
+ m4_ifval([$3],[$3],[:])
+ elif test -n "$1" ; then
+ AC_MSG_CHECKING([SWIG version])
+ [swig_version=`$SWIG -version 2>&1 | grep 'SWIG Version' | sed 's/.*\([0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*\).*/\1/g'`]
+ AC_MSG_RESULT([$swig_version])
+ if test -n "$swig_version" ; then
+ # Calculate the required version number components
+ [required=$1]
+ [required_major=`echo $required | sed 's/[^0-9].*//'`]
+ if test -z "$required_major" ; then
+ [required_major=0]
+ fi
+ [required=`echo $required | sed 's/[0-9]*[^0-9]//'`]
+ [required_minor=`echo $required | sed 's/[^0-9].*//'`]
+ if test -z "$required_minor" ; then
+ [required_minor=0]
+ fi
+ [required=`echo $required | sed 's/[0-9]*[^0-9]//'`]
+ [required_patch=`echo $required | sed 's/[^0-9].*//'`]
+ if test -z "$required_patch" ; then
+ [required_patch=0]
+ fi
+ # Calculate the available version number components
+ [available=$swig_version]
+ [available_major=`echo $available | sed 's/[^0-9].*//'`]
+ if test -z "$available_major" ; then
+ [available_major=0]
+ fi
+ [available=`echo $available | sed 's/[0-9]*[^0-9]//'`]
+ [available_minor=`echo $available | sed 's/[^0-9].*//'`]
+ if test -z "$available_minor" ; then
+ [available_minor=0]
+ fi
+ [available=`echo $available | sed 's/[0-9]*[^0-9]//'`]
+ [available_patch=`echo $available | sed 's/[^0-9].*//'`]
+ if test -z "$available_patch" ; then
+ [available_patch=0]
+ fi
+ # Convert the version tuple into a single number for easier comparison.
+ # Using base 100 should be safe since SWIG internally uses BCD values
+ # to encode its version number.
+ required_swig_vernum=`expr $required_major \* 10000 \
+ \+ $required_minor \* 100 \+ $required_patch`
+ available_swig_vernum=`expr $available_major \* 10000 \
+ \+ $available_minor \* 100 \+ $available_patch`
+
+ if test $available_swig_vernum -lt $required_swig_vernum; then
+ AC_MSG_WARN([SWIG version >= $1 is required. You have $swig_version.])
+ SWIG=''
+ m4_ifval([$3],[$3],[])
+ else
+ AC_MSG_CHECKING([for SWIG library])
+ SWIG_LIB=`$SWIG -swiglib`
+ AC_MSG_RESULT([$SWIG_LIB])
+ m4_ifval([$2],[$2],[])
+ fi
+ else
+ AC_MSG_WARN([cannot determine SWIG version])
+ SWIG=''
+ m4_ifval([$3],[$3],[])
+ fi
+ fi
+ AC_SUBST([SWIG_LIB])
+])
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_jar.m4 b/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_jar.m4
new file mode 100644
index 00000000000..776e804ad9f
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_jar.m4
@@ -0,0 +1,52 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_prog_jar.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_PROG_JAR
+#
+# DESCRIPTION
+#
+# AX_PROG_JAR tests for an existing jar program. It uses the environment
+# variable JAR then tests in sequence various common jar programs.
+#
+# If you want to force a specific compiler:
+#
+# - at the configure.in level, set JAR=yourcompiler before calling
+# AX_PROG_JAR
+#
+# - at the configure level, setenv JAR
+#
+# You can use the JAR variable in your Makefile.in, with @JAR@.
+#
+# Note: This macro depends on the autoconf M4 macros for Java programs. It
+# is VERY IMPORTANT that you download that whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission.
+#
+# The general documentation of those macros, as well as the sample
+# configure.in, is included in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Egon Willighagen <e.willighagen@science.ru.nl>
+#
+# Copying and distribution of this file, with or without modification, are
+# permitted in any medium without royalty provided the copyright notice
+# and this notice are preserved. This file is offered as-is, without any
+# warranty.
+
+#serial 6
+
+AU_ALIAS([AC_PROG_JAR], [AX_PROG_JAR])
+AC_DEFUN([AX_PROG_JAR],[
+AC_REQUIRE([AC_EXEEXT])dnl
+if test "x$JAVAPREFIX" = x; then
+ test "x$JAR" = x && AC_CHECK_PROGS(JAR, jar$EXEEXT)
+else
+ test "x$JAR" = x && AC_CHECK_PROGS(JAR, jar, $JAVAPREFIX)
+fi
+test "x$JAR" = x && AC_MSG_ERROR([no acceptable jar program found in \$PATH])
+AC_PROVIDE([$0])dnl
+])
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_java.m4 b/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_java.m4
new file mode 100644
index 00000000000..5471f322d25
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_java.m4
@@ -0,0 +1,115 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_prog_java.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_PROG_JAVA
+#
+# DESCRIPTION
+#
+# Here is a summary of the main macros:
+#
+# AX_PROG_JAVAC: finds a Java compiler.
+#
+# AX_PROG_JAVA: finds a Java virtual machine.
+#
+# AX_CHECK_CLASS: finds if we have the given class (beware of CLASSPATH!).
+#
+# AX_CHECK_RQRD_CLASS: finds if we have the given class and stops
+# otherwise.
+#
+# AX_TRY_COMPILE_JAVA: attempt to compile user given source.
+#
+# AX_TRY_RUN_JAVA: attempt to compile and run user given source.
+#
+# AX_JAVA_OPTIONS: adds Java configure options.
+#
+# AX_PROG_JAVA tests an existing Java virtual machine. It uses the
+# environment variable JAVA then tests in sequence various common Java
+# virtual machines. For political reasons, it starts with the free ones.
+# You *must* call [AX_PROG_JAVAC] before.
+#
+# If you want to force a specific VM:
+#
+# - at the configure.in level, set JAVA=yourvm before calling AX_PROG_JAVA
+#
+# (but after AC_INIT)
+#
+# - at the configure level, setenv JAVA
+#
+# You can use the JAVA variable in your Makefile.in, with @JAVA@.
+#
+# *Warning*: its success or failure can depend on a proper setting of the
+# CLASSPATH env. variable.
+#
+# TODO: allow to exclude virtual machines (rationale: most Java programs
+# cannot run with some VM like kaffe).
+#
+# Note: This is part of the set of autoconf M4 macros for Java programs.
+# It is VERY IMPORTANT that you download the whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission.
+#
+# A Web page, with a link to the latest CVS snapshot is at
+# <http://www.internatif.org/bortzmeyer/autoconf-Java/>.
+#
+# This is a sample configure.in Process this file with autoconf to produce
+# a configure script.
+#
+# AC_INIT(UnTag.java)
+#
+# dnl Checks for programs.
+# AC_CHECK_CLASSPATH
+# AX_PROG_JAVAC
+# AX_PROG_JAVA
+#
+# dnl Checks for classes
+# AX_CHECK_RQRD_CLASS(org.xml.sax.Parser)
+# AX_CHECK_RQRD_CLASS(com.jclark.xml.sax.Driver)
+#
+# AC_OUTPUT(Makefile)
+#
+# LICENSE
+#
+# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception, the respective Autoconf Macro's copyright owner
+# gives unlimited permission to copy, distribute and modify the configure
+# scripts that are the output of Autoconf when processing the Macro. You
+# need not follow the terms of the GNU General Public License when using
+# or distributing such scripts, even though portions of the text of the
+# Macro appear in them. The GNU General Public License (GPL) does govern
+# all other use of the material that constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the Autoconf
+# Macro released by the Autoconf Archive. When you make and distribute a
+# modified version of the Autoconf Macro, you may extend this special
+# exception to the GPL to apply to your modified version as well.
+
+#serial 8
+
+AU_ALIAS([AC_PROG_JAVA], [AX_PROG_JAVA])
+AC_DEFUN([AX_PROG_JAVA],[
+if test x$JAVAPREFIX = x; then
+ test x$JAVA = x && AC_CHECK_PROGS(JAVA, kaffe java)
+else
+ test x$JAVA = x && AC_CHECK_PROGS(JAVA, kaffe java, $JAVAPREFIX)
+fi
+test x$JAVA = x && AC_MSG_ERROR([no acceptable Java virtual machine found in \$PATH])
+AX_PROG_JAVA_WORKS
+AC_PROVIDE([$0])dnl
+])
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_java_works.m4 b/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_java_works.m4
new file mode 100644
index 00000000000..741bd561b62
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_java_works.m4
@@ -0,0 +1,134 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_prog_java_works.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_PROG_JAVA_WORKS
+#
+# DESCRIPTION
+#
+# Internal use ONLY.
+#
+# Note: This is part of the set of autoconf M4 macros for Java programs.
+# It is VERY IMPORTANT that you download the whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission. The
+# general documentation, as well as the sample configure.in, is included
+# in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception, the respective Autoconf Macro's copyright owner
+# gives unlimited permission to copy, distribute and modify the configure
+# scripts that are the output of Autoconf when processing the Macro. You
+# need not follow the terms of the GNU General Public License when using
+# or distributing such scripts, even though portions of the text of the
+# Macro appear in them. The GNU General Public License (GPL) does govern
+# all other use of the material that constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the Autoconf
+# Macro released by the Autoconf Archive. When you make and distribute a
+# modified version of the Autoconf Macro, you may extend this special
+# exception to the GPL to apply to your modified version as well.
+
+#serial 8
+
+AU_ALIAS([AC_PROG_JAVA_WORKS], [AX_PROG_JAVA_WORKS])
+AC_DEFUN([AX_PROG_JAVA_WORKS], [
+AC_PATH_PROG(UUDECODE, uudecode, [no])
+if test x$UUDECODE != xno; then
+AC_CACHE_CHECK([if uudecode can decode base 64 file], ac_cv_prog_uudecode_base64, [
+dnl /**
+dnl * Test.java: used to test if java compiler works.
+dnl */
+dnl public class Test
+dnl {
+dnl
+dnl public static void
+dnl main( String[] argv )
+dnl {
+dnl System.exit (0);
+dnl }
+dnl
+dnl }
+cat << \EOF > Test.uue
+begin-base64 644 Test.class
+yv66vgADAC0AFQcAAgEABFRlc3QHAAQBABBqYXZhL2xhbmcvT2JqZWN0AQAE
+bWFpbgEAFihbTGphdmEvbGFuZy9TdHJpbmc7KVYBAARDb2RlAQAPTGluZU51
+bWJlclRhYmxlDAAKAAsBAARleGl0AQAEKEkpVgoADQAJBwAOAQAQamF2YS9s
+YW5nL1N5c3RlbQEABjxpbml0PgEAAygpVgwADwAQCgADABEBAApTb3VyY2VG
+aWxlAQAJVGVzdC5qYXZhACEAAQADAAAAAAACAAkABQAGAAEABwAAACEAAQAB
+AAAABQO4AAyxAAAAAQAIAAAACgACAAAACgAEAAsAAQAPABAAAQAHAAAAIQAB
+AAEAAAAFKrcAErEAAAABAAgAAAAKAAIAAAAEAAQABAABABMAAAACABQ=
+====
+EOF
+if $UUDECODE Test.uue; then
+ ac_cv_prog_uudecode_base64=yes
+else
+ echo "configure: __oline__: uudecode had trouble decoding base 64 file 'Test.uue'" >&AS_MESSAGE_LOG_FD
+ echo "configure: failed file was:" >&AS_MESSAGE_LOG_FD
+ cat Test.uue >&AS_MESSAGE_LOG_FD
+ ac_cv_prog_uudecode_base64=no
+fi
+rm -f Test.uue])
+fi
+if test x$ac_cv_prog_uudecode_base64 != xyes; then
+ rm -f Test.class
+ AC_MSG_WARN([I have to compile Test.class from scratch])
+ if test x$ac_cv_prog_javac_works = xno; then
+ AC_MSG_ERROR([Cannot compile java source. $JAVAC does not work properly])
+ fi
+ if test x$ac_cv_prog_javac_works = x; then
+ AX_PROG_JAVAC
+ fi
+fi
+AC_CACHE_CHECK(if $JAVA works, ac_cv_prog_java_works, [
+JAVA_TEST=Test.java
+CLASS_TEST=Test.class
+TEST=Test
+changequote(, )dnl
+cat << \EOF > $JAVA_TEST
+/* [#]line __oline__ "configure" */
+public class Test {
+public static void main (String args[]) {
+ System.exit (0);
+} }
+EOF
+changequote([, ])dnl
+if test x$ac_cv_prog_uudecode_base64 != xyes; then
+ if AC_TRY_COMMAND($JAVAC $JAVACFLAGS $JAVA_TEST) && test -s $CLASS_TEST; then
+ :
+ else
+ echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD
+ cat $JAVA_TEST >&AS_MESSAGE_LOG_FD
+ AC_MSG_ERROR(The Java compiler $JAVAC failed (see config.log, check the CLASSPATH?))
+ fi
+fi
+if AC_TRY_COMMAND($JAVA $JAVAFLAGS $TEST) >/dev/null 2>&1; then
+ ac_cv_prog_java_works=yes
+else
+ echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD
+ cat $JAVA_TEST >&AS_MESSAGE_LOG_FD
+ AC_MSG_ERROR(The Java VM $JAVA failed (see config.log, check the CLASSPATH?))
+fi
+rm -fr $JAVA_TEST $CLASS_TEST Test.uue
+])
+AC_PROVIDE([$0])dnl
+]
+)
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_javac.m4 b/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_javac.m4
new file mode 100644
index 00000000000..d9bcc2d7c34
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_javac.m4
@@ -0,0 +1,79 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_prog_javac.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_PROG_JAVAC
+#
+# DESCRIPTION
+#
+# AX_PROG_JAVAC tests an existing Java compiler. It uses the environment
+# variable JAVAC then tests in sequence various common Java compilers. For
+# political reasons, it starts with the free ones.
+#
+# If you want to force a specific compiler:
+#
+# - at the configure.in level, set JAVAC=yourcompiler before calling
+# AX_PROG_JAVAC
+#
+# - at the configure level, setenv JAVAC
+#
+# You can use the JAVAC variable in your Makefile.in, with @JAVAC@.
+#
+# *Warning*: its success or failure can depend on a proper setting of the
+# CLASSPATH env. variable.
+#
+# TODO: allow to exclude compilers (rationale: most Java programs cannot
+# compile with some compilers like guavac).
+#
+# Note: This is part of the set of autoconf M4 macros for Java programs.
+# It is VERY IMPORTANT that you download the whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission. The
+# general documentation, as well as the sample configure.in, is included
+# in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception, the respective Autoconf Macro's copyright owner
+# gives unlimited permission to copy, distribute and modify the configure
+# scripts that are the output of Autoconf when processing the Macro. You
+# need not follow the terms of the GNU General Public License when using
+# or distributing such scripts, even though portions of the text of the
+# Macro appear in them. The GNU General Public License (GPL) does govern
+# all other use of the material that constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the Autoconf
+# Macro released by the Autoconf Archive. When you make and distribute a
+# modified version of the Autoconf Macro, you may extend this special
+# exception to the GPL to apply to your modified version as well.
+
+#serial 6
+
+AU_ALIAS([AC_PROG_JAVAC], [AX_PROG_JAVAC])
+AC_DEFUN([AX_PROG_JAVAC],[
+if test "x$JAVAPREFIX" = x; then
+ test "x$JAVAC" = x && AC_CHECK_PROGS(JAVAC, "gcj -C" guavac jikes javac)
+else
+ test "x$JAVAC" = x && AC_CHECK_PROGS(JAVAC, "gcj -C" guavac jikes javac, $JAVAPREFIX)
+fi
+test "x$JAVAC" = x && AC_MSG_ERROR([no acceptable Java compiler found in \$PATH])
+AX_PROG_JAVAC_WORKS
+AC_PROVIDE([$0])dnl
+])
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_javac_works.m4 b/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_javac_works.m4
new file mode 100644
index 00000000000..7dfa1e37d89
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/ax_prog_javac_works.m4
@@ -0,0 +1,72 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_prog_javac_works.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_PROG_JAVAC_WORKS
+#
+# DESCRIPTION
+#
+# Internal use ONLY.
+#
+# Note: This is part of the set of autoconf M4 macros for Java programs.
+# It is VERY IMPORTANT that you download the whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission. The
+# general documentation, as well as the sample configure.in, is included
+# in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception, the respective Autoconf Macro's copyright owner
+# gives unlimited permission to copy, distribute and modify the configure
+# scripts that are the output of Autoconf when processing the Macro. You
+# need not follow the terms of the GNU General Public License when using
+# or distributing such scripts, even though portions of the text of the
+# Macro appear in them. The GNU General Public License (GPL) does govern
+# all other use of the material that constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the Autoconf
+# Macro released by the Autoconf Archive. When you make and distribute a
+# modified version of the Autoconf Macro, you may extend this special
+# exception to the GPL to apply to your modified version as well.
+
+#serial 6
+
+AU_ALIAS([AC_PROG_JAVAC_WORKS], [AX_PROG_JAVAC_WORKS])
+AC_DEFUN([AX_PROG_JAVAC_WORKS],[
+AC_CACHE_CHECK([if $JAVAC works], ac_cv_prog_javac_works, [
+JAVA_TEST=Test.java
+CLASS_TEST=Test.class
+cat << \EOF > $JAVA_TEST
+/* [#]line __oline__ "configure" */
+public class Test {
+}
+EOF
+if AC_TRY_COMMAND($JAVAC $JAVACFLAGS $JAVA_TEST) >/dev/null 2>&1; then
+ ac_cv_prog_javac_works=yes
+else
+ AC_MSG_ERROR([The Java compiler $JAVAC failed (see config.log, check the CLASSPATH?)])
+ echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD
+ cat $JAVA_TEST >&AS_MESSAGE_LOG_FD
+fi
+rm -f $JAVA_TEST $CLASS_TEST
+])
+AC_PROVIDE([$0])dnl
+])
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/ax_try_compile_java.m4 b/src/third_party/wiredtiger/build_posix/aclocal/ax_try_compile_java.m4
new file mode 100644
index 00000000000..8efd091c43b
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/ax_try_compile_java.m4
@@ -0,0 +1,55 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_try_compile_java.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_TRY_COMPILE_JAVA
+#
+# DESCRIPTION
+#
+# AX_TRY_COMPILE_JAVA attempt to compile user given source.
+#
+# *Warning*: its success or failure can depend on a proper setting of the
+# CLASSPATH env. variable.
+#
+# Note: This is part of the set of autoconf M4 macros for Java programs.
+# It is VERY IMPORTANT that you download the whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission. The
+# general documentation, as well as the sample configure.in, is included
+# in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Devin Weaver <ktohg@tritarget.com>
+#
+# Copying and distribution of this file, with or without modification, are
+# permitted in any medium without royalty provided the copyright notice
+# and this notice are preserved. This file is offered as-is, without any
+# warranty.
+
+#serial 7
+
+AU_ALIAS([AC_TRY_COMPILE_JAVA], [AX_TRY_COMPILE_JAVA])
+AC_DEFUN([AX_TRY_COMPILE_JAVA],[
+AC_REQUIRE([AX_PROG_JAVAC])dnl
+cat << \EOF > Test.java
+/* [#]line __oline__ "configure" */
+ifelse([$1], , , [import $1;])
+public class Test {
+[$2]
+}
+EOF
+if AC_TRY_COMMAND($JAVAC $JAVACFLAGS Test.java) && test -s Test.class
+then
+dnl Don't remove the temporary files here, so they can be examined.
+ ifelse([$3], , :, [$3])
+else
+ echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD
+ cat Test.java >&AS_MESSAGE_LOG_FD
+ifelse([$4], , , [ rm -fr Test*
+ $4
+])dnl
+fi
+rm -fr Test*])
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/cond-if.m4 b/src/third_party/wiredtiger/build_posix/aclocal/cond-if.m4
new file mode 100644
index 00000000000..df4b2c4b8ac
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/cond-if.m4
@@ -0,0 +1,14 @@
+dnl AC_CONFIG_FILES conditionalization requires using AM_COND_IF, however
+dnl AM_COND_IF is new to Automake 1.11. To use it on new Automake without
+dnl requiring same, a fallback implementation for older Autoconf is provided.
+dnl Note that disabling of AC_CONFIG_FILES requires Automake 1.11, this code
+dnl is correct only in terms of m4sh generated script.
+m4_ifndef([AM_COND_IF], [AC_DEFUN([AM_COND_IF], [
+if test -z "$$1_TRUE"; then :
+ m4_n([$2])[]dnl
+m4_ifval([$3],
+[else
+ $3
+])dnl
+fi[]dnl
+])])
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/options.m4 b/src/third_party/wiredtiger/build_posix/aclocal/options.m4
new file mode 100644
index 00000000000..2682c8ea82c
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/options.m4
@@ -0,0 +1,228 @@
+# Optional configuration.
+AC_DEFUN([AM_OPTIONS], [
+
+AH_TEMPLATE(HAVE_ATTACH, [Define to 1 to pause for debugger attach on failure.])
+AC_MSG_CHECKING(if --enable-attach option specified)
+AC_ARG_ENABLE(attach,
+ [AS_HELP_STRING([--enable-attach],
+ [Configure for debugger attach on failure.])], r=$enableval, r=no)
+case "$r" in
+no) wt_cv_enable_attach=no;;
+*) AC_DEFINE(HAVE_ATTACH)
+ wt_cv_enable_attach=yes;;
+esac
+AC_MSG_RESULT($wt_cv_enable_attach)
+
+AH_TEMPLATE(HAVE_BUILTIN_EXTENSION_SNAPPY,
+ [Snappy support automatically loaded.])
+AH_TEMPLATE(HAVE_BUILTIN_EXTENSION_ZLIB,
+ [Zlib support automatically loaded.])
+AC_MSG_CHECKING(if --with-builtins option specified)
+AC_ARG_WITH(builtins,
+ [AS_HELP_STRING([--with-builtins],
+ [builtin extension names (snappy, zlib).])],
+ [with_builtins=$withval],
+ [with_builtins=])
+
+# Validate and setup each builtin extension library.
+builtin_list=`echo "$with_builtins"|tr -s , ' '`
+for builtin_i in $builtin_list; do
+ case "$builtin_i" in
+ snappy) AC_DEFINE(HAVE_BUILTIN_EXTENSION_SNAPPY)
+ wt_cv_with_builtin_extension_snappy=yes;;
+ zlib) AC_DEFINE(HAVE_BUILTIN_EXTENSION_ZLIB)
+ wt_cv_with_builtin_extension_zlib=yes;;
+ *) AC_MSG_ERROR([Unknown builtin extension "$builtin_i"]);;
+ esac
+done
+AM_CONDITIONAL([HAVE_BUILTIN_EXTENSION_SNAPPY],
+ [test "$wt_cv_with_builtin_extension_snappy" = "yes"])
+AM_CONDITIONAL([HAVE_BUILTIN_EXTENSION_ZLIB],
+ [test "$wt_cv_with_builtin_extension_zlib" = "yes"])
+AC_MSG_RESULT($with_builtins)
+
+AC_MSG_CHECKING(if --enable-bzip2 option specified)
+AC_ARG_ENABLE(bzip2,
+ [AS_HELP_STRING([--enable-bzip2],
+ [Build the bzip2 compressor extension.])], r=$enableval, r=no)
+case "$r" in
+no) wt_cv_enable_bzip2=no;;
+*) wt_cv_enable_bzip2=yes;;
+esac
+AC_MSG_RESULT($wt_cv_enable_bzip2)
+if test "$wt_cv_enable_bzip2" = "yes"; then
+ AC_CHECK_HEADER(bzlib.h,,
+ [AC_MSG_ERROR([--enable-bzip2 requires bzlib.h])])
+ AC_CHECK_LIB(bz2, BZ2_bzCompress,,
+ [AC_MSG_ERROR([--enable-bzip2 requires bz2 library])])
+fi
+AM_CONDITIONAL([BZIP2], [test "$wt_cv_enable_bzip2" = "yes"])
+
+AH_TEMPLATE(HAVE_DIAGNOSTIC, [Define to 1 for diagnostic tests.])
+AC_MSG_CHECKING(if --enable-diagnostic option specified)
+AC_ARG_ENABLE(diagnostic,
+ [AS_HELP_STRING([--enable-diagnostic],
+ [Configure for diagnostic tests.])], r=$enableval, r=no)
+case "$r" in
+no) wt_cv_enable_diagnostic=no;;
+*) AC_DEFINE(HAVE_DIAGNOSTIC)
+ wt_cv_enable_diagnostic=yes;;
+esac
+AC_MSG_RESULT($wt_cv_enable_diagnostic)
+
+AC_MSG_CHECKING(if --enable-java option specified)
+AC_ARG_ENABLE(java,
+ [AS_HELP_STRING([--enable-java],
+ [Configure the Java API.])], r=$enableval, r=no)
+case "$r" in
+no) wt_cv_enable_java=no;;
+*) if test "$enable_shared" = "no"; then
+ AC_MSG_ERROR([--enable-java requires shared libraries])
+ fi
+ wt_cv_enable_java=yes;;
+esac
+AC_MSG_RESULT($wt_cv_enable_java)
+AM_CONDITIONAL([JAVA], [test x$wt_cv_enable_java = xyes])
+
+AC_MSG_CHECKING(if --enable-leveldb option specified)
+AC_ARG_ENABLE(leveldb,
+ [AS_HELP_STRING([--enable-leveldb[[=yes|basho|hyper|rocksdb]]],
+ [Build the LevelDB API.])], r=$enableval, r=no)
+wt_cv_enable_leveldb=yes
+wt_cv_enable_basholeveldb=no
+wt_cv_enable_hyperleveldb=no
+wt_cv_enable_rocksdb=no
+case "$r" in
+yes) ;;
+no) wt_cv_enable_leveldb=no;;
+basho) wt_cv_enable_basholeveldb=yes;;
+hyper) wt_cv_enable_hyperleveldb=yes;;
+rocksdb) wt_cv_enable_rocksdb=yes;;
+*) AC_MSG_ERROR([Unknown LevelDB configuration "$r"]);;
+esac
+
+AH_TEMPLATE(HAVE_BASHOLEVELDB, [Build the LevelDB API with Basho LevelDB support.])
+if test "$wt_cv_enable_basholeveldb" = "yes"; then
+ AC_DEFINE(HAVE_BASHOLEVELDB)
+fi
+AH_TEMPLATE(HAVE_HYPERLEVELDB,
+ [Build the LevelDB API with HyperLevelDB support.])
+if test "$wt_cv_enable_hyperleveldb" = "yes"; then
+ AC_DEFINE(HAVE_HYPERLEVELDB)
+fi
+AH_TEMPLATE(HAVE_ROCKSDB, [Build the LevelDB API with RocksDB support.])
+if test "$wt_cv_enable_rocksdb" = "yes"; then
+ AC_DEFINE(HAVE_ROCKSDB)
+fi
+AC_MSG_RESULT($wt_cv_enable_leveldb)
+AM_CONDITIONAL([LEVELDB], [test "$wt_cv_enable_leveldb" = "yes"])
+AM_CONDITIONAL([HAVE_BASHOLEVELDB], [test "$wt_cv_enable_basholeveldb" = "yes"])
+AM_CONDITIONAL([HAVE_HYPERLEVELDB], [test "$wt_cv_enable_hyperleveldb" = "yes"])
+AM_CONDITIONAL([HAVE_ROCKSDB], [test "$wt_cv_enable_rocksdb" = "yes"])
+
+AC_MSG_CHECKING(if --enable-python option specified)
+AC_ARG_ENABLE(python,
+ [AS_HELP_STRING([--enable-python],
+ [Configure the python API.])], r=$enableval, r=no)
+case "$r" in
+no) wt_cv_enable_python=no;;
+*) if test "$enable_shared" = "no"; then
+ AC_MSG_ERROR([--enable-python requires shared libraries])
+ fi
+ wt_cv_enable_python=yes;;
+esac
+AC_MSG_RESULT($wt_cv_enable_python)
+AM_CONDITIONAL([PYTHON], [test x$wt_cv_enable_python = xyes])
+
+AC_MSG_CHECKING(if --with-python-prefix option specified)
+AC_ARG_WITH(python-prefix,
+ [AS_HELP_STRING([--with-python-prefix=DIR],
+ [Installation prefix for Python module.])])
+AC_MSG_RESULT($with_python_prefix)
+
+AC_MSG_CHECKING(if --enable-snappy option specified)
+AC_ARG_ENABLE(snappy,
+ [AS_HELP_STRING([--enable-snappy],
+ [Build the snappy compressor extension.])], r=$enableval, r=no)
+case "$r" in
+no) if test "$wt_cv_with_builtin_extension_snappy" = "yes"; then
+ wt_cv_enable_snappy=yes
+ else
+ wt_cv_enable_snappy=no
+ fi
+ ;;
+*) if test "$wt_cv_with_builtin_extension_snappy" = "yes"; then
+ AC_MSG_ERROR(
+ [Only one of --enable-snappy --with-builtins=snappy allowed])
+ fi
+ wt_cv_enable_snappy=yes;;
+esac
+AC_MSG_RESULT($wt_cv_enable_snappy)
+if test "$wt_cv_enable_snappy" = "yes"; then
+ AC_LANG_PUSH([C++])
+ AC_CHECK_HEADER(snappy.h,,
+ [AC_MSG_ERROR([--enable-snappy requires snappy.h])])
+ AC_LANG_POP([C++])
+ AC_CHECK_LIB(snappy, snappy_compress,,
+ [AC_MSG_ERROR([--enable-snappy requires snappy library])])
+fi
+AM_CONDITIONAL([SNAPPY], [test "$wt_cv_enable_snappy" = "yes"])
+
+AH_TEMPLATE(SPINLOCK_TYPE, [Spinlock type from mutex.h.])
+AC_MSG_CHECKING(if --with-spinlock option specified)
+AC_ARG_WITH(spinlock,
+ [AS_HELP_STRING([--with-spinlock],
+ [Spinlock type (pthread, pthread_adaptive or gcc).])],
+ [],
+ [with_spinlock=pthread])
+case "$with_spinlock" in
+gcc) AC_DEFINE(SPINLOCK_TYPE, SPINLOCK_GCC);;
+pthread|pthreads)
+ AC_DEFINE(SPINLOCK_TYPE, SPINLOCK_PTHREAD_MUTEX);;
+pthread_adaptive|pthreads_adaptive)
+ AC_DEFINE(SPINLOCK_TYPE, SPINLOCK_PTHREAD_MUTEX_ADAPTIVE);;
+pthread_logging|pthreads_logging)
+ AC_DEFINE(SPINLOCK_TYPE, SPINLOCK_PTHREAD_MUTEX_LOGGING);;
+*) AC_MSG_ERROR([Unknown spinlock type "$with_spinlock"]);;
+esac
+AC_MSG_RESULT($with_spinlock)
+
+AH_TEMPLATE(HAVE_VERBOSE, [Enable verbose message configuration.])
+AC_MSG_CHECKING(if --enable-verbose option specified)
+AC_ARG_ENABLE(verbose,
+ [AS_HELP_STRING([--enable-verbose],
+ [Enable verbose message configuration.])], r=$enableval, r=no)
+case "$r" in
+no) wt_cv_enable_verbose=no;;
+*) AC_DEFINE(HAVE_VERBOSE)
+ wt_cv_enable_verbose=yes;;
+esac
+AC_MSG_RESULT($wt_cv_enable_verbose)
+
+AC_MSG_CHECKING(if --enable-zlib option specified)
+AC_ARG_ENABLE(zlib,
+ [AS_HELP_STRING([--enable-zlib],
+ [Build the zlib compressor extension.])], r=$enableval, r=no)
+case "$r" in
+no) if test "$wt_cv_with_builtin_extension_zlib" = "yes"; then
+ wt_cv_enable_zlib=yes
+ else
+ wt_cv_enable_zlib=no
+ fi
+ ;;
+*) if test "$wt_cv_with_builtin_extension_zlib" = "yes"; then
+ AC_MSG_ERROR(
+ [Only one of --enable-zlib --with-builtins=zlib allowed])
+ fi
+ wt_cv_enable_zlib=yes;;
+esac
+AC_MSG_RESULT($wt_cv_enable_zlib)
+if test "$wt_cv_enable_zlib" = "yes"; then
+ AC_CHECK_HEADER(zlib.h,,
+ [AC_MSG_ERROR([--enable-zlib requires zlib.h])])
+ AC_CHECK_LIB(z, deflate,,
+ [AC_MSG_ERROR([--enable-zlib requires zlib library])])
+fi
+AM_CONDITIONAL([ZLIB], [test "$wt_cv_enable_zlib" = "yes"])
+
+])
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/types.m4 b/src/third_party/wiredtiger/build_posix/aclocal/types.m4
new file mode 100644
index 00000000000..439034c89d2
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/types.m4
@@ -0,0 +1,47 @@
+# AM_TYPES --
+# Check for missing types, create substitutes where we can.
+AC_DEFUN([AM_TYPES], [
+ # Basic list of include files that might have types. We also use
+ # as the list of includes directly included by wiredtiger.h.
+ std_includes="
+#include <sys/types.h>
+#include <inttypes.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>"
+ AC_SUBST(wiredtiger_includes_decl)
+ wiredtiger_includes_decl="$std_includes"
+
+ # We require FILE, pid_t, size_t, ssize_t, time_t, uintmax_t
+ # and uintptr_t.
+ AC_SUBST(FILE_t_decl)
+ AC_CHECK_TYPE(FILE *,, AC_MSG_ERROR([No FILE type.]), $std_includes)
+ AC_SUBST(pid_t_decl)
+ AC_CHECK_TYPE(pid_t,, AC_MSG_ERROR([No pid_t type.]), $std_includes)
+ AC_SUBST(size_t_decl)
+ AC_CHECK_TYPE(size_t,, AC_MSG_ERROR([No size_t type.]), $std_includes)
+ AC_SUBST(ssize_t_decl)
+ AC_CHECK_TYPE(ssize_t,, AC_MSG_ERROR([No size_t type.]), $std_includes)
+ AC_SUBST(time_t_decl)
+ AC_CHECK_TYPE(time_t,, AC_MSG_ERROR([No time_t type.]), $std_includes)
+
+ # We require off_t, but use a local version for portability to Windows
+ # where it's 4B, not 8B.
+ AC_SUBST(off_t_decl)
+ AC_CHECK_TYPE(off_t,
+ [off_t_decl="typedef off_t wt_off_t;"],
+ [AC_MSG_ERROR([No off_t type.])],
+ $std_includes)
+
+ # Some systems don't have a uintmax_t type (for example, FreeBSD 6.2.
+ # In this case, use an unsigned long long.
+ AC_SUBST(uintmax_t_decl)
+ AC_CHECK_TYPE(uintmax_t,, [AC_CHECK_TYPE(unsigned long long,
+ [uintmax_t_decl="typedef unsigned long long uintmax_t;"],
+ [uintmax_t_decl="typedef unsigned long uintmax_t;"],
+ $std_includes)])
+
+ AC_SUBST(uintptr_t_decl)
+ AC_CHECK_TYPE(uintptr_t,,
+ AC_MSG_ERROR([No uintptr_t type.]), $std_includes)
+])
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4 b/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4
new file mode 100644
index 00000000000..07afcc3dbb6
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/version-set.m4
@@ -0,0 +1,14 @@
+dnl build by dist/s_version
+
+VERSION_MAJOR=2
+VERSION_MINOR=4
+VERSION_PATCH=1
+VERSION_STRING='"WiredTiger 2.4.1: (October 16, 2014)"'
+
+AC_SUBST(VERSION_MAJOR)
+AC_SUBST(VERSION_MINOR)
+AC_SUBST(VERSION_PATCH)
+AC_SUBST(VERSION_STRING)
+
+VERSION_NOPATCH=2.4
+AC_SUBST(VERSION_NOPATCH)
diff --git a/src/third_party/wiredtiger/build_posix/aclocal/version.m4 b/src/third_party/wiredtiger/build_posix/aclocal/version.m4
new file mode 100644
index 00000000000..70d75bc85ba
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/aclocal/version.m4
@@ -0,0 +1,2 @@
+dnl WiredTiger product version for AC_INIT. Maintained by dist/s_version
+2.4.1
diff --git a/src/third_party/wiredtiger/build_posix/configure.ac.in b/src/third_party/wiredtiger/build_posix/configure.ac.in
new file mode 100644
index 00000000000..feade27ae1b
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/configure.ac.in
@@ -0,0 +1,179 @@
+PACKAGE=wiredtiger
+AC_PREREQ(2.63)
+AC_INIT(WiredTiger, m4_normalize(m4_include([build_posix/aclocal/version.m4])),
+ [support@wiredtiger.com])
+
+m4_include([build_posix/aclocal/version-set.m4])
+
+AC_CONFIG_AUX_DIR([build_posix/gnu-support])
+AC_CONFIG_MACRO_DIR([build_posix/aclocal])
+AC_CONFIG_SRCDIR([RELEASE_INFO])
+
+# If CFLAGS/CXXFLAGS were not set on entry, default to "-O3 -g"
+: ${CFLAGS=-O3 -g}
+: ${CXXFLAGS=-O3 -g}
+
+AM_INIT_AUTOMAKE([1.11 foreign parallel-tests subdir-objects])
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([no])])
+
+# Configure options. The AM_OPTIONS and the libtool configuration
+# need to stay here. Moving them below the compiler and other
+# configurations causes -Wcast_align warnings and other warnings
+# on MacOS.
+AM_OPTIONS
+
+define([AC_LIBTOOL_LANG_CXX_CONFIG], [:])dnl
+define([AC_LIBTOOL_LANG_F77_CONFIG], [:])dnl
+LT_PREREQ(2.2.6)
+LT_INIT([pic-only])
+AC_SUBST([LIBTOOL_DEPS])
+
+AC_PROG_CC(cc gcc)
+# AC_PROG_CXX(c++ g++)
+
+if test "$GCC" = "yes"; then
+ # The Solaris gcc compiler gets the additional -pthreads flag.
+ if test "`uname -s`" = "SunOS"; then
+ AM_CFLAGS="$AM_CFLAGS -pthreads"
+ fi
+else
+ # The Solaris native compiler gets the additional -mt flag.
+ if test "`uname -s`" = "SunOS"; then
+ AM_CFLAGS="$AM_CFLAGS -mt"
+ fi
+fi
+
+# Java and Python APIs
+if test "$wt_cv_enable_java" = "yes" -o "$wt_cv_enable_python" = "yes"; then
+ AX_PKG_SWIG(2.0.4, [],
+ [AC_MSG_WARN([SWIG is required to rebuild Java or Python APIs.])])
+fi
+
+if test "$wt_cv_enable_java" = "yes"; then
+ JAVAC=${JAVAC-javac}
+ AX_PROG_JAVAC
+ AX_PROG_JAR
+ AX_JNI_INCLUDE_DIR
+ AX_CHECK_JUNIT
+ for JNI_INCLUDE_DIR in $JNI_INCLUDE_DIRS ; do
+ JNI_CPPFLAGS="$JNI_CPPFLAGS -I$JNI_INCLUDE_DIR"
+ done
+ AC_SUBST(JNI_CPPFLAGS)
+fi
+
+if test "$wt_cv_enable_python" = "yes"; then
+ AM_PATH_PYTHON([2.6])
+ if test -n "$with_python_prefix" ; then
+ PYTHON_INSTALL_ARG="-d $with_python_prefix"
+ fi
+ AC_SUBST(PYTHON_INSTALL_ARG)
+fi
+
+AM_TYPES
+
+AC_PROG_INSTALL
+
+AC_CHECK_HEADERS([pthread_np.h])
+AC_CHECK_LIB(pthread, pthread_create)
+AC_CHECK_LIB(dl, dlopen)
+AC_CHECK_LIB(rt, sched_yield)
+
+AC_CHECK_FUNCS([\
+ clock_gettime fallocate fcntl fread_unlocked ftruncate gettimeofday\
+ posix_fadvise posix_fallocate posix_madvise posix_memalign\
+ strtouq sync_file_range])
+
+# OS X wrongly reports that it has fdatasync
+AS_CASE([$host_os], [darwin*], [], [AC_CHECK_FUNCS([fdatasync])])
+
+AC_SYS_LARGEFILE
+
+AC_C_BIGENDIAN
+
+# Linux requires _GNU_SOURCE to be defined
+case "$host_os" in
+linux*) AM_CFLAGS="$AM_CFLAGS -D_GNU_SOURCE" ;;
+esac
+
+# Linux requires buffers aligned to 4KB boundaries for O_DIRECT to work.
+BUFFER_ALIGNMENT=0
+if test "$ac_cv_func_posix_memalign" = "yes" ; then
+ case "$host_os" in
+ linux*) BUFFER_ALIGNMENT=4096 ;;
+ esac
+fi
+AC_DEFINE_UNQUOTED(WT_BUFFER_ALIGNMENT_DEFAULT, $BUFFER_ALIGNMENT,
+ [Default alignment of buffers used for I/O])
+
+AC_SUBST(AM_CFLAGS)
+
+# test/format requires an installed Oracle Berkeley DB release tree.
+AC_MSG_CHECKING([if --with-berkeleydb=DIR option specified])
+AC_ARG_WITH(berkeleydb,
+ [AS_HELP_STRING([--with-berkeleydb=DIR],
+ [Specify installed library directory of Berkeley DB])],
+ [with_berkeleydb="$withval"], [with_berkeleydb="NO_BERKELEY_DB_LIBRARY"])
+AC_MSG_RESULT($with_berkeleydb)
+AM_CONDITIONAL([HAVE_BERKELEY_DB], [test -d $with_berkeleydb])
+AC_SUBST(BERKELEY_DB_PATH, [$with_berkeleydb])
+
+# test/format optionally supports the Levyx/Helium key/value store.
+AC_MSG_CHECKING([if --with-helium=DIR option specified])
+AC_ARG_WITH(helium,
+ [AS_HELP_STRING([--with-helium=DIR],
+ [Specify installed library directory of Helium])],
+ [with_helium="$withval"], [with_helium="NO_HELIUM_LIBRARY"])
+AC_MSG_RESULT($with_helium)
+AM_CONDITIONAL([HAVE_HELIUM], [test -d $with_helium])
+AC_SUBST(HELIUM_PATH, [$with_helium])
+
+# Sanity check the build
+AM_COND_IF([LEVELDB], [
+ AM_COND_IF([SNAPPY], [], [
+ AC_MSG_ERROR(
+ [--enable-leveldb requires --enable-snappy or --with-builtins=snappy.])
+ ])
+])
+
+
+# Warn that diagnostic builds should not be used in production
+if test "$wt_cv_enable_diagnostic" = "yes"; then
+ AC_MSG_WARN(
+ [DIAGNOSTIC BUILDS ARE NOT RECOMMENDED FOR PRODUCTION DEPLOYMENT.])
+fi
+
+# If we are building in a tree without documentation, check if doxygen is
+# available.
+if test -f "$srcdir/docs/index.html" ; then
+ wt_cv_docs_exist=yes
+else
+ wt_cv_docs_exist=no
+fi
+
+if test "$wt_cv_docs_exist" = "no"; then
+ AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [false])
+fi
+
+if test "$wt_cv_docs_exist" = "yes" -o "$DOXYGEN" = "doxygen" ; then
+ MAN1_PAGES="$srcdir/docs/man/man1/wt.1"
+ AC_SUBST(MAN1_PAGES)
+ MAN3_PAGES="$srcdir/docs/man/man3/wiredtiger.3"
+ AC_SUBST(MAN3_PAGES)
+fi
+
+# Output files
+AC_CONFIG_HEADERS([wiredtiger_config.h:build_posix/config.hin])
+
+# The LevelDB API needs some configuration knowledge
+AM_COND_IF([LEVELDB],
+ AC_CONFIG_HEADERS([api/leveldb/leveldb_wt_config.h:api/leveldb/config.hin]))
+
+# BEGIN check existence -- maintained by reconf and Make.subdirs
+# END check existence
+
+AC_CONFIG_FILES([
+ Makefile
+ wiredtiger.h:src/include/wiredtiger.in
+ wiredtiger.pc:build_posix/wiredtiger.pc.in
+])
+AC_OUTPUT
diff --git a/src/third_party/wiredtiger/build_posix/makemake b/src/third_party/wiredtiger/build_posix/makemake
new file mode 100755
index 00000000000..9ed9d252911
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/makemake
@@ -0,0 +1,35 @@
+#! /bin/sh
+#
+# Build Makefile.am
+
+# Process Make.base, insert subdirs that exist from Make.subdirs
+# (in release trees, some of the subdirs might be excluded).
+(sed -n '1,/BEGIN SUBDIRS/p' Make.base
+
+echo "SUBDIRS ="
+sed -e 's/#.*$//' -e '/^$/d' Make.subdirs | (while read dir cond ; do
+ test -d ../$dir || continue
+ if test -n "$cond" ; then
+ cat <<END_CONDITIONAL
+if ${cond}
+ SUBDIRS += $dir
+endif
+END_CONDITIONAL
+ else
+ echo "SUBDIRS += $dir"
+ fi
+done)
+
+# Write the rest of Make.base, up to SOURCES
+sed -n '/END SUBDIRS/,/BEGIN SOURCES/p' Make.base
+
+echo
+echo "libwiredtiger_la_LDFLAGS = -release @VERSION@"
+echo "libwiredtiger_la_SOURCES=\\"
+sed -e '/^[a-z]/!d' \
+ -e 's/.*/ & \\/' \
+ -e '$s/ \\$//' < ../dist/filelist
+
+# Write the rest of Make.base
+sed -n '/END SOURCES/,$p' Make.base
+) > ../Makefile.am
diff --git a/src/third_party/wiredtiger/build_posix/reconf b/src/third_party/wiredtiger/build_posix/reconf
new file mode 100755
index 00000000000..8700c5da43d
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/reconf
@@ -0,0 +1,79 @@
+#! /bin/sh
+
+t=/tmp/__configure.$$
+trap 'rm -f $t; exit 0' 0 1 2 3 13 15
+
+# Insulate against IFS from the user's env
+IFS=' '' ''
+'
+export IFS
+
+# Allow this script to be run from anywhere
+cd "`dirname \"$0\"`"
+
+# There's a cleanup function so we can easily clean out the directory.
+clean()
+{
+ # Use the Makefile to remove object files if they exist.
+ test -f Makefile && make distclean > /dev/null
+
+ # Remove automatically generated files.
+ rm -rf Makefile \
+ Makefile.am \
+ Makefile.in \
+ aclocal.m4 \
+ auto-includes.chk \
+ autom4te.cache \
+ config.hin \
+ config.hin~ \
+ config.log \
+ config.status \
+ configure \
+ gnu-support \
+ mklog
+}
+
+# We always clean things up, assume build_posix and the top-level directory
+# are the build spots.
+(cd .. && clean)
+clean
+
+while :
+ do case "$1" in
+ -c) # Clean and leave empty
+ exit 0;;
+ *) # Clean and then re-create
+ break;;
+ esac
+done
+
+# Build configure.ac
+(
+echo "# DO NOT EDIT"
+echo "# This file is built automatically from build_posix/configure.ac.in."
+
+sed -n '1,/BEGIN check existence/p' configure.ac.in
+
+sed -e 's/#.*$//' -e '/^$/d' -e '/^\.$/d' Make.subdirs | \
+while read dir cond ; do
+ test -d ../$dir || continue
+ echo 'AC_CONFIG_FILES(['$dir/Makefile'])'
+done
+
+sed -n '/END check existence/,$p' configure.ac.in
+) > ../configure.ac
+
+# Build Makefile.am
+sh ./makemake
+
+# From here on, work in the top of the tree
+cd ..
+autoreconf --install --warnings=all
+
+# Make sure any missing files are writable
+chmod 755 build_posix/gnu-support/*
+
+# Cleanup
+rm -rf autom4te.cache
+
+exit 0
diff --git a/src/third_party/wiredtiger/build_posix/wiredtiger.pc.in b/src/third_party/wiredtiger/build_posix/wiredtiger.pc.in
new file mode 100644
index 00000000000..be257efcef3
--- /dev/null
+++ b/src/third_party/wiredtiger/build_posix/wiredtiger.pc.in
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: WiredTiger
+Description: The WiredTiger Data Engine
+Requires:
+Version: @PACKAGE_VERSION@
+Libs: -L${libdir} -lwiredtiger-@VERSION_NOPATCH@
+Cflags: -I${includedir}/wiredtiger-@VERSION_NOPATCH@
diff --git a/src/third_party/wiredtiger/build_solaris/wiredtiger_config.h b/src/third_party/wiredtiger/build_solaris/wiredtiger_config.h
new file mode 100644
index 00000000000..3f6dbc51155
--- /dev/null
+++ b/src/third_party/wiredtiger/build_solaris/wiredtiger_config.h
@@ -0,0 +1,146 @@
+/* wiredtiger_config.h. Generated from config.hin by configure. */
+/* build_posix/config.hin. Generated from configure.ac by autoheader. */
+
+/* Define if building universal (internal helper macro) */
+/* #undef AC_APPLE_UNIVERSAL_BUILD */
+
+/* Define to 1 to pause for debugger attach on failure. */
+/* #undef HAVE_ATTACH */
+
+/* Build the LevelDB API with Basho LevelDB support. */
+/* #undef HAVE_BASHOLEVELDB */
+
+/* Snappy support automatically loaded. */
+/* #undef HAVE_BUILTIN_EXTENSION_SNAPPY */
+
+/* Zlib support automatically loaded. */
+/* #undef HAVE_BUILTIN_EXTENSION_ZLIB */
+
+/* Define to 1 if you have the `clock_gettime' function. */
+#define HAVE_CLOCK_GETTIME 1
+
+/* Define to 1 for diagnostic tests. */
+/* #undef HAVE_DIAGNOSTIC */
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the `fallocate' function. */
+/* #undef HAVE_FALLOCATE */
+
+/* Define to 1 if you have the `fcntl' function. */
+#define HAVE_FCNTL 1
+
+/* Define to 1 if you have the `fdatasync' function. */
+#define HAVE_FDATASYNC 1
+
+/* Define to 1 if you have the `fread_unlocked' function. */
+/* #undef HAVE_FREAD_UNLOCKED */
+
+/* Define to 1 if you have the `ftruncate' function. */
+#define HAVE_FTRUNCATE 1
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#define HAVE_GETTIMEOFDAY 1
+
+/* Build the LevelDB API with HyperLevelDB support. */
+/* #undef HAVE_HYPERLEVELDB */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the `bz2' library (-lbz2). */
+/* #undef HAVE_LIBBZ2 */
+
+/* Define to 1 if you have the `dl' library (-ldl). */
+#define HAVE_LIBDL 1
+
+/* Define to 1 if you have the `pthread' library (-lpthread). */
+#define HAVE_LIBPTHREAD 1
+
+/* Define to 1 if you have the `rt' library (-lrt). */
+#define HAVE_LIBRT 1
+
+/* Define to 1 if you have the `snappy' library (-lsnappy). */
+/* #undef HAVE_LIBSNAPPY */
+
+/* Define to 1 if you have the `z' library (-lz). */
+/* #undef HAVE_LIBZ */
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the `posix_fadvise' function. */
+#define HAVE_POSIX_FADVISE 1
+
+/* Define to 1 if you have the `posix_fallocate' function. */
+#define HAVE_POSIX_FALLOCATE 1
+
+/* Define to 1 if you have the `posix_madvise' function. */
+#define HAVE_POSIX_MADVISE 1
+
+/* Define to 1 if you have the `posix_memalign' function. */
+#define HAVE_POSIX_MEMALIGN 1
+
+/* Define to 1 if you have the <pthread_np.h> header file. */
+/* #undef HAVE_PTHREAD_NP_H */
+
+/* Build the LevelDB API with RocksDB support. */
+/* #undef HAVE_ROCKSDB */
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strtouq' function. */
+/* #undef HAVE_STRTOUQ */
+
+/* Define to 1 if you have the `sync_file_range' function. */
+/* #undef HAVE_SYNC_FILE_RANGE */
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Enable verbose message configuration. */
+/* #undef HAVE_VERBOSE */
+
+/* Spinlock type from mutex.h. */
+#define SPINLOCK_TYPE SPINLOCK_PTHREAD_MUTEX
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Define WORDS_BIGENDIAN to 0 if your processor stores words with the most
+ significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined AC_APPLE_UNIVERSAL_BUILD
+# if defined __BIG_ENDIAN__
+# define WORDS_BIGENDIAN 1
+# endif
+#else
+# ifndef WORDS_BIGENDIAN
+/* # undef WORDS_BIGENDIAN */
+# endif
+#endif
+
+/* Default alignment of buffers used for I/O */
+#define WT_BUFFER_ALIGNMENT_DEFAULT 0
+
+/* Number of bits in a file offset, on hosts where this is settable. */
+/* #undef _FILE_OFFSET_BITS */
+
+/* Define for large files, on AIX-style hosts. */
+/* #undef _LARGE_FILES */
diff --git a/src/third_party/wiredtiger/build_win/wiredtiger_config.h b/src/third_party/wiredtiger/build_win/wiredtiger_config.h
new file mode 100644
index 00000000000..93317b6d81d
--- /dev/null
+++ b/src/third_party/wiredtiger/build_win/wiredtiger_config.h
@@ -0,0 +1,151 @@
+/* wiredtiger_config.h. Generated from config.hin by configure. */
+/* build_posix/config.hin. Generated from configure.ac by autoheader. */
+
+/* Define if building universal (internal helper macro) */
+/* #undef AC_APPLE_UNIVERSAL_BUILD */
+
+/* Define to 1 to pause for debugger attach on failure. */
+/* #undef HAVE_ATTACH */
+
+/* Build the LevelDB API with Basho LevelDB support. */
+/* #undef HAVE_BASHOLEVELDB */
+
+/* Snappy support automatically loaded. */
+/* #undef HAVE_BUILTIN_EXTENSION_SNAPPY */
+
+/* Zlib support automatically loaded. */
+/* #undef HAVE_BUILTIN_EXTENSION_ZLIB */
+
+/* Define to 1 if you have the `clock_gettime' function. */
+/* #undef HAVE_CLOCK_GETTIME */
+
+/* Define to 1 for diagnostic tests. */
+/* #undef HAVE_DIAGNOSTIC */
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+/* #undef HAVE_DLFCN_H */
+
+/* Define to 1 if you have the `fallocate' function. */
+/* #undef HAVE_FALLOCATE */
+
+/* Define to 1 if you have the `fcntl' function. */
+/* #undef HAVE_FCNTL 1 */
+
+/* Define to 1 if you have the `fdatasync' function. */
+/* #undef HAVE_FDATASYNC */
+
+/* Define to 1 if you have the `fread_unlocked' function. */
+/* #undef HAVE_FREAD_UNLOCKED */
+
+/* Define to 1 if you have the `ftruncate' function. */
+/* #undef HAVE_FTRUNCATE */
+
+/* Define to 1 if you have the `gettimeofday' function. */
+/* #undef HAVE_GETTIMEOFDAY */
+
+/* Build the LevelDB API with HyperLevelDB support. */
+/* #undef HAVE_HYPERLEVELDB */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the `bz2' library (-lbz2). */
+/* #undef HAVE_LIBBZ2 */
+
+/* Define to 1 if you have the `dl' library (-ldl). */
+/* #undef HAVE_LIBDL */
+
+/* Define to 1 if you have the `pthread' library (-lpthread). */
+/* #undef HAVE_LIBPTHREAD */
+
+/* Define to 1 if you have the `rt' library (-lrt). */
+/* #undef HAVE_LIBRT */
+
+/* Define to 1 if you have the `snappy' library (-lsnappy). */
+/* #undef HAVE_LIBSNAPPY */
+
+/* Define to 1 if you have the `z' library (-lz). */
+/* #undef HAVE_LIBZ */
+
+/* Define to 1 if you have the <memory.h> header file. */
+/* #undef HAVE_MEMORY_H */
+
+/* Define to 1 if you have the `posix_fadvise' function. */
+/* #undef HAVE_POSIX_FADVISE */
+
+/* Define to 1 if you have the `posix_fallocate' function. */
+/* #undef HAVE_POSIX_FALLOCATE */
+
+/* Define to 1 if you have the `posix_madvise' function. */
+/* #undef HAVE_POSIX_MADVISE */
+
+/* Define to 1 if you have the `posix_memalign' function. */
+/* #undef HAVE_POSIX_MEMALIGN */
+
+/* Define to 1 if you have the <pthread_np.h> header file. */
+/* #undef HAVE_PTHREAD_NP_H */
+
+/* Build the LevelDB API with RocksDB support. */
+/* #undef HAVE_ROCKSDB */
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+/* #undef HAVE_STRINGS_H */
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strtouq' function. */
+/* #undef HAVE_STRTOUQ */
+
+/* Define to 1 if you have the `sync_file_range' function. */
+/* #undef HAVE_SYNC_FILE_RANGE */
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+/* #undef HAVE_UNISTD_H */
+
+/* Enable verbose message configuration. */
+/* #undef HAVE_VERBOSE */
+
+/* Spinlock type from mutex.h. */
+#define SPINLOCK_TYPE SPINLOCK_MSVC
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
+ significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined AC_APPLE_UNIVERSAL_BUILD
+# if defined __BIG_ENDIAN__
+# define WORDS_BIGENDIAN 1
+# endif
+#else
+# ifndef WORDS_BIGENDIAN
+/* # undef WORDS_BIGENDIAN */
+# endif
+#endif
+
+/* Default alignment of buffers used for I/O */
+#define WT_BUFFER_ALIGNMENT_DEFAULT 0
+
+/* Enable large inode numbers on Mac OS X 10.5. */
+/* #ifndef _DARWIN_USE_64_BIT_INODE */
+/* # define _DARWIN_USE_64_BIT_INODE 1 */
+/* #endif */
+
+/* Number of bits in a file offset, on hosts where this is settable. */
+/* #undef _FILE_OFFSET_BITS */
+
+/* Define for large files, on AIX-style hosts. */
+/* #undef _LARGE_FILES */
diff --git a/src/third_party/wiredtiger/dist/api_config.py b/src/third_party/wiredtiger/dist/api_config.py
new file mode 100644
index 00000000000..6ca0275f228
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/api_config.py
@@ -0,0 +1,332 @@
+#!/usr/bin/env python
+
+import os, re, sys, textwrap
+import api_data
+from dist import compare_srcfile
+
+# Temporary file.
+tmp_file = '__tmp'
+
+#####################################################################
+# Update wiredtiger.in with doxygen comments
+#####################################################################
+f='../src/include/wiredtiger.in'
+tfile = open(tmp_file, 'w')
+
+whitespace_re = re.compile(r'\s+')
+cbegin_re = re.compile(r'(\s*\*\s*)@config(?:empty|start)\{(.*?),.*\}')
+
+def gettype(c):
+ '''Derive the type of a config item'''
+ checks = c.flags
+ ctype = checks.get('type', None)
+ if not ctype and ('min' in checks or 'max' in checks):
+ ctype = 'int'
+ return ctype or 'string'
+
+def typedesc(c):
+ '''Descripe what type of value is expected for the given config item'''
+ checks = c.flags
+ cmin = str(checks.get('min', ''))
+ cmax = str(checks.get('max', ''))
+ choices = checks.get('choices', [])
+ ctype = gettype(c)
+ desc = {
+ 'boolean' : 'a boolean flag',
+ 'format' : 'a format string',
+ 'int' : 'an integer',
+ 'list' : 'a list',
+ 'category': 'a set of related configuration options defined below',
+ 'string' : 'a string'}[ctype]
+ if cmin and cmax:
+ desc += ' between ' + cmin + ' and ' + cmax
+ elif cmin:
+ desc += ' greater than or equal to ' + cmin
+ elif cmax:
+ desc += ' no more than ' + cmax
+ if choices:
+ if ctype == 'list':
+ desc += ', with values chosen from the following options: '
+ else:
+ desc += ', chosen from the following options: '
+ desc += ', '.join('\\c "' + c + '"' for c in choices)
+ elif ctype == 'list':
+ desc += ' of strings'
+ return desc
+
+def parseconfig(c, name_indent=''):
+ ctype = gettype(c)
+ desc = whitespace_re.sub(' ', c.desc.strip())
+ desc = desc.strip('.') + '.'
+ desc = desc.replace(',', '\\,')
+ default = '\\c ' + str(c.default) if c.default or ctype == 'int' \
+ else 'empty'
+ name = name_indent + c.name
+
+ tdesc = typedesc(c)
+ if ctype != 'category':
+ tdesc += '; default ' + default
+ else:
+ name += ' = ('
+ tdesc += '.'
+ tdesc = tdesc.replace(',', '\\,')
+ output = '@config{' + ', '.join((name, desc, tdesc)) + '}\n'
+ if ctype == 'category':
+ for subc in sorted(c.subconfig):
+ output += parseconfig(subc, name_indent + ('&nbsp;' * 4))
+ output += '@config{ ),,}\n'
+ return output
+
+def getconfcheck(c):
+ check = '{ "' + c.name + '", "' + gettype(c) + '",'
+ cstr = checkstr(c)
+ sstr = getsubconfigstr(c)
+ if cstr != 'NULL':
+ cstr = '"\n\t "'.join(w.wrap(cstr))
+ # Manually re-wrap when there is a check string to avoid ugliness
+ # between string and non-string wrapping
+ if len(check + ' ' + cstr + ',\n\t ' + sstr + '},') >= 68:
+ check = check + '\n\t ' + cstr + ',\n\t ' + sstr + ' },'
+ else:
+ check = check + ' ' + cstr + ', ' + sstr + ' },'
+ else:
+ check = '\n\t '.join(
+ w.wrap(check + ' ' + cstr + ', ' + sstr + ' },'))
+ return check
+
+skip = False
+for line in open(f, 'r'):
+ if skip:
+ if '@configend' in line:
+ skip = False
+ continue
+
+ m = cbegin_re.match(line)
+ if not m:
+ tfile.write(line)
+ continue
+
+ prefix, config_name = m.groups()
+ if config_name not in api_data.methods:
+ print >>sys.stderr, "Missing configuration for " + config_name
+ tfile.write(line)
+ continue
+
+ skip = ('@configstart' in line)
+
+ if not api_data.methods[config_name].config:
+ tfile.write(prefix + '@configempty{' + config_name +
+ ', see dist/api_data.py}\n')
+ continue
+
+ tfile.write(prefix + '@configstart{' + config_name +
+ ', see dist/api_data.py}\n')
+
+ w = textwrap.TextWrapper(width=80-len(prefix.expandtabs()),
+ break_on_hyphens=False,
+ replace_whitespace=False,
+ fix_sentence_endings=True)
+ lastname = None
+ for c in sorted(api_data.methods[config_name].config):
+ name = c.name
+ if '.' in name:
+ print >>sys.stderr, "Bad config key " + name
+
+ # Deal with duplicates: with complex configurations (like
+ # WT_SESSION::create), it's simpler to deal with duplicates here than
+ # manually in api_data.py.
+ if name == lastname:
+ continue
+ lastname = name
+ if 'undoc' in c.flags:
+ continue
+ output = parseconfig(c)
+ for l in w.wrap(output):
+ tfile.write(prefix + l.replace('\n', '\n' + prefix) + '\n')
+
+ tfile.write(prefix + '@configend\n')
+
+tfile.close()
+compare_srcfile(tmp_file, f)
+
+#####################################################################
+# Create config_def.c with defaults for each config string
+#####################################################################
+f='../src/config/config_def.c'
+tfile = open(tmp_file, 'w')
+
+tfile.write('''/* DO NOT EDIT: automatically built by dist/config.py. */
+
+#include "wt_internal.h"
+''')
+
+# Make a TextWrapper that can wrap at commas.
+w = textwrap.TextWrapper(width=64, break_on_hyphens=False)
+w.wordsep_re = w.wordsep_simple_re = re.compile(r'(,)')
+
+def checkstr(c):
+ '''Generate the JSON string used by __wt_config_check to validate the
+ config string'''
+ checks = c.flags
+ cmin = str(checks.get('min', ''))
+ cmax = str(checks.get('max', ''))
+ choices = checks.get('choices', [])
+ result = []
+ if cmin:
+ result.append('min=' + cmin)
+ if cmax:
+ result.append('max=' + cmax)
+ if choices:
+ result.append('choices=' + '[' +
+ ','.join('\\"' + s + '\\"' for s in choices) + ']')
+ if result:
+ return '"' + ','.join(result) + '"'
+ else:
+ return 'NULL'
+
+def get_default(c):
+ t = gettype(c)
+ if c.default == 'false':
+ return '0'
+ elif t == 'category':
+ return '(%s)' % (','.join('%s=%s' % (subc.name, get_default(subc))
+ for subc in sorted(c.subconfig)))
+ elif (c.default or t == 'int') and c.default != 'true':
+ return str(c.default).replace('"', '\\"')
+ else:
+ return ''
+
+created_subconfigs=set()
+def add_subconfig(c):
+ if c.name in created_subconfigs:
+ return
+ created_subconfigs.add(c.name)
+ tfile.write('''
+static const WT_CONFIG_CHECK confchk_%(name)s_subconfigs[] = {
+\t%(check)s
+\t{ NULL, NULL, NULL, NULL }
+};
+''' % {
+ 'name' : c.name,
+ 'check' : '\n\t'.join(getconfcheck(subc) for subc in sorted(c.subconfig)),
+})
+
+def getsubconfigstr(c):
+ '''Return a string indicating if an item has sub configuration'''
+ ctype = gettype(c)
+ if ctype == 'category':
+ add_subconfig(c)
+ return 'confchk_' + c.name + '_subconfigs'
+ else:
+ return 'NULL'
+
+# Write structures of arrays of allowable configuration options, including a
+# NULL as a terminator for iteration.
+for name in sorted(api_data.methods.keys()):
+ ctype = api_data.methods[name].config
+ if ctype:
+ tfile.write('''
+static const WT_CONFIG_CHECK confchk_%(name)s[] = {
+\t%(check)s
+\t{ NULL, NULL, NULL, NULL }
+};
+''' % {
+ 'name' : name.replace('.', '_'),
+ 'check' : '\n\t'.join(getconfcheck(c) for c in sorted(ctype)),
+})
+
+# Write the initialized list of configuration entry structures.
+tfile.write('\n')
+tfile.write('static const WT_CONFIG_ENTRY config_entries[] = {')
+
+slot=-1
+config_defines = ''
+for name in sorted(api_data.methods.keys()):
+ ctype = api_data.methods[name].config
+ slot += 1
+
+ # Build a list of #defines that reference specific slots in the list (the
+ # #defines are used to avoid a list search where we know the correct slot).
+ config_defines +=\
+ '#define\tWT_CONFIG_ENTRY_' + name.replace('.', '_') + '\t' * \
+ max(1, 6 - (len('WT_CONFIG_ENTRY_' + name) / 8)) + \
+ "%2s" % str(slot) + '\n'
+
+ # Write the method name and base.
+ tfile.write('''
+\t{ "%(name)s",
+%(config)s,''' % {
+ 'config' : '\n'.join('\t "%s"' % line
+ for line in w.wrap(','.join('%s=%s' % (c.name, get_default(c))
+ for c in sorted(ctype))) or [""]),
+ 'name' : name
+})
+
+ # Write the checks reference, or NULL if no related checks structure.
+ tfile.write('\n\t ')
+ if ctype:
+ tfile.write('confchk_' + name.replace('.', '_'))
+ else:
+ tfile.write('NULL')
+
+ tfile.write('\n\t},')
+
+# Write a NULL as a terminator for iteration.
+tfile.write('\n\t{ NULL, NULL, NULL }')
+tfile.write('\n};\n')
+
+# Write the routine that connects the WT_CONNECTION_IMPL structure to the list
+# of configuration entry structures.
+tfile.write('''
+int
+__wt_conn_config_init(WT_SESSION_IMPL *session)
+{
+\tWT_CONNECTION_IMPL *conn;
+\tconst WT_CONFIG_ENTRY *ep, **epp;
+
+\tconn = S2C(session);
+
+\t/* Build a list of pointers to the configuration information. */
+\tWT_RET(__wt_calloc_def(session,
+\t sizeof(config_entries) / sizeof(config_entries[0]), &epp));
+\tconn->config_entries = epp;
+
+\t/* Fill in the list to reference the default information. */
+\tfor (ep = config_entries;;) {
+\t\t*epp++ = ep++;
+\t\tif (ep->method == NULL)
+\t\t\tbreak;
+\t}
+\treturn (0);
+}
+
+void
+__wt_conn_config_discard(WT_SESSION_IMPL *session)
+{
+\tWT_CONNECTION_IMPL *conn;
+
+\tconn = S2C(session);
+
+\t__wt_free(session, conn->config_entries);
+}
+''')
+
+tfile.close()
+compare_srcfile(tmp_file, f)
+
+# Update the config.h file with the #defines for the configuration entries.
+tfile = open(tmp_file, 'w')
+skip = 0
+for line in open('../src/include/config.h', 'r'):
+ if skip:
+ if 'configuration section: END' in line:
+ tfile.write('/*\n' + line)
+ skip = 0
+ else:
+ tfile.write(line)
+ if 'configuration section: BEGIN' in line:
+ skip = 1
+ tfile.write(' */\n')
+ tfile.write(config_defines)
+tfile.close()
+compare_srcfile(tmp_file, '../src/include/config.h')
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
new file mode 100644
index 00000000000..f0a9c9742e1
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -0,0 +1,863 @@
+# This file is a python script that describes the WiredTiger API.
+
+class Error:
+ def __init__(self, name, desc, long_desc=None, **flags):
+ self.name = name
+ self.desc = desc
+ self.long_desc = long_desc
+ self.flags = flags
+
+errors = [
+ Error('WT_DUPLICATE_KEY', 'attempt to insert an existing key', '''
+ This error is generated when the application attempts to insert
+ a record with the same key as an existing record without the
+ 'overwrite' configuration to WT_SESSION::open_cursor.'''),
+ Error('WT_ERROR', 'non-specific WiredTiger error', '''
+ This error is returned when an error is not covered by a
+ specific error return.'''),
+ Error('WT_NOTFOUND', 'item not found', '''
+ This error indicates an operation did not find a value to
+ return. This includes cursor search and other operations
+ where no record matched the cursor's search key such as
+ WT_CURSOR::update or WT_CURSOR::remove.'''),
+ Error('WT_PANIC', 'WiredTiger library panic', '''
+ This error indicates an underlying problem that requires the
+ application exit and restart.'''),
+ Error('WT_RESTART', 'restart the operation (internal)', undoc=True),
+ Error('WT_ROLLBACK', 'conflict between concurrent operations', '''
+ This error is generated when an operation cannot be completed
+ due to a conflict with concurrent operations. The operation
+ may be retried; if a transaction is in progress, it should be
+ rolled back and the operation retried in a new transaction.'''),
+]
+
+class Method:
+ def __init__(self, config, **flags):
+ self.config = config
+ self.flags = flags
+
+class Config:
+ def __init__(self, name, default, desc, subconfig=None, **flags):
+ self.name = name
+ self.default = default
+ self.desc = desc
+ self.subconfig = subconfig
+ self.flags = flags
+
+ def __cmp__(self, other):
+ return cmp(self.name, other.name)
+
+# Metadata shared by all schema objects
+common_meta = [
+ Config('app_metadata', '', r'''
+ application-owned metadata for this object'''),
+ Config('columns', '', r'''
+ list of the column names. Comma-separated list of the form
+ <code>(column[,...])</code>. For tables, the number of entries
+ must match the total number of values in \c key_format and \c
+ value_format. For colgroups and indices, all column names must
+ appear in the list of columns for the table''',
+ type='list'),
+]
+
+source_meta = [
+ Config('source', '', r'''
+ set a custom data source URI for a column group, index or simple
+ table. By default, the data source URI is derived from the \c
+ type and the column group or index name. Applications can
+ create tables from existing data sources by supplying a \c
+ source configuration''', undoc=True),
+ Config('type', 'file', r'''
+ set the type of data source used to store a column group, index
+ or simple table. By default, a \c "file:" URI is derived from
+ the object name. The \c type configuration can be used to
+ switch to a different data source, such as LSM or an extension
+ configured by the application'''),
+]
+
+format_meta = common_meta + [
+ Config('key_format', 'u', r'''
+ the format of the data packed into key items. See @ref
+ schema_format_types for details. By default, the key_format is
+ \c 'u' and applications use WT_ITEM structures to manipulate
+ raw byte arrays. By default, records are stored in row-store
+ files: keys of type \c 'r' are record numbers and records
+ referenced by record number are stored in column-store files''',
+ type='format'),
+ Config('value_format', 'u', r'''
+ the format of the data packed into value items. See @ref
+ schema_format_types for details. By default, the value_format
+ is \c 'u' and applications use a WT_ITEM structure to
+ manipulate raw byte arrays. Value items of type 't' are
+ bitfields, and when configured with record number type keys,
+ will be stored using a fixed-length store''',
+ type='format'),
+]
+
+lsm_config = [
+ Config('lsm', '', r'''
+ options only relevant for LSM data sources''',
+ type='category', subconfig=[
+ Config('auto_throttle', 'true', r'''
+ Throttle inserts into LSM trees if flushing to disk isn't
+ keeping up''',
+ type='boolean'),
+ Config('bloom', 'true', r'''
+ create bloom filters on LSM tree chunks as they are merged''',
+ type='boolean'),
+ Config('bloom_config', '', r'''
+ config string used when creating Bloom filter files, passed
+ to WT_SESSION::create'''),
+ Config('bloom_bit_count', '16', r'''
+ the number of bits used per item for LSM bloom filters''',
+ min='2', max='1000'),
+ Config('bloom_hash_count', '8', r'''
+ the number of hash values per item used for LSM bloom
+ filters''',
+ min='2', max='100'),
+ Config('bloom_oldest', 'false', r'''
+ create a bloom filter on the oldest LSM tree chunk. Only
+ supported if bloom filters are enabled''',
+ type='boolean'),
+ Config('chunk_max', '5GB', r'''
+ the maximum size a single chunk can be. Chunks larger than this
+ size are not considered for further merges. This is a soft
+ limit, and chunks larger than this value can be created. Must
+ be larger than chunk_size''',
+ min='100MB', max='10TB'),
+ Config('chunk_size', '10MB', r'''
+ the maximum size of the in-memory chunk of an LSM tree. This
+ limit is soft - it is possible for chunks to be temporarily
+ larger than this value. This overrides the \c memory_page_max
+ setting''',
+ min='512K', max='500MB'),
+ Config('merge_max', '15', r'''
+ the maximum number of chunks to include in a merge operation''',
+ min='2', max='100'),
+ Config('merge_min', '0', r'''
+ the minimum number of chunks to include in a merge operation. If
+ set to 0 or 1 half the value of merge_max is used''',
+ max='100'),
+ ]),
+]
+
+# Per-file configuration
+file_config = format_meta + [
+ Config('block_allocation', 'best', r'''
+ configure block allocation. Permitted values are \c "first" or
+ \c "best"; the \c "first" configuration uses a first-available
+ algorithm during block allocation, the \c "best" configuration
+ uses a best-fit algorithm''',
+ choices=['first', 'best',]),
+ Config('allocation_size', '4KB', r'''
+ the file unit allocation size, in bytes, must a power-of-two;
+ smaller values decrease the file space required by overflow
+ items, and the default value of 4KB is a good choice absent
+ requirements from the operating system or storage device''',
+ min='512B', max='128MB'),
+ Config('block_compressor', '', r'''
+ configure a compressor for file blocks. Permitted values are
+ empty (off) or \c "bzip2", \c "snappy" or custom compression
+ engine \c "name" created with WT_CONNECTION::add_compressor.
+ See @ref compression for more information'''),
+ Config('cache_resident', 'false', r'''
+ do not ever evict the object's pages; see @ref
+ tuning_cache_resident for more information''',
+ type='boolean'),
+ Config('checksum', 'uncompressed', r'''
+ configure block checksums; permitted values are <code>on</code>
+ (checksum all blocks), <code>off</code> (checksum no blocks) and
+ <code>uncompresssed</code> (checksum only blocks which are not
+ compressed for any reason). The \c uncompressed setting is for
+ applications which can rely on decompression to fail if a block
+ has been corrupted''',
+ choices=['on', 'off', 'uncompressed']),
+ Config('collator', '', r'''
+ configure custom collation for keys. Value must be a collator
+ name created with WT_CONNECTION::add_collator'''),
+ Config('dictionary', '0', r'''
+ the maximum number of unique values remembered in the Btree
+ row-store leaf page value dictionary; see
+ @ref file_formats_compression for more information''',
+ min='0'),
+ Config('format', 'btree', r'''
+ the file format''',
+ choices=['btree']),
+ Config('huffman_key', '', r'''
+ configure Huffman encoding for keys. Permitted values
+ are empty (off), \c "english", \c "utf8<file>" or \c
+ "utf16<file>". See @ref huffman for more information'''),
+ Config('huffman_value', '', r'''
+ configure Huffman encoding for values. Permitted values
+ are empty (off), \c "english", \c "utf8<file>" or \c
+ "utf16<file>". See @ref huffman for more information'''),
+ Config('internal_key_truncate', 'true', r'''
+ configure internal key truncation, discarding unnecessary
+ trailing bytes on internal keys (ignored for custom
+ collators)''',
+ type='boolean'),
+ Config('internal_page_max', '4KB', r'''
+ the maximum page size for internal nodes, in bytes; the size
+ must be a multiple of the allocation size and is significant
+ for applications wanting to avoid excessive L2 cache misses
+ while searching the tree. The page maximum is the bytes of
+ uncompressed data, that is, the limit is applied before any
+ block compression is done''',
+ min='512B', max='512MB'),
+ Config('internal_item_max', '0', r'''
+ the largest key stored within an internal node, in bytes. If
+ non-zero, any key larger than the specified size will be
+ stored as an overflow item (which may require additional I/O
+ to access). If zero, a default size is chosen that permits at
+ least 8 keys per internal page''',
+ min=0),
+ Config('key_gap', '10', r'''
+ the maximum gap between instantiated keys in a Btree leaf page,
+ constraining the number of keys processed to instantiate a
+ random Btree leaf page key''',
+ min='0', undoc=True),
+ Config('leaf_page_max', '32KB', r'''
+ the maximum page size for leaf nodes, in bytes; the size must
+ be a multiple of the allocation size, and is significant for
+ applications wanting to maximize sequential data transfer from
+ a storage device. The page maximum is the bytes of uncompressed
+ data, that is, the limit is applied before any block compression
+ is done''',
+ min='512B', max='512MB'),
+ Config('leaf_item_max', '0', r'''
+ the largest key or value stored within a leaf node, in bytes.
+ If non-zero, any key or value larger than the specified size
+ will be stored as an overflow item (which may require additional
+ I/O to access). If zero, a default size is chosen that permits
+ at least 4 key and value pairs per leaf page''',
+ min=0),
+ Config('memory_page_max', '5MB', r'''
+ the maximum size a page can grow to in memory before being
+ reconciled to disk. The specified size will be adjusted to a lower
+ bound of <code>50 * leaf_page_max</code>, and an upper bound of
+ <code>cache_size / 2</code>. This limit is soft - it is possible
+ for pages to be temporarily larger than this value. This setting
+ is ignored for LSM trees, see \c chunk_size''',
+ min='512B', max='10TB'),
+ Config('os_cache_max', '0', r'''
+ maximum system buffer cache usage, in bytes. If non-zero, evict
+ object blocks from the system buffer cache after that many bytes
+ from this object are read or written into the buffer cache''',
+ min=0),
+ Config('os_cache_dirty_max', '0', r'''
+ maximum dirty system buffer cache usage, in bytes. If non-zero,
+ schedule writes for dirty blocks belonging to this object in the
+ system buffer cache after that many bytes from this object are
+ written into the buffer cache''',
+ min=0),
+ Config('prefix_compression', 'false', r'''
+ configure prefix compression on row-store leaf pages''',
+ type='boolean'),
+ Config('prefix_compression_min', '4', r'''
+ minimum gain before prefix compression will be used on row-store
+ leaf pages''',
+ min=0),
+ Config('split_pct', '75', r'''
+ the Btree page split size as a percentage of the maximum Btree
+ page size, that is, when a Btree page is split, it will be
+ split into smaller pages, where each page is the specified
+ percentage of the maximum Btree page size''',
+ min='25', max='100'),
+]
+
+# File metadata, including both configurable and non-configurable (internal)
+file_meta = file_config + [
+ Config('checkpoint', '', r'''
+ the file checkpoint entries'''),
+ Config('checkpoint_lsn', '', r'''
+ LSN of the last checkpoint'''),
+ Config('id', '', r'''
+ the file's ID number'''),
+ Config('version', '(major=0,minor=0)', r'''
+ the file version'''),
+]
+
+table_only_meta = [
+ Config('colgroups', '', r'''
+ comma-separated list of names of column groups. Each column
+ group is stored separately, keyed by the primary key of the
+ table. If no column groups are specified, all columns are
+ stored together in a single file. All value columns in the
+ table must appear in at least one column group. Each column
+ group must be created with a separate call to
+ WT_SESSION::create''', type='list'),
+]
+
+colgroup_meta = common_meta + source_meta
+
+index_meta = format_meta + source_meta
+
+table_meta = format_meta + table_only_meta
+
+# Connection runtime config, shared by conn.reconfigure and wiredtiger_open
+connection_runtime_config = [
+ Config('async', '', r'''
+ asynchronous operations configuration options''',
+ type='category', subconfig=[
+ Config('enabled', 'false', r'''
+ enable asynchronous operation''',
+ type='boolean'),
+ Config('ops_max', '1024', r'''
+ maximum number of expected simultaneous asynchronous
+ operations''', min='10', max='4096'),
+ Config('threads', '2', r'''
+ the number of worker threads to service asynchronous
+ requests''',
+ min='1', max='20'), # !!! Must match WT_ASYNC_MAX_WORKERS
+ ]),
+ Config('cache_size', '100MB', r'''
+ maximum heap memory to allocate for the cache. A database should
+ configure either a cache_size or a shared_cache not both''',
+ min='1MB', max='10TB'),
+ Config('checkpoint', '', r'''
+ periodically checkpoint the database''',
+ type='category', subconfig=[
+ Config('name', '"WiredTigerCheckpoint"', r'''
+ the checkpoint name'''),
+ Config('log_size', '0', r'''
+ wait for this amount of log record bytes to be written to
+ the log between each checkpoint. A database can configure
+ both log_size and wait to set an upper bound for checkpoints;
+ setting this value above 0 configures periodic checkpoints''',
+ min='0', max='2GB'),
+ Config('wait', '0', r'''
+ seconds to wait between each checkpoint; setting this value
+ above 0 configures periodic checkpoints''',
+ min='0', max='100000'),
+ ]),
+ Config('error_prefix', '', r'''
+ prefix string for error messages'''),
+ Config('eviction_dirty_target', '80', r'''
+ continue evicting until the cache has less dirty memory than the
+ value, as a percentage of the total cache size. Dirty pages will
+ only be evicted if the cache is full enough to trigger eviction''',
+ min=10, max=99),
+ Config('eviction_target', '80', r'''
+ continue evicting until the cache has less total memory than the
+ value, as a percentage of the total cache size. Must be less than
+ \c eviction_trigger''',
+ min=10, max=99),
+ Config('eviction_trigger', '95', r'''
+ trigger eviction when the cache is using this much memory, as a
+ percentage of the total cache size''', min=10, max=99),
+ Config('lsm_manager', '', r'''
+ configure database wide options for LSM tree management''',
+ type='category', subconfig=[
+ Config('worker_thread_max', '4', r'''
+ Configure a set of threads to manage merging LSM trees in
+ the database.''',
+ min='3', max='20'), # !!! Must match WT_LSM_MAX_WORKERS
+ Config('merge', 'true', r'''
+ merge LSM chunks where possible''',
+ type='boolean')
+ ]),
+ Config('lsm_merge', 'true', r'''
+ merge LSM chunks where possible (deprecated)''',
+ type='boolean', undoc=True),
+ Config('eviction', '', r'''
+ eviction configuration options.''',
+ type='category', subconfig=[
+ Config('threads_max', '1', r'''
+ maximum number of threads WiredTiger will start to help evict
+ pages from cache. The number of threads started will vary
+ depending on the current eviction load''',
+ min=1, max=20),
+ Config('threads_min', '1', r'''
+ minimum number of threads WiredTiger will start to help evict
+ pages from cache. The number of threads currently running will
+ vary depending on the current eviction load''',
+ min=1, max=20),
+ ]),
+ Config('shared_cache', '', r'''
+ shared cache configuration options. A database should configure
+ either a cache_size or a shared_cache not both''',
+ type='category', subconfig=[
+ Config('chunk', '10MB', r'''
+ the granularity that a shared cache is redistributed''',
+ min='1MB', max='10TB'),
+ Config('reserve', '0', r'''
+ amount of cache this database is guaranteed to have
+ available from the shared cache. This setting is per
+ database. Defaults to the chunk size''', type='int'),
+ Config('name', '', r'''
+ name of a cache that is shared between databases'''),
+ Config('size', '500MB', r'''
+ maximum memory to allocate for the shared cache. Setting
+ this will update the value if one is already set''',
+ min='1MB', max='10TB')
+ ]),
+ Config('statistics', 'none', r'''
+ Maintain database statistics, which may impact performance.
+ Choosing "all" maintains all statistics regardless of cost,
+ "fast" maintains a subset of statistics that are relatively
+ inexpensive, "none" turns off all statistics. The "clear"
+ configuration resets statistics after they are gathered,
+ where appropriate (for example, a cache size statistic is
+ not cleared, while the count of cursor insert operations will
+ be cleared). When "clear" is configured for the database,
+ gathered statistics are reset each time a statistics cursor
+ is used to gather statistics, as well as each time statistics
+ are logged using the \c statistics_log configuration. See
+ @ref statistics for more information''',
+ type='list', choices=['all', 'fast', 'none', 'clear']),
+ Config('statistics_log', '', r'''
+ log any statistics the database is configured to maintain,
+ to a file. See @ref statistics for more information''',
+ type='category', subconfig=[
+ Config('on_close', 'false', r'''log statistics on database close''',
+ type='boolean'),
+ Config('path', '"WiredTigerStat.%d.%H"', r'''
+ the pathname to a file into which the log records are written,
+ may contain ISO C standard strftime conversion specifications.
+ If the value is not an absolute path name, the file is created
+ relative to the database home'''),
+ Config('sources', '', r'''
+ if non-empty, include statistics for the list of data source
+ URIs, if they are open at the time of the statistics logging.
+ The list may include URIs matching a single data source
+ ("table:mytable"), or a URI matching all data sources of a
+ particular type ("table:")''',
+ type='list'),
+ Config('timestamp', '"%b %d %H:%M:%S"', r'''
+ a timestamp prepended to each log record, may contain strftime
+ conversion specifications'''),
+ Config('wait', '0', r'''
+ seconds to wait between each write of the log records''',
+ min='0', max='100000'),
+ ]),
+ Config('verbose', '', r'''
+ enable messages for various events. Only available if WiredTiger
+ is configured with --enable-verbose. Options are given as a
+ list, such as <code>"verbose=[evictserver,read]"</code>''',
+ type='list', choices=[
+ 'api',
+ 'block',
+ 'checkpoint',
+ 'compact',
+ 'evict',
+ 'evictserver',
+ 'fileops',
+ 'log',
+ 'lsm',
+ 'metadata',
+ 'mutex',
+ 'overflow',
+ 'read',
+ 'reconcile',
+ 'recovery',
+ 'salvage',
+ 'shared_cache',
+ 'split',
+ 'temporary',
+ 'transaction',
+ 'verify',
+ 'version',
+ 'write']),
+]
+
+session_config = [
+ Config('isolation', 'read-committed', r'''
+ the default isolation level for operations in this session''',
+ choices=['read-uncommitted', 'read-committed', 'snapshot']),
+]
+
+common_wiredtiger_open = [
+ Config('buffer_alignment', '-1', r'''
+ in-memory alignment (in bytes) for buffers used for I/O. The
+ default value of -1 indicates a platform-specific alignment
+ value should be used (4KB on Linux systems, zero elsewhere)''',
+ min='-1', max='1MB'),
+ Config('checkpoint_sync', 'true', r'''
+ flush files to stable storage when closing or writing
+ checkpoints''',
+ type='boolean'),
+ Config('direct_io', '', r'''
+ Use \c O_DIRECT to access files. Options are given as a list,
+ such as <code>"direct_io=[data]"</code>. Configuring
+ \c direct_io requires care, see @ref
+ tuning_system_buffer_cache_direct_io for important warnings.
+ Including \c "data" will cause WiredTiger data files to use
+ \c O_DIRECT, including \c "log" will cause WiredTiger log files
+ to use \c O_DIRECT, and including \c "checkpoint" will cause
+ WiredTiger data files opened at a checkpoint (i.e: read only) to
+ use \c O_DIRECT''',
+ type='list', choices=['checkpoint', 'data', 'log']),
+ Config('extensions', '', r'''
+ list of shared library extensions to load (using dlopen).
+ Any values specified to an library extension are passed to
+ WT_CONNECTION::load_extension as the \c config parameter
+ (for example,
+ <code>extensions=(/path/ext.so={entry=my_entry})</code>)''',
+ type='list'),
+ Config('file_extend', '', r'''
+ file extension configuration. If set, extend files of the set
+ type in allocations of the set size, instead of a block at a
+ time as each new block is written. For example,
+ <code>file_extend=(data=16MB)</code>''',
+ type='list', choices=['data', 'log']),
+ Config('hazard_max', '1000', r'''
+ maximum number of simultaneous hazard pointers per session
+ handle''',
+ min='15'),
+ Config('log', '', r'''
+ enable logging''',
+ type='category', subconfig=[
+ Config('archive', 'true', r'''
+ automatically archive unneeded log files''',
+ type='boolean'),
+ Config('enabled', 'false', r'''
+ enable logging subsystem''',
+ type='boolean'),
+ Config('file_max', '100MB', r'''
+ the maximum size of log files''',
+ min='100KB', max='2GB'),
+ Config('path', '""', r'''
+ the path to a directory into which the log files are written.
+ If the value is not an absolute path name, the files are created
+ relative to the database home'''),
+ ]),
+ Config('mmap', 'true', r'''
+ Use memory mapping to access files when possible''',
+ type='boolean'),
+ Config('multiprocess', 'false', r'''
+ permit sharing between processes (will automatically start an
+ RPC server for primary processes and use RPC for secondary
+ processes). <b>Not yet supported in WiredTiger</b>''',
+ type='boolean'),
+ Config('session_max', '100', r'''
+ maximum expected number of sessions (including server
+ threads)''',
+ min='1'),
+ Config('transaction_sync', '', r'''
+ how to sync log records when the transaction commits''',
+ type='category', subconfig=[
+ Config('enabled', 'false', r'''
+ whether to sync the log on every commit by default, can
+ be overridden by the \c sync setting to
+ WT_SESSION::begin_transaction''',
+ type='boolean'),
+ Config('method', 'fsync', r'''
+ the method used to ensure log records are stable on disk,
+ see @ref tune_durability for more information''',
+ choices=['dsync', 'fsync', 'none']),
+ ]),
+]
+
+methods = {
+'file.meta' : Method(file_meta),
+
+'colgroup.meta' : Method(colgroup_meta),
+
+'index.meta' : Method(index_meta),
+
+'table.meta' : Method(table_meta),
+
+'cursor.close' : Method([]),
+
+'session.close' : Method([]),
+
+'session.compact' : Method([
+ Config('timeout', '1200', r'''
+ maximum amount of time to allow for compact in seconds. The
+ actual amount of time spent in compact may exceed the configured
+ value. A value of zero disables the timeout''',
+ type='int'),
+]),
+
+'session.create' :
+ Method(table_only_meta + file_config + lsm_config + source_meta + [
+ Config('exclusive', 'false', r'''
+ fail if the object exists. When false (the default), if the
+ object exists, check that its settings match the specified
+ configuration''',
+ type='boolean'),
+]),
+
+'session.drop' : Method([
+ Config('force', 'false', r'''
+ return success if the object does not exist''',
+ type='boolean'),
+ Config('remove_files', 'true', r'''
+ should the underlying files be removed?''',
+ type='boolean'),
+]),
+
+'session.log_printf' : Method([]),
+
+'session.open_cursor' : Method([
+ Config('append', 'false', r'''
+ append the value as a new record, creating a new record
+ number key; valid only for cursors with record number keys''',
+ type='boolean'),
+ Config('bulk', 'false', r'''
+ configure the cursor for bulk-loading, a fast, initial load
+ path (see @ref tune_bulk_load for more information). Bulk-load
+ may only be used for newly created objects and cursors
+ configured for bulk-load only support the WT_CURSOR::insert
+ and WT_CURSOR::close methods. When bulk-loading row-store
+ objects, keys must be loaded in sorted order. The value is
+ usually a true/false flag; when bulk-loading fixed-length
+ column store objects, the special value \c bitmap allows
+ chunks of a memory resident bitmap to be loaded directly into
+ a file by passing a \c WT_ITEM to WT_CURSOR::set_value where
+ the \c size field indicates the number of records in the
+ bitmap (as specified by the object's \c value_format
+ configuration). Bulk-loaded bitmap values must end on a byte
+ boundary relative to the bit count (except for the last set
+ of values loaded)'''),
+ Config('checkpoint', '', r'''
+ the name of a checkpoint to open (the reserved name
+ "WiredTigerCheckpoint" opens the most recent internal
+ checkpoint taken for the object). The cursor does not
+ support data modification'''),
+ Config('dump', '', r'''
+ configure the cursor for dump format inputs and outputs: "hex"
+ selects a simple hexadecimal format, "json" selects a JSON format
+ with each record formatted as fields named by column names if
+ available, and "print" selects a format where only non-printing
+ characters are hexadecimal encoded. These formats are compatible
+ with the @ref util_dump and @ref util_load commands''',
+ choices=['hex', 'json', 'print']),
+ Config('next_random', 'false', r'''
+ configure the cursor to return a pseudo-random record from
+ the object; valid only for row-store cursors. Cursors
+ configured with \c next_random=true only support the
+ WT_CURSOR::next and WT_CURSOR::close methods. See @ref
+ cursor_random for details''',
+ type='boolean'),
+ Config('overwrite', 'true', r'''
+ configures whether the cursor's insert, update and remove
+ methods check the existing state of the record. If \c overwrite
+ is \c false, WT_CURSOR::insert fails with ::WT_DUPLICATE_KEY
+ if the record exists, WT_CURSOR::update and WT_CURSOR::remove
+ fail with ::WT_NOTFOUND if the record does not exist''',
+ type='boolean'),
+ Config('raw', 'false', r'''
+ ignore the encodings for the key and value, manage data as if
+ the formats were \c "u". See @ref cursor_raw for details''',
+ type='boolean'),
+ Config('readonly', 'false', r'''
+ only query operations are supported by this cursor. An error is
+ returned if a modification is attempted using the cursor. The
+ default is false for all cursor types except for log and metadata
+ cursors''',
+ type='boolean'),
+ Config('skip_sort_check', 'false', r'''
+ skip the check of the sort order of each bulk-loaded key''',
+ type='boolean', undoc=True),
+ Config('statistics', '', r'''
+ Specify the statistics to be gathered. Choosing "all" gathers
+ statistics regardless of cost and may include traversing
+ on-disk files; "fast" gathers a subset of relatively
+ inexpensive statistics. The selection must agree with the
+ database \c statistics configuration specified to
+ ::wiredtiger_open or WT_CONNECTION::reconfigure. For example,
+ "all" or "fast" can be configured when the database is
+ configured with "all", but the cursor open will fail if "all"
+ is specified when the database is configured with "fast",
+ and the cursor open will fail in all cases when the database
+ is configured with "none". If \c statistics is not configured,
+ the default configuration is the database configuration.
+ The "clear" configuration resets statistics after gathering
+ them, where appropriate (for example, a cache size statistic
+ is not cleared, while the count of cursor insert operations
+ will be cleared). See @ref statistics for more information''',
+ type='list', choices=['all', 'fast', 'clear']),
+ Config('target', '', r'''
+ if non-empty, backup the list of objects; valid only for a
+ backup data source''',
+ type='list'),
+]),
+
+'session.rename' : Method([]),
+'session.salvage' : Method([
+ Config('force', 'false', r'''
+ force salvage even of files that do not appear to be WiredTiger
+ files''',
+ type='boolean'),
+]),
+'session.truncate' : Method([]),
+'session.upgrade' : Method([]),
+'session.verify' : Method([
+ Config('dump_address', 'false', r'''
+ Display addresses and page types as pages are verified, using
+ the application's message handler, intended for debugging''',
+ type='boolean'),
+ Config('dump_blocks', 'false', r'''
+ Display the contents of on-disk blocks as they are verified, using
+ the application's message handler, intended for debugging''',
+ type='boolean'),
+ Config('dump_offsets', '', r'''
+ Display the contents of specific on-disk blocks, using
+ the application's message handler, intended for debugging''',
+ type='list'),
+ Config('dump_pages', 'false', r'''
+ Display the contents of in-memory pages as they are verified, using
+ the application's message handler, intended for debugging''',
+ type='boolean')
+]),
+
+'session.begin_transaction' : Method([
+ Config('isolation', '', r'''
+ the isolation level for this transaction; defaults to the
+ session's isolation level''',
+ choices=['read-uncommitted', 'read-committed', 'snapshot']),
+ Config('name', '', r'''
+ name of the transaction for tracing and debugging'''),
+ Config('priority', 0, r'''
+ priority of the transaction for resolving conflicts.
+ Transactions with higher values are less likely to abort''',
+ min='-100', max='100'),
+ Config('sync', '', r'''
+ whether to sync log records when the transaction commits,
+ inherited from ::wiredtiger_open \c transaction_sync''',
+ type='boolean'),
+]),
+
+'session.commit_transaction' : Method([]),
+'session.rollback_transaction' : Method([]),
+
+'session.checkpoint' : Method([
+ Config('drop', '', r'''
+ specify a list of checkpoints to drop.
+ The list may additionally contain one of the following keys:
+ \c "from=all" to drop all checkpoints,
+ \c "from=<checkpoint>" to drop all checkpoints after and
+ including the named checkpoint, or
+ \c "to=<checkpoint>" to drop all checkpoints before and
+ including the named checkpoint. Checkpoints cannot be
+ dropped while a hot backup is in progress or if open in
+ a cursor''', type='list'),
+ Config('force', 'false', r'''
+ by default, checkpoints may be skipped if the underlying object
+ has not been modified, this option forces the checkpoint''',
+ type='boolean'),
+ Config('name', '', r'''
+ if non-empty, specify a name for the checkpoint (note that
+ checkpoints including LSM trees may not be named)'''),
+ Config('target', '', r'''
+ if non-empty, checkpoint the list of objects''', type='list'),
+]),
+
+'connection.add_collator' : Method([]),
+'connection.add_compressor' : Method([]),
+'connection.add_data_source' : Method([]),
+'connection.add_extractor' : Method([]),
+'connection.async_new_op' : Method([
+ Config('append', 'false', r'''
+ append the value as a new record, creating a new record
+ number key; valid only for operations with record number keys''',
+ type='boolean'),
+ Config('overwrite', 'true', r'''
+ configures whether the cursor's insert, update and remove
+ methods check the existing state of the record. If \c overwrite
+ is \c false, WT_CURSOR::insert fails with ::WT_DUPLICATE_KEY
+ if the record exists, WT_CURSOR::update and WT_CURSOR::remove
+ fail with ::WT_NOTFOUND if the record does not exist''',
+ type='boolean'),
+ Config('raw', 'false', r'''
+ ignore the encodings for the key and value, manage data as if
+ the formats were \c "u". See @ref cursor_raw for details''',
+ type='boolean'),
+ Config('timeout', '1200', r'''
+ maximum amount of time to allow for compact in seconds. The
+ actual amount of time spent in compact may exceed the configured
+ value. A value of zero disables the timeout''',
+ type='int'),
+]),
+'connection.close' : Method([
+ Config('leak_memory', 'false', r'''
+ don't free memory during close''',
+ type='boolean'),
+]),
+'connection.reconfigure' : Method(connection_runtime_config),
+
+'connection.load_extension' : Method([
+ Config('config', '', r'''
+ configuration string passed to the entry point of the
+ extension as its WT_CONFIG_ARG argument'''),
+ Config('entry', 'wiredtiger_extension_init', r'''
+ the entry point of the extension, called to initialize the
+ extension when it is loaded. The signature of the function
+ must match ::wiredtiger_extension_init'''),
+ Config('terminate', 'wiredtiger_extension_terminate', r'''
+ an optional function in the extension that is called before
+ the extension is unloaded during WT_CONNECTION::close. The
+ signature of the function must match
+ ::wiredtiger_extension_terminate'''),
+]),
+
+'connection.open_session' : Method(session_config),
+
+'session.reconfigure' : Method(session_config),
+
+# There are 4 variants of the wiredtiger_open configurations.
+# wiredtiger_open:
+# Configuration values allowed in the application's configuration
+# argument to the wiredtiger_open call.
+# wiredtiger_open_basecfg:
+# Configuration values allowed in the WiredTiger.basecfg file (remove
+# creation-specific configuration strings and add a version string).
+# wiredtiger_open_usercfg:
+# Configuration values allowed in the WiredTiger.config file (remove
+# creation-specific configuration strings).
+# wiredtiger_open_all:
+# All of the above configuration values combined
+'wiredtiger_open' : Method(
+ connection_runtime_config +
+ common_wiredtiger_open + [
+ Config('config_base', 'true', r'''
+ write the base configuration file if creating the database,
+ see @ref config_base for more information''',
+ type='boolean'),
+ Config('create', 'false', r'''
+ create the database if it does not exist''',
+ type='boolean'),
+ Config('exclusive', 'false', r'''
+ fail if the database already exists, generally used with the
+ \c create option''',
+ type='boolean'),
+ Config('use_environment_priv', 'false', r'''
+ use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment
+ variables regardless of whether or not the process is running
+ with special privileges. See @ref home for more information''',
+ type='boolean'),
+]),
+'wiredtiger_open_basecfg' : Method(
+ connection_runtime_config +
+ common_wiredtiger_open + [
+ Config('version', '(major=0,minor=0)', r'''
+ the file version'''),
+]),
+'wiredtiger_open_usercfg' : Method(
+ connection_runtime_config +
+ common_wiredtiger_open
+),
+'wiredtiger_open_all' : Method(
+ connection_runtime_config +
+ common_wiredtiger_open + [
+ Config('config_base', 'true', r'''
+ write the base configuration file if creating the database,
+ see @ref config_base for more information''',
+ type='boolean'),
+ Config('create', 'false', r'''
+ create the database if it does not exist''',
+ type='boolean'),
+ Config('exclusive', 'false', r'''
+ fail if the database already exists, generally used with the
+ \c create option''',
+ type='boolean'),
+ Config('use_environment_priv', 'false', r'''
+ use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment
+ variables regardless of whether or not the process is running
+ with special privileges. See @ref home for more information''',
+ type='boolean'),
+ Config('version', '(major=0,minor=0)', r'''
+ the file version'''),
+]),
+}
diff --git a/src/third_party/wiredtiger/dist/api_err.py b/src/third_party/wiredtiger/dist/api_err.py
new file mode 100644
index 00000000000..352bfd5ca94
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/api_err.py
@@ -0,0 +1,111 @@
+# Output C #defines for errors into wiredtiger.in and the associated error
+# message code in strerror.c.
+
+import re, textwrap
+
+import api_data
+from dist import compare_srcfile
+
+# Update the #defines in the wiredtiger.in file.
+tmp_file = '__tmp'
+tfile = open(tmp_file, 'w')
+skip = 0
+for line in open('../src/include/wiredtiger.in', 'r'):
+ if not skip:
+ tfile.write(line)
+ if line.count('Error return section: END'):
+ tfile.write(line)
+ skip = 0
+ elif line.count('Error return section: BEGIN'):
+ tfile.write(' */\n')
+ skip = 1
+
+ # We don't want our error returns to conflict with any other
+ # package, so use an uncommon range, specifically, -31,800 to
+ # -31,999.
+ v = -31800
+ for err in api_data.errors:
+ if 'undoc' in err.flags:
+ tfile.write('/*! @cond internal */\n')
+ tfile.write('/*!%s.%s */\n' %
+ (('\n * ' if err.long_desc else ' ') +
+ err.desc[0].upper() + err.desc[1:],
+ ''.join('\n * ' + l for l in textwrap.wrap(
+ textwrap.dedent(err.long_desc).strip(), 77)) +
+ '\n' if err.long_desc else ''))
+ tfile.write('#define\t%s\t%d\n' % (err.name, v))
+ v -= 1
+ if 'undoc' in err.flags:
+ tfile.write('/*! @endcond */\n')
+ tfile.write('/*\n')
+tfile.close()
+compare_srcfile(tmp_file, '../src/include/wiredtiger.in')
+
+# Output the wiredtiger_strerror code.
+tmp_file = '__tmp'
+tfile = open(tmp_file, 'w')
+tfile.write('''/* DO NOT EDIT: automatically built by dist/api_err.py. */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_strerror --
+ * Return a string for any error value.
+ */
+const char *
+wiredtiger_strerror(int error)
+{
+ static char errbuf[64];
+ char *p;
+
+ if (error == 0)
+ return ("Successful return: 0");
+
+ switch (error) {
+''')
+
+for err in api_data.errors:
+ tfile.write('\tcase ' + err.name + ':\n')
+ tfile.write('\t\treturn ("' + err.name + ': ' + err.desc + '");\n')
+
+tfile.write('''\
+ default:
+ if (error > 0 && (p = strerror(error)) != NULL)
+ return (p);
+ break;
+ }
+
+ /*
+ * !!!
+ * Not thread-safe, but this is never supposed to happen.
+ */
+ (void)snprintf(errbuf, sizeof(errbuf), "Unknown error: %d", error);
+ return (errbuf);
+}
+''')
+tfile.close()
+compare_srcfile(tmp_file, '../src/conn/api_strerror.c')
+
+# Update the error documentation block.
+doc = '../src/docs/error-handling.dox'
+tmp_file = '__tmp'
+tfile = open(tmp_file, 'w')
+skip = 0
+for line in open(doc, 'r'):
+ if not skip:
+ tfile.write(line)
+ if line.count('IGNORE_BUILT_BY_API_ERR_END'):
+ tfile.write(line)
+ skip = 0
+ elif line.count('IGNORE_BUILT_BY_API_ERR_BEGIN'):
+ tfile.write('@endif\n\n')
+ skip = 1
+
+ for err in api_data.errors:
+ if 'undoc' in err.flags:
+ continue
+ tfile.write(
+ '@par <code>' + err.name.upper() + '</code>\n' +
+ " ".join(err.long_desc.split()) + '\n\n')
+tfile.close()
+compare_srcfile(tmp_file, doc)
diff --git a/src/third_party/wiredtiger/dist/db.py b/src/third_party/wiredtiger/dist/db.py
new file mode 100644
index 00000000000..06a9484d1f9
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/db.py
@@ -0,0 +1,24 @@
+# A simple python script to build a file that can be bulk-loaded into a
+# WiredTiger database for smoke-testing.
+
+import getopt, random, sys
+
+dmin = 7 # Minimum data size
+dmax = 837 # Maximum data size
+
+seed = None # Random number seed
+pairs = 100000 # Key/data pairs to output
+
+opts, args = getopt.getopt(sys.argv[1:], "m:n:s:")
+for o, a in opts:
+ if o == "-m":
+ dmax = int(a)
+ elif o == "-n":
+ pairs = int(a)
+ elif o == "-s":
+ seed = int(a)
+
+random.seed(seed)
+for i in range(pairs):
+ fmt = "%010d\ndata: %0" + str(random.randrange(dmin, dmax)) + "d"
+ print(fmt % (i, i))
diff --git a/src/third_party/wiredtiger/dist/dist.py b/src/third_party/wiredtiger/dist/dist.py
new file mode 100644
index 00000000000..6994a9128af
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/dist.py
@@ -0,0 +1,35 @@
+import filecmp, glob, os, re, shutil
+
+# source_files --
+# Return a list of the WiredTiger source file names.
+def source_files(skip_includes=False):
+ if not skip_includes:
+ for line in glob.iglob('../src/include/*.[hi]'):
+ yield line
+ file_re = re.compile(r'^\w')
+ for line in open('filelist', 'r'):
+ if file_re.match(line):
+ yield os.path.join('..', line.rstrip())
+ for line in open('extlist', 'r'):
+ if file_re.match(line):
+ yield os.path.join('..', line.rstrip())
+
+# source_dirs --
+# Return a list of the WiredTiger source directory names.
+def source_dirs():
+ dirs = set()
+ for f in source_files():
+ dirs.add(os.path.dirname(f))
+ return dirs
+
+def print_source_dirs():
+ for d in source_dirs():
+ print d
+
+# compare_srcfile --
+# Compare two files, and if they differ, update the source file.
+def compare_srcfile(tmp, src):
+ if not os.path.isfile(src) or not filecmp.cmp(tmp, src, shallow=False):
+ print('Updating ' + src)
+ shutil.copyfile(tmp, src)
+ os.remove(tmp)
diff --git a/src/third_party/wiredtiger/dist/extlist b/src/third_party/wiredtiger/dist/extlist
new file mode 100644
index 00000000000..524dcb85992
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/extlist
@@ -0,0 +1,9 @@
+# extlist --
+# List of extension source files for WiredTiger library.
+
+ext/collators/reverse/reverse_collator.c
+ext/compressors/bzip2/bzip2_compress.c
+ext/compressors/nop/nop_compress.c
+ext/compressors/snappy/snappy_compress.c
+ext/compressors/zlib/zlib_compress.c
+ext/datasources/helium/helium.c
diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist
new file mode 100644
index 00000000000..6fa967d1504
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/filelist
@@ -0,0 +1,166 @@
+# filelist --
+# List of source files for WiredTiger POSIX library.
+
+src/async/async_api.c
+src/async/async_op.c
+src/async/async_worker.c
+src/block/block_addr.c
+src/block/block_ckpt.c
+src/block/block_compact.c
+src/block/block_ext.c
+src/block/block_map.c
+src/block/block_mgr.c
+src/block/block_open.c
+src/block/block_read.c
+src/block/block_session.c
+src/block/block_slvg.c
+src/block/block_vrfy.c
+src/block/block_write.c
+src/bloom/bloom.c
+src/btree/bt_compact.c
+src/btree/bt_curnext.c
+src/btree/bt_curprev.c
+src/btree/bt_cursor.c
+src/btree/bt_debug.c
+src/btree/bt_delete.c
+src/btree/bt_discard.c
+src/btree/bt_evict.c
+src/btree/bt_handle.c
+src/btree/bt_huffman.c
+src/btree/bt_io.c
+src/btree/bt_misc.c
+src/btree/bt_ovfl.c
+src/btree/bt_page.c
+src/btree/bt_read.c
+src/btree/bt_ret.c
+src/btree/bt_slvg.c
+src/btree/bt_stat.c
+src/btree/bt_sync.c
+src/btree/bt_upgrade.c
+src/btree/bt_vrfy.c
+src/btree/bt_vrfy_dsk.c
+src/btree/bt_walk.c
+src/btree/col_modify.c
+src/btree/col_srch.c
+src/btree/rec_evict.c
+src/btree/rec_split.c
+src/btree/rec_track.c
+src/btree/rec_write.c
+src/btree/row_key.c
+src/btree/row_modify.c
+src/btree/row_srch.c
+src/config/config.c
+src/config/config_api.c
+src/config/config_check.c
+src/config/config_collapse.c
+src/config/config_concat.c
+src/config/config_def.c
+src/config/config_ext.c
+src/config/config_upgrade.c
+src/conn/api_strerror.c
+src/conn/api_version.c
+src/conn/conn_api.c
+src/conn/conn_cache.c
+src/conn/conn_cache_pool.c
+src/conn/conn_ckpt.c
+src/conn/conn_dhandle.c
+src/conn/conn_handle.c
+src/conn/conn_log.c
+src/conn/conn_open.c
+src/conn/conn_stat.c
+src/conn/conn_sweep.c
+src/cursor/cur_backup.c
+src/cursor/cur_bulk.c
+src/cursor/cur_config.c
+src/cursor/cur_ds.c
+src/cursor/cur_dump.c
+src/cursor/cur_file.c
+src/cursor/cur_index.c
+src/cursor/cur_json.c
+src/cursor/cur_log.c
+src/cursor/cur_metadata.c
+src/cursor/cur_stat.c
+src/cursor/cur_std.c
+src/cursor/cur_table.c
+src/log/log.c
+src/log/log_auto.c
+src/log/log_slot.c
+src/lsm/lsm_cursor.c
+src/lsm/lsm_manager.c
+src/lsm/lsm_merge.c
+src/lsm/lsm_meta.c
+src/lsm/lsm_stat.c
+src/lsm/lsm_tree.c
+src/lsm/lsm_work_unit.c
+src/lsm/lsm_worker.c
+src/meta/meta_apply.c
+src/meta/meta_ckpt.c
+src/meta/meta_ext.c
+src/meta/meta_table.c
+src/meta/meta_track.c
+src/meta/meta_turtle.c
+src/os_posix/os_abort.c
+src/os_posix/os_alloc.c
+src/os_posix/os_dir.c
+src/os_posix/os_dlopen.c
+src/os_posix/os_errno.c
+src/os_posix/os_exist.c
+src/os_posix/os_fallocate.c
+src/os_posix/os_filesize.c
+src/os_posix/os_flock.c
+src/os_posix/os_fsync.c
+src/os_posix/os_ftruncate.c
+src/os_posix/os_getline.c
+src/os_posix/os_getopt.c
+src/os_posix/os_map.c
+src/os_posix/os_mtx_cond.c
+src/os_posix/os_mtx_rw.c
+src/os_posix/os_once.c
+src/os_posix/os_open.c
+src/os_posix/os_path.c
+src/os_posix/os_priv.c
+src/os_posix/os_remove.c
+src/os_posix/os_rename.c
+src/os_posix/os_rw.c
+src/os_posix/os_sleep.c
+src/os_posix/os_strtouq.c
+src/os_posix/os_thread.c
+src/os_posix/os_time.c
+src/os_posix/os_yield.c
+src/packing/pack_api.c
+src/packing/pack_impl.c
+src/packing/pack_stream.c
+src/schema/schema_create.c
+src/schema/schema_drop.c
+src/schema/schema_list.c
+src/schema/schema_open.c
+src/schema/schema_plan.c
+src/schema/schema_project.c
+src/schema/schema_rename.c
+src/schema/schema_stat.c
+src/schema/schema_truncate.c
+src/schema/schema_util.c
+src/schema/schema_worker.c
+src/session/session_api.c
+src/session/session_compact.c
+src/session/session_dhandle.c
+src/session/session_salvage.c
+src/support/cksum.c
+src/support/err.c
+src/support/filename.c
+src/support/global.c
+src/support/hash_city.c
+src/support/hash_fnv.c
+src/support/hazard.c
+src/support/hex.c
+src/support/huffman.c
+src/support/mutex.c
+src/support/pow.c
+src/support/rand.c
+src/support/scratch.c
+src/support/stat.c
+src/txn/txn.c
+src/txn/txn_ckpt.c
+src/txn/txn_ext.c
+src/txn/txn_log.c
+src/txn/txn_recover.c
diff --git a/src/third_party/wiredtiger/dist/filelist.win b/src/third_party/wiredtiger/dist/filelist.win
new file mode 100644
index 00000000000..813bfb8b43d
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/filelist.win
@@ -0,0 +1,167 @@
+# filelist --
+# List of source files for WiredTiger Windows library.
+
+src/async/async_api.c
+src/async/async_op.c
+src/async/async_worker.c
+src/block/block_addr.c
+src/block/block_ckpt.c
+src/block/block_compact.c
+src/block/block_ext.c
+src/block/block_map.c
+src/block/block_mgr.c
+src/block/block_open.c
+src/block/block_read.c
+src/block/block_session.c
+src/block/block_slvg.c
+src/block/block_vrfy.c
+src/block/block_write.c
+src/bloom/bloom.c
+src/btree/bt_compact.c
+src/btree/bt_curnext.c
+src/btree/bt_curprev.c
+src/btree/bt_cursor.c
+src/btree/bt_debug.c
+src/btree/bt_delete.c
+src/btree/bt_discard.c
+src/btree/bt_evict.c
+src/btree/bt_handle.c
+src/btree/bt_huffman.c
+src/btree/bt_io.c
+src/btree/bt_misc.c
+src/btree/bt_ovfl.c
+src/btree/bt_page.c
+src/btree/bt_read.c
+src/btree/bt_ret.c
+src/btree/bt_slvg.c
+src/btree/bt_stat.c
+src/btree/bt_sync.c
+src/btree/bt_upgrade.c
+src/btree/bt_vrfy.c
+src/btree/bt_vrfy_dsk.c
+src/btree/bt_walk.c
+src/btree/col_modify.c
+src/btree/col_srch.c
+src/btree/rec_evict.c
+src/btree/rec_split.c
+src/btree/rec_track.c
+src/btree/rec_write.c
+src/btree/row_key.c
+src/btree/row_modify.c
+src/btree/row_srch.c
+src/config/config.c
+src/config/config_api.c
+src/config/config_check.c
+src/config/config_collapse.c
+src/config/config_concat.c
+src/config/config_def.c
+src/config/config_ext.c
+src/config/config_upgrade.c
+src/conn/api_strerror.c
+src/conn/api_version.c
+src/conn/conn_api.c
+src/conn/conn_cache.c
+src/conn/conn_cache_pool.c
+src/conn/conn_ckpt.c
+src/conn/conn_dhandle.c
+src/conn/conn_handle.c
+src/conn/conn_log.c
+src/conn/conn_open.c
+src/conn/conn_stat.c
+src/conn/conn_sweep.c
+src/cursor/cur_backup.c
+src/cursor/cur_bulk.c
+src/cursor/cur_config.c
+src/cursor/cur_ds.c
+src/cursor/cur_dump.c
+src/cursor/cur_file.c
+src/cursor/cur_index.c
+src/cursor/cur_json.c
+src/cursor/cur_log.c
+src/cursor/cur_metadata.c
+src/cursor/cur_stat.c
+src/cursor/cur_std.c
+src/cursor/cur_table.c
+src/log/log.c
+src/log/log_auto.c
+src/log/log_slot.c
+src/lsm/lsm_cursor.c
+src/lsm/lsm_manager.c
+src/lsm/lsm_merge.c
+src/lsm/lsm_meta.c
+src/lsm/lsm_stat.c
+src/lsm/lsm_tree.c
+src/lsm/lsm_work_unit.c
+src/lsm/lsm_worker.c
+src/meta/meta_apply.c
+src/meta/meta_ckpt.c
+src/meta/meta_ext.c
+src/meta/meta_table.c
+src/meta/meta_track.c
+src/meta/meta_turtle.c
+src/os_posix/os_abort.c
+src/os_posix/os_alloc.c
+src/os_posix/os_getline.c
+src/os_posix/os_getopt.c
+src/os_posix/os_strtouq.c
+src/os_win/os_dir.c
+src/os_win/os_dlopen.c
+src/os_win/os_errno.c
+src/os_win/os_exist.c
+src/os_win/os_fallocate.c
+src/os_win/os_filesize.c
+src/os_win/os_flock.c
+src/os_win/os_fsync.c
+src/os_win/os_ftruncate.c
+src/os_win/os_map.c
+src/os_win/os_mtx_cond.c
+src/os_win/os_mtx_rw.c
+src/os_win/os_once.c
+src/os_win/os_open.c
+src/os_win/os_path.c
+src/os_win/os_priv.c
+src/os_win/os_remove.c
+src/os_win/os_rename.c
+src/os_win/os_rw.c
+src/os_win/os_sleep.c
+src/os_win/os_thread.c
+src/os_win/os_time.c
+src/os_win/os_vsnprintf.c
+src/os_win/os_yield.c
+src/packing/pack_api.c
+src/packing/pack_impl.c
+src/packing/pack_stream.c
+src/schema/schema_create.c
+src/schema/schema_drop.c
+src/schema/schema_list.c
+src/schema/schema_open.c
+src/schema/schema_plan.c
+src/schema/schema_project.c
+src/schema/schema_rename.c
+src/schema/schema_stat.c
+src/schema/schema_truncate.c
+src/schema/schema_util.c
+src/schema/schema_worker.c
+src/session/session_api.c
+src/session/session_compact.c
+src/session/session_dhandle.c
+src/session/session_salvage.c
+src/support/cksum.c
+src/support/err.c
+src/support/filename.c
+src/support/global.c
+src/support/hash_city.c
+src/support/hash_fnv.c
+src/support/hazard.c
+src/support/hex.c
+src/support/huffman.c
+src/support/mutex.c
+src/support/pow.c
+src/support/rand.c
+src/support/scratch.c
+src/support/stat.c
+src/txn/txn.c
+src/txn/txn_ckpt.c
+src/txn/txn_ext.c
+src/txn/txn_log.c
+src/txn/txn_recover.c
diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py
new file mode 100644
index 00000000000..0fc609a69e3
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/flags.py
@@ -0,0 +1,183 @@
+# Output a C header file using the minimum number of distinct bits to ensure
+# flags don't collide.
+
+import os, re, sys
+from dist import compare_srcfile
+
+flags = {
+###################################################
+# Internal routine flag declarations
+###################################################
+ 'cache_flush' : [
+ 'SYNC_CHECKPOINT',
+ 'SYNC_CLOSE',
+ 'SYNC_DISCARD',
+ 'SYNC_DISCARD_FORCE',
+ 'SYNC_WRITE_LEAVES',
+ ],
+ 'file_types' : [
+ 'FILE_TYPE_CHECKPOINT',
+ 'FILE_TYPE_DATA',
+ 'FILE_TYPE_LOG',
+ ],
+ 'log_scan' : [
+ 'LOGSCAN_FIRST',
+ 'LOGSCAN_FROM_CKP',
+ 'LOGSCAN_ONE',
+ 'LOGSCAN_RECOVER',
+ ],
+ 'log_write' : [
+ 'LOG_DSYNC',
+ 'LOG_FLUSH',
+ 'LOG_FSYNC',
+ ],
+ 'page_read' : [
+ 'READ_CACHE',
+ 'READ_COMPACT',
+ 'READ_NO_GEN',
+ 'READ_NO_EVICT',
+ 'READ_NO_WAIT',
+ 'READ_PREV',
+ 'READ_SKIP_INTL',
+ 'READ_SKIP_LEAF',
+ 'READ_TRUNCATE',
+ 'READ_WONT_NEED',
+ ],
+ 'rec_write' : [
+ 'EVICTING',
+ 'SKIP_UPDATE_ERR',
+ 'SKIP_UPDATE_RESTORE',
+ ],
+ 'txn_log_checkpoint' : [
+ 'TXN_LOG_CKPT_FAIL',
+ 'TXN_LOG_CKPT_PREPARE',
+ 'TXN_LOG_CKPT_START',
+ 'TXN_LOG_CKPT_STOP',
+ ],
+ 'verbose' : [
+ 'VERB_API',
+ 'VERB_BLOCK',
+ 'VERB_CHECKPOINT',
+ 'VERB_COMPACT',
+ 'VERB_EVICT',
+ 'VERB_EVICTSERVER',
+ 'VERB_FILEOPS',
+ 'VERB_LOG',
+ 'VERB_LSM',
+ 'VERB_METADATA',
+ 'VERB_MUTEX',
+ 'VERB_OVERFLOW',
+ 'VERB_READ',
+ 'VERB_RECONCILE',
+ 'VERB_RECOVERY',
+ 'VERB_SALVAGE',
+ 'VERB_SHARED_CACHE',
+ 'VERB_SPLIT',
+ 'VERB_TEMPORARY',
+ 'VERB_TRANSACTION',
+ 'VERB_VERIFY',
+ 'VERB_VERSION',
+ 'VERB_WRITE',
+ ],
+
+###################################################
+# Structure flag declarations
+###################################################
+ 'conn' : [
+ 'CONN_CACHE_POOL',
+ 'CONN_CKPT_SYNC',
+ 'CONN_EVICTION_RUN',
+ 'CONN_LEAK_MEMORY',
+ 'CONN_LSM_MERGE',
+ 'CONN_PANIC',
+ 'CONN_SERVER_RUN',
+ 'CONN_SERVER_ASYNC',
+ 'CONN_SERVER_CHECKPOINT',
+ 'CONN_SERVER_LSM',
+ 'CONN_SERVER_STATISTICS',
+ 'CONN_SERVER_SWEEP',
+ 'CONN_WAS_BACKUP',
+ ],
+ 'session' : [
+ 'SESSION_CAN_WAIT',
+ 'SESSION_DISCARD_FORCE',
+ 'SESSION_INTERNAL',
+ 'SESSION_LOGGING_INMEM',
+ 'SESSION_NO_CACHE',
+ 'SESSION_NO_CACHE_CHECK',
+ 'SESSION_NO_DATA_HANDLES',
+ 'SESSION_NO_LOGGING',
+ 'SESSION_NO_SCHEMA_LOCK',
+ 'SESSION_SALVAGE_CORRUPT_OK',
+ 'SESSION_SCHEMA_LOCKED',
+ 'SESSION_SERVER_ASYNC',
+ ],
+}
+
+flag_cnt = {} # Dictionary [flag] : [reference count]
+flag_name = {} # Dictionary [flag] : [name ...]
+name_mask = {} # Dictionary [name] : [used flag mask]
+
+# Step through the flags dictionary and build our local dictionaries.
+for method in flags.items():
+ name_mask[method[0]] = 0x0
+ for flag in method[1]:
+ if flag == '__NONE__':
+ continue
+ if flag not in flag_cnt:
+ flag_cnt[flag] = 1
+ flag_name[flag] = []
+ else:
+ flag_cnt[flag] += 1
+ flag_name[flag].append(method[0])
+
+# Create list of possible bit masks.
+bits = [2 ** i for i in range(0, 32)]
+
+# Walk the list of flags in reverse, sorted-by-reference count order. For
+# each flag, find a bit that's not currently in use by any method using the
+# flag.
+flag_bit = {} # Dictionary [flag] : [bit value]
+for f in sorted(flag_cnt.items(),\
+ key = lambda k_v : (k_v[1], k_v[0]), reverse = True):
+ mask = 0xffffffff
+ for m in flag_name[f[0]]:
+ mask &= ~name_mask[m]
+ if mask == 0:
+ print >>sys.stderr,\
+ "flags.py: ran out of flags at " + m + " method",
+ sys.exit(1)
+ for b in bits:
+ if mask & b:
+ mask = b
+ break
+ flag_bit[f[0]] = mask
+ for m in flag_name[f[0]]:
+ name_mask[m] |= mask
+
+# Print out the flag masks in hex.
+# Assumes tab stops set to 8 characters.
+flag_info = ''
+for f in sorted(flag_cnt.items()):
+ flag_info += "#define\tWT_%s%s%#010x\n" %\
+ (f[0],\
+ "\t" * max(1, 6 - int((len('WT_') + len(f[0])) / 8)),\
+ flag_bit[f[0]])
+
+# Update the wiredtiger.in file with the flags information.
+tmp_file = '__tmp'
+tfile = open(tmp_file, 'w')
+skip = 0
+for line in open('../src/include/flags.h', 'r'):
+ if skip:
+ if line.count('flags section: END'):
+ tfile.write('/*\n' + line)
+ skip = 0
+ else:
+ tfile.write(line)
+ if line.count('flags section: BEGIN'):
+ skip = 1
+ tfile.write(' */\n')
+ tfile.write(flag_info)
+tfile.close()
+compare_srcfile(tmp_file, '../src/include/flags.h')
diff --git a/src/third_party/wiredtiger/dist/java_doc.py b/src/third_party/wiredtiger/dist/java_doc.py
new file mode 100644
index 00000000000..d44ccb12160
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/java_doc.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+# This program pulls the function names from wiredtiger.in and generates
+# an input file for Java SWIG that adds doxygen copydoc comments to functions.
+
+import os, re, sys
+import api_data
+from dist import compare_srcfile
+
+# Temporary file.
+tmp_file = '__tmp'
+
+#####################################################################
+# Update wiredtiger.in with doxygen comments
+#####################################################################
+f='../src/include/wiredtiger.in'
+o='../lang/java/java_doc.i'
+tfile = open(tmp_file, 'w')
+
+tfile.write('''/* DO NOT EDIT: automatically built by dist/java_doc.py. */
+
+''')
+
+cclass_re = re.compile('^struct __([a-z_]*) {')
+cfunc_re = re.compile('\t.*? __F\(([a-z_]*)\)')
+
+curr_class = ""
+for line in open(f, 'r'):
+
+ m = cclass_re.match(line)
+ if m:
+ curr_class = m.group(1)
+
+ if curr_class == "":
+ continue
+
+ m = cfunc_re.match(line)
+ if m:
+ tfile.write('COPYDOC(__' + curr_class.lower() + ', ' +
+ curr_class.upper() + ', ' + m.group(1) + ')\n')
+
+tfile.close()
+compare_srcfile(tmp_file, o)
+
diff --git a/src/third_party/wiredtiger/dist/log.py b/src/third_party/wiredtiger/dist/log.py
new file mode 100644
index 00000000000..2f8fbea5294
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/log.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python
+
+import os, re, sys, textwrap
+from dist import compare_srcfile
+import log_data
+
+# Temporary file.
+tmp_file = '__tmp'
+
+# Map log record types to:
+# (C type, pack type, printf format, printf arg(s))
+field_types = {
+ 'string' : ('const char *', 'S', '%s', 'arg'),
+ 'item' : ('WT_ITEM *', 'u', '%.*s',
+ '(int)arg.size, (const char *)arg.data'),
+ 'recno' : ('uint64_t', 'r', '%" PRIu64 "', 'arg'),
+ 'uint32' : ('uint32_t', 'I', '%" PRIu32 "', 'arg'),
+ 'uint64' : ('uint64_t', 'Q', '%" PRIu64 "', 'arg'),
+}
+
+def cintype(f):
+ return field_types[f[0]][0]
+
+def couttype(f):
+ type = cintype(f)
+ # We already have a pointer to a WT_ITEM
+ if f[0] == 'item':
+ return type
+ if type[-1] != '*':
+ type += ' '
+ return type + '*'
+
+def clocaltype(f):
+ type = cintype(f)
+ # Allocate a WT_ITEM struct on the stack
+ if f[0] == 'item':
+ return type[:-2]
+ return type
+
+def pack_fmt(fields):
+ return ''.join(field_types[f[0]][1] for f in fields)
+
+def op_pack_fmt(r):
+ return 'II' + pack_fmt(r.fields)
+
+def rec_pack_fmt(r):
+ return 'I' + pack_fmt(r.fields)
+
+def printf_fmt(f):
+ return field_types[f[0]][2]
+
+def printf_arg(f):
+ arg = field_types[f[0]][3].replace('arg', f[1])
+ return '\n\t ' + arg if f[0] == 'item' else ' ' + arg
+
+#####################################################################
+# Update log.h with #defines for types
+#####################################################################
+log_defines = (
+ ''.join('/*! %s */\n#define\t%s\t%d\n' % (r.desc, r.macro_name(), i)
+ for i, r in enumerate(log_data.rectypes)) +
+ ''.join('/*! %s */\n#define\t%s\t%d\n' % (r.desc, r.macro_name(), i)
+ for i, r in enumerate(log_data.optypes,start=1))
+)
+
+tfile = open(tmp_file, 'w')
+skip = 0
+for line in open('../src/include/wiredtiger.in', 'r'):
+ if skip:
+ if 'Log record declarations: END' in line:
+ tfile.write('/*\n' + line)
+ skip = 0
+ else:
+ tfile.write(line)
+ if 'Log record declarations: BEGIN' in line:
+ skip = 1
+ tfile.write(' */\n')
+ tfile.write('/*! invalid operation */\n')
+ tfile.write('#define\tWT_LOGOP_INVALID\t0\n')
+ tfile.write(log_defines)
+tfile.close()
+compare_srcfile(tmp_file, '../src/include/wiredtiger.in')
+
+#####################################################################
+# Create log_auto.c with handlers for each record / operation type.
+#####################################################################
+f='../src/log/log_auto.c'
+tfile = open(tmp_file, 'w')
+
+tfile.write('/* DO NOT EDIT: automatically built by dist/log.py. */\n')
+
+tfile.write('''
+#include "wt_internal.h"
+
+int
+__wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp)
+{
+ WT_ITEM *logrec;
+
+ WT_RET(__wt_scr_alloc(session, WT_ALIGN(size + 1, LOG_ALIGN), &logrec));
+ WT_CLEAR(*(WT_LOG_RECORD *)logrec->data);
+ logrec->size = offsetof(WT_LOG_RECORD, record);
+
+ *logrecp = logrec;
+ return (0);
+}
+
+void
+__wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp)
+{
+ WT_UNUSED(session);
+ __wt_scr_free(logrecp);
+}
+
+int
+__wt_logrec_read(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end, uint32_t *rectypep)
+{
+ uint64_t rectype;
+
+ WT_UNUSED(session);
+ WT_RET(__wt_vunpack_uint(pp, WT_PTRDIFF(end, *pp), &rectype));
+ *rectypep = (uint32_t)rectype;
+ return (0);
+}
+
+int
+__wt_logop_read(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end,
+ uint32_t *optypep, uint32_t *opsizep)
+{
+ return (__wt_struct_unpack(
+ session, *pp, WT_PTRDIFF(end, *pp), "II", optypep, opsizep));
+}
+''')
+
+# Emit code to read, write and print log operations (within a log record)
+for optype in log_data.optypes:
+ if not optype.fields:
+ continue
+
+ tfile.write('''
+int
+__wt_logop_%(name)s_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ %(arg_decls)s)
+{
+ const char *fmt = WT_UNCHECKED_STRING(%(fmt)s);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = %(macro)s;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0%(arg_names)s));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize%(arg_names)s));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+''' % {
+ 'name' : optype.name,
+ 'macro' : optype.macro_name(),
+ 'arg_decls' : ', '.join(
+ '%s%s%s' % (cintype(f), '' if cintype(f)[-1] == '*' else ' ', f[1])
+ for f in optype.fields),
+ 'arg_names' : ''.join(', %s' % f[1] for f in optype.fields),
+ 'fmt' : op_pack_fmt(optype)
+})
+
+ tfile.write('''
+int
+__wt_logop_%(name)s_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ %(arg_decls)s)
+{
+ const char *fmt = WT_UNCHECKED_STRING(%(fmt)s);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size%(arg_names)s));
+ WT_ASSERT(session, optype == %(macro)s);
+
+ *pp += size;
+ return (0);
+}
+''' % {
+ 'name' : optype.name,
+ 'macro' : optype.macro_name(),
+ 'arg_decls' : ', '.join(
+ '%s%sp' % (couttype(f), f[1]) for f in optype.fields),
+ 'arg_names' : ''.join(', %sp' % f[1] for f in optype.fields),
+ 'fmt' : op_pack_fmt(optype)
+})
+
+ tfile.write('''
+int
+__wt_logop_%(name)s_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ %(arg_decls)s
+
+ WT_RET(__wt_logop_%(name)s_unpack(
+ session, pp, end%(arg_addrs)s));
+
+ fprintf(out, " \\"optype\\": \\"%(name)s\\",\\n");
+ %(print_args)s
+ return (0);
+}
+''' % {
+ 'name' : optype.name,
+ 'arg_decls' : '\n\t'.join('%s%s%s;' %
+ (clocaltype(f), '' if clocaltype(f)[-1] == '*' else ' ', f[1])
+ for f in optype.fields),
+ 'arg_addrs' : ''.join(', &%s' % f[1] for f in optype.fields),
+ 'print_args' : '\n\t'.join(
+ 'fprintf(out, " \\"%s\\": \\"%s\\",\\n",%s);' %
+ (f[1], printf_fmt(f), printf_arg(f))
+ for f in optype.fields),
+})
+
+# Emit the printlog entry point
+tfile.write('''
+int
+__wt_txn_op_printlog(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t optype, opsize;
+
+ /* Peek at the size and the type. */
+ WT_RET(__wt_logop_read(session, pp, end, &optype, &opsize));
+ end = *pp + opsize;
+
+ switch (optype) {''')
+
+for optype in log_data.optypes:
+ if not optype.fields:
+ continue
+
+ tfile.write('''
+ case %(macro)s:
+ WT_RET(%(print_func)s(session, pp, end, out));
+ break;
+''' % {
+ 'macro' : optype.macro_name(),
+ 'print_func' : '__wt_logop_' + optype.name + '_print',
+})
+
+tfile.write('''
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+''')
+
+tfile.close()
+compare_srcfile(tmp_file, f)
diff --git a/src/third_party/wiredtiger/dist/log_data.py b/src/third_party/wiredtiger/dist/log_data.py
new file mode 100644
index 00000000000..f46e9e80dda
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/log_data.py
@@ -0,0 +1,63 @@
+# Data for log.py, describes the format of log records
+
+# There are a small number of main log record types.
+#
+# Some log record types, such as transaction commit, also include a list of
+# "log operations" within the same log record. Both log record types and log
+# operations are described here.
+
+class LogRecordType:
+ def __init__(self, name, desc, fields):
+ self.name = name
+ self.desc = desc
+ self.fields = fields
+
+ def macro_name(self):
+ return 'WT_LOGREC_%s' % self.name.upper()
+
+ def prname(self):
+ return '__logrec_print_' + self.name
+
+rectypes = [
+ # A database-wide checkpoint.
+ LogRecordType('checkpoint', 'checkpoint', [
+ ('WT_LSN', 'ckpt_lsn'), ('uint32', 'nsnapshot'), ('item', 'snapshot')]),
+
+ # Common case: a transaction commit
+ LogRecordType('commit', 'transaction commit', [('uint64', 'txnid')]),
+
+ # Mark the start / end of a file sync operation (usually when a file is
+ # closed). These log records aren't required during recovery, but we use
+ # the allocated LSN to reduce the amount of work recovery has to do, and
+ # they are useful for debugging recovery.
+ LogRecordType('file_sync', 'file sync', [
+ ('uint32', 'fileid'), ('int', 'start')]),
+
+ # Debugging message in the log
+ LogRecordType('message', 'message', [('string', 'message')]),
+]
+
+class LogOperationType:
+ def __init__(self, name, desc, fields):
+ self.name = name
+ self.desc = desc
+ self.fields = fields
+
+ def macro_name(self):
+ return 'WT_LOGOP_%s' % self.name.upper()
+
+optypes = [
+ LogOperationType('col_put', 'column put',
+ [('uint32', 'fileid'), ('recno', 'recno'), ('item', 'value')]),
+ LogOperationType('col_remove', 'column remove',
+ [('uint32', 'fileid'), ('recno', 'recno')]),
+ LogOperationType('col_truncate', 'column truncate',
+ [('uint32', 'fileid'), ('recno', 'start'), ('recno', 'stop')]),
+ LogOperationType('row_put', 'row put',
+ [('uint32', 'fileid'), ('item', 'key'), ('item', 'value')]),
+ LogOperationType('row_remove', 'row remove',
+ [('uint32', 'fileid'), ('item', 'key')]),
+ LogOperationType('row_truncate', 'row truncate',
+ [('uint32', 'fileid'), ('item', 'start'), ('item', 'stop'),
+ ('uint32', 'mode')]),
+]
diff --git a/src/third_party/wiredtiger/dist/package/debian/README.Debian b/src/third_party/wiredtiger/dist/package/debian/README.Debian
new file mode 100644
index 00000000000..2028c0740dc
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/README.Debian
@@ -0,0 +1,8 @@
+wiredtiger for Debian
+---------------------
+
+This is a package of the WiredTiger database library for Debian based
+systems. For more information on WiredTiger please visit:
+http://www.wiredtiger.com or contact us at info@wiredtiger.com
+
+ -- Alex <alexg@wiredtiger.com> Tue, 01 Apr 2014 15:50:02 +1100
diff --git a/src/third_party/wiredtiger/dist/package/debian/README.source b/src/third_party/wiredtiger/dist/package/debian/README.source
new file mode 100644
index 00000000000..ddd6dc94c3d
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/README.source
@@ -0,0 +1,9 @@
+wiredtiger for Debian
+---------------------
+
+<this file describes information about the source package, see Debian policy
+manual section 4.14. You WILL either need to modify or delete this file>
+
+
+
+
diff --git a/src/third_party/wiredtiger/dist/package/debian/changelog b/src/third_party/wiredtiger/dist/package/debian/changelog
new file mode 100644
index 00000000000..1481a506d6d
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/changelog
@@ -0,0 +1,5 @@
+wiredtiger (2.1.2-1) UNRELEASED; urgency=low
+
+ * Initial release of WiredTiger
+
+ -- Alex <alexg@wiredtiger.com> Tue, 01 Apr 2014 15:50:02 +1100
diff --git a/src/third_party/wiredtiger/dist/package/debian/compat b/src/third_party/wiredtiger/dist/package/debian/compat
new file mode 100644
index 00000000000..45a4fb75db8
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/compat
@@ -0,0 +1 @@
+8
diff --git a/src/third_party/wiredtiger/dist/package/debian/control b/src/third_party/wiredtiger/dist/package/debian/control
new file mode 100644
index 00000000000..5ad2b71b4b9
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/control
@@ -0,0 +1,36 @@
+Source: wiredtiger
+Priority: extra
+Maintainer: Alex Gorrod <alexg@wiredtiger.com>
+Build-Depends: debhelper (>= 8.0.0), autotools-dev
+Standards-Version: 3.9.4
+Section: libs
+Homepage: http://www.wiredtiger.com
+#Vcs-Git: git://git.debian.org/collab-maint/wiredtiger.git
+#Vcs-Browser: http://git.debian.org/?p=collab-maint/wiredtiger.git;a=summary
+
+Package: libwiredtiger-dev
+Architecture: any
+Section: libdevel
+Priority: extra
+Depends: ${misc:Depends}
+Description: WiredTiger Database Libraries [development]
+ This is the development package which contains headers and static
+ libraries for the WiredTiger database library.
+
+Package: libwiredtiger
+Architecture: any
+Depends: ${shlibs:Depends},
+ ${misc:Depends}
+Description: WiredTiger Database Libraries [runtime]
+ This is the runtime package for programs that use the WiredTiger
+ database library.
+
+Package: wiredtiger-util
+Architecture: any
+Section: database
+Priority: extra
+Depends: ${shlibs:Depends},
+ ${misc:Depends}
+Description: WiredTiger Database Utilities
+ This package provides tools for manipulating WiredTiger databases
+
diff --git a/src/third_party/wiredtiger/dist/package/debian/copyright b/src/third_party/wiredtiger/dist/package/debian/copyright
new file mode 100644
index 00000000000..1394ad8b4bd
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/copyright
@@ -0,0 +1,26 @@
+Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: wiredtiger
+Source: <http://www.wiredtiger.com>
+
+Files: *
+Copyright: (c) 2008-2014 WiredTiger, Inc.
+License:
+ This program is free software: you can redistribute it and/or modify it under
+ the terms of either version 2 or version 3 of the GNU General Public License
+ as published by the Free Software Foundation.
+ .
+ On Debian GNU/Linux systems, the complete text of the GNU General
+ Public License can be found in `/usr/share/common-licenses/GPL-2' and
+ `/usr/share/common-licenses/GPL-3'.
+ .
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ details.
+ .
+ For a license to use the WiredTiger software under conditions other than those
+ described by the GNU General Public License, or for technical support for this
+ software, contact WiredTiger, Inc. at info@wiredtiger.com.
+ .
+ For further information, see the licensing section in the documentation.
+
diff --git a/src/third_party/wiredtiger/dist/package/debian/docs b/src/third_party/wiredtiger/dist/package/debian/docs
new file mode 100644
index 00000000000..50bd824bb7b
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/docs
@@ -0,0 +1,2 @@
+NEWS
+README
diff --git a/src/third_party/wiredtiger/dist/package/debian/files b/src/third_party/wiredtiger/dist/package/debian/files
new file mode 100644
index 00000000000..53662d0c48e
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/files
@@ -0,0 +1,3 @@
+libwiredtiger-dev_2.1.2-1_amd64.deb libdevel extra
+libwiredtiger_2.1.2-1_amd64.deb libs extra
+wiredtiger-util_2.1.2-1_amd64.deb database extra
diff --git a/src/third_party/wiredtiger/dist/package/debian/libwiredtiger-dev.dirs b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger-dev.dirs
new file mode 100644
index 00000000000..da07fddd09b
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger-dev.dirs
@@ -0,0 +1,2 @@
+usr/include
+usr/lib
diff --git a/src/third_party/wiredtiger/dist/package/debian/libwiredtiger-dev.install b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger-dev.install
new file mode 100644
index 00000000000..deb99408b27
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger-dev.install
@@ -0,0 +1,2 @@
+usr/include/*
+usr/lib/pkgconfig/*
diff --git a/src/third_party/wiredtiger/dist/package/debian/libwiredtiger-dev.substvars b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger-dev.substvars
new file mode 100644
index 00000000000..abd3ebebc30
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger-dev.substvars
@@ -0,0 +1 @@
+misc:Depends=
diff --git a/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.dirs b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.dirs
new file mode 100644
index 00000000000..68457717bd8
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.dirs
@@ -0,0 +1 @@
+usr/lib
diff --git a/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.install b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.install
new file mode 100644
index 00000000000..27fae7a0850
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.install
@@ -0,0 +1,2 @@
+usr/lib/libwiredtiger*.a
+usr/lib/libwiredtiger*.so
diff --git a/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.postinst.debhelper b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.postinst.debhelper
new file mode 100644
index 00000000000..3d89d3ef629
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.postinst.debhelper
@@ -0,0 +1,5 @@
+# Automatically added by dh_makeshlibs
+if [ "$1" = "configure" ]; then
+ ldconfig
+fi
+# End automatically added section
diff --git a/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.postrm.debhelper b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.postrm.debhelper
new file mode 100644
index 00000000000..7f44047270f
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.postrm.debhelper
@@ -0,0 +1,5 @@
+# Automatically added by dh_makeshlibs
+if [ "$1" = "remove" ]; then
+ ldconfig
+fi
+# End automatically added section
diff --git a/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.substvars b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.substvars
new file mode 100644
index 00000000000..1e00e6fd7a6
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/libwiredtiger.substvars
@@ -0,0 +1,2 @@
+shlibs:Depends=libc6 (>= 2.14)
+misc:Depends=
diff --git a/src/third_party/wiredtiger/dist/package/debian/rules b/src/third_party/wiredtiger/dist/package/debian/rules
new file mode 100755
index 00000000000..312e24d2e6f
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/rules
@@ -0,0 +1,13 @@
+#!/usr/bin/make -f
+# -*- makefile -*-
+# Sample debian/rules that uses debhelper.
+# This file was originally written by Joey Hess and Craig Small.
+# As a special exception, when this file is copied by dh-make into a
+# dh-make output file, you may use that output file without restriction.
+# This special exception was added by Craig Small in version 0.37 of dh-make.
+
+# Uncomment this to turn on verbose mode.
+#export DH_VERBOSE=1
+
+%:
+ dh $@ --with autotools-dev
diff --git a/src/third_party/wiredtiger/dist/package/debian/shlibs.local b/src/third_party/wiredtiger/dist/package/debian/shlibs.local
new file mode 100644
index 00000000000..a3b3face389
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/shlibs.local
@@ -0,0 +1 @@
+libwiredtiger 2.1.2 wiredtiger (>> 2.1.2-0), wiredtiger (<< 2.1.2-99)
diff --git a/src/third_party/wiredtiger/dist/package/debian/source/format b/src/third_party/wiredtiger/dist/package/debian/source/format
new file mode 100644
index 00000000000..163aaf8d82b
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/source/format
@@ -0,0 +1 @@
+3.0 (quilt)
diff --git a/src/third_party/wiredtiger/dist/package/debian/watch b/src/third_party/wiredtiger/dist/package/debian/watch
new file mode 100644
index 00000000000..9287dca3d91
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/watch
@@ -0,0 +1,8 @@
+# Watch control file for uscan
+# to check for upstream updates and more.
+# See uscan(1) for format
+
+# Compulsory line, this is a version 3 file
+version=3
+
+http://source.wiredtiger.com/releases/wiredtiger-(.*)\.tar\.bz2
diff --git a/src/third_party/wiredtiger/dist/package/debian/wiredtiger-util.dirs b/src/third_party/wiredtiger/dist/package/debian/wiredtiger-util.dirs
new file mode 100644
index 00000000000..e7724817552
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/wiredtiger-util.dirs
@@ -0,0 +1 @@
+usr/bin
diff --git a/src/third_party/wiredtiger/dist/package/debian/wiredtiger-util.install b/src/third_party/wiredtiger/dist/package/debian/wiredtiger-util.install
new file mode 100644
index 00000000000..1df36c612fb
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/wiredtiger-util.install
@@ -0,0 +1 @@
+usr/bin/*
diff --git a/src/third_party/wiredtiger/dist/package/debian/wiredtiger-util.substvars b/src/third_party/wiredtiger/dist/package/debian/wiredtiger-util.substvars
new file mode 100644
index 00000000000..4dd9c7cf955
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/wiredtiger-util.substvars
@@ -0,0 +1,2 @@
+shlibs:Depends=libc6 (>= 2.14), wiredtiger (>> 2.1.2-0), wiredtiger (<< 2.1.2-99)
+misc:Depends=
diff --git a/src/third_party/wiredtiger/dist/package/debian/wiredtiger.doc-base b/src/third_party/wiredtiger/dist/package/debian/wiredtiger.doc-base
new file mode 100644
index 00000000000..faa994f156f
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/debian/wiredtiger.doc-base
@@ -0,0 +1,12 @@
+Document: wiredtiger
+Title: Debian wiredtiger Manual
+Author: WiredTiger, Inc.
+Abstract: WiredTiger is a database storage engine library.
+Section: library
+
+Format: postscript
+Files: /usr/share/doc/wiredtiger/wiredtiger.ps.gz
+
+Format: HTML
+Index: /usr/share/doc/wiredtiger/html/index.html
+Files: /usr/share/doc/wiredtiger/html/*.html
diff --git a/src/third_party/wiredtiger/dist/package/wiredtiger.spec b/src/third_party/wiredtiger/dist/package/wiredtiger.spec
new file mode 100644
index 00000000000..4925672a3ba
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/package/wiredtiger.spec
@@ -0,0 +1,58 @@
+Name: wiredtiger
+Version: 2.4.1
+Release: 1%{?dist}
+Summary: WiredTiger data storage engine
+
+Group: Development/Libraries
+License: GPLV2 or GPLV3
+URL: www.wiredtiger.com
+Source0: http://source.wiredtiger.com/releases/%{name}-%{version}.tar.bz2
+BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)
+
+BuildRequires: python-devel java-devel
+Requires: jemalloc
+
+%description
+
+WiredTiger is a data storage engine that provides APIs for efficiently
+storing data in highly concurrent applications. It includes functionality
+for automatically maintaining indexes. It implements both row and column
+store formats - so that all types of data can be stored space efficiently.
+
+WiredTiger is a library that can be accessed via C, Python and Java APIs.
+
+
+%prep
+%autosetup
+
+
+%build
+%configure --enable-java --enable-bzip2 --enable-snappy --enable-zlib
+# Stop the build setting up an rpath
+sed -i 's|^hardcode_libdir_flag_spec=.*|hardcode_libdir_flag_spec=""|g' libtool
+sed -i 's|^runpath_var=LD_RUN_PATH|runpath_var=DIE_RPATH_DIE|g' libtool
+make %{?_smp_mflags}
+
+
+%install
+rm -rf %{buildroot}
+make install DESTDIR=%{buildroot}
+# Need to resolve make install with --enable-python before we can
+# install the python API.
+# python setup.py install -O1 --skip-build --root $RPM_BUILD_ROOT
+
+%clean
+rm -rf %{buildroot}
+
+
+%files
+%defattr(-,root,root,-)
+%doc README LICENSE NEWS
+%{_bindir}/*
+%{_datadir}/*
+%{_includedir}/*
+%{_libdir}/*
+
+
+%changelog
+
diff --git a/src/third_party/wiredtiger/dist/s_all b/src/third_party/wiredtiger/dist/s_all
new file mode 100644
index 00000000000..1b171bdeafd
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_all
@@ -0,0 +1,84 @@
+#! /bin/sh
+
+# Run standard scripts.
+t=__wt.$$
+trap 'rm -f $t *.pyc __tmp __wt.*' 0 1 2 3 13 15
+
+# We require python which may not be installed.
+type python > /dev/null 2>&1 || {
+ echo 's_all: python not found'
+ exit 1
+}
+
+run()
+{
+ printf "WiredTiger: $2..."
+ $1 > $t
+
+ if `grep 'skipped' $t > /dev/null 2>&1`; then
+ printf " " && cat $t
+ elif `test -s $t`; then
+ echo
+ sed -e 's/^/ /' $t
+ else
+ echo ' OK'
+ fi
+ rm -f $t
+ return 0
+}
+
+echo 'dist/s_all run started...'
+
+force=
+reconf=0
+while :
+ do case "$1" in
+ -A) # Reconfigure the library build.
+ reconf=1
+ shift;;
+ -f) # Force versions to be updated
+ force="-f"
+ shift;;
+ *)
+ break;;
+ esac
+done
+
+run "sh ./s_version $force" "Updating files that include the package version"
+
+test "$reconf" -eq 0 || {
+ (cd ../build_posix &&
+ run "sh ./reconf" "Rebuilding GNU tools library support")
+}
+
+run "python api_config.py" "building WiredTiger API"
+run "python api_err.py" "building WiredTiger error returns"
+
+run "python flags.py" "building flags"
+run "python log.py" "building logging layer"
+run "python serial.py" "building serial function support"
+run "python stat.py" "building statistics support"
+run "python java_doc.py" "building Java documentation index"
+
+run "sh ./s_typedef -b" "building standard typedefs"
+run "sh ./s_prototypes" "building function prototypes"
+run "sh ./s_readme $force" "building README file"
+run "sh ./s_tags" "building tags files"
+
+run "sh ./s_copyright" "checking copyright notices"
+run "sh ./s_define" "checking for unused #defines"
+run "sh ./s_funcs" "checking for unused functions"
+run "sh ./s_getopt" "checking for incorrect getopt usage"
+run "sh ./s_longlines" "checking for long lines"
+run "sh ./s_stat" "checking for unused statistics fields"
+run "sh ./s_string" "checking string spelling"
+run "python style.py" "checking style (pass 1)"
+run "sh ./s_style" "checking style (pass 2)"
+run "sh ./s_symbols" "checking external symbol names"
+run "sh ./s_typedef -c" "checking for unused typedefs"
+run "sh ./s_whitespace" "checking whitespace"
+run "sh ./s_win" "checking windows config"
+
+run "sh ./s_docs" "generating documentation"
+
+echo 'dist/s_all run finished'
diff --git a/src/third_party/wiredtiger/dist/s_copyright b/src/third_party/wiredtiger/dist/s_copyright
new file mode 100755
index 00000000000..893b2451f2c
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_copyright
@@ -0,0 +1,101 @@
+#! /bin/sh
+
+# Check the copyrights.
+
+c1=__wt.1$$
+c2=__wt.2$$
+c3=__wt.3$$
+c4=__wt.4$$
+trap 'rm -f $c1 $c2 $c3 $c4; exit 0' 0 1 2 3 13 15
+
+year=`date +%Y`
+
+cat > $c1 <<ENDOFTEXT
+ * Copyright (c) 2008-$year WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ENDOFTEXT
+
+# Copyright for files WiredTiger does not own.
+cat > $c2 <<ENDOFTEXT
+ * Public Domain 2008-$year WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ENDOFTEXT
+
+cat > $c3 <<ENDOFTEXT
+# Copyright (c) 2008-$year WiredTiger, Inc.
+# All rights reserved.
+#
+# See the file LICENSE for redistribution information.
+ENDOFTEXT
+
+cat > $c4 <<ENDOFTEXT
+# Public Domain 2008-$year WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+ENDOFTEXT
+
+check()
+{
+ # Skip files in which WiredTiger holds no rights.
+ if `egrep "skip $1" s_copyright.list > /dev/null`; then
+ return;
+ fi
+
+ # It's okay if the file doesn't exist: we may be running in a release
+ # tree with some files removed.
+ test -f ../$i || return
+
+ # Check for a correct copyright header.
+ if `sed -e 2,4p -e 5q -e d ../$1 | diff - $c1 > /dev/null` ; then
+ return;
+ fi
+ if `sed -e 2,3p -e 4q -e d ../$1 | diff - $c2 > /dev/null` ; then
+ return;
+ fi
+ if `sed -e 3,5p -e 6q -e d ../$1 | diff - $c3 > /dev/null` ; then
+ return;
+ fi
+ if `sed -e 3,4p -e 5q -e d ../$1 | diff - $c4 > /dev/null` ; then
+ return;
+ fi
+ if `sed -e 1,2p -e 3q -e d ../$1 | diff - $c4 > /dev/null` ; then
+ return;
+ fi
+
+ echo "$1: copyright information is incorrect"
+}
+
+# Search for files, skipping some well-known 3rd party directories.
+for i in `cd .. &&
+ find [a-z]* -name '*.[chi]' \
+ -o -name '*.cxx' -o -name '*.in' -o -name '*.java' -o -name '*.py' |
+ sed -e '/Makefile.in/d' \
+ -e '/^build_posix\//d' \
+ -e '/api\/leveldb\/basho\//d' \
+ -e '/api\/leveldb\/hyperleveldb\//d' \
+ -e '/api\/leveldb\/leveldb\//d' \
+ -e '/api\/leveldb\/rocksdb\//d' \
+ -e '/test\/3rdparty\//d' \
+ -e 's/^\.\///'`
+do
+ check $i
+done
+
+# A few special cases: LICENSE, documentation, wt utility, some of which
+# have more than one copyright notice in the file.
+s="Copyright (c) 2008-$year WiredTiger, Inc."
+special_copyright()
+{
+ cnt=`grep "$s" ../$1 | wc -l`
+ if test $cnt -ne $2; then
+ echo "$1: copyright information is incorrect"
+ fi
+}
+
+special_copyright LICENSE 1
+special_copyright src/docs/build-javadoc.sh 1
+special_copyright src/docs/style/footer.html 2
+special_copyright src/utilities/util_cpyright.c 2
diff --git a/src/third_party/wiredtiger/dist/s_copyright.list b/src/third_party/wiredtiger/dist/s_copyright.list
new file mode 100644
index 00000000000..ed1d8a655f5
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_copyright.list
@@ -0,0 +1,41 @@
+skip api/leveldb/leveldb_wt_config.in
+skip build_win/wiredtiger_config.h
+skip dist/api_config.py
+skip dist/api_data.py
+skip dist/api_err.py
+skip dist/db.py
+skip dist/dist.py
+skip dist/flags.py
+skip dist/java_doc.py
+skip dist/log.py
+skip dist/log_data.py
+skip dist/serial.py
+skip dist/stat.py
+skip dist/stat_data.py
+skip dist/style.py
+skip lang/java/java_doc.i
+skip lang/java/src/com/wiredtiger/db/AsyncOp.java
+skip lang/java/src/com/wiredtiger/db/AsyncOpType.java
+skip lang/java/src/com/wiredtiger/db/Connection.java
+skip lang/java/src/com/wiredtiger/db/Cursor.java
+skip lang/java/src/com/wiredtiger/db/SearchStatus.java
+skip lang/java/src/com/wiredtiger/db/Session.java
+skip lang/java/src/com/wiredtiger/db/wiredtiger.java
+skip lang/java/src/com/wiredtiger/db/wiredtigerConstants.java
+skip lang/java/src/com/wiredtiger/db/wiredtigerJNI.java
+skip lang/java/wiredtiger_wrap.c
+skip lang/python/wiredtiger/__init__.py
+skip lang/python/wiredtiger_wrap.c
+skip src/config/config_def.c
+skip src/conn/api_strerror.c
+skip src/docs/tools/doxypy.py
+skip src/include/extern.h
+skip src/include/flags.h
+skip src/include/queue.h
+skip src/log/log_auto.c
+skip src/support/stat.c
+skip test/packing/intpack-test.c
+skip test/packing/intpack-test2.c
+skip test/packing/packing-test.c
+skip tools/stat_data.py
+skip wiredtiger_config.h
diff --git a/src/third_party/wiredtiger/dist/s_define b/src/third_party/wiredtiger/dist/s_define
new file mode 100644
index 00000000000..7809bf14918
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_define
@@ -0,0 +1,34 @@
+#! /bin/sh
+
+# Complain about unused #defines.
+t=__wt.$$
+trap 'rm -f $t; exit 0' 0 1 2 3 13 15
+
+# List of files to search.
+l=`sed -e 's,#.*,,' -e '/^$/d' -e 's,^,../,' filelist`
+l="$l `echo ../src/include/*.i ../src/utilities/*.c ../test/*/*.c`"
+
+(
+# Copy out the list of #defines we don't use, but it's OK.
+sed -e '/^$/d' -e '/^#/d' < s_define.list
+
+# Get the list of #defines.
+# Ignore the list of configuration objects
+# Ignore the list of statistic "keys" generated for applications.
+search=`cat ../src/include/*.[hi] ../src/include/*.in |
+ sed -e '/configuration section: BEGIN/,/configuration section: END/d' \
+ -e '/Statistics section: BEGIN/,/Statistics section: END/d' |
+ egrep '^#define' |
+ sed 's/#define[ ][ ]*\([A-Za-z_][A-Za-z0-9_]*\).*/\1/' |
+ sort -u`
+
+# Print the list of macros, followed by the occurrences: we're looking for
+# macros that only appear once.
+echo "$search"
+fgrep -who "$search" $l
+
+) | sort | uniq -u > $t
+
+test -s $t && cat $t
+
+exit 0
diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list
new file mode 100644
index 00000000000..75f0f886915
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_define.list
@@ -0,0 +1,131 @@
+# List of WiredTiger #defines that are "unused", but it's OK.
+ALIGN_CHECK
+API_CALL
+API_CALL_NOCONF
+API_SESSION_INIT
+FLD_CLR
+F_CAS_ATOMIC
+F_CLR_ATOMIC
+F_SET_ATOMIC
+IS_INIT_LSN
+LF_CLR
+LF_SET
+LLONG_MAX
+LLONG_MIN
+MAX_LSN
+SIZE_CHECK
+TXNID_LE
+TXN_API_CALL
+TXN_API_CALL_NOCONF
+TXN_API_END
+WIN32_LEAN_AND_MEAN
+WT_ATOMIC_ADD1
+WT_ATOMIC_ADD2
+WT_ATOMIC_CAS1
+WT_ATOMIC_CAS2
+WT_ATOMIC_CAS_VAL1
+WT_ATOMIC_CAS_VAL2
+WT_ATOMIC_CAS_VAL4
+WT_ATOMIC_FETCH_ADD1
+WT_ATOMIC_FETCH_ADD2
+WT_ATOMIC_FETCH_ADD4
+WT_ATOMIC_STORE1
+WT_ATOMIC_STORE2
+WT_ATOMIC_SUB1
+WT_ATOMIC_SUB2
+WT_BARRIER
+WT_BLOCK_DESC_SIZE
+WT_CACHE_LINE_ALIGNMENT
+WT_DEADLOCK
+WT_DEBUG_BYTE
+WT_HANDLE_CLOSED
+WT_HANDLE_NULLABLE
+WT_READ_BARRIER
+WT_REF_SIZE
+WT_SPINLOCK_MAX
+WT_STAT_ATOMIC_DECR
+WT_STAT_ATOMIC_DECRV
+WT_STAT_ATOMIC_INCR
+WT_STAT_ATOMIC_INCRV
+WT_STAT_DECRV
+WT_STAT_FAST_ATOMIC_DECR
+WT_STAT_FAST_ATOMIC_DECRV
+WT_STAT_FAST_ATOMIC_INCR
+WT_STAT_FAST_ATOMIC_INCRV
+WT_STAT_FAST_CONN_ATOMIC_DECRV
+WT_STAT_FAST_CONN_ATOMIC_INCRV
+WT_STAT_FAST_CONN_DECRV
+WT_STAT_FAST_DATA_DECRV
+WT_STAT_FAST_DECR
+WT_STAT_FAST_DECRV
+WT_STAT_FAST_INCRV
+WT_STAT_FAST_SET
+__F
+__WIREDTIGER_EXT_H_
+__WIREDTIGER_H_
+__WT_ATOMIC_ADD
+__WT_ATOMIC_CAS
+__WT_ATOMIC_CAS_VAL
+__WT_ATOMIC_FETCH_ADD
+__WT_ATOMIC_STORE
+__WT_ATOMIC_SUB
+
+# List of queue.h #defines that are "unused", but it's OK.
+LIST_EMPTY
+LIST_ENTRY
+LIST_FIRST
+LIST_FOREACH
+LIST_HEAD
+LIST_HEAD_INITIALIZER
+LIST_INIT
+LIST_INSERT_AFTER
+LIST_INSERT_BEFORE
+LIST_INSERT_HEAD
+LIST_NEXT
+LIST_REMOVE
+QMD_TRACE_ELEM
+QMD_TRACE_HEAD
+QUEUE_MACRO_DEBUG
+SLIST_EMPTY
+SLIST_ENTRY
+SLIST_FIRST
+SLIST_FOREACH
+SLIST_FOREACH_PREVPTR
+SLIST_HEAD
+SLIST_HEAD_INITIALIZER
+SLIST_INIT
+SLIST_INSERT_AFTER
+SLIST_INSERT_HEAD
+SLIST_NEXT
+SLIST_REMOVE
+SLIST_REMOVE_HEAD
+STAILQ_CONCAT
+STAILQ_EMPTY
+STAILQ_ENTRY
+STAILQ_FIRST
+STAILQ_FOREACH
+STAILQ_HEAD
+STAILQ_HEAD_INITIALIZER
+STAILQ_INIT
+STAILQ_INSERT_AFTER
+STAILQ_INSERT_HEAD
+STAILQ_INSERT_TAIL
+STAILQ_LAST
+STAILQ_NEXT
+STAILQ_REMOVE
+STAILQ_REMOVE_HEAD
+STAILQ_REMOVE_HEAD_UNTIL
+TAILQ_CONCAT
+TAILQ_EMPTY
+TAILQ_ENTRY
+TAILQ_FOREACH_REVERSE
+TAILQ_HEAD
+TAILQ_HEAD_INITIALIZER
+TAILQ_INSERT_AFTER
+TAILQ_INSERT_BEFORE
+TAILQ_LAST
+TAILQ_NEXT
+TAILQ_PREV
+TRACEBUF
+TRASHIT
+_DB_QUEUE_H_
diff --git a/src/third_party/wiredtiger/dist/s_docs b/src/third_party/wiredtiger/dist/s_docs
new file mode 100755
index 00000000000..cf5f3962c19
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_docs
@@ -0,0 +1,216 @@
+#! /bin/sh
+
+t=__wt.$$
+trap 'rm -f $t /tmp/__doxy; exit 0' 0 1 2 3 13 15
+
+# Skip this when building release packages: docs are built separately
+test -n "$WT_RELEASE_BUILD" && exit 0
+
+# We require doxygen which may not be installed.
+type doxygen > /dev/null 2>&1 || {
+ echo 'skipped: doxygen not found'
+ exit 0
+}
+
+. ../RELEASE_INFO
+
+e=0
+
+changelog()
+{
+ # convert the top-level NEWS file into a change log page in the docs
+ (echo "WiredTiger Change Log"
+ echo "====================="
+ echo
+ cat ../NEWS) > ../src/docs/changelog.md
+}
+
+wtperf_config()
+{
+ # The Linux ed command writes line numbers to stderr, redirect both
+ # stdout and stderr to keep things quiet.
+ cc -o /tmp/__doxy ../bench/wtperf/doxy.c &&
+ (echo '/START_AUTO_GENERATED_WTPERF_CONFIGURATION/+3,/STOP_AUTO_GENERATED_WTPERF_CONFIGURATION/-1d'
+ echo 'i'
+ echo ''
+ echo '.'
+ echo ".r !/tmp/__doxy"
+ echo 'a'
+ echo ''
+ echo '.'
+ echo 'w'
+ echo 'q') | ed ../src/docs/wtperf.dox 1>/dev/null 2>/dev/null &&
+ rm -f /tmp/__doxy
+}
+
+structurechk()
+{
+ # @page names should match the source file name
+ (cd ../src/docs &&
+ grep @page *.dox |
+ sed 's/\([^:]*\)\.dox:.*@page \([^ ]*\) .*/\1 \2/g' |
+ sed 's/-/_/g' | awk '{ if ($1 != $2) { print $1 " != " $2; } }') > $t
+ test -s $t && {
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ echo "@page references don't match source file names"
+ sed -e 's/^/ /' < $t
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ e=1
+ }
+
+ # sections are a global name space for doxygen, and must be uniquely
+ # named or you can get the wrong results. For example, if you have
+ # "@section foo ABC" and "@section foo DEF", they will both appear as
+ # "ABC" or "DEF".
+ (cd ../src/docs &&
+ sed -n 's/@section \([^ ]*\)/\1/p' *.dox | sort | uniq -d) > $t
+ test -s $t && {
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ echo '@section references that are not uniquely named'
+ sed -e 's/^/ /' < $t
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ e=1
+ }
+ # we want a simple tree structure for navigation, otherwise
+ # clicking in the navigation tree can jump to a different point in
+ # the tree
+ (cd ../src/docs &&
+ sed -n 's/@subpage \([^ ]*\)/\1/p' *.dox | sort | uniq -d) > $t
+ test -s $t && {
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ echo 'multiple @subpage references for the same page'
+ sed -e 's/^/ /' < $t
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ e=1
+ }
+}
+
+spellchk()
+{
+ # If aspell has been installed, run a spell check.
+ type aspell > /dev/null 2>&1 || return
+
+ (cd ../src/docs &&
+ cat *.dox | aspell --lang=en --personal=./spell.ok list) |
+ sort -u > $t
+ test -s $t && {
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ echo 'Documentation spelling notes'
+ echo 'Update src/docs/spell.ok to remove warnings.'
+ sed -e 's/^/ /' < $t
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ e=1
+ }
+}
+
+valid_build()
+{
+ # Complain if there are pages we don't reference directly.
+ sed -n '/<table.*directory/,/\/table/p' < ../docs/pages.html | \
+ grep href > /dev/null && {
+ echo 'Unreferenced page: see docs/pages.html for the list.'
+ e=1
+ }
+ classf=`ls ../docs/struct___* 2>/dev/null`
+ for c in $classf; do
+ echo "$c: Need to add class to PREDEFINED in src/docs/Doxyfile"
+ done
+}
+
+build()
+{
+ # Build from scratch on demand.
+ [ "$1" -eq 0 ] || (cd .. && rm -rf docs && mkdir docs)
+
+ # Run doxygen to generate warnings for the base HTML documentation.
+ #
+ # We omit Python because warnings are expected there (the code generated
+ # by swig does not have named arguments, but we want to document them
+ # as if they do.
+ (cd ../src/docs &&
+ (eval cat Doxyfile $filter ; cat <<EOF
+QUIET=YES
+EOF
+) | doxygen -
+ test -s doxygen.log && cat doxygen.log) > $t 2>&1
+ test -s $t && {
+ cat $t
+ e=1
+ }
+
+ # Add optional extras
+ EXTRAS="../lang/java/src/com/wiredtiger/db ../lang/python/wiredtiger.py"
+ EXTRA_INPUT=""
+ for f in $EXTRAS ; do
+ [ -e "$f" ] && EXTRA_INPUT="$EXTRA_INPUT ../$f"
+ done
+
+ # Run again to generate the full doc set with Python and Java.
+ [ "$additional_languages" -eq 1 ] && [ "x$EXTRA_INPUT" != "x" ] && (
+ cd ../src/docs &&
+ (eval cat Doxyfile $filter ; cat <<EOF
+QUIET=YES
+INPUT+=$EXTRA_INPUT
+EOF
+) | doxygen -)
+
+ # Fix up bad links doxygen generates in navtree.js
+ (cd ../docs &&
+ sed -i~ -e 's,/\.html,/,' -e 's,\.html\.html,.html,' navtree.js &&
+ rm -f navtree.js~)
+
+ # Fixup the man pages generated by Doxygen. We want the command line
+ # documentation to be the main man page, but also install a man page
+ # for the WiredTiger header into the library section.
+ [ "$additional_languages" -eq 1 ] &&
+ (cd ../docs && mkdir -p man/man1 &&
+ mv man/man3/command_line.3 man/man1/wt.1 &&
+ sed -i~ -e 's/command_line/wt/g' man/man1/wt.1 &&
+ rm -f man/man1/wt.1~ &&
+ mv man/man3/basic_api.3 man/ && rm -f man/man3/* &&
+ mv man/basic_api.3 man/man3/wiredtiger.3 &&
+ sed -i~ -e 's/basic_api/WiredTiger/g' man/man3/wiredtiger.3 &&
+ rm -f man/man3/wiredtiger.3~)
+}
+
+clean=0
+additional_languages=1
+filter="|sed '/PROJECT_NUMBER/s,=.*,=\"Version $WIREDTIGER_VERSION\",'"
+while :
+ do case "$1" in
+ -a) # Build from scratch
+ clean=1
+ shift;;
+ -l) # Generate the top-level landing page in ../docs/top
+ filter="$filter| sed '/GENERATE_MAN/s,=.*,=NO,';"
+ filter="$filter cat top/Doxyfile"
+ additional_languages=0
+ shift;;
+ -p) # Generate PDFs
+ filter="$filter| sed '/GENERATE_LATEX/s,=.*,=YES,'"
+ shift;;
+ -t) # Include the TODO list
+ filter="$filter| sed '/GENERATE_TODOLIST/s,=.*,=YES,'"
+ shift;;
+ *)
+ break;;
+ esac
+done
+
+# Generate the change log
+changelog
+
+# Generate the list of wtperf configuration options.
+wtperf_config
+
+# Spell and structure check the documentation.
+spellchk
+structurechk
+
+# Build the documentation.
+build $clean
+
+# Any post-build validity checks we want to make.
+valid_build
+
+exit $e
diff --git a/src/third_party/wiredtiger/dist/s_funcs b/src/third_party/wiredtiger/dist/s_funcs
new file mode 100644
index 00000000000..3769ccc4aa7
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_funcs
@@ -0,0 +1,29 @@
+#! /bin/sh
+
+# Complain about unused functions
+t=__wt.$$
+trap 'rm -f $t; exit 0' 0 1 2 3 13 15
+
+# List of files to search.
+l=`sed -e 's,#.*,,' -e '/^$/d' -e 's,^,../,' filelist`
+l="$l `echo ../src/*/*.i ../src/utilities/*.c`"
+
+(
+# Copy out the functions we don't use, but it's OK.
+sed -e '/^$/d' -e '/^#/d' < s_funcs.list
+
+# Get the list of functions
+search=`egrep -h '^[a-zA-Z0-9_][a-zA-Z0-9_]*\(' $l | sed -e 's/(.*//' | sort -u`
+
+# Print the list of functions, followed by the occurrences: we're looking for
+# functions that only appear once
+echo "$search"
+sed -n '/{/,/^}/p' $l | fgrep -wo "$search"
+
+sed -n '/^#define/,/[^\\]$/p' ../src/include/*.h ../src/include/*.in |
+ fgrep -who "$search"
+) | sort | uniq -u > $t
+
+test -s $t && cat $t
+
+exit 0
diff --git a/src/third_party/wiredtiger/dist/s_funcs.list b/src/third_party/wiredtiger/dist/s_funcs.list
new file mode 100644
index 00000000000..4bb9796c11f
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_funcs.list
@@ -0,0 +1,44 @@
+# List of functions that aren't found by s_funcs, but that's OK.
+WT_CURDUMP_PASS
+__bit_ffs
+__bit_nclr
+__ovfl_discard_dump
+__ovfl_reuse_dump
+__ovfl_txnc_dump
+__wt_bloom_drop
+__wt_bloom_get
+__wt_bulk_insert_fix
+__wt_bulk_insert_row
+__wt_bulk_insert_var
+__wt_cache_dump
+__wt_config_getone
+__wt_cursor_get_raw_value
+__wt_debug_addr
+__wt_debug_addr_print
+__wt_debug_offset
+__wt_debug_set_verbose
+__wt_debug_tree
+__wt_debug_tree_all
+__wt_debug_tree_shape
+__wt_fsync
+__wt_lex_compare
+__wt_lex_compare_skip
+__wt_log_read
+__wt_log_scan
+__wt_nlpo2
+__wt_nlpo2_round
+__wt_print_huffman_code
+__wt_try_readlock
+wiredtiger_config_parser_open
+wiredtiger_pack_int
+wiredtiger_pack_item
+wiredtiger_pack_str
+wiredtiger_pack_uint
+wiredtiger_struct_pack
+wiredtiger_struct_size
+wiredtiger_struct_unpack
+wiredtiger_unpack_int
+wiredtiger_unpack_item
+wiredtiger_unpack_start
+wiredtiger_unpack_str
+wiredtiger_unpack_uint
diff --git a/src/third_party/wiredtiger/dist/s_getopt b/src/third_party/wiredtiger/dist/s_getopt
new file mode 100644
index 00000000000..745de80503a
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_getopt
@@ -0,0 +1,16 @@
+#! /bin/sh
+
+t=__wt.$$
+trap 'rm -f $t' 0 1 2 3 13 15
+
+# Complain if someone uses the wrong getopt.
+find ../src ../test ../bench -name '*.c' | xargs egrep '[^a-z_]getopt\(' > $t
+
+test -s $t && {
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ echo 'Calls to the C library version of getopt.'
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ cat $t
+ exit 1
+}
+exit 0
diff --git a/src/third_party/wiredtiger/dist/s_longlines b/src/third_party/wiredtiger/dist/s_longlines
new file mode 100644
index 00000000000..15ca5603385
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_longlines
@@ -0,0 +1,17 @@
+#! /bin/sh
+
+# Check for long lines
+t=__wt.$$
+trap 'rm -f $t; exit 0' 0 1 2 3 13 15
+
+l=`(cd .. &&
+ find bench/wtperf examples ext src test -name '*.[chisy]' &&
+ find dist -name '*.py' &&
+ find src -name '*.in') |
+ sed -e '/include\/extern\.h/d'\
+ -e '/support\/stat\.c/d'`
+
+for f in $l ; do
+ expand -t8 < ../$f | awk -- \
+ "{if(length(\$0) > 80) printf(\"%s:%d\\n\", \"$f\", NR)}"
+done
diff --git a/src/third_party/wiredtiger/dist/s_prototypes b/src/third_party/wiredtiger/dist/s_prototypes
new file mode 100755
index 00000000000..f29b96a1f55
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_prototypes
@@ -0,0 +1,41 @@
+#! /bin/sh
+
+# Build a list of internal function and variable prototypes.
+t=__wt.$$
+trap 'rm -f $t; exit 0' 0 1 2 3 13 15
+
+(
+cat <<EOF
+/* DO NOT EDIT: automatically built by dist/s_prototypes. */
+
+EOF
+
+for i in `sed -e '/^[a-z]/!d' filelist`; do
+ sed -n \
+ -e '/^__wt_[a-z]/!{' \
+ -e h \
+ -e d \
+ -e '}' \
+ -e x \
+ -e '/^static/d' \
+ -e x \
+ -e ': loop' \
+ -e H \
+ -e n \
+ -e '/;/b end' \
+ -e '/^{/!b loop' \
+ -e ': end' \
+ -e x \
+ -e 's/ =.*$//' \
+ -e '/#/!s/\n/ /g' \
+ -e 's/\* /\*/g' \
+ -e 's/ */ /g' \
+ -e 's/^/extern /' \
+ -e 's/WT_GCC_FUNC_/WT_GCC_/' \
+ -e 's/$/;/p' \
+ < ../$i
+done) > $t
+
+f=../src/include/extern.h
+cmp $t $f > /dev/null 2>&1 ||
+ (echo "Building $f" && rm -f $f && cp $t $f)
diff --git a/src/third_party/wiredtiger/dist/s_readme b/src/third_party/wiredtiger/dist/s_readme
new file mode 100644
index 00000000000..0a2cce16e41
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_readme
@@ -0,0 +1,54 @@
+#! /bin/sh
+
+t=__wt.$$
+trap 'rm -f $t; exit 0' 0 1 2 3 13 15
+f=../README
+
+. ../RELEASE_INFO
+
+force=no
+while :
+ do case "$1" in
+ -f) # Force versions to be updated
+ force=yes
+ shift;;
+ *)
+ break;;
+ esac
+done
+
+# If the version hasn't changed and we aren't forcing the issue, we're done.
+# Don't generate a new README file just because the date changed unless forced:
+# that happens all the time.
+if test "$force" = no ; then
+ cnt=`(sed -e q < $f; echo "$WIREDTIGER_VERSION_STRING") |
+ sed -e 's/:.*//' | sort -u | wc -l`
+ test $cnt -eq 1 && exit 0
+fi
+
+cat << END_TEXT > $t
+$WIREDTIGER_VERSION_STRING
+
+This is version $WIREDTIGER_VERSION of WiredTiger.
+
+WiredTiger release packages and documentation can be found at:
+
+ http://source.wiredtiger.com/
+
+Information on configuring, building and installing WiredTiger can be
+found at:
+
+ http://source.wiredtiger.com/$WIREDTIGER_VERSION/install.html
+
+WiredTiger licensing information can be found at:
+
+ http://source.wiredtiger.com/license.html
+
+For general questions and discussion, please use the WiredTiger mailing
+list:
+
+ http://groups.google.com/group/wiredtiger-users
+END_TEXT
+
+cmp $t $f > /dev/null 2>&1 ||
+ (echo "Building $f" && rm -f $f && cp $t $f)
diff --git a/src/third_party/wiredtiger/dist/s_release b/src/third_party/wiredtiger/dist/s_release
new file mode 100755
index 00000000000..ec85341d0ff
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_release
@@ -0,0 +1,55 @@
+#!/bin/sh
+# Build a WiredTiger release package.
+
+set -e
+
+. ../RELEASE_INFO || exit 1
+
+RELEASE_DIR=`pwd`/../releases
+mkdir -p $RELEASE_DIR
+
+pkgver="$1"
+if test -z "$pkgver" ; then
+ pkgver="$WIREDTIGER_VERSION"
+fi
+PKG="wiredtiger-$pkgver"
+DEST="$RELEASE_DIR/$PKG"
+
+rm -rf $DEST ; mkdir -p $DEST
+EXCLUSIONS=`sed -e '/^#/d' -e 's/^/--exclude /' < s_release.list`
+
+if [ -d ../.hg ] ; then
+ echo "Running 'hg archive' to copy the tree"
+ (cd .. && hg archive $EXCLUSIONS $DEST)
+elif [ -d ../.git ] ; then
+ echo "Running 'git archive' to copy the tree"
+ (cd .. && git archive HEAD) | (cd $DEST && tar xf - $EXCLUSIONS)
+else
+ echo "$0 must be run in a Git or Mercurial tree"
+ exit 1
+fi
+
+echo "Running 'dist/s_all' in the release tree"
+(cd "$DEST/dist" && env WT_RELEASE_BUILD=yes sh s_all -A > /dev/null)
+
+echo "Running swig to generate the Java and Python API"
+(cd "$DEST/build_posix" &&
+ ../configure --enable-java --enable-python &&
+ (cd lang/java && make ../../../lang/java/wiredtiger_wrap.c) &&
+ (cd lang/python && make ../../../lang/python/wiredtiger_wrap.c) &&
+ make distclean &&
+ find . -type d -a -empty | xargs rmdir &&
+ find . -type d -a -empty | xargs rmdir &&
+ find . -type d -a -empty | xargs rmdir) > /dev/null
+
+echo "Building documentation"
+(cd "$DEST/dist" && sh s_docs > /dev/null)
+
+echo "Packing release into $RELEASE_DIR/$PKG.tar.bz2"
+(cd "$RELEASE_DIR" && tar cf - $PKG | bzip2 -9 > $PKG.tar.bz2)
+
+echo "Packing documentation into $RELEASE_DIR/$PKG-docs.tar.bz2"
+(cd "$RELEASE_DIR" && tar cf - $PKG/LICENSE $PKG/NEWS $PKG/README $PKG/docs | \
+ bzip2 -9 > $PKG-docs.tar.bz2)
+
+rm -r $DEST
diff --git a/src/third_party/wiredtiger/dist/s_release.list b/src/third_party/wiredtiger/dist/s_release.list
new file mode 100644
index 00000000000..4f67e4cdb5b
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_release.list
@@ -0,0 +1,9 @@
+# Exclusions from release packages.
+# Each non-comment line is passed as an "--exclude" argument to "hg archive".
+lang/python/src
+src/server
+test/format
+test/packing
+test/salvage
+test/snapshot
+test/thread
diff --git a/src/third_party/wiredtiger/dist/s_release_docs b/src/third_party/wiredtiger/dist/s_release_docs
new file mode 100755
index 00000000000..965f59662e8
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_release_docs
@@ -0,0 +1,79 @@
+#!/bin/sh
+# Build a WiredTiger release package.
+
+set -e
+
+. ../RELEASE_INFO || exit 1
+
+TOPDIR=`pwd`/..
+RELEASE_DIR=$TOPDIR/releases
+DOC_DIR=$TOPDIR/../wiredtiger.github.com
+DOCFILE=$TOPDIR/src/docs/top/main.dox
+tmp=$0.tmp
+
+RELEASE_PACKAGE=`ls $RELEASE_DIR | tail -n 1`
+
+# Parse command line options.
+while getopts d:r: OPT; do
+ case "$OPT" in
+ d)
+ DOC_DIR=$OPTARG
+ ;;
+ r)
+ RELEASE_PACKAGE=$OPTARG
+ ;;
+ \?)
+ # getopts issues an error message
+ echo $USAGE >&2
+ exit 1
+ ;;
+ esac
+done
+
+# Remove the switches we parsed above.
+shift `expr $OPTIND - 1`
+
+if [ ! -d "$DOC_DIR" ]; then
+ echo "Invalid Git doc repository $DOC_DIR"
+fi
+if [ ! -f "$RELEASE_DIR/$RELEASE_PACKAGE" ]; then
+ echo "Invalid release package: $RELEASE_DIR/$RELEASE_PACKAGE"
+fi
+
+pkgver="$WIREDTIGER_VERSION"
+
+# Find the old versions in the documentation.
+oldrel=`grep current $DOCFILE | cut -d ' ' -f 2 | cut -d '<' -f 1`
+prevrel=`grep previous $DOCFILE | cut -d ' ' -f 2 | cut -d '<' -f 1`
+
+OLD_VERSION_MAJOR=`echo $oldrel | cut -d '.' -f 1`
+OLD_VERSION_MINOR=`echo $oldrel | cut -d '.' -f 2`
+OLD_VERSION_PATCH=`echo $oldrel | cut -d '.' -f 3`
+PREV_VERSION_MAJOR=`echo $prevrel | cut -d '.' -f 1`
+PREV_VERSION_MINOR=`echo $prevrel | cut -d '.' -f 2`
+PREV_VERSION_PATCH=`echo $prevrel | cut -d '.' -f 3`
+
+# Update the release versions on the landing page.
+sed -e "s/$oldrel/$pkgver/" $DOCFILE > $tmp && mv $tmp $DOCFILE
+if [ $OLD_VERSION_MINOR != $WIREDTIGER_VERSION_MINOR ]; then
+ sed -e "s/$prevrel/$oldrel/" $DOCFILE > $tmp && mv $tmp $DOCFILE
+fi
+
+echo "Rebuild documentation root"
+(cd "$TOPDIR/dist" && sh s_docs -l > /dev/null)
+
+# Copy the new files into the documentation repository
+(cd $TOPDIR && cp docs/top/* $DOC_DIR/)
+
+# Unpack the documentation into the right location
+(mkdir -p $DOC_DIR/$pkgver && cd $RELEASE_DIR && \
+ cp $RELEASE_PACKAGE $DOC_DIR/releases/ &&
+ tar xjf $RELEASE_PACKAGE -C $DOC_DIR/$pkgver --strip-components 2 wiredtiger-$pkgver/docs)
+
+(cd $DOC_DIR && git add . $pkgver releases/wiredtiger-$pkgver.tar.bz2 && \
+ git commit -m "Release $pkgver")
+
+echo "Finished packaging documentation, you should now push the results. Run:"
+echo "cd $DOC_DIR && git push origin"
+echo "To backout changes run 'git reset HEAD~1'"
+
diff --git a/src/third_party/wiredtiger/dist/s_stat b/src/third_party/wiredtiger/dist/s_stat
new file mode 100644
index 00000000000..152097f14be
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_stat
@@ -0,0 +1,33 @@
+#! /bin/sh
+
+# Complain about unused statistics fields.
+t=__wt.$$
+trap 'rm -f $t; exit 0' 0 1 2 3 13 15
+
+# List of files to search: skip stat.c, it lists all of the fields by
+# definition.
+l=`sed \
+ -e '/src\/support\/stat.c/d' \
+ -e 's,#.*,,' \
+ -e '/^$/d' \
+ -e 's,^,../,' filelist`
+l="$l `echo ../src/include/*.i`"
+
+(
+# Get the list of statistics fields.
+search=`sed \
+ -e 's/^ WT_STATS \([a-z_*]*\);$/\1/p' \
+ -e d ../src/include/stat.h |
+ sort`
+
+echo "$search"
+fgrep -who "$search" $l) | sort | uniq -u > $t
+
+test -s $t && {
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ echo 'unused statistics fields'
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ cat $t
+ exit 1
+}
+exit 0
diff --git a/src/third_party/wiredtiger/dist/s_string b/src/third_party/wiredtiger/dist/s_string
new file mode 100644
index 00000000000..89ba5b130d0
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_string
@@ -0,0 +1,37 @@
+#!/bin/sh -
+#
+# Check spelling in comments and quoted strings from the source files.
+
+t=__wt.$$
+trap 'rm -f $t; exit 0' 0 1 2 3 13 15
+
+# Insulate against locale-specific sort order
+LC_ALL=C
+export LC_ALL
+
+# If aspell has not been installed, quit
+type aspell > /dev/null 2>&1 || {
+ echo 'skipped: aspell not found'
+ exit 0
+}
+
+check() {
+ aspell --mode=ccpp --lang=en list < ../$1 |
+ sort -u |
+ comm -23 /dev/stdin s_string.ok > $t
+ test -s $t && {
+ echo "==== $1"
+ cat $t
+ }
+}
+
+# List of files to spellchk.
+l=`(cd .. &&
+ find examples ext src test -name '*.[chisy]' &&
+ find src -name '*.in')`
+
+for f in $l; do
+ check $f
+done
+
+exit 0
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
new file mode 100644
index 00000000000..e980df43b68
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -0,0 +1,1147 @@
+AAAAA
+AAAAAA
+AAAAAAAAAA
+AAAAAAAAAAAAA
+AAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAA
+ABCDEFGHIJKLMNOPQRSTUVWXYZ
+ADDR
+AIX
+AJ
+API
+APIs
+ARG
+ARGS
+ASYNC
+Addr
+Ailamaki
+Alakuijala
+Alloc
+Async
+Athanassoulis
+Athlon
+BASHOLEVELDB
+BBBBB
+BBBBBB
+BBBBBBBBBB
+BBBBBBBBBBBBB
+BBBBBBBBBBBBBBBB
+BBBBBBBBBBBBBBBBBB
+BDB
+BDB's
+BIGENDIAN
+BOOL
+BSR
+BTREE
+BUF
+BUFs
+BUILTIN
+BZ
+BZIP
+Bitfield
+Bitwise
+Bsearch
+Btree
+Bzip
+CAS
+CELL's
+CELLs
+CHECKKEY
+CKPT
+CLMPRSTdehikrsuv
+CLR
+CMP
+COL's
+CONCAT
+CONFIG
+CPUID
+CPUs
+CRC
+CURSORs
+CURSTD
+CallsCustDate
+Checksum
+Checksums
+CityHash
+CloseHandle
+Comparator
+Config
+CreateFileMapping
+CreateThread
+Crummey
+CustomersPhone
+DATAITEMs
+DECL
+DESC
+DHANDLE
+DLFCN
+DNE
+DOI
+DSRC
+DUPLICATEV
+DbCursor
+DbEnv
+Decrement
+EAGAIN
+EB
+EBUSY
+EINTR
+EINVAL
+EMSG
+EMail
+ENOENT
+ENOMEM
+ENOTSUP
+ENV
+EOF
+ERET
+ESET
+ETIME
+ETIMEDOUT
+EXLOG
+EXTLIST
+Enqueue
+Env
+Eron
+FADVISE
+FALLOCATE
+FALLTHROUGH
+FCNTL
+FDATASYNC
+FH
+FLD
+FLS
+FNV
+FORALL
+FOREACH
+FREAD
+FREELIST
+FTRUNCATE
+Fasttrack
+Filesystems
+FindFirstFile
+Fk
+FlushFileBuffers
+FreeBSD
+FreeBSD's
+FreeLibrary
+Freelist
+Fsync
+Ftruncate
+Fuerst
+GCC
+GETTIME
+GETTIMEOFDAY
+GIDs
+Gcc
+Geoff
+GetFileAttributesEx
+GetFileSizeEx
+GetLastError
+GetModuleHandleEx
+GetProcAddress
+Givargis
+Google
+HHHH
+HHHHLL
+HHHLL
+HYPERLEVELDB
+HyperLevelDB
+IEC
+IEEE
+IFF
+IKEY
+IMPL
+IMPL's
+INDX
+INIT
+INITIALIZER
+INMEM
+INODE
+INSERT's
+INTL
+INUSE
+ISSET
+ITEMs
+Inline
+Intra
+Ippokratis
+Iu
+JPEG
+JSON
+Jyrki
+KEYFIRST
+KV
+KVS
+Kanowski's
+Kounavis
+LEX
+LF
+LIBBZ
+LIBDL
+LIBPTHREAD
+LIBRT
+LIBSNAPPY
+LIBZ
+LLLLLL
+LLLLLLL
+LNO
+LOGREC
+LOGSCAN
+LRU
+LSB
+LSM
+LSN
+LSNs
+LZO
+LeafGreen
+LevelDB
+Levyx
+Llqr
+Llqrt
+LoadLoad
+LockFile
+Lookup
+MADVISE
+MALLOC
+MAXID
+MBUF
+MEM
+MEMALIGN
+MERCHANTABILITY
+MSVC
+MUTEX
+MVCC
+Manos
+MapViewOfFile
+Marsaglia's
+Mellor
+Metadata
+Mewhort
+MoveFile
+Multi
+Multithreaded
+Mutex
+Mutexes
+NEEDKEY
+NEEDVALUE
+NOLL
+NONINFRINGEMENT
+NOTFOUND
+NOTREACHED
+NOVALUE
+NOWRITE
+NRECS
+NUL
+NULLs
+NetBSD
+NoAddr
+Noll
+Nul
+ONPAGE
+OPTYPE
+OUTBUFF
+OVFL
+PADDR
+PAGE's
+PARAM
+POSIX
+PRIu
+PRIu64
+PSIZE
+PTHREAD
+PTR
+Pagesize
+Pandis
+Phong
+PlatformSDK
+Posix
+Pre
+Prepend
+Qsort
+RCS
+READONLY
+RECNO
+REF's
+REFs
+REQ
+RET
+RLE
+RNG
+ROCKSDB
+RPC
+RUNDIR
+Radu
+Recno
+Recurse
+RedHat
+Redistributions
+Relock
+Resize
+RocksDB
+SCHED
+SIMD
+SLIST
+SLVG
+SML
+SOURCE's
+SPARC
+SPINLOCK
+SQL
+SSD
+SSq
+STAILQ
+STRTOUQ
+STRUCT
+SYS
+Scalability
+Scalable
+Seigh
+SetEndOfFile
+SetFilePointerEx
+Sevii
+SiH
+Skiplist
+SleepConditionVariableCS
+Solaris
+Spinlock
+Spinlocks
+Split's
+Stoica
+StoreLoad
+Strsep
+Subtree
+Subtrees
+TAILQ
+TODO
+TOOSMALL
+TORTIOUS
+TRK
+TXN
+TXNC
+TXNID
+Timespec
+Timestamp
+TryCV
+TxnID
+UID
+UIDs
+UINT
+ULINE
+UNISTD
+UPD
+UPDATEs
+URI
+URIs
+UTF
+UnixLib
+Unmap
+UnmapViewOfFile
+Unmarshall
+Unregister
+VARCHAR
+VLDB
+VMSG
+Vanishingly
+Vc
+Vixie
+Vo
+Vv
+VxWorks
+WIREDTIGER
+WaitForSingleObject
+WakeAllConditionVariable
+Wconditional
+WeakHashLen
+Werror
+Wformat
+WinNT
+WiredTiger
+WiredTiger's
+WiredTigerCheckpoint
+WiredTigerHome
+WiredTigerInit
+WiredTigerLog
+WiredTigerStat
+WiredTigerTxn
+WithSeeds
+Wmissing
+Wuninitialized
+XP
+ZLIB
+Zlib
+__wt_epoch
+abcdef
+abcdefghijklmnopqrstuvwxyz
+addfrag
+addl
+addr
+addrs
+af
+agc
+alfred
+alloc
+allocator
+allocsize
+amd
+ao
+ap
+api
+apip
+arg
+argc
+args
+argv
+async
+asyncopp
+autockpt
+autocommit
+autoheader
+bInheritHandle
+basecfg
+bdb
+bigram
+bitcnt
+bitfield
+bitfields
+bitpos
+bitstr
+bitstring
+bitwise
+bm
+bnd
+boolean
+br
+breakpoint
+bsearch
+bt
+btcur
+btdsk
+btmem
+btree
+btrees
+buf
+bufs
+bufsz
+builtin
+builtins
+bytelock
+bytestring
+bz
+bzCompressEnd
+bzCompressInit
+bzDecompress
+bzDecompressEnd
+bzDecompressInit
+bzalloc
+bzfree
+bzip
+calloc
+catfmt
+cb
+cd
+centric
+cfg
+cfkos
+change's
+checkfrag
+checkpointed
+checkpointer
+checkpointing
+checksum
+checksums
+chk
+chongo
+cip
+cityhash
+ckpt
+ckptfrag
+ckptlist
+cksum
+clr
+clsm
+cmd
+cmp
+cnt
+colcheck
+colgroup
+colgroups
+collatorp
+comparator
+comparep
+compat
+concat
+cond
+conf
+config
+conn
+connectionp
+const
+copydoc
+copyin
+copyout
+cp
+cpuid
+crc
+cref
+ctime
+ctype
+curbackup
+curbtree
+curbulk
+curconfig
+curdump
+curfile
+curindex
+curlog
+curmetadata
+cursoring
+cursorp
+curstat
+curtable
+cust
+cv
+cxa
+data's
+database's
+datalen
+datasets
+datasource
+datastore
+dbc
+decile
+deciles
+decl
+decr
+decrement
+decrementing
+deflateEnd
+deflateInit
+defno
+del
+delfmt
+dequeue
+dequeued
+der
+dereference
+desc
+dest
+dev
+dhandle
+dhandles
+dirlist
+dl
+dlclose
+dlfcn
+dlh
+dll
+dlopen
+dlsym
+dmsg
+ds
+dsk
+dsrc
+dst
+dstlen
+dsync
+dt
+dtype
+dumpable
+dumpcmp
+dumpfile
+dup
+eg
+emp
+encodings
+endian
+endif
+english
+enqueue
+enqueued
+enum's
+env
+eof
+eop
+errhandler
+errno
+errv
+errx
+esc
+eventv
+evictable
+evictserver
+exactp
+exe
+execop
+extern
+extlist
+extlists
+fadvise
+fallocate
+fblocks
+fclose
+fcntl
+fdatasync
+feof
+ffc
+fflush
+ffs
+fgetln
+fh
+fileID
+filefrag
+fileid
+filename
+filenames
+fileop
+fileops
+filesize
+filesystem
+fillms
+firstfit
+fixup
+flcs
+floatnum
+fmt
+fmterr
+fnv
+foc
+fopen
+fotxn
+fp
+fprintf
+fread
+free'd
+freelist
+fs
+fsm
+fstat
+fsync
+fsyncs
+ftruncate
+func
+funcs
+gcc
+gdb
+getfiles
+getid
+getline
+getone
+getoneraw
+getones
+getonesn
+getopt
+getraw
+gettime
+gettimeofday
+getv
+gobare
+goesc
+gostring
+gostruct
+goutf
+hashval
+havesize
+hdr
+highjack
+hin
+hrow
+html
+huffman
+hval
+hw
+iS
+iSh
+icount
+idx
+ifdef's
+ifndef
+ikey
+imref
+incr
+incrementing
+incrv
+indices
+indirects
+indx
+infeasible
+inflateEnd
+inflateInit
+init
+initn
+initsize
+inline
+inlined
+inmem
+inode
+insertK
+insertV
+instantiation
+intl
+intnum
+intpack
+ints
+inttypes
+inuse
+io
+ip
+ispo
+iteratively
+jnr
+jrx
+json
+kb
+kcell
+keycmp
+keygen
+keyname
+keyv
+kv
+kvraw
+kvs
+kvsbdb
+lang
+latencies
+lbrace
+lbracket
+lbz
+ld
+ldl
+len
+lenp
+level's
+lex
+lexicographically
+lf
+lfence
+libdatasource
+libs
+libwiredtiger
+lld
+llll
+llu
+lno
+loadtext
+localtime
+lockdown
+logf
+logmgr
+lognum
+logput
+logread
+logrec
+logsize
+logtest
+lookup
+lookups
+lpthread
+lr
+lrt
+lru
+lseek
+lsm
+lsn
+lsnappy
+lu
+lz
+lzo
+madvise
+majorp
+malloc
+marshall
+marshalled
+maxid
+maxintlitem
+maxintlpage
+maxleafitem
+maxleafpage
+mb
+mem
+memalign
+memalloc
+membar
+memcpy
+memfree
+memmove
+memset
+memsize
+mergeable
+metaconf
+metadata
+metafile
+mfence
+minorp
+minprefix
+mkdir
+mmap
+mnt
+msg
+msgv
+msvc
+mtx
+multi
+multiblock
+multicore
+multiprocess
+multisocket
+multithread
+multithreaded
+munmap
+mutex
+mutexes
+myfile
+mytable
+mytxn
+namespace
+namespaces
+nbits
+nbsp
+nchunks
+nclr
+nd
+negint
+newbar
+newfile
+newname
+newuri
+nextprev
+nfilename
+nhex
+nl
+nlpo
+nocase
+nonliteral
+noop
+nop
+noraw
+notfound
+notset
+notsup
+notyet
+nowrite
+np
+nr
+nset
+nthread
+nul
+nuls
+numSymbols
+numbare
+oc
+offpage
+oindex
+ok
+oldname
+ondisk
+onint
+onpage
+oo
+opendir
+openfile
+opsq
+optype
+os
+ovfl
+ownp
+packv
+parens
+parserp
+patchp
+pathname
+pathnames
+perf
+pfx
+poptable
+pos
+posint
+posix
+pre
+prealloc
+preload
+prepend
+prepended
+prepending
+presize
+primary's
+printf
+printlog
+priv
+progname
+ps
+pse
+psp
+pthread
+pushms
+putK
+putV
+pv
+py
+qSS
+qdown
+qrrSS
+qs
+qsort
+quartile
+qup
+rS
+rbrace
+rbracket
+rdlock
+rduppo
+readlock
+readnear
+readnext
+readonly
+readprev
+readserver
+readunlock
+realloc
+recno
+recnos
+reconfig
+reconfiguring
+recsize
+rectype
+recurse
+refp
+reinitialization
+req
+rescan
+resize
+resizing
+resultp
+ret
+retp
+rf
+rle
+rng
+rpc
+rref
+run's
+runlength
+runtime
+rwlock
+rwlocks
+rwunlock
+rx
+sH
+sHQ
+savepoints
+sb
+scanp
+sched
+schemas
+schematab
+scr
+sd
+searchable
+sed
+seqno
+serializable
+sessionp
+setstr
+setv
+sfence
+sii
+sizeof
+sizep
+sizev
+skiplist
+skiplists
+sl
+slotsp
+slvg
+snaplen
+snprintf
+sp
+spinlock
+spinlocks
+sprintf
+src
+srch
+srvr
+sset
+ssize
+startup
+statlog
+stbar
+stdarg
+stderr
+stdin
+stdint
+stdlib
+stdout
+stepinit
+stepkv
+stepnext
+stepp
+steprec
+str
+strcmp
+strdup
+strerror
+strftime
+strget
+stringin
+strlen
+strncmp
+strncpy
+strndup
+strsep
+strtoll
+strtouq
+struct
+structs
+su
+subgetraw
+subgets
+subinit
+sublicense
+subtree
+subtrees
+sunique
+superset
+sw
+sys
+syserr
+sz
+t's
+tV
+tablename
+tbackup
+tblock
+tcopyright
+tcreate
+tcursors
+tdatabase
+tdisplay
+tdrop
+tdump
+tdumpfile
+tempty
+th
+thazard
+tid
+timestamp
+tinsert
+tlist
+tload
+tmp
+toffpage
+tokenizer
+toklen
+tokname
+tokstart
+toktype
+toverflow
+tparent
+tprintlog
+transactional
+transactionally
+trecno
+treeconfig
+trename
+trepeat
+treplacement
+trk
+trk's
+troot
+trunc
+trylock
+trywrlock
+tsalvage
+tsplit
+tstat
+tstate
+tt
+ttracking
+tupdate
+tupgrade
+tvalue
+tverbose
+tverify
+twiredtiger
+twrite
+txn
+txnc
+txnid
+txnidp
+txnmin
+typedef
+uB
+uid
+uint
+uintmax
+unbare
+uncompressing
+uncompresssed
+undef
+unesc
+unescaped
+uninstantiated
+unistd
+unix
+unjams
+unlinked
+unmap
+unmarshall
+unmarshalled
+unmerged
+unmodify
+unpackv
+unreferenced
+unregister
+unsized
+unterminated
+untyped
+upd
+update's
+upg
+uri
+uri's
+usecs
+usedp
+usercfg
+usr
+utf
+util
+uu
+va
+valuep
+valuev
+vanishingly
+variable's
+vcell
+verrx
+versa
+vlcs
+vmsg
+vpack
+vprintf
+vrfy
+vsize
+vslot
+vsnprintf
+vtype
+vunpack
+vupdate
+walk's
+wiredtiger
+workFactor
+wrapup
+writelock
+writeunlock
+wrlock
+ws
+wti
+wtperf
+wts
+xF
+xff
+xxxx
+xxxxx
+xxxxxx
+zalloc
+zfree
+zlib
+zu
diff --git a/src/third_party/wiredtiger/dist/s_style b/src/third_party/wiredtiger/dist/s_style
new file mode 100755
index 00000000000..e36924dffb9
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_style
@@ -0,0 +1,183 @@
+#! /bin/sh
+
+# General style correction and cleanup.
+t=__wt.$$
+trap 'rm -f $t; exit 0' 0 1 2 3 13 15
+
+cd ..
+
+# Turn a C file into a line per function so we can use grep on it.
+file_parse()
+{
+ sed -n \
+ -e '/^{$/,/^}$/{=;p;}' $1 |
+ sed 'N;s/\n/:/' |
+ sed -e '/./{H;/^[0-9][0-9]*:}$/!d;}' \
+ -e x \
+ -e 's/\n/ /g' \
+ -e p \
+ -e '{s/.*//;x;}'
+}
+
+# Returns in functions after a jump to the error label, or an infinite loop
+# where there's a jump to the error label after the error label.
+for f in `find bench examples ext src test -name '*.[ci]'`; do
+ file_parse $f |
+ egrep '(WT_ERR|WT_ERR_MSG|WT_ERR_NOTFOUND_OK|WT_ERR_TEST|WT_ILLEGAL_VALUE_ERR)\(.*(WT_ASSERT_RET|WT_ILLEGAL_VALUE|WT_RET|WT_RET_MSG|WT_RET_NOTFOUND_OK|WT_RET_TEST|WT_VERBOSE_RET|WT_VERBOSE_RETVAL)\(.*err:|[^a-z_]err:.*(WT_ERR|WT_ERR_MSG|WT_ERR_NOTFOUND_OK|WT_ERR_TEST|WT_ILLEGAL_VALUE_ERR)\(' |
+ sed 's/:.*//' > $t
+
+ test -s $t && {
+ echo "$f: return after a jump to the error label or a jump to the error label after the error label"
+ sed 's/^/function @ line:/' < $t
+ }
+done
+
+# Return of 0 in functions after a jump to the error label.
+for f in `find bench examples ext src test -name '*.[ci]'`; do
+ file_parse $f |
+ egrep -v '[^a-z_]err:.*return \(ret|[^a-z_]err:.*WT_RET' |
+ egrep '[^a-z_]err:.*return \(0\);' |
+ sed 's/:.*//' > $t
+
+ test -s $t && {
+ echo "$f: error label followed by a return of 0"
+ sed 's/^/function @ line:/' < $t
+ }
+done
+
+for f in \
+ `find bench examples ext src test -name '*.[chisy]' -o -name '*.in' |
+ sed -e '/Makefile.in/d' \
+ -e '/build_win\/wiredtiger_config.h/d'`; do
+ if grep "^[^}]*while (0);" $f > $t; then
+ echo "$f: while (0) has trailing semi-colon"
+ cat $t
+ fi
+
+ if grep "(unsigned)" $f > $t; then
+ echo "$f: (unsigned) cast is wrong"
+ cat $t
+ fi
+
+ if grep WT_DEADLOCK $f | grep -v '#define.WT_DEADLOCK' > $t; then
+ echo "$f: WT_DEADLOCK deprecated in favor of WT_ROLLBACK"
+ cat $t
+ fi
+
+ if ! expr "$f" : 'examples/c/.*' > /dev/null &&
+ ! expr "$f" : 'ext/datasources/helium/helium.c' > /dev/null &&
+ ! expr "$f" : 'src/include/os.h' > /dev/null &&
+ grep "%zu" $f | grep -v 'SIZET_FMT' > $t; then
+ echo "$f: %zu needs to be fixed for Windows"
+ cat $t
+ fi
+
+ egrep -w 'off_t' $f > $t
+ test -s $t && {
+ echo "$f: off_t type declaration, use wt_off_t"
+ cat $t
+ }
+
+ # Direct calls to functions we're not supposed to use in the library.
+ # We don't check for all of them, just a few of the common ones.
+ if ! expr "$f" : 'bench/.*' > /dev/null &&
+ ! expr "$f" : 'examples/.*' > /dev/null &&
+ ! expr "$f" : 'ext/.*' > /dev/null &&
+ ! expr "$f" : 'test/.*' > /dev/null &&
+ ! expr "$f" : '.*/utilities/.*' > /dev/null; then
+ if ! expr "$f" : '.*/os_alloc.c' > /dev/null &&
+ egrep '[[:space:]]free[(]|[[:space:]]strdup[(]|[[:space:]]strndup[(]|[[:space:]]malloc[(]|[[:space:]]calloc[(]|[[:space:]]realloc[(]' $f > $t; then
+ test -s $t && {
+ echo "$f: call to illegal function"
+ cat $t
+ }
+ fi
+ if ! expr "$f" : '.*/os_strtouq.c' > /dev/null &&
+ egrep '[[:space:]]strtouq[(]' $f > $t; then
+ test -s $t && {
+ echo "$f: call to illegal function"
+ cat $t
+ }
+ fi
+ if egrep '[[:space:]]exit[(]' $f > $t; then
+ test -s $t && {
+ echo "$f: call to illegal function"
+ cat $t
+ }
+ fi
+ fi
+
+ # Declaration of an integer return variable.
+ if ! expr "$f" : 'bench/.*' > /dev/null &&
+ ! expr "$f" : 'examples/.*' > /dev/null &&
+ ! expr "$f" : 'test/.*' > /dev/null &&
+ ! expr "$f" : 'ext/.*' > /dev/null; then
+ egrep -w ret $f | egrep 'int.*[, ]ret[,;]' > $t
+ test -s $t && {
+ echo "$f: explicit declaration of \"ret\""
+ cat $t
+ }
+ fi
+
+ # Early exits from critical loops
+ sed -n -e '/API_CALL.*;$/,/API_END.*;/{=;p;}' \
+ -e '/LSM_.*ENTER*;$/,/LSM_.*LEAVE*;/{=;p;}' \
+ -e '/va_start/,/va_end/{=;p;}' $f | \
+ sed 'N;s/\n/:/' | \
+ egrep -w 'return|WT_RET' | \
+ sed -e "s,^,$f:," -e 's/$/ [return skips API_END call]/'
+
+ tr -cd '[:alnum:][:space:][:punct:]' < $f |
+ unexpand |
+ sed -e 's/){/) {/' \
+ -e 's/\([ ]\)for(/\1for (/' \
+ -e 's/\([ ]\)if(/\1if (/' \
+ -e 's/\([ ]\)index(/\1strchr(/' \
+ -e 's/\([ ]\)return(/\1return (/' \
+ -e 's/\([ ]\)return \([^()]*\);/\1return (\2);/' \
+ -e 's/\([ ]\)rindex(/\1strrchr(/' \
+ -e 's/\([ ]\)sizeof (/\1sizeof(/g' \
+ -e 's/\([ ]\)switch(/\1switch (/' \
+ -e 's/\([ ]\)while(/\1while (/' \
+ -e 's/\([ ,]\)uint\([ ,]\)/\1u_int\2/g' \
+ -e 's/\([ ,]\)u_int8_t\([ ,]\)/\1uint8_t\2/g' \
+ -e 's/\([ ,]\)u_int16_t\([ ,]\)/\1uint16_t\2/g' \
+ -e 's/\([ ,]\)u_int32_t\([ ,]\)/\1uint32_t\2/g' \
+ -e 's/\([ ,]\)u_int64_t\([ ,]\)/\1uint64_t\2/g' \
+ -e 's/\([ ,]\)u_quad\([ ,]\)/\1uint64_t\2/g' \
+ -e 's/\([|&=+-]\) *\([^*]\)/\1 \2/' \
+ -e 's/(void) \([a-zA-Z_]\)/(void)\1/' \
+ -e '/for /!s/;;$/;/' \
+ -e 's/(EOPNOTSUPP)/(ENOTSUP)/' \
+ -e 's/(unsigned)/(u_int)/' \
+ -e 's/hazard reference/hazard pointer/' \
+ -e 's/^#define /#define /' >$t
+
+ cmp $t $f > /dev/null 2>&1 || (echo "modifying $f" && cp $t $f)
+done
+
+# Check Python coding standards: check for tab characters.
+egrep ' ' tools/*.py test/suite/*.py |
+ sed 's/:.*//' |
+ sort -u |
+ sed 's/^/ /' > $t
+test -s $t && {
+ echo '[tab] characters appear in test suite scripts:'
+ cat $t
+}
+# Check Python coding standards: check for trailing semi-colons.
+egrep ';$' tools/*.py test/suite/*.py > $t
+test -s $t && {
+ echo 'trailing semi-colons in tools or test suite Python code:'
+ cat $t
+}
+
+# Check for common typos (Wikipedia's list).
+find bench examples ext src test \
+ -name '*.[chisy]' -o -name '*.in' -o -name '*.dox' |
+xargs egrep -w 'a a|an an|and and|are are|be be|by by|for for|from from|if if|in in|is is|it it|of of|the the|this this|to to|was was|were were|when when|with with|a an|an a|a the|the a' > $t
+ test -s $t && {
+ echo "paired typo"
+ echo "============================"
+ cat $t
+ }
diff --git a/src/third_party/wiredtiger/dist/s_symbols b/src/third_party/wiredtiger/dist/s_symbols
new file mode 100644
index 00000000000..e590ab6f62c
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_symbols
@@ -0,0 +1,56 @@
+#! /bin/sh
+
+# Check for illegal external symbols.
+#
+t=__a.c
+trap 'rm -f $t; exit 0' 0 1 2 3 13 15
+
+case `uname` in
+Darwin)
+ NM='nm -gUo $f | egrep " T | D " | sed "s/ _/ /"'
+ ;;
+*)
+ # We require GNU nm, which may not be installed.
+ type nm > /dev/null 2>&1 &&
+ (nm --version | grep 'GNU nm') > /dev/null 2>&1 || {
+ echo 'skipped: GNU nm not found'
+ exit 0
+ }
+ NM='nm --extern-only --defined-only --print-file-name $f'
+ ;;
+esac
+
+
+check()
+{
+ (sed -e '/^#/d' s_symbols.list &&
+ eval $NM |
+ sed 's/.* //' |
+ egrep -v '^__wt') |
+ sort |
+ uniq -u > $t
+
+ test -s $t && {
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ echo 'unexpected external symbols in the WiredTiger library'
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ cat $t
+ exit 1
+ }
+
+ exit 0
+}
+
+# This check would normally be done after the library is built, but this way
+# we don't forget about a symbol during development. Check the previously
+# built library, if it exists.
+for d in .libs build_posix/.libs; do
+ f="$d/libwiredtiger.a"
+ test -f $f && check $f
+
+ f="../$d/libwiredtiger.a"
+ test -f $f && check $f
+done
+
+echo "skipped: libwiredtiger.a not found"
+exit 0
diff --git a/src/third_party/wiredtiger/dist/s_symbols.list b/src/third_party/wiredtiger/dist/s_symbols.list
new file mode 100644
index 00000000000..d3803bc3afa
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_symbols.list
@@ -0,0 +1,19 @@
+# List of OK external symbols.
+wiredtiger_config_parser_open
+wiredtiger_open
+wiredtiger_pack_close
+wiredtiger_pack_int
+wiredtiger_pack_item
+wiredtiger_pack_start
+wiredtiger_pack_str
+wiredtiger_pack_uint
+wiredtiger_strerror
+wiredtiger_struct_pack
+wiredtiger_struct_size
+wiredtiger_struct_unpack
+wiredtiger_unpack_int
+wiredtiger_unpack_item
+wiredtiger_unpack_start
+wiredtiger_unpack_str
+wiredtiger_unpack_uint
+wiredtiger_version
diff --git a/src/third_party/wiredtiger/dist/s_tags b/src/third_party/wiredtiger/dist/s_tags
new file mode 100644
index 00000000000..908b5eb7e0d
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_tags
@@ -0,0 +1,44 @@
+#! /bin/sh
+
+# Build tags file.
+#
+t=__a.c
+trap 'rm -f $t; exit 0' 0 1 2 3 13 15
+
+# Skip this when building release packages
+test -n "$WT_RELEASE_BUILD" && exit 0
+
+# We require ctags which may not be installed.
+type ctags > /dev/null 2>&1 || {
+ echo 'skipped: ctags not found' > $t
+ exit 0
+}
+
+# Test to see what flags this ctags binary supports.
+# Use the -d, -t and -w flags to ctags if available.
+flags=""
+echo "f() { int a; }" > $t
+for i in -d -t -w --language-force=C; do
+ if ctags $i $t 2>/dev/null; then
+ flags="$i $flags"
+ fi
+done
+
+# Generate a tags file for the build directory
+(cd ../build_posix
+rm -f tags
+ctags $flags ../src/include/*.in ../src/*/*.[chi] 2>/dev/null)
+
+# Put the shared tags file in the include directory, it's at the same level in
+# the tree as the other source files.
+(cd ../src/include
+rm -f tags
+ctags $flags ../include/*.in ../*/*.[chi] 2>/dev/null)
+
+# Link to the tags file from standard build and source directories.
+dirs="`python -c 'import dist; dist.print_source_dirs()'` ../src/os_win"
+for i in $dirs; do
+ if ! expr "$i" : ".*/include" > /dev/null; then
+ (cd $i && rm -f tags && ln -s ../include/tags .)
+ fi
+done
diff --git a/src/third_party/wiredtiger/dist/s_typedef b/src/third_party/wiredtiger/dist/s_typedef
new file mode 100644
index 00000000000..6b230223baa
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_typedef
@@ -0,0 +1,80 @@
+#! /bin/sh
+
+t=__wt.$$
+trap 'rm -f $t; exit 0' 0 1 2 3 13 15
+
+# Insulate against locale-specific sort order and IFS from the user's env
+LC_ALL=C
+export LC_ALL
+IFS=' '' ''
+'
+export IFS
+
+build() {
+ # Build the standard typedefs.
+ f=../src/include/wt_internal.h
+ (sed -e '/Forward type declarations .*: BEGIN/{' \
+ -e 'n' \
+ -e 'q' \
+ -e '}' < $f
+
+ l=`ls ../src/include/*.[hi] ../src/include/*.in |
+ sed -e '/wiredtiger.*/d' -e '/queue.h/d'`
+ egrep -h '^[ ]*(struct|union)[ ]*__.*[ ]*{' $l | \
+ sed -e 's/^[ ]*//' -e 's/[ ]*{.*//' | sort | \
+ while read t n; do
+ upper=`echo $n | sed -e 's/^__//' | tr [a-z] [A-Z]`
+ echo "$t $n;"
+ echo " typedef $t $n $upper;"
+ done
+
+ echo '/*'
+ sed -e '/Forward type declarations .*: END/,${' \
+ -e 'p' \
+ -e '}' \
+ -e 'd' < $f) > $t
+ cmp $t $f > /dev/null 2>&1 ||
+ (echo "Building $f" && rm -f $f && cp $t $f)
+}
+
+check() {
+ # Complain about unused #typedefs.
+ # List of files to search.
+ l=`sed -e 's,#.*,,' -e '/^$/d' -e 's,^,../,' filelist`
+ l="$l `echo ../src/utilities/*.c`"
+
+ (
+ # Get the list of typedefs
+ search=`cat ../src/include/*.h ../src/include/*.in |
+ sed -e 's/^struct.*typedef.* \(.*\);$/\1/p' \
+ -e 's/^union.*typedef.* \(.*\);$/\1/p' \
+ -e d |
+ sort -u`
+ echo "$search"
+ fgrep -who "$search" $l
+ ) | sort | uniq -u > $t
+
+ test -s $t && cat $t
+}
+
+usage()
+{
+ echo 'usage: s_typedef [-bc]' >&2
+ exit 1
+}
+test "$#" -eq 1 || usage
+while :
+ do case "$1" in
+ -b) # -b builds the typedefs
+ build
+ shift;;
+ -c) # -c checks the typedefs
+ check
+ shift;;
+ *)
+ test "$#" -eq 0 || usage
+ break;;
+ esac
+done
+
+exit 0
diff --git a/src/third_party/wiredtiger/dist/s_version b/src/third_party/wiredtiger/dist/s_version
new file mode 100755
index 00000000000..a09aebd282a
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_version
@@ -0,0 +1,60 @@
+#!/bin/sh
+
+# Propagate version changes to the necessary files.
+. ../RELEASE_INFO
+
+m4dir=../build_posix/aclocal
+rpmspec=./package/wiredtiger.spec
+tmp_file=__tmp
+
+force=no
+while :
+ do case "$1" in
+ -f) # Force versions to be updated
+ force=yes
+ shift;;
+ *)
+ break;;
+ esac
+done
+
+# If the version hasn't changed and we're not forcing the issue, we're done.
+if test "$force" = no -a \
+ -f $m4dir/version.m4 -a \
+ -f $m4dir/version-set.m4 ; then
+ eval `grep '^VERSION_[A-Z]*=' $m4dir/version-set.m4`
+ if test x${WIREDTIGER_VERSION_MAJOR} = x${VERSION_MAJOR} -a \
+ x${WIREDTIGER_VERSION_MINOR} = x${VERSION_MINOR} -a \
+ x${WIREDTIGER_VERSION_PATCH} = x${VERSION_PATCH} ; then
+ exit 0
+ fi
+fi
+
+dotted_version=${WIREDTIGER_VERSION_MAJOR}.${WIREDTIGER_VERSION_MINOR}.${WIREDTIGER_VERSION_PATCH}
+echo "Building $m4dir/version.m4"
+cat <<EOF > $m4dir/version.m4
+dnl WiredTiger product version for AC_INIT. Maintained by dist/s_version
+${dotted_version}
+EOF
+
+echo "Building $m4dir/version-set.m4"
+cat <<EOF > $m4dir/version-set.m4
+dnl build by dist/s_version
+
+VERSION_MAJOR=${WIREDTIGER_VERSION_MAJOR}
+VERSION_MINOR=${WIREDTIGER_VERSION_MINOR}
+VERSION_PATCH=${WIREDTIGER_VERSION_PATCH}
+VERSION_STRING='"${WIREDTIGER_VERSION_STRING}"'
+
+AC_SUBST(VERSION_MAJOR)
+AC_SUBST(VERSION_MINOR)
+AC_SUBST(VERSION_PATCH)
+AC_SUBST(VERSION_STRING)
+
+VERSION_NOPATCH=${WIREDTIGER_VERSION_MAJOR}.${WIREDTIGER_VERSION_MINOR}
+AC_SUBST(VERSION_NOPATCH)
+EOF
+
+echo "Building $rpmspec"
+sed -e "s/Version: .*/Version: ${dotted_version}/" $rpmspec \
+ > $tmp_file && mv $tmp_file $rpmspec
diff --git a/src/third_party/wiredtiger/dist/s_whitespace b/src/third_party/wiredtiger/dist/s_whitespace
new file mode 100644
index 00000000000..3a51b251bfe
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_whitespace
@@ -0,0 +1,30 @@
+#! /bin/sh
+
+# Single space and remove trailing whitespace from source files.
+t=__wt.$$
+trap 'rm -f $t; exit 0' 0 1 2 3 13 15
+
+ws()
+{
+ sed -e 's/[ ][ ]*$//' \
+ -e '/^$/N' \
+ -e '/\n$/D' < $1 > $t
+ cmp $t $1 > /dev/null 2>&1 || (echo "$1" && cp $t $1)
+}
+
+cd ..
+
+for f in `find dist -name '*.py' -name 's_*'`; do
+ ws $f
+done
+
+for f in `find examples ext src test \
+ -name '*.[chi]' -o \
+ -name '*.dox' -o \
+ -name '*.in' -o \
+ -name 'Makefile.am'`; do
+ if expr "$f" : ".*/Makefile.in" > /dev/null; then
+ continue
+ fi
+ ws $f
+done
diff --git a/src/third_party/wiredtiger/dist/s_win b/src/third_party/wiredtiger/dist/s_win
new file mode 100755
index 00000000000..cb889f2ba49
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/s_win
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+t=__wt.$$
+trap 'rm -f $t' 0 1 2 3 13 15
+
+egrep '#define|#undef' \
+ ../build_posix/config.hin \
+ ../build_win/wiredtiger_config.h |
+ sed 's/^.*#//' |
+ awk '{print $2}' |
+ egrep -v '^(LT_OBJDIR|PACKAGE|VERSION)' |
+ sort | uniq -u > $t
+
+test -s $t && {
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ echo 'Windows #defines missing from build_win/wiredtiger_config.h'
+ echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ cat $t
+ exit 1
+}
+exit 0
diff --git a/src/third_party/wiredtiger/dist/serial.py b/src/third_party/wiredtiger/dist/serial.py
new file mode 100644
index 00000000000..6abfa5bc96f
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/serial.py
@@ -0,0 +1,189 @@
+# Output serialization functions.
+
+import textwrap
+from dist import compare_srcfile
+
+class SerialArg:
+ def __init__(self, typestr, name, sized=0):
+ self.typestr = typestr
+ self.name = name
+ self.sized = sized
+
+class Serial:
+ def __init__(self, name, args):
+ self.name = name
+ self.args = args
+
+msgtypes = [
+Serial('col_append', [
+ SerialArg('WT_INSERT_HEAD *', 'ins_head'),
+ SerialArg('WT_INSERT ***', 'ins_stack'),
+ SerialArg('WT_INSERT *', 'new_ins', 1),
+ SerialArg('uint64_t *', 'recnop'),
+ SerialArg('u_int', 'skipdepth'),
+ ]),
+
+Serial('insert', [
+ SerialArg('WT_INSERT_HEAD *', 'ins_head'),
+ SerialArg('WT_INSERT ***', 'ins_stack'),
+ SerialArg('WT_INSERT *', 'new_ins', 1),
+ SerialArg('u_int', 'skipdepth'),
+ ]),
+
+Serial('update', [
+ SerialArg('WT_UPDATE **', 'srch_upd'),
+ SerialArg('WT_UPDATE *', 'upd', 1),
+ ]),
+]
+
+# decl --
+# Return a declaration for the variable.
+def decl(l):
+ o = l.typestr
+ if o[-1] != '*':
+ o += ' '
+ return o + l.name
+
+# decl_p --
+# Return a declaration for a reference to the variable, which requires
+# another level of indirection.
+def decl_p(l):
+ o = l.typestr
+ if o[-1] != '*':
+ o += ' '
+ return o + '*' + l.name + 'p'
+
+# output --
+# Create serialized function calls.
+def output(entry, f):
+ # Function declaration.
+ f.write('static inline int\n__wt_' + entry.name + '_serial(\n')
+ o = 'WT_SESSION_IMPL *session, WT_PAGE *page'
+ for l in entry.args:
+ if l.sized:
+ o += ', ' + decl_p(l) + ', size_t ' + l.name + '_size'
+ else:
+ o += ', ' + decl(l)
+ o += ')'
+ f.write('\n'.join('\t' + l for l in textwrap.wrap(o, 70)))
+ f.write('\n{')
+
+ # Local variable declarations.
+ for l in entry.args:
+ if l.sized:
+ f.write('''
+\t''' + decl(l) + ''' = *''' + l.name + '''p;
+\tWT_DECL_RET;
+\tsize_t incr_mem;
+''')
+
+ # Clear memory references we now own.
+ for l in entry.args:
+ if l.sized:
+ f.write('''
+\t/* Clear references to memory we now own. */
+\t*''' + l.name + '''p = NULL;
+''')
+
+ # Check the page write generation hasn't wrapped.
+ f.write('''
+\t/*
+\t * Check to see if the page's write generation is about to wrap (wildly
+\t * unlikely as it implies 4B updates between clean page reconciliations,
+\t * but technically possible), and fail the update.
+\t *
+\t * The check is outside of the serialization mutex because the page's
+\t * write generation is going to be a hot cache line, so technically it's
+\t * possible for the page's write generation to wrap between the test and
+\t * our subsequent modification of it. However, the test is (4B-1M), and
+\t * there cannot be a million threads that have done the test but not yet
+\t * completed their modification.
+\t */
+\t WT_RET(__page_write_gen_wrapped_check(page));
+''')
+
+ # Call the worker function.
+ if entry.name != "update":
+ f.write('''
+\t/* Acquire the page's spinlock, call the worker function. */
+\tWT_PAGE_LOCK(session, page);''')
+
+ f.write('''
+\tret = __''' + entry.name + '''_serial_func(
+''')
+ o = 'session'
+ if entry.name == "update":
+ o += ', page'
+ for l in entry.args:
+ o += ', ' + l.name
+ o += ');'
+ f.write('\n'.join('\t ' + l for l in textwrap.wrap(o, 70)))
+
+ if entry.name != "update":
+ f.write('''
+\tWT_PAGE_UNLOCK(session, page);''')
+
+ f.write('''
+
+\t/* Free unused memory on error. */
+\tif (ret != 0) {
+''')
+ for l in entry.args:
+ if not l.sized:
+ continue
+ f.write(
+ '\t\t__wt_free(session, ' + l.name + ');\n')
+ f.write('''
+\t\treturn (ret);
+\t}
+''')
+
+ f.write('''
+\t/*
+\t * Increment in-memory footprint after releasing the mutex: that's safe
+\t * because the structures we added cannot be discarded while visible to
+\t * any running transaction, and we're a running transaction, which means
+\t * there can be no corresponding delete until we complete.
+\t */
+\tincr_mem = 0;
+''')
+ for l in entry.args:
+ if not l.sized:
+ continue
+ f.write('\tWT_ASSERT(session, ' +
+ l.name + '_size != 0);\n')
+ f.write('\tincr_mem += ' + l.name + '_size;\n')
+ f.write('''\tif (incr_mem != 0)
+\t\t__wt_cache_page_inmem_incr(session, page, incr_mem);
+
+\t/* Mark the page dirty after updating the footprint. */
+\t__wt_page_modify_set(session, page);
+
+\treturn (0);
+}
+
+''')
+
+#####################################################################
+# Update serial.i.
+#####################################################################
+tmp_file = '__tmp'
+tfile = open(tmp_file, 'w')
+skip = 0
+for line in open('../src/include/serial.i', 'r'):
+ if not skip:
+ tfile.write(line)
+ if line.count('Serialization function section: END'):
+ tfile.write(line)
+ skip = 0
+ elif line.count('Serialization function section: BEGIN'):
+ tfile.write(' */\n\n')
+ skip = 1
+
+ for entry in msgtypes:
+ output(entry, tfile)
+
+ tfile.write('/*\n')
+
+tfile.close()
+compare_srcfile(tmp_file, '../src/include/serial.i')
diff --git a/src/third_party/wiredtiger/dist/stat.py b/src/third_party/wiredtiger/dist/stat.py
new file mode 100644
index 00000000000..6a3a1b74db3
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/stat.py
@@ -0,0 +1,183 @@
+# Read the source files and output the statistics #defines plus the
+# initialize and refresh code.
+
+import re, string, sys, textwrap
+from dist import compare_srcfile
+
+# Read the source files.
+from stat_data import dsrc_stats, connection_stats
+
+def print_struct(title, name, base, stats):
+ '''Print the structures for the stat.h file.'''
+ f.write('/*\n')
+ f.write(' * Statistics entries for ' + title + '.\n')
+ f.write(' */\n')
+ f.write(
+ '#define\tWT_' + name.upper() + '_STATS_BASE\t' + str(base) + '\n')
+ f.write('struct __wt_' + name + '_stats {\n')
+
+ for l in stats:
+ f.write('\tWT_STATS ' + l.name + ';\n')
+ f.write('};\n\n')
+
+# Update the #defines in the stat.h file.
+tmp_file = '__tmp'
+f = open(tmp_file, 'w')
+skip = 0
+for line in open('../src/include/stat.h', 'r'):
+ if not skip:
+ f.write(line)
+ if line.count('Statistics section: END'):
+ f.write(line)
+ skip = 0
+ elif line.count('Statistics section: BEGIN'):
+ f.write('\n')
+ skip = 1
+ print_struct(
+ 'connections', 'connection', 1000, connection_stats)
+ print_struct('data sources', 'dsrc', 2000, dsrc_stats)
+f.close()
+compare_srcfile(tmp_file, '../src/include/stat.h')
+
+def print_defines():
+ '''Print the #defines for the wiredtiger.in file.'''
+ f.write('''
+/*!
+ * @name Connection statistics
+ * @anchor statistics_keys
+ * @anchor statistics_conn
+ * Statistics are accessed through cursors with \c "statistics:" URIs.
+ * Individual statistics can be queried through the cursor using the following
+ * keys. See @ref data_statistics for more information.
+ * @{
+ */
+''')
+ for v, l in enumerate(connection_stats, 1000):
+ f.write('/*! %s */\n' % '\n * '.join(textwrap.wrap(l.desc, 70)))
+ f.write('#define\tWT_STAT_CONN_' + l.name.upper() + "\t" *
+ max(1, 6 - int((len('WT_STAT_CONN_' + l.name)) / 8)) +
+ str(v) + '\n')
+ f.write('''
+/*!
+ * @}
+ * @name Statistics for data sources
+ * @anchor statistics_dsrc
+ * @{
+ */
+''')
+ for v, l in enumerate(dsrc_stats, 2000):
+ f.write('/*! %s */\n' % '\n * '.join(textwrap.wrap(l.desc, 70)))
+ f.write('#define\tWT_STAT_DSRC_' + l.name.upper() + "\t" *
+ max(1, 6 - int((len('WT_STAT_DSRC_' + l.name)) / 8)) +
+ str(v) + '\n')
+ f.write('/*! @} */\n')
+
+# Update the #defines in the wiredtiger.in file.
+tmp_file = '__tmp'
+f = open(tmp_file, 'w')
+skip = 0
+for line in open('../src/include/wiredtiger.in', 'r'):
+ if not skip:
+ f.write(line)
+ if line.count('Statistics section: END'):
+ f.write(line)
+ skip = 0
+ elif line.count('Statistics section: BEGIN'):
+ f.write(' */\n')
+ skip = 1
+ print_defines()
+ f.write('/*\n')
+f.close()
+compare_srcfile(tmp_file, '../src/include/wiredtiger.in')
+
+def print_func(name, list):
+ '''Print the functions for the stat.c file.'''
+ f.write('''
+void
+__wt_stat_init_''' + name + '''_stats(WT_''' + name.upper() + '''_STATS *stats)
+{
+\t/* Clear, so can also be called for reinitialization. */
+\tmemset(stats, 0, sizeof(*stats));
+
+''')
+ for l in sorted(list):
+ o = '\tstats->' + l.name + '.desc = "' + l.desc + '";\n'
+ if len(o) + 7 > 80:
+ o = o.replace('= ', '=\n\t ')
+ f.write(o)
+ f.write('''}
+''')
+
+ f.write('''
+void
+__wt_stat_refresh_''' + name + '''_stats(void *stats_arg)
+{
+\tWT_''' + name.upper() + '''_STATS *stats;
+
+\tstats = (WT_''' + name.upper() + '''_STATS *)stats_arg;
+''')
+ for l in sorted(list):
+ # no_clear: don't clear the value.
+ if not 'no_clear' in l.flags:
+ f.write('\tstats->' + l.name + '.v = 0;\n');
+ f.write('}\n')
+
+ # Aggregation is only interesting for data-source statistics.
+ if name == 'connection':
+ return;
+
+ f.write('''
+void
+__wt_stat_aggregate_''' + name +
+'''_stats(const void *child, const void *parent)
+{
+\tWT_''' + name.upper() + '''_STATS *c, *p;
+
+\tc = (WT_''' + name.upper() + '''_STATS *)child;
+\tp = (WT_''' + name.upper() + '''_STATS *)parent;
+''')
+ for l in sorted(list):
+ if 'no_aggregate' in l.flags:
+ continue;
+ elif 'max_aggregate' in l.flags:
+ o = 'if (c->' + l.name + '.v > p->' + l.name +\
+ '.v)\n\t p->' + l.name + '.v = c->' + l.name + '.v;'
+ else:
+ o = 'p->' + l.name + '.v += c->' + l.name + '.v;'
+ f.write('\t' + o + '\n')
+ f.write('}\n')
+
+# Write the stat initialization and refresh routines to the stat.c file.
+f = open(tmp_file, 'w')
+f.write('/* DO NOT EDIT: automatically built by dist/stat.py. */\n\n')
+f.write('#include "wt_internal.h"\n')
+
+print_func('dsrc', dsrc_stats)
+print_func('connection', connection_stats)
+f.close()
+compare_srcfile(tmp_file, '../src/support/stat.c')
+
+
+# Update the statlog file with the entries we can scale per second.
+scale_info = 'no_scale_per_second_list = [\n'
+clear_info = 'no_clear_list = [\n'
+for l in sorted(connection_stats):
+ if 'no_scale' in l.flags:
+ scale_info += ' \'' + l.desc + '\',\n'
+ if 'no_clear' in l.flags:
+ clear_info += ' \'' + l.desc + '\',\n'
+for l in sorted(dsrc_stats):
+ if 'no_scale' in l.flags:
+ scale_info += ' \'' + l.desc + '\',\n'
+ if 'no_clear' in l.flags:
+ clear_info += ' \'' + l.desc + '\',\n'
+scale_info += ']\n'
+clear_info += ']\n'
+
+tmp_file = '__tmp'
+f = open(tmp_file, 'w')
+f.write('# DO NOT EDIT: automatically built by dist/stat.py. */\n\n')
+f.write(scale_info)
+f.write(clear_info)
+f.close()
+compare_srcfile(tmp_file, '../tools/stat_data.py')
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
new file mode 100644
index 00000000000..9f40ce142df
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -0,0 +1,410 @@
+# Auto-generate statistics #defines, with initialization, clear and aggregate
+# functions.
+#
+# NOTE: Statistics reports show individual objects as operations per second.
+# All objects where that does not make sense should have the word 'currently'
+# or the phrase 'in the cache' in their text description, for example, 'files
+# currently open'.
+# NOTE: All statistics descriptions must have a prefix string followed by ':'.
+#
+# Optional configuration flags:
+# no_clear Value ignored by the statistics refresh function
+# no_aggregate Ignore the value when aggregating statistics
+# max_aggregate Take the maximum value when aggregating statistics
+# no_scale Don't scale value per second in the logging tool script
+
+from operator import attrgetter
+import sys
+
+class Stat:
+ def __init__(self, name, desc, flags=''):
+ self.name = name
+ if ':' not in desc:
+ print >>sys.stderr, 'Missing prefix in: ' + desc
+ self.desc = desc
+ self.flags = flags
+
+ def __cmp__(self, other):
+ return cmp(self.name, other.name)
+
+##########################################
+# CONNECTION statistics
+##########################################
+connection_stats = [
+ ##########################################
+ # System statistics
+ ##########################################
+ Stat('cond_wait', 'conn: pthread mutex condition wait calls'),
+ Stat('file_open', 'conn: files currently open', 'no_clear,no_scale'),
+ Stat('memory_allocation', 'conn: memory allocations'),
+ Stat('memory_free', 'conn: memory frees'),
+ Stat('memory_grow', 'conn: memory re-allocations'),
+ Stat('read_io', 'conn: total read I/Os'),
+ Stat('rwlock_read', 'conn: pthread mutex shared lock read-lock calls'),
+ Stat('rwlock_write',
+ 'conn: pthread mutex shared lock write-lock calls'),
+ Stat('write_io', 'conn: total write I/Os'),
+
+ ##########################################
+ # Async API statistics
+ ##########################################
+ Stat('async_alloc_race', 'async: number of allocation state races'),
+ Stat('async_alloc_view', 'async: number of op slots viewed for alloc'),
+ Stat('async_flush', 'async: number of async flush calls'),
+ Stat('async_full', 'async: number of times op allocation failed'),
+ Stat('async_cur_queue', 'async: current work queue length'),
+ Stat('async_max_queue', 'async: maximum work queue length',
+ 'max_aggregate,no_scale'),
+ Stat('async_nowork', 'async: number of times worker found no work'),
+ Stat('async_op_alloc', 'async: op allocations'),
+ Stat('async_op_compact', 'async: op compact calls'),
+ Stat('async_op_insert', 'async: op insert calls'),
+ Stat('async_op_remove', 'async: op remove calls'),
+ Stat('async_op_search', 'async: op search calls'),
+ Stat('async_op_update', 'async: op update calls'),
+
+ ##########################################
+ # Block manager statistics
+ ##########################################
+ Stat('block_byte_map_read', 'block manager: mapped bytes read'),
+ Stat('block_byte_read', 'block manager: bytes read'),
+ Stat('block_byte_write', 'block manager: bytes written'),
+ Stat('block_map_read', 'block manager: mapped blocks read'),
+ Stat('block_preload', 'block manager: blocks pre-loaded'),
+ Stat('block_read', 'block manager: blocks read'),
+ Stat('block_write', 'block manager: blocks written'),
+
+ ##########################################
+ # Cache and eviction statistics
+ ##########################################
+ Stat('cache_bytes_dirty',
+ 'cache: tracked dirty bytes in the cache', 'no_scale'),
+ Stat('cache_bytes_inuse',
+ 'cache: bytes currently in the cache', 'no_clear,no_scale'),
+ Stat('cache_bytes_max',
+ 'cache: maximum bytes configured', 'no_clear,no_scale'),
+ Stat('cache_bytes_read', 'cache: bytes read into cache'),
+ Stat('cache_bytes_write', 'cache: bytes written from cache'),
+ Stat('cache_eviction_clean', 'cache: unmodified pages evicted'),
+ Stat('cache_eviction_deepen',
+ 'cache: page split during eviction deepened the tree'),
+ Stat('cache_eviction_dirty', 'cache: modified pages evicted'),
+ Stat('cache_eviction_checkpoint',
+ 'cache: checkpoint blocked page eviction'),
+ Stat('cache_eviction_fail',
+ 'cache: pages selected for eviction unable to be evicted'),
+ Stat('cache_eviction_force',
+ 'cache: pages evicted because they exceeded the in-memory maximum'),
+ Stat('cache_eviction_force_fail',
+ 'cache: failed eviction of pages that exceeded the ' +
+ 'in-memory maximum'),
+ Stat('cache_eviction_hazard',
+ 'cache: hazard pointer blocked page eviction'),
+ Stat('cache_eviction_internal', 'cache: internal pages evicted'),
+ Stat('cache_eviction_queue_empty',
+ 'cache: eviction server candidate queue empty when topping up'),
+ Stat('cache_eviction_queue_not_empty',
+ 'cache: eviction server candidate queue not empty when topping up'),
+ Stat('cache_eviction_server_evicting',
+ 'cache: eviction server evicting pages'),
+ Stat('cache_eviction_server_not_evicting',
+ 'cache: eviction server populating queue, but not evicting pages'),
+ Stat('cache_eviction_slow',
+ 'cache: eviction server unable to reach eviction goal'),
+ Stat('cache_eviction_split', 'cache: pages split during eviction'),
+ Stat('cache_eviction_walk', 'cache: pages walked for eviction'),
+ Stat('cache_pages_dirty',
+ 'cache: tracked dirty pages in the cache', 'no_scale'),
+ Stat('cache_pages_inuse',
+ 'cache: pages currently held in the cache', 'no_clear,no_scale'),
+ Stat('cache_read', 'cache: pages read into cache'),
+ Stat('cache_write', 'cache: pages written from cache'),
+
+ ##########################################
+ # Dhandle statistics
+ ##########################################
+ Stat('dh_session_handles', 'dhandle: session dhandles swept'),
+ Stat('dh_session_sweeps', 'dhandle: session sweep attempts'),
+
+ ##########################################
+ # Logging statistics
+ ##########################################
+ Stat('log_buffer_grow',
+ 'log: log buffer size increases'),
+ Stat('log_buffer_size',
+ 'log: total log buffer size', 'no_clear,no_scale'),
+ Stat('log_bytes_user', 'log: user provided log bytes written'),
+ Stat('log_bytes_written', 'log: log bytes written'),
+ Stat('log_close_yields',
+ 'log: yields waiting for previous log file close'),
+ Stat('log_max_filesize', 'log: maximum log file size', 'no_clear'),
+ Stat('log_reads', 'log: log read operations'),
+ Stat('log_scan_records', 'log: records processed by log scan'),
+ Stat('log_scan_rereads', 'log: log scan records requiring two reads'),
+ Stat('log_scans', 'log: log scan operations'),
+ Stat('log_sync', 'log: log sync operations'),
+ Stat('log_writes', 'log: log write operations'),
+
+ Stat('log_slot_consolidated', 'log: logging bytes consolidated'),
+ Stat('log_slot_closes', 'log: consolidated slot closures'),
+ Stat('log_slot_joins', 'log: consolidated slot joins'),
+ Stat('log_slot_races', 'log: consolidated slot join races'),
+ Stat('log_slot_switch_fails',
+ 'log: slots selected for switching that were unavailable'),
+ Stat('log_slot_toobig', 'log: record size exceeded maximum'),
+ Stat('log_slot_toosmall',
+ 'log: failed to find a slot large enough for record'),
+ Stat('log_slot_transitions', 'log: consolidated slot join transitions'),
+
+ ##########################################
+ # Reconciliation statistics
+ ##########################################
+ Stat('rec_pages', 'reconciliation: page reconciliation calls'),
+ Stat('rec_pages_eviction',
+ 'reconciliation: page reconciliation calls for eviction'),
+ Stat('rec_split_stashed_bytes',
+ 'reconciliation: split bytes currently awaiting free',
+ 'no_clear,no_scale'),
+ Stat('rec_split_stashed_objects',
+ 'reconciliation: split objects currently awaiting free',
+ 'no_clear,no_scale'),
+
+ ##########################################
+ # Transaction statistics
+ ##########################################
+ Stat('txn_begin', 'txn: transaction begins'),
+ Stat('txn_checkpoint', 'txn: transaction checkpoints'),
+ Stat('txn_checkpoint_running',
+ 'txn: transaction checkpoint currently running',
+ 'no_aggregate,no_clear,no_scale'),
+ Stat('txn_pinned_range',
+ 'txn: transaction range of IDs currently pinned',
+ 'no_aggregate,no_clear,no_scale'),
+ Stat('txn_commit', 'txn: transactions committed'),
+ Stat('txn_fail_cache',
+ 'txn: transaction failures due to cache overflow'),
+ Stat('txn_rollback', 'txn: transactions rolled back'),
+
+ ##########################################
+ # LSM statistics
+ ##########################################
+ Stat('lsm_checkpoint_throttle',
+ 'LSM: sleep for LSM checkpoint throttle'),
+ Stat('lsm_merge_throttle', 'LSM: sleep for LSM merge throttle'),
+ Stat('lsm_rows_merged', 'LSM: rows merged in an LSM tree'),
+ Stat('lsm_work_queue_app', 'LSM: App work units currently queued',
+ 'no_clear,no_scale'),
+ Stat('lsm_work_queue_manager', 'LSM: Merge work units currently queued',
+ 'no_clear,no_scale'),
+ Stat('lsm_work_queue_max', 'LSM: tree queue hit maximum'),
+ Stat('lsm_work_queue_switch', 'LSM: Switch work units currently queued',
+ 'no_clear,no_scale'),
+ Stat('lsm_work_units_created',
+ 'LSM: tree maintenance operations scheduled'),
+ Stat('lsm_work_units_discarded',
+ 'LSM: tree maintenance operations discarded'),
+ Stat('lsm_work_units_done',
+ 'LSM: tree maintenance operations executed'),
+
+ ##########################################
+ # Session operations
+ ##########################################
+ Stat('session_cursor_open',
+ 'session: open cursor count', 'no_clear,no_scale'),
+ Stat('session_open',
+ 'session: open session count', 'no_clear,no_scale'),
+
+ ##########################################
+ # Total Btree cursor operations
+ ##########################################
+ Stat('cursor_create', 'Btree: cursor create calls'),
+ Stat('cursor_insert', 'Btree: cursor insert calls'),
+ Stat('cursor_next', 'Btree: cursor next calls'),
+ Stat('cursor_prev', 'Btree: cursor prev calls'),
+ Stat('cursor_remove', 'Btree: cursor remove calls'),
+ Stat('cursor_reset', 'Btree: cursor reset calls'),
+ Stat('cursor_search', 'Btree: cursor search calls'),
+ Stat('cursor_search_near', 'Btree: cursor search near calls'),
+ Stat('cursor_update', 'Btree: cursor update calls'),
+]
+
+connection_stats = sorted(connection_stats, key=attrgetter('name'))
+
+##########################################
+# Data source statistics
+##########################################
+dsrc_stats = [
+ ##########################################
+ # Session operations
+ ##########################################
+ Stat('session_compact', 'session: object compaction'),
+ Stat('session_cursor_open',
+ 'session: open cursor count', 'no_clear,no_scale'),
+
+ ##########################################
+ # Cursor operations
+ ##########################################
+ Stat('cursor_create', 'cursor: create calls'),
+ Stat('cursor_insert', 'cursor: insert calls'),
+ Stat('cursor_insert_bulk', 'cursor: bulk-loaded cursor-insert calls'),
+ Stat('cursor_insert_bytes',
+ 'cursor: cursor-insert key and value bytes inserted'),
+ Stat('cursor_next', 'cursor: next calls'),
+ Stat('cursor_prev', 'cursor: prev calls'),
+ Stat('cursor_remove', 'cursor: remove calls'),
+ Stat('cursor_remove_bytes', 'cursor: cursor-remove key bytes removed'),
+ Stat('cursor_reset', 'cursor: reset calls'),
+ Stat('cursor_search', 'cursor: search calls'),
+ Stat('cursor_search_near', 'cursor: search near calls'),
+ Stat('cursor_update', 'cursor: update calls'),
+ Stat('cursor_update_bytes',
+ 'cursor: cursor-update value bytes updated'),
+
+ ##########################################
+ # Btree statistics
+ ##########################################
+ Stat('btree_column_deleted',
+ 'btree: column-store variable-size deleted values', 'no_scale'),
+ Stat('btree_column_fix',
+ 'btree: column-store fixed-size leaf pages', 'no_scale'),
+ Stat('btree_column_internal',
+ 'btree: column-store internal pages', 'no_scale'),
+ Stat('btree_column_variable',
+ 'btree: column-store variable-size leaf pages', 'no_scale'),
+ Stat('btree_compact_rewrite', 'btree: pages rewritten by compaction'),
+ Stat('btree_entries', 'btree: number of key/value pairs', 'no_scale'),
+ Stat('btree_fixed_len',
+ 'btree: fixed-record size', 'no_aggregate,no_scale'),
+ Stat('btree_maximum_depth',
+ 'btree: maximum tree depth', 'max_aggregate,no_scale'),
+ Stat('btree_maxintlitem',
+ 'btree: maximum internal page item size', 'no_aggregate,no_scale'),
+ Stat('btree_maxintlpage',
+ 'btree: maximum internal page size', 'no_aggregate,no_scale'),
+ Stat('btree_maxleafitem',
+ 'btree: maximum leaf page item size', 'no_aggregate,no_scale'),
+ Stat('btree_maxleafpage',
+ 'btree: maximum leaf page size', 'no_aggregate,no_scale'),
+ Stat('btree_overflow', 'btree: overflow pages', 'no_scale'),
+ Stat('btree_row_internal',
+ 'btree: row-store internal pages', 'no_scale'),
+ Stat('btree_row_leaf', 'btree: row-store leaf pages', 'no_scale'),
+
+ ##########################################
+ # LSM statistics
+ ##########################################
+ Stat('bloom_count', 'LSM: bloom filters in the LSM tree', 'no_scale'),
+ Stat('bloom_false_positive', 'LSM: bloom filter false positives'),
+ Stat('bloom_hit', 'LSM: bloom filter hits'),
+ Stat('bloom_miss', 'LSM: bloom filter misses'),
+ Stat('bloom_page_evict',
+ 'LSM: bloom filter pages evicted from cache'),
+ Stat('bloom_page_read', 'LSM: bloom filter pages read into cache'),
+ Stat('bloom_size', 'LSM: total size of bloom filters', 'no_scale'),
+ Stat('lsm_checkpoint_throttle',
+ 'LSM: sleep for LSM checkpoint throttle'),
+ Stat('lsm_chunk_count',
+ 'LSM: chunks in the LSM tree', 'no_aggregate,no_scale'),
+ Stat('lsm_generation_max',
+ 'LSM: highest merge generation in the LSM tree',
+ 'max_aggregate,no_scale'),
+ Stat('lsm_lookup_no_bloom',
+ 'LSM: queries that could have benefited ' +
+ 'from a Bloom filter that did not exist'),
+ Stat('lsm_merge_throttle', 'LSM: sleep for LSM merge throttle'),
+
+ ##########################################
+ # Block manager statistics
+ ##########################################
+ Stat('block_alloc', 'block manager: blocks allocated'),
+ Stat('allocation_size',
+ 'block manager: file allocation unit size',
+ 'no_aggregate,no_scale'),
+ Stat('block_checkpoint_size',
+ 'block manager: checkpoint size', 'no_scale'),
+ Stat('block_extension',
+ 'block manager: allocations requiring file extension'),
+ Stat('block_free', 'block manager: blocks freed'),
+ Stat('block_magic',
+ 'block manager: file magic number', 'no_aggregate,no_scale'),
+ Stat('block_major', 'block manager: file major version number',
+ 'no_aggregate,no_scale'),
+ Stat('block_minor',
+ 'block manager: minor version number', 'no_aggregate,no_scale'),
+ Stat('block_reuse_bytes',
+ 'block manager: file bytes available for reuse'),
+ Stat('block_size', 'block manager: file size in bytes', 'no_scale'),
+
+ ##########################################
+ # Cache and eviction statistics
+ ##########################################
+ Stat('cache_bytes_read', 'cache: bytes read into cache'),
+ Stat('cache_bytes_write', 'cache: bytes written from cache'),
+ Stat('cache_eviction_clean', 'cache: unmodified pages evicted'),
+ Stat('cache_eviction_checkpoint',
+ 'cache: checkpoint blocked page eviction'),
+ Stat('cache_eviction_dirty', 'cache: modified pages evicted'),
+ Stat('cache_eviction_fail',
+ 'cache: data source pages selected for eviction unable' +
+ ' to be evicted'),
+ Stat('cache_eviction_hazard',
+ 'cache: hazard pointer blocked page eviction'),
+ Stat('cache_eviction_internal', 'cache: internal pages evicted'),
+ Stat('cache_overflow_value',
+ 'cache: overflow values cached in memory', 'no_scale'),
+ Stat('cache_read', 'cache: pages read into cache'),
+ Stat('cache_read_overflow', 'cache: overflow pages read into cache'),
+ Stat('cache_write', 'cache: pages written from cache'),
+
+ ##########################################
+ # Compression statistics
+ ##########################################
+ Stat('compress_raw_ok', 'compression: raw compression call succeeded'),
+ Stat('compress_raw_fail',
+ 'compression: raw compression call failed, no additional' +
+ ' data available'),
+ Stat('compress_raw_fail_temporary',
+ 'compression: raw compression call failed, additional' +
+ ' data available'),
+ Stat('compress_read', 'compression: compressed pages read'),
+ Stat('compress_write', 'compression: compressed pages written'),
+ Stat('compress_write_fail',
+ 'compression: page written failed to compress'),
+ Stat('compress_write_too_small',
+ 'compression: page written was too small to compress'),
+
+ ##########################################
+ # Reconciliation statistics
+ ##########################################
+ Stat('rec_dictionary', 'reconciliation: dictionary matches'),
+ Stat('rec_overflow_key_internal',
+ 'reconciliation: internal-page overflow keys'),
+ Stat('rec_overflow_key_leaf',
+ 'reconciliation: leaf-page overflow keys'),
+ Stat('rec_overflow_value', 'reconciliation: overflow values written'),
+ Stat('rec_page_match', 'reconciliation: page checksum matches'),
+ Stat('rec_page_delete', 'reconciliation: pages deleted'),
+ Stat('rec_pages', 'reconciliation: page reconciliation calls'),
+ Stat('rec_pages_eviction',
+ 'reconciliation: page reconciliation calls for eviction'),
+ Stat('rec_prefix_compression',
+ 'reconciliation: leaf page key bytes discarded using' +
+ ' prefix compression'),
+ Stat('rec_suffix_compression',
+ 'reconciliation: internal page key bytes discarded using' +
+ ' suffix compression'),
+ Stat('rec_multiblock_internal',
+ 'reconciliation: internal page multi-block writes'),
+ Stat('rec_multiblock_leaf',
+ 'reconciliation: leaf page multi-block writes'),
+ Stat('rec_multiblock_max',
+ 'reconciliation: maximum blocks required for a page',
+ 'max_aggregate,no_scale'),
+
+ ##########################################
+ # Transaction statistics
+ ##########################################
+ Stat('txn_update_conflict', 'txn: update conflicts'),
+]
+
+dsrc_stats = sorted(dsrc_stats, key=attrgetter('name'))
diff --git a/src/third_party/wiredtiger/dist/style.py b/src/third_party/wiredtiger/dist/style.py
new file mode 100755
index 00000000000..70eb7fb19ab
--- /dev/null
+++ b/src/third_party/wiredtiger/dist/style.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+# Check the style of WiredTiger C code.
+from dist import source_files
+import re, sys
+
+# Complain if a function comment is missing.
+def missing_comment():
+ for f in source_files():
+ skip_re = re.compile(r'DO NOT EDIT: automatically built')
+ func_re = re.compile(
+ r'(/\*(?:[^\*]|\*[^/])*\*/)?\n\w[\w ]+\n(\w+)', re.DOTALL)
+ s = open(f, 'r').read()
+ if skip_re.search(s):
+ continue
+ for m in func_re.finditer(s):
+ if not m.group(1) or \
+ not m.group(1).startswith('/*\n * %s --\n' % m.group(2)):
+ print "%s:%d: missing comment for %s" % \
+ (f, s[:m.start(2)].count('\n'), m.group(2))
+
+# Display lines that could be joined.
+def lines_could_join():
+ skip_re = re.compile(r'__asm__')
+ match_re = re.compile('(^[ \t].*\()\n^[ \t]*([^\n]*)', re.MULTILINE)
+ for f in source_files():
+ s = open(f, 'r').read()
+ if skip_re.search(s):
+ continue
+
+ for m in match_re.finditer(s):
+ if len(m.group(1).expandtabs()) + \
+ len(m.group(2).expandtabs()) < 80:
+ print f + ': lines may be combined: '
+ print '\t' + m.group(1).lstrip() + m.group(2)
+ print
+
+
+missing_comment()
+
+# Don't display lines that could be joined by default; in some cases, the code
+# isn't maintained by WiredTiger, or the line splitting enhances readability.
+if len(sys.argv) > 1:
+ lines_could_join()
diff --git a/src/third_party/wiredtiger/examples/c/Makefile.am b/src/third_party/wiredtiger/examples/c/Makefile.am
new file mode 100644
index 00000000000..adf87c3cc5d
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/Makefile.am
@@ -0,0 +1,32 @@
+LDADD = $(top_builddir)/libwiredtiger.la
+AM_CPPFLAGS = -I$(top_srcdir)/src/include
+
+noinst_PROGRAMS = \
+ ex_access \
+ ex_all \
+ ex_async \
+ ex_call_center \
+ ex_config \
+ ex_config_parse \
+ ex_cursor \
+ ex_data_source \
+ ex_extending \
+ ex_file \
+ ex_hello \
+ ex_log \
+ ex_pack \
+ ex_process \
+ ex_schema \
+ ex_scope \
+ ex_stat \
+ ex_thread
+
+# The examples can be run with no arguments as simple smoke tests
+TESTS = $(noinst_PROGRAMS)
+
+AM_TESTS_ENVIRONMENT = WIREDTIGER_HOME=`mktemp -d WT_HOME.XXXX` ; export WIREDTIGER_HOME ; rm -rf $$WIREDTIGER_HOME ; mkdir $$WIREDTIGER_HOME ;
+# automake 1.11 compatibility
+TESTS_ENVIRONMENT = $(AM_TESTS_ENVIRONMENT)
+
+clean-local:
+ rm -rf WT_HOME* *.core WiredTiger* *.wt
diff --git a/src/third_party/wiredtiger/examples/c/ex_access.c b/src/third_party/wiredtiger/examples/c/ex_access.c
new file mode 100644
index 00000000000..522db567d71
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_access.c
@@ -0,0 +1,98 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_access.c
+ * demonstrates how to create and access a simple table.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+
+static const char *home;
+
+int
+main(void)
+{
+ /*! [access example connection] */
+ WT_CONNECTION *conn;
+ WT_CURSOR *cursor;
+ WT_SESSION *session;
+ const char *key, *value;
+ int ret;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ /* Open a connection to the database, creating it if necessary. */
+ if ((ret = wiredtiger_open(home, NULL, "create", &conn)) != 0 ||
+ (ret = conn->open_session(conn, NULL, NULL, &session)) != 0) {
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+ return (ret);
+ }
+ /*! [access example connection] */
+
+ /*! [access example table create] */
+ ret = session->create(session,
+ "table:access", "key_format=S,value_format=S");
+ /*! [access example table create] */
+
+ /*! [access example cursor open] */
+ ret = session->open_cursor(session,
+ "table:access", NULL, NULL, &cursor);
+ /*! [access example cursor open] */
+
+ /*! [access example cursor insert] */
+ cursor->set_key(cursor, "key1"); /* Insert a record. */
+ cursor->set_value(cursor, "value1");
+ ret = cursor->insert(cursor);
+ /*! [access example cursor insert] */
+
+ /*! [access example cursor list] */
+ ret = cursor->reset(cursor); /* Restart the scan. */
+ while ((ret = cursor->next(cursor)) == 0) {
+ ret = cursor->get_key(cursor, &key);
+ ret = cursor->get_value(cursor, &value);
+
+ printf("Got record: %s : %s\n", key, value);
+ }
+ /*! [access example cursor list] */
+
+ /*! [access example close] */
+ ret = conn->close(conn, NULL);
+ /*! [access example close] */
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c
new file mode 100644
index 00000000000..1339e90eb2c
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_all.c
@@ -0,0 +1,1125 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_all.c
+ * Containing a call to every method in the WiredTiger API.
+ *
+ * It doesn't do anything very useful, just demonstrates how to call each
+ * method. This file is used to populate the API reference with code
+ * fragments.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef _WIN32
+#include <unistd.h>
+#else
+#include "windows_shim.h"
+#endif
+#include <sys/stat.h>
+
+#include <wiredtiger.h>
+
+int add_collator(WT_CONNECTION *conn);
+int add_extractor(WT_CONNECTION *conn);
+int backup(WT_SESSION *session);
+int checkpoint_ops(WT_SESSION *session);
+int connection_ops(WT_CONNECTION *conn);
+int cursor_ops(WT_SESSION *session);
+int cursor_search_near(WT_CURSOR *cursor);
+int cursor_statistics(WT_SESSION *session);
+int pack_ops(WT_SESSION *session);
+int session_ops(WT_SESSION *session);
+int transaction_ops(WT_CONNECTION *conn, WT_SESSION *session);
+
+static const char * const progname = "ex_all";
+static const char *home;
+
+int
+cursor_ops(WT_SESSION *session)
+{
+ WT_CURSOR *cursor;
+ int ret;
+
+ /*! [Open a cursor] */
+ ret = session->open_cursor(
+ session, "table:mytable", NULL, NULL, &cursor);
+ /*! [Open a cursor] */
+
+ /*! [Open a cursor on the metadata] */
+ ret = session->open_cursor(
+ session, "metadata:", NULL, NULL, &cursor);
+ /*! [Open a cursor on the metadata] */
+
+ {
+ WT_CURSOR *duplicate;
+ const char *key = "some key";
+ /*! [Duplicate a cursor] */
+ ret = session->open_cursor(
+ session, "table:mytable", NULL, NULL, &cursor);
+ cursor->set_key(cursor, key);
+ ret = cursor->search(cursor);
+
+ /* Duplicate the cursor. */
+ ret = session->open_cursor(session, NULL, cursor, NULL, &duplicate);
+ /*! [Duplicate a cursor] */
+ }
+
+ {
+ WT_CURSOR *overwrite_cursor;
+ const char *key = "some key", *value = "some value";
+ /*! [Reconfigure a cursor] */
+ ret = session->open_cursor(
+ session, "table:mytable", NULL, NULL, &cursor);
+ cursor->set_key(cursor, key);
+
+ /* Reconfigure the cursor to overwrite the record. */
+ ret = session->open_cursor(
+ session, NULL, cursor, "overwrite", &overwrite_cursor);
+ ret = cursor->close(cursor);
+
+ overwrite_cursor->set_value(overwrite_cursor, value);
+ ret = overwrite_cursor->insert(cursor);
+ /*! [Reconfigure a cursor] */
+ }
+
+ {
+ /*! [boolean configuration string example] */
+ ret = session->open_cursor(session, "table:mytable", NULL,
+ "overwrite", &cursor);
+ ret = session->open_cursor(session, "table:mytable", NULL,
+ "overwrite=true", &cursor);
+ ret = session->open_cursor(session, "table:mytable", NULL,
+ "overwrite=1", &cursor);
+ /*! [boolean configuration string example] */
+ }
+
+ {
+ /*! [open a named checkpoint] */
+ ret = session->open_cursor(session,
+ "table:mytable", NULL, "checkpoint=midnight", &cursor);
+ /*! [open a named checkpoint] */
+ }
+
+ {
+ /*! [open the default checkpoint] */
+ ret = session->open_cursor(session,
+ "table:mytable", NULL, "checkpoint=WiredTigerCheckpoint", &cursor);
+ /*! [open the default checkpoint] */
+ }
+
+ {
+ /*! [Get the cursor's string key] */
+ const char *key; /* Get the cursor's string key. */
+ ret = cursor->get_key(cursor, &key);
+ /*! [Get the cursor's string key] */
+ }
+
+ {
+ /*! [Set the cursor's string key] */
+ /* Set the cursor's string key. */
+ const char *key = "another key";
+ cursor->set_key(cursor, key);
+ /*! [Set the cursor's string key] */
+ }
+
+ {
+ /*! [Get the cursor's record number key] */
+ uint64_t recno; /* Get the cursor's record number key. */
+ ret = cursor->get_key(cursor, &recno);
+ /*! [Get the cursor's record number key] */
+ }
+
+ {
+ /*! [Set the cursor's record number key] */
+ uint64_t recno = 37; /* Set the cursor's record number key. */
+ cursor->set_key(cursor, recno);
+ /*! [Set the cursor's record number key] */
+ }
+
+ {
+ /*! [Get the cursor's composite key] */
+ /* Get the cursor's "SiH" format composite key. */
+ const char *first;
+ int32_t second;
+ uint16_t third;
+ ret = cursor->get_key(cursor, &first, &second, &third);
+ /*! [Get the cursor's composite key] */
+ }
+
+ {
+ /*! [Set the cursor's composite key] */
+ /* Set the cursor's "SiH" format composite key. */
+ cursor->set_key(cursor, "first", (int32_t)5, (uint16_t)7);
+ /*! [Set the cursor's composite key] */
+ }
+
+ {
+ /*! [Get the cursor's string value] */
+ const char *value; /* Get the cursor's string value. */
+ ret = cursor->get_value(cursor, &value);
+ /*! [Get the cursor's string value] */
+ }
+
+ {
+ /*! [Set the cursor's string value] */
+ /* Set the cursor's string value. */
+ const char *value = "another value";
+ cursor->set_value(cursor, value);
+ /*! [Set the cursor's string value] */
+ }
+
+ {
+ /*! [Get the cursor's raw value] */
+ WT_ITEM value; /* Get the cursor's raw value. */
+ ret = cursor->get_value(cursor, &value);
+ /*! [Get the cursor's raw value] */
+ }
+
+ {
+ /*! [Set the cursor's raw value] */
+ WT_ITEM value; /* Set the cursor's raw value. */
+ value.data = "another value";
+ value.size = strlen("another value");
+ cursor->set_value(cursor, &value);
+ /*! [Set the cursor's raw value] */
+ }
+
+ /*! [Return the next record] */
+ ret = cursor->next(cursor);
+ /*! [Return the next record] */
+
+ /*! [Return the previous record] */
+ ret = cursor->prev(cursor);
+ /*! [Return the previous record] */
+
+ /*! [Reset the cursor] */
+ ret = cursor->reset(cursor);
+ /*! [Reset the cursor] */
+
+ {
+ WT_CURSOR *other = NULL;
+ /*! [Cursor comparison] */
+ int compare;
+ ret = cursor->compare(cursor, other, &compare);
+ if (compare == 0) {
+ /* Cursors reference the same key */
+ } else if (compare < 0) {
+ /* Cursor key less than other key */
+ } else if (compare > 0) {
+ /* Cursor key greater than other key */
+ }
+ /*! [Cursor comparison] */
+ }
+
+ {
+ /*! [Search for an exact match] */
+ const char *key = "some key";
+ cursor->set_key(cursor, key);
+ ret = cursor->search(cursor);
+ /*! [Search for an exact match] */
+ }
+
+ ret = cursor_search_near(cursor);
+
+ {
+ /*! [Insert a new record or overwrite an existing record] */
+ /* Insert a new record or overwrite an existing record. */
+ const char *key = "some key", *value = "some value";
+ ret = session->open_cursor(
+ session, "table:mytable", NULL, NULL, &cursor);
+ cursor->set_key(cursor, key);
+ cursor->set_value(cursor, value);
+ ret = cursor->insert(cursor);
+ /*! [Insert a new record or overwrite an existing record] */
+ }
+
+ {
+ /*! [Insert a new record and fail if the record exists] */
+ /* Insert a new record and fail if the record exists. */
+ const char *key = "some key", *value = "some value";
+ ret = session->open_cursor(
+ session, "table:mytable", NULL, "overwrite=false", &cursor);
+ cursor->set_key(cursor, key);
+ cursor->set_value(cursor, value);
+ ret = cursor->insert(cursor);
+ /*! [Insert a new record and fail if the record exists] */
+ }
+
+ {
+ /*! [Insert a new record and assign a record number] */
+ /* Insert a new record and assign a record number. */
+ uint64_t recno;
+ const char *value = "some value";
+ ret = session->open_cursor(
+ session, "table:mytable", NULL, "append", &cursor);
+ cursor->set_value(cursor, value);
+ ret = cursor->insert(cursor);
+ if (ret == 0)
+ ret = cursor->get_key(cursor, &recno);
+ /*! [Insert a new record and assign a record number] */
+ }
+
+ {
+ /*! [Update an existing record or insert a new record] */
+ const char *key = "some key", *value = "some value";
+ ret = session->open_cursor(
+ session, "table:mytable", NULL, NULL, &cursor);
+ cursor->set_key(cursor, key);
+ cursor->set_value(cursor, value);
+ ret = cursor->update(cursor);
+ /*! [Update an existing record or insert a new record] */
+ }
+
+ {
+ /*! [Update an existing record and fail if DNE] */
+ const char *key = "some key", *value = "some value";
+ ret = session->open_cursor(
+ session, "table:mytable", NULL, "overwrite=false", &cursor);
+ cursor->set_key(cursor, key);
+ cursor->set_value(cursor, value);
+ ret = cursor->update(cursor);
+ /*! [Update an existing record and fail if DNE] */
+ }
+
+ {
+ /*! [Remove a record] */
+ const char *key = "some key";
+ ret = session->open_cursor(
+ session, "table:mytable", NULL, NULL, &cursor);
+ cursor->set_key(cursor, key);
+ ret = cursor->remove(cursor);
+ /*! [Remove a record] */
+ }
+
+ {
+ /*! [Remove a record and fail if DNE] */
+ const char *key = "some key";
+ ret = session->open_cursor(
+ session, "table:mytable", NULL, "overwrite=false", &cursor);
+ cursor->set_key(cursor, key);
+ ret = cursor->remove(cursor);
+ /*! [Remove a record and fail if DNE] */
+ }
+
+ {
+ /*! [Display an error] */
+ const char *key = "non-existent key";
+ cursor->set_key(cursor, key);
+ if ((ret = cursor->remove(cursor)) != 0) {
+ fprintf(stderr,
+ "cursor.remove: %s\n", wiredtiger_strerror(ret));
+ return (ret);
+ }
+ /*! [Display an error] */
+ }
+
+ /*! [Close the cursor] */
+ ret = cursor->close(cursor);
+ /*! [Close the cursor] */
+
+ return (ret);
+}
+
+int
+cursor_search_near(WT_CURSOR *cursor)
+{
+ int exact, ret;
+ const char *key = "some key";
+
+ /*! [Search for an exact or adjacent match] */
+ cursor->set_key(cursor, key);
+ ret = cursor->search_near(cursor, &exact);
+ if (ret == 0) {
+ if (exact == 0) {
+ /* an exact match */
+ } else if (exact < 0) {
+ /* returned smaller key */
+ } else if (exact > 0) {
+ /* returned larger key */
+ }
+ }
+ /*! [Search for an exact or adjacent match] */
+
+ /*! [Forward scan greater than or equal] */
+ cursor->set_key(cursor, key);
+ ret = cursor->search_near(cursor, &exact);
+ if (ret == 0 && exact >= 0) {
+ /* include first key returned in the scan */
+ }
+
+ while ((ret = cursor->next(cursor)) == 0) {
+ /* the rest of the scan */
+ }
+ /*! [Forward scan greater than or equal] */
+
+ /*! [Backward scan less than] */
+ cursor->set_key(cursor, key);
+ ret = cursor->search_near(cursor, &exact);
+ if (ret == 0 && exact < 0) {
+ /* include first key returned in the scan */
+ }
+
+ while ((ret = cursor->prev(cursor)) == 0) {
+ /* the rest of the scan */
+ }
+ /*! [Backward scan less than] */
+
+ return (ret);
+}
+
+int
+checkpoint_ops(WT_SESSION *session)
+{
+ int ret;
+
+ /*! [Checkpoint examples] */
+ /* Checkpoint the database. */
+ ret = session->checkpoint(session, NULL);
+
+ /* Checkpoint of the database, creating a named snapshot. */
+ ret = session->checkpoint(session, "name=June01");
+
+ /*
+ * Checkpoint a list of objects.
+ * JSON parsing requires quoting the list of target URIs.
+ */
+ ret = session->
+ checkpoint(session, "target=(\"table:table1\",\"table:table2\")");
+
+ /*
+ * Checkpoint a list of objects, creating a named snapshot.
+ * JSON parsing requires quoting the list of target URIs.
+ */
+ ret = session->
+ checkpoint(session, "target=(\"table:mytable\"),name=midnight");
+
+ /* Checkpoint the database, discarding all previous snapshots. */
+ ret = session->checkpoint(session, "drop=(from=all)");
+
+ /* Checkpoint the database, discarding the "midnight" snapshot. */
+ ret = session->checkpoint(session, "drop=(midnight)");
+
+ /*
+ * Checkpoint the database, discarding all snapshots after and
+ * including "noon".
+ */
+ ret = session->checkpoint(session, "drop=(from=noon)");
+
+ /*
+ * Checkpoint the database, discarding all snapshots before and
+ * including "midnight".
+ */
+ ret = session->checkpoint(session, "drop=(to=midnight)");
+
+ /*
+ * Create a checkpoint of a table, creating the "July01" snapshot and
+ * discarding the "May01" and "June01" snapshots.
+ * JSON parsing requires quoting the list of target URIs.
+ */
+ ret = session->checkpoint(session,
+ "target=(\"table:mytable\"),name=July01,drop=(May01,June01)");
+ /*! [Checkpoint examples] */
+
+ /*! [JSON quoting example] */
+ /*
+ * Checkpoint a list of objects.
+ * JSON parsing requires quoting the list of target URIs.
+ */
+ ret = session->
+ checkpoint(session, "target=(\"table:table1\",\"table:table2\")");
+ /*! [JSON quoting example] */
+
+ return (ret);
+}
+
+int
+cursor_statistics(WT_SESSION *session)
+{
+ WT_CURSOR *cursor;
+ int ret;
+
+ /*! [Statistics cursor database] */
+ ret = session->open_cursor(
+ session, "statistics:", NULL, NULL, &cursor);
+ /*! [Statistics cursor database] */
+
+ /*! [Statistics cursor table] */
+ ret = session->open_cursor(
+ session, "statistics:table:mytable", NULL, NULL, &cursor);
+ /*! [Statistics cursor table] */
+
+ /*! [Statistics cursor table fast] */
+ ret = session->open_cursor(session,
+ "statistics:table:mytable", NULL, "statistics=(fast)", &cursor);
+ /*! [Statistics cursor table fast] */
+
+ /*! [Statistics clear configuration] */
+ ret = session->open_cursor(session,
+ "statistics:", NULL, "statistics=(fast,clear)", &cursor);
+ /*! [Statistics clear configuration] */
+
+ /*! [Statistics cursor clear configuration] */
+ ret = session->open_cursor(session,
+ "statistics:table:mytable",
+ NULL, "statistics=(all,clear)", &cursor);
+ /*! [Statistics cursor clear configuration] */
+
+ return (ret);
+}
+
+int
+session_ops(WT_SESSION *session)
+{
+ int ret;
+
+ /*! [Reconfigure a session] */
+ ret = session->reconfigure(session, "isolation=snapshot");
+ /*! [Reconfigure a session] */
+
+ /*! [Create a table] */
+ ret = session->create(session,
+ "table:mytable", "key_format=S,value_format=S");
+ /*! [Create a table] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+ /*! [Create a column-store table] */
+ ret = session->create(session,
+ "table:mytable", "key_format=r,value_format=S");
+ /*! [Create a column-store table] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+ /*! [Create a table with columns] */
+ /*
+ * Create a table with columns: keys are record numbers, values are
+ * (string, signed 32-bit integer, unsigned 16-bit integer).
+ */
+ ret = session->create(session, "table:mytable",
+ "key_format=r,value_format=SiH,"
+ "columns=(id,department,salary,year-started)");
+ /*! [Create a table with columns] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+ /*
+ * This example code gets run, and the compression libraries might not
+ * be loaded, causing the create to fail. The documentation requires
+ * the code snippets, use #ifdef's to avoid running it.
+ */
+#ifdef MIGHT_NOT_RUN
+ /*! [Create a bzip2 compressed table] */
+ ret = session->create(session,
+ "table:mytable",
+ "block_compressor=bzip2,key_format=S,value_format=S");
+ /*! [Create a bzip2 compressed table] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+ /*! [Create a snappy compressed table] */
+ ret = session->create(session,
+ "table:mytable",
+ "block_compressor=snappy,key_format=S,value_format=S");
+ /*! [Create a snappy compressed table] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+ /*! [Create a zlib compressed table] */
+ ret = session->create(session,
+ "table:mytable",
+ "block_compressor=zlib,key_format=S,value_format=S");
+ /*! [Create a zlib compressed table] */
+ ret = session->drop(session, "table:mytable", NULL);
+#endif
+
+ /*! [Configure checksums to uncompressed] */
+ ret = session->create(session, "table:mytable",
+ "key_format=S,value_format=S,checksum=uncompressed");
+ /*! [Configure checksums to uncompressed] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+ /*! [Configure dictionary compression on] */
+ ret = session->create(session, "table:mytable",
+ "key_format=S,value_format=S,dictionary=1000");
+ /*! [Configure dictionary compression on] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+ /*! [Configure key prefix compression on] */
+ ret = session->create(session, "table:mytable",
+ "key_format=S,value_format=S,prefix_compression=true");
+ /*! [Configure key prefix compression on] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+#ifdef MIGHT_NOT_RUN
+ /* Requires sync_file_range */
+ /*! [os_cache_dirty_max configuration] */
+ ret = session->create(
+ session, "table:mytable", "os_cache_dirty_max=500MB");
+ /*! [os_cache_dirty_max configuration] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+ /* Requires posix_fadvise */
+ /*! [os_cache_max configuration] */
+ ret = session->create(session, "table:mytable", "os_cache_max=1GB");
+ /*! [os_cache_max configuration] */
+ ret = session->drop(session, "table:mytable", NULL);
+#endif
+ /*! [Configure block_allocation] */
+ ret = session->create(session, "table:mytable",
+ "key_format=S,value_format=S,block_allocation=first");
+ /*! [Configure block_allocation] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+ /*! [Create a cache-resident object] */
+ ret = session->create(session,
+ "table:mytable", "key_format=r,value_format=S,cache_resident=true");
+ /*! [Create a cache-resident object] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+ {
+ /* Create a table for the session operations. */
+ ret = session->create(
+ session, "table:mytable", "key_format=S,value_format=S");
+
+ /*! [Compact a table] */
+ ret = session->compact(session, "table:mytable", NULL);
+ /*! [Compact a table] */
+
+ /*! [Rename a table] */
+ ret = session->rename(session, "table:old", "table:new", NULL);
+ /*! [Rename a table] */
+
+ /*! [Salvage a table] */
+ ret = session->salvage(session, "table:mytable", NULL);
+ /*! [Salvage a table] */
+
+ /*! [Truncate a table] */
+ ret = session->truncate(session, "table:mytable", NULL, NULL, NULL);
+ /*! [Truncate a table] */
+
+ {
+ /*
+ * Insert a pair of keys so we can truncate a range.
+ */
+ WT_CURSOR *cursor;
+ ret = session->open_cursor(
+ session, "table:mytable", NULL, NULL, &cursor);
+ cursor->set_key(cursor, "June01");
+ cursor->set_value(cursor, "value");
+ ret = cursor->update(cursor);
+ cursor->set_key(cursor, "June30");
+ cursor->set_value(cursor, "value");
+ ret = cursor->update(cursor);
+ ret = cursor->close(cursor);
+
+ {
+ /*! [Truncate a range] */
+ WT_CURSOR *start, *stop;
+
+ ret = session->open_cursor(
+ session, "table:mytable", NULL, NULL, &start);
+ start->set_key(start, "June01");
+ ret = start->search(start);
+
+ ret = session->open_cursor(
+ session, "table:mytable", NULL, NULL, &stop);
+ stop->set_key(stop, "June30");
+ ret = stop->search(stop);
+
+ ret = session->truncate(session, NULL, start, stop, NULL);
+ /*! [Truncate a range] */
+ }
+ }
+
+ /*! [Upgrade a table] */
+ ret = session->upgrade(session, "table:mytable", NULL);
+ /*! [Upgrade a table] */
+
+ /*! [Verify a table] */
+ ret = session->verify(session, "table:mytable", NULL);
+ /*! [Verify a table] */
+
+ /*! [Drop a table] */
+ ret = session->drop(session, "table:mytable", NULL);
+ /*! [Drop a table] */
+ }
+
+ /*! [Close a session] */
+ ret = session->close(session, NULL);
+ /*! [Close a session] */
+
+ return (ret);
+}
+
+int
+transaction_ops(WT_CONNECTION *conn, WT_SESSION *session)
+{
+ WT_CURSOR *cursor;
+ int ret;
+
+ /*! [transaction commit/rollback] */
+ /*
+ * Cursors may be opened before or after the transaction begins, and in
+ * either case, subsequent operations are included in the transaction.
+ * Opening cursors before the transaction begins allows applications to
+ * cache cursors and use them for multiple operations.
+ */
+ ret =
+ session->open_cursor(session, "table:mytable", NULL, NULL, &cursor);
+ ret = session->begin_transaction(session, NULL);
+
+ cursor->set_key(cursor, "key");
+ cursor->set_value(cursor, "value");
+ switch (ret = cursor->update(cursor)) {
+ case 0: /* Update success */
+ ret = session->commit_transaction(session, NULL);
+ /*
+ * If commit_transaction succeeds, cursors remain positioned; if
+ * commit_transaction fails, the transaction was rolled-back and
+ * and all cursors are reset.
+ */
+ break;
+ case WT_ROLLBACK: /* Update conflict */
+ default: /* Other error */
+ ret = session->rollback_transaction(session, NULL);
+ /* The rollback_transaction call resets all cursors. */
+ break;
+ }
+
+ /*
+ * Cursors remain open and may be used for multiple transactions.
+ */
+ /*! [transaction commit/rollback] */
+ ret = cursor->close(cursor);
+
+ /*! [transaction isolation] */
+ /* A single transaction configured for snapshot isolation. */
+ ret =
+ session->open_cursor(session, "table:mytable", NULL, NULL, &cursor);
+ ret = session->begin_transaction(session, "isolation=snapshot");
+ cursor->set_key(cursor, "some-key");
+ cursor->set_value(cursor, "some-value");
+ ret = cursor->update(cursor);
+ ret = session->commit_transaction(session, NULL);
+ /*! [transaction isolation] */
+
+ /*! [session isolation configuration] */
+ /* Open a session configured for read-uncommitted isolation. */
+ ret = conn->open_session(
+ conn, NULL, "isolation=read_uncommitted", &session);
+ /*! [session isolation configuration] */
+
+ /*! [session isolation re-configuration] */
+ /* Re-configure a session for snapshot isolation. */
+ ret = session->reconfigure(session, "isolation=snapshot");
+ /*! [session isolation re-configuration] */
+
+ {
+ /*! [transaction pinned range] */
+ /* Check the transaction ID range pinned by the session handle. */
+ uint64_t range;
+
+ ret = session->transaction_pinned_range(session, &range);
+ /*! [transaction pinned range] */
+ }
+
+ return (ret);
+}
+
+/*! [Implement WT_COLLATOR] */
+/*
+ * A simple example of the collator API: compare the keys as strings.
+ */
+static int
+my_compare(WT_COLLATOR *collator, WT_SESSION *session,
+ const WT_ITEM *value1, const WT_ITEM *value2, int *cmp)
+{
+ const char *p1, *p2;
+
+ /* Unused parameters */
+ (void)collator;
+ (void)session;
+
+ p1 = (const char *)value1->data;
+ p2 = (const char *)value2->data;
+ while (*p1 != '\0' && *p1 == *p2)
+ p1++, p2++;
+
+ *cmp = (int)*p2 - (int)*p1;
+ return (0);
+}
+/*! [Implement WT_COLLATOR] */
+
+int
+add_collator(WT_CONNECTION *conn)
+{
+ int ret;
+
+ /*! [WT_COLLATOR register] */
+ static WT_COLLATOR my_collator = { my_compare, NULL, NULL };
+ ret = conn->add_collator(conn, "my_collator", &my_collator, NULL);
+ /*! [WT_COLLATOR register] */
+
+ return (ret);
+}
+
+/*! [WT_EXTRACTOR] */
+static int
+my_extract(WT_EXTRACTOR *extractor, WT_SESSION *session,
+ const WT_ITEM *key, const WT_ITEM *value, WT_ITEM *result)
+{
+ /* Unused parameters */
+ (void)extractor;
+ (void)session;
+ (void)key;
+
+ result->data = value->data;
+ result->size = value->size;
+ return (0);
+}
+/*! [WT_EXTRACTOR] */
+
+int
+add_extractor(WT_CONNECTION *conn)
+{
+ int ret;
+
+ /*! [WT_EXTRACTOR register] */
+ static WT_EXTRACTOR my_extractor = {my_extract};
+
+ ret = conn->add_extractor(conn, "my_extractor", &my_extractor, NULL);
+ /*! [WT_EXTRACTOR register] */
+
+ return (ret);
+}
+
+int
+connection_ops(WT_CONNECTION *conn)
+{
+ int ret;
+
+#ifdef MIGHT_NOT_RUN
+ /*! [Load an extension] */
+ ret = conn->load_extension(conn, "my_extension.dll", NULL);
+
+ ret = conn->load_extension(conn,
+ "datasource/libdatasource.so",
+ "config=[device=/dev/sd1,alignment=64]");
+ /*! [Load an extension] */
+#endif
+
+ ret = add_collator(conn);
+ ret = add_extractor(conn);
+
+ /*! [Reconfigure a connection] */
+ ret = conn->reconfigure(conn, "eviction_target=75");
+ /*! [Reconfigure a connection] */
+
+ /*! [Get the database home directory] */
+ printf("The database home is %s\n", conn->get_home(conn));
+ /*! [Get the database home directory] */
+
+ /*! [Check if the database is newly created] */
+ if (conn->is_new(conn)) {
+ /* First time initialization. */
+ }
+ /*! [Check if the database is newly created] */
+
+ {
+ /*! [Open a session] */
+ WT_SESSION *session;
+ ret = conn->open_session(conn, NULL, NULL, &session);
+ /*! [Open a session] */
+
+ ret = session_ops(session);
+ }
+
+ /*! [Configure method configuration] */
+ /*
+ * Applications opening a cursor for the data-source object "my_data"
+ * have an additional configuration option "entries", which is an
+ * integer type, defaults to 5, and must be an integer between 1 and 10.
+ */
+ ret = conn->configure_method(conn,
+ "session.open_cursor",
+ "my_data:", "entries=5", "int", "min=1,max=10");
+
+ /*
+ * Applications opening a cursor for the data-source object "my_data"
+ * have an additional configuration option "devices", which is a list
+ * of strings.
+ */
+ ret = conn->configure_method(conn,
+ "session.open_cursor", "my_data:", "devices", "list", NULL);
+ /*! [Configure method configuration] */
+
+ /*! [Close a connection] */
+ ret = conn->close(conn, NULL);
+ /*! [Close a connection] */
+
+ return (ret);
+}
+
+int
+pack_ops(WT_SESSION *session)
+{
+ int ret;
+
+ {
+ /*! [Get the packed size] */
+ size_t size;
+ ret = wiredtiger_struct_size(session, &size, "iSh", 42, "hello", -3);
+ /*! [Get the packed size] */
+ assert(size < 100);
+ }
+
+ {
+ /*! [Pack fields into a buffer] */
+ char buf[100];
+ ret = wiredtiger_struct_pack(
+ session, buf, sizeof(buf), "iSh", 42, "hello", -3);
+ /*! [Pack fields into a buffer] */
+
+ {
+ /*! [Unpack fields from a buffer] */
+ int i;
+ char *s;
+ short h;
+ ret = wiredtiger_struct_unpack(
+ session, buf, sizeof(buf), "iSh", &i, &s, &h);
+ /*! [Unpack fields from a buffer] */
+ }
+ }
+
+ return (ret);
+}
+
+int
+backup(WT_SESSION *session)
+{
+ char buf[1024];
+
+ /*! [backup]*/
+ WT_CURSOR *cursor;
+ const char *filename;
+ int ret;
+
+ /* Create the backup directory. */
+ ret = mkdir("/path/database.backup", 077);
+
+ /* Open the backup data source. */
+ ret = session->open_cursor(session, "backup:", NULL, NULL, &cursor);
+
+ /* Copy the list of files. */
+ while (
+ (ret = cursor->next(cursor)) == 0 &&
+ (ret = cursor->get_key(cursor, &filename)) == 0) {
+ (void)snprintf(buf, sizeof(buf),
+ "cp /path/database/%s /path/database.backup/%s",
+ filename, filename);
+ ret = system(buf);
+ }
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ if (ret != 0)
+ fprintf(stderr, "%s: cursor next(backup:) failed: %s\n",
+ progname, wiredtiger_strerror(ret));
+
+ ret = cursor->close(cursor);
+ /*! [backup]*/
+
+ /*! [backup of a checkpoint]*/
+ ret = session->checkpoint(session, "drop=(from=June01),name=June01");
+ /*! [backup of a checkpoint]*/
+
+ return (ret);
+}
+
+int
+main(void)
+{
+ WT_CONNECTION *conn;
+ int ret;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ /*! [Open a connection] */
+ ret = wiredtiger_open(home, NULL, "create,cache_size=500M", &conn);
+ /*! [Open a connection] */
+
+ if (ret == 0)
+ ret = connection_ops(conn);
+ /*
+ * The connection has been closed.
+ */
+
+#ifdef MIGHT_NOT_RUN
+ /*
+ * This example code gets run, and the compression libraries might not
+ * be installed, causing the open to fail. The documentation requires
+ * the code snippets, use #ifdef's to avoid running it.
+ */
+ /*! [Configure bzip2 extension] */
+ ret = wiredtiger_open(home, NULL,
+ "create,"
+ "extensions=[/usr/local/lib/libwiredtiger_bzip2.so]", &conn);
+ /*! [Configure bzip2 extension] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
+ /*! [Configure snappy extension] */
+ ret = wiredtiger_open(home, NULL,
+ "create,"
+ "extensions=[/usr/local/lib/libwiredtiger_snappy.so]", &conn);
+ /*! [Configure snappy extension] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
+ /*! [Configure zlib extension] */
+ ret = wiredtiger_open(home, NULL,
+ "create,"
+ "extensions=[/usr/local/lib/libwiredtiger_zlib.so]", &conn);
+ /*! [Configure zlib extension] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
+ /*
+ * This example code gets run, and direct I/O might not be available,
+ * causing the open to fail. The documentation requires code snippets,
+ * use #ifdef's to avoid running it.
+ */
+ /* Might Not Run: direct I/O may not be available. */
+ /*! [Configure direct_io for data files] */
+ ret = wiredtiger_open(home, NULL, "create,direct_io=[data]", &conn);
+ /*! [Configure direct_io for data files] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+#endif
+
+ /*! [Configure file_extend] */
+ ret = wiredtiger_open(
+ home, NULL, "create,file_extend=(data=16MB)", &conn);
+ /*! [Configure file_extend] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
+ /*! [Eviction configuration] */
+ /*
+ * Configure eviction to begin at 90% full, and run until the cache
+ * is only 75% dirty.
+ */
+ ret = wiredtiger_open(home, NULL,
+ "create,eviction_trigger=90,eviction_dirty_target=75", &conn);
+ /*! [Eviction configuration] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
+ /*! [Eviction worker configuration] */
+ /* Configure up to four eviction threads */
+ ret = wiredtiger_open(home, NULL,
+ "create,eviction_trigger=90,eviction=(threads_max=4)", &conn);
+ /*! [Eviction worker configuration] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
+ /*! [Statistics configuration] */
+ ret = wiredtiger_open(home, NULL, "create,statistics=(all)", &conn);
+ /*! [Statistics configuration] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
+ /*! [Statistics logging] */
+ ret = wiredtiger_open(
+ home, NULL, "create,statistics_log=(wait=30)", &conn);
+ /*! [Statistics logging] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
+ /*! [Statistics logging with a table] */
+ ret = wiredtiger_open(home, NULL,
+ "create, statistics_log=("
+ "sources=(\"lsm:table1\",\"lsm:table2\"), wait=5)",
+ &conn);
+ /*! [Statistics logging with a table] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
+ /*! [Statistics logging with all tables] */
+ ret = wiredtiger_open(home, NULL,
+ "create, statistics_log=(sources=(\"lsm:\"), wait=5)",
+ &conn);
+ /*! [Statistics logging with all tables] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
+#ifdef MIGHT_NOT_RUN
+ /*
+ * This example code gets run, and a non-existent log file path might
+ * cause the open to fail. The documentation requires code snippets,
+ * use #ifdef's to avoid running it.
+ */
+ /*! [Statistics logging with path] */
+ ret = wiredtiger_open(home, NULL,
+ "create,"
+ "statistics_log=(wait=120,path=/log/log.%m.%d.%y)", &conn);
+ /*! [Statistics logging with path] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
+ /*
+ * Don't run this code, because memory checkers get very upset when we
+ * leak memory.
+ */
+ (void)wiredtiger_open(home, NULL, "create", &conn);
+ /*! [Connection close leaking memory] */
+ ret = conn->close(conn, "leak_memory=true");
+ /*! [Connection close leaking memory] */
+#endif
+
+ /*! [Get the WiredTiger library version #1] */
+ printf("WiredTiger version %s\n", wiredtiger_version(NULL, NULL, NULL));
+ /*! [Get the WiredTiger library version #1] */
+
+ {
+ /*! [Get the WiredTiger library version #2] */
+ int major_v, minor_v, patch;
+ (void)wiredtiger_version(&major_v, &minor_v, &patch);
+ printf("WiredTiger version is %d, %d (patch %d)\n",
+ major_v, minor_v, patch);
+ /*! [Get the WiredTiger library version #2] */
+ }
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_async.c b/src/third_party/wiredtiger/examples/c/ex_async.c
new file mode 100644
index 00000000000..0c8f83e4aac
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_async.c
@@ -0,0 +1,223 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_async.c
+ * demonstrates how to use the asynchronous API.
+ */
+#include <errno.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#ifndef _WIN32
+#include <unistd.h>
+#else
+#include "windows_shim.h"
+#endif
+
+#include <wiredtiger.h>
+
+#if defined(_lint)
+#define ATOMIC_ADD(v, val) ((v) += (val), (v))
+#elif defined(_WIN32)
+#define ATOMIC_ADD(v, val) (_InterlockedExchangeAdd(&(v), val) + val)
+#else
+#define ATOMIC_ADD(v, val) __sync_add_and_fetch(&(v), val)
+#endif
+
+static const char * const home = NULL;
+static int global_error = 0;
+
+/*! [async example callback implementation] */
+typedef struct {
+ WT_ASYNC_CALLBACK iface;
+ uint32_t num_keys;
+} ASYNC_KEYS;
+
+static int
+async_callback(WT_ASYNC_CALLBACK *cb,
+ WT_ASYNC_OP *op, int wiredtiger_error, uint32_t flags)
+{
+ ASYNC_KEYS *asynckey = (ASYNC_KEYS *)cb;
+ WT_ASYNC_OPTYPE type;
+ WT_ITEM k, v;
+ const char *key, *value;
+ uint64_t id;
+ int ret;
+
+ (void)flags; /* Unused */
+
+ ret = 0;
+
+ /*! [async get type] */
+ /* Retrieve the operation's WT_ASYNC_OPTYPE type. */
+ type = op->get_type(op);
+ /*! [async get type] */
+
+ /*! [async get identifier] */
+ /* Retrieve the operation's 64-bit identifier. */
+ id = op->get_id(op);
+ /*! [async get identifier] */
+
+ /* Check for a WiredTiger error. */
+ if (wiredtiger_error != 0) {
+ fprintf(stderr,
+ "ID %" PRIu64 " error %d: %s\n",
+ id, wiredtiger_error,
+ wiredtiger_strerror(wiredtiger_error));
+ global_error = wiredtiger_error;
+ return (1);
+ }
+
+ /* If doing a search, retrieve the key/value pair. */
+ if (type == WT_AOP_SEARCH) {
+ /*! [async get the operation's string key] */
+ ret = op->get_key(op, &k);
+ key = k.data;
+ /*! [async get the operation's string key] */
+ /*! [async get the operation's string value] */
+ ret = op->get_value(op, &v);
+ value = v.data;
+ /*! [async get the operation's string value] */
+ ATOMIC_ADD(asynckey->num_keys, 1);
+ printf("Id %" PRIu64 " got record: %s : %s\n", id, key, value);
+ }
+ return (ret);
+}
+/*! [async example callback implementation] */
+
+static ASYNC_KEYS ex_asynckeys = { {async_callback}, 0 };
+
+#define MAX_KEYS 15
+
+int
+main(void)
+{
+ WT_ASYNC_OP *op;
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+ int i, ret;
+ char k[MAX_KEYS][16], v[MAX_KEYS][16];
+
+ /*! [async example connection] */
+ ret = wiredtiger_open(home, NULL,
+ "create,cache_size=100MB,"
+ "async=(enabled=true,ops_max=20,threads=2)", &conn);
+ /*! [async example connection] */
+
+ /*! [async example table create] */
+ ret = conn->open_session(conn, NULL, NULL, &session);
+ ret = session->create(
+ session, "table:async", "key_format=S,value_format=S");
+ /*! [async example table create] */
+
+ /* Insert a set of keys asynchronously. */
+ for (i = 0; i < MAX_KEYS; i++) {
+ /*! [async handle allocation] */
+ while ((ret = conn->async_new_op(conn,
+ "table:async", NULL, &ex_asynckeys.iface, &op)) != 0) {
+ /*
+ * If we used up all the handles, pause and retry to
+ * give the workers a chance to catch up.
+ */
+ fprintf(stderr,
+ "asynchronous operation handle not available\n");
+ if (ret == EBUSY)
+ sleep(1);
+ else
+ return (ret);
+ }
+ /*! [async handle allocation] */
+
+ /*! [async insert] */
+ /*
+ * Set the operation's string key and value, and then do
+ * an asynchronous insert.
+ */
+ /*! [async set the operation's string key] */
+ snprintf(k[i], sizeof(k), "key%d", i);
+ op->set_key(op, k[i]);
+ /*! [async set the operation's string key] */
+
+ /*! [async set the operation's string value] */
+ snprintf(v[i], sizeof(v), "value%d", i);
+ op->set_value(op, v[i]);
+ /*! [async set the operation's string value] */
+
+ ret = op->insert(op);
+ /*! [async insert] */
+ }
+
+ /*! [async flush] */
+ /* Wait for all outstanding operations to complete. */
+ ret = conn->async_flush(conn);
+ /*! [async flush] */
+
+ /*! [async compaction] */
+ /*
+ * Compact a table asynchronously, limiting the run-time to 5 minutes.
+ */
+ ret = conn->async_new_op(
+ conn, "table:async", "timeout=300", &ex_asynckeys.iface, &op);
+ ret = op->compact(op);
+ /*! [async compaction] */
+
+ /* Search for the keys we just inserted, asynchronously. */
+ for (i = 0; i < MAX_KEYS; i++) {
+ while ((ret = conn->async_new_op(conn,
+ "table:async", NULL, &ex_asynckeys.iface, &op)) != 0) {
+ /*
+ * If we used up all the handles, pause and retry to
+ * give the workers a chance to catch up.
+ */
+ fprintf(stderr,
+ "asynchronous operation handle not available\n");
+ if (ret == EBUSY)
+ sleep(1);
+ else
+ return (ret);
+ }
+
+ /*! [async search] */
+ /*
+ * Set the operation's string key and value, and then do
+ * an asynchronous search.
+ */
+ snprintf(k[i], sizeof(k), "key%d", i);
+ op->set_key(op, k[i]);
+ ret = op->search(op);
+ /*! [async search] */
+ }
+
+ /*
+ * Connection close automatically does an async_flush so it will wait
+ * for all queued search operations to complete.
+ */
+ ret = conn->close(conn, NULL);
+
+ printf("Searched for %d keys\n", ex_asynckeys.num_keys);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_call_center.c b/src/third_party/wiredtiger/examples/c/ex_call_center.c
new file mode 100644
index 00000000000..14ab8f37f56
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_call_center.c
@@ -0,0 +1,248 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_call_center.c
+ * This is an example application that demonstrates how to map a
+ * moderately complex SQL application into WiredTiger.
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+
+static const char *home;
+
+/*! [call-center decl] */
+/*
+ * In SQL, the tables are described as follows:
+ *
+ * CREATE TABLE Customers(id INTEGER PRIMARY KEY,
+ * name VARCHAR(30), address VARCHAR(50), phone VARCHAR(15))
+ * CREATE INDEX CustomersPhone ON Customers(phone)
+ *
+ * CREATE TABLE Calls(id INTEGER PRIMARY KEY, call_date DATE,
+ * cust_id INTEGER, emp_id INTEGER, call_type VARCHAR(12),
+ * notes VARCHAR(25))
+ * CREATE INDEX CallsCustDate ON Calls(cust_id, call_date)
+ *
+ * In this example, both tables will use record numbers for their IDs, which
+ * will be the key. The C structs for the records are as follows.
+ */
+
+/* Customer records. */
+typedef struct {
+ uint64_t id;
+ const char *name;
+ const char *address;
+ const char *phone;
+} CUSTOMER;
+
+/* Call records. */
+typedef struct {
+ uint64_t id;
+ uint64_t call_date;
+ uint64_t cust_id;
+ uint64_t emp_id;
+ const char *call_type;
+ const char *notes;
+} CALL;
+/*! [call-center decl] */
+
+int
+main(void)
+{
+ int count, exact, ret;
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+ WT_CURSOR *cursor;
+ CUSTOMER cust, *custp, cust_sample[] = {
+ { 0, "Professor Oak", "LeafGreen Avenue", "123-456-7890" },
+ { 0, "Lorelei", "Sevii Islands", "098-765-4321" },
+ { 0, NULL, NULL, NULL }
+ };
+ CALL call, *callp, call_sample[] = {
+ { 0, 32, 1, 2, "billing", "unavailable" },
+ { 0, 33, 1, 2, "billing", "available" },
+ { 0, 34, 1, 2, "reminder", "unavailable" },
+ { 0, 35, 1, 2, "reminder", "available" },
+ { 0, 0, 0, 0, NULL, NULL }
+ };
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ if ((ret = wiredtiger_open(home, NULL, "create", &conn)) != 0) {
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+ return (1);
+ }
+ /* Note: further error checking omitted for clarity. */
+
+ /*! [call-center work] */
+ ret = conn->open_session(conn, NULL, NULL, &session);
+
+ /*
+ * Create the customers table, give names and types to the columns.
+ * The columns will be stored in two groups: "main" and "address",
+ * created below.
+ */
+ ret = session->create(session, "table:customers",
+ "key_format=r,"
+ "value_format=SSS,"
+ "columns=(id,name,address,phone),"
+ "colgroups=(main,address)");
+
+ /* Create the main column group with value columns except address. */
+ ret = session->create(session,
+ "colgroup:customers:main", "columns=(name,phone)");
+
+ /* Create the address column group with just the address. */
+ ret = session->create(session,
+ "colgroup:customers:address", "columns=(address)");
+
+ /* Create an index on the customer table by phone number. */
+ ret = session->create(session,
+ "index:customers:phone", "columns=(phone)");
+
+ /* Populate the customers table with some data. */
+ ret = session->open_cursor(
+ session, "table:customers", NULL, "append", &cursor);
+ for (custp = cust_sample; custp->name != NULL; custp++) {
+ cursor->set_value(cursor,
+ custp->name, custp->address, custp->phone);
+ ret = cursor->insert(cursor);
+ }
+ ret = cursor->close(cursor);
+
+ /*
+ * Create the calls table, give names and types to the columns. All the
+ * columns will be stored together, so no column groups are declared.
+ */
+ ret = session->create(session, "table:calls",
+ "key_format=r,"
+ "value_format=qrrSS,"
+ "columns=(id,call_date,cust_id,emp_id,call_type,notes)");
+
+ /*
+ * Create an index on the calls table with a composite key of cust_id
+ * and call_date.
+ */
+ ret = session->create(session, "index:calls:cust_date",
+ "columns=(cust_id,call_date)");
+
+ /* Populate the calls table with some data. */
+ ret = session->open_cursor(
+ session, "table:calls", NULL, "append", &cursor);
+ for (callp = call_sample; callp->call_type != NULL; callp++) {
+ cursor->set_value(cursor, callp->call_date, callp->cust_id,
+ callp->emp_id, callp->call_type, callp->notes);
+ ret = cursor->insert(cursor);
+ }
+ ret = cursor->close(cursor);
+
+ /*
+ * First query: a call arrives. In SQL:
+ *
+ * SELECT id, name FROM Customers WHERE phone=?
+ *
+ * Use the cust_phone index, lookup by phone number to fill the
+ * customer record. The cursor will have a key format of "S" for a
+ * string because the cust_phone index has a single column ("phone"),
+ * which is of type "S".
+ *
+ * Specify the columns we want: the customer ID and the name. This
+ * means the cursor's value format will be "rS".
+ */
+ ret = session->open_cursor(session,
+ "index:customers:phone(id,name)", NULL, NULL, &cursor);
+ cursor->set_key(cursor, "123-456-7890");
+ ret = cursor->search(cursor);
+ if (ret == 0) {
+ ret = cursor->get_value(cursor, &cust.id, &cust.name);
+ printf("Read customer record for %s (ID %" PRIu64 ")\n",
+ cust.name, cust.id);
+ }
+ ret = cursor->close(cursor);
+
+ /*
+ * Next query: get the recent order history. In SQL:
+ *
+ * SELECT * FROM Calls WHERE cust_id=? ORDER BY call_date DESC LIMIT 3
+ *
+ * Use the call_cust_date index to find the matching calls. Since it is
+ * is in increasing order by date for a given customer, we want to start
+ * with the last record for the customer and work backwards.
+ *
+ * Specify a subset of columns to be returned. (Note that if these were
+ * all covered by the index, the primary would not have to be accessed.)
+ * Stop after getting 3 records.
+ */
+ ret = session->open_cursor(session,
+ "index:calls:cust_date(cust_id,call_type,notes)",
+ NULL, NULL, &cursor);
+
+ /*
+ * The keys in the index are (cust_id,call_date) -- we want the largest
+ * call date for a given cust_id. Search for (cust_id+1,0), then work
+ * backwards.
+ */
+ cust.id = 1;
+ cursor->set_key(cursor, cust.id + 1, 0);
+ ret = cursor->search_near(cursor, &exact);
+
+ /*
+ * If the table is empty, search_near will return WT_NOTFOUND, else the
+ * cursor will be positioned on a matching key if one exists, or an
+ * adjacent key if one does not. If the positioned key is equal to or
+ * larger than the search key, go back one.
+ */
+ if (ret == 0 && exact >= 0)
+ ret = cursor->prev(cursor);
+ for (count = 0; ret == 0 && count < 3; ++count) {
+ ret = cursor->get_value(cursor,
+ &call.cust_id, &call.call_type, &call.notes);
+ if (call.cust_id != cust.id)
+ break;
+ printf("Call record: customer %" PRIu64 " (%s: %s)\n",
+ call.cust_id, call.call_type, call.notes);
+ ret = cursor->prev(cursor);
+ }
+ /*! [call-center work] */
+
+ ret = conn->close(conn, NULL);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_config.c b/src/third_party/wiredtiger/examples/c/ex_config.c
new file mode 100644
index 00000000000..cb8ab02b393
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_config.c
@@ -0,0 +1,90 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_config.c
+ * This is an example demonstrating how to configure various database and
+ * table properties.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+
+static const char *home;
+
+int
+main(void)
+{
+ int ret;
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+ WT_CURSOR *cursor;
+ const char *key, *value;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ /*! [configure cache size] */
+ if ((ret = wiredtiger_open(home, NULL,
+ "create,cache_size=500M", &conn)) != 0)
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+ /*! [configure cache size] */
+
+ /*! [create a table] */
+ ret = conn->open_session(conn, NULL, NULL, &session);
+
+ ret = session->create(session,
+ "table:access", "key_format=S,value_format=S");
+ /*! [create a table] */
+
+ /*! [transaction] */
+ ret = session->begin_transaction(session, "priority=100,name=mytxn");
+
+ ret = session->open_cursor(session, "config:", NULL, NULL, &cursor);
+
+ while ((ret = cursor->next(cursor)) == 0) {
+ ret = cursor->get_key(cursor, &key);
+ ret = cursor->get_value(cursor, &value);
+ printf("configuration value: %s = %s\n", key, value);
+ }
+
+ ret = session->commit_transaction(session, NULL);
+ /*! [transaction] */
+
+ ret = conn->close(conn, NULL);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_config_parse.c b/src/third_party/wiredtiger/examples/c/ex_config_parse.c
new file mode 100644
index 00000000000..543c53f508c
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_config_parse.c
@@ -0,0 +1,165 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_config_parse.c
+ * This is an example demonstrating how to parse WiredTiger compatible
+ * configuration strings.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+
+int
+main(void)
+{
+ int ret;
+
+ /*! [Create a configuration parser] */
+ WT_CONFIG_ITEM k, v;
+ WT_CONFIG_PARSER *parser;
+ const char *config_string =
+ "path=/dev/loop,page_size=1024,log=(archive=true,file_max=20MB)";
+
+ if ((ret = wiredtiger_config_parser_open(
+ NULL, config_string, strlen(config_string), &parser)) != 0) {
+ fprintf(stderr, "Error creating configuration parser: %s\n",
+ wiredtiger_strerror(ret));
+ return (ret);
+ }
+ if ((ret = parser->close(parser)) != 0) {
+ fprintf(stderr, "Error closing configuration parser: %s\n",
+ wiredtiger_strerror(ret));
+ return (ret);
+ }
+ /*! [Create a configuration parser] */
+
+ if ((ret = wiredtiger_config_parser_open(
+ NULL, config_string, strlen(config_string), &parser)) != 0) {
+ fprintf(stderr, "Error creating configuration parser: %s\n",
+ wiredtiger_strerror(ret));
+ return (ret);
+ }
+
+ {
+ /*! [get] */
+ int64_t my_page_size;
+ /*
+ * Retrieve the value of the integer configuration string "page_size".
+ */
+ if ((ret = parser->get(parser, "page_size", &v)) != 0) {
+ fprintf(stderr,
+ "page_size configuration: %s", wiredtiger_strerror(ret));
+ return (ret);
+ }
+ my_page_size = v.val;
+ /*! [get] */
+
+ ret = parser->close(parser);
+
+ (void)my_page_size;
+ }
+
+ {
+ if ((ret = wiredtiger_config_parser_open(
+ NULL, config_string, strlen(config_string), &parser)) != 0) {
+ fprintf(stderr, "Error creating configuration parser: %s\n",
+ wiredtiger_strerror(ret));
+ return (ret);
+ }
+ /*! [next] */
+ /*
+ * Retrieve and print the values of the configuration strings.
+ */
+ while ((ret = parser->next(parser, &k, &v)) == 0) {
+ printf("%.*s:", (int)k.len, k.str);
+ if (v.type == WT_CONFIG_ITEM_NUM)
+ printf("%d\n", (int)v.val);
+ else
+ printf("%.*s\n", (int)v.len, v.str);
+ }
+ /*! [next] */
+ ret = parser->close(parser);
+ }
+
+ if ((ret = wiredtiger_config_parser_open(
+ NULL, config_string, strlen(config_string), &parser)) != 0) {
+ fprintf(stderr, "Error creating configuration parser: %s\n",
+ wiredtiger_strerror(ret));
+ return (ret);
+ }
+
+ /*! [nested get] */
+ /*
+ * Retrieve the value of the nested log file_max configuration string
+ * using dot shorthand. Utilize the configuration parsing automatic
+ * conversion of value strings into an integer.
+ */
+ v.type = WT_CONFIG_ITEM_NUM;
+ if ((ret = parser->get(parser, "log.file_max", &v)) != 0) {
+ fprintf(stderr,
+ "log.file_max configuration: %s", wiredtiger_strerror(ret));
+ return (ret);
+ }
+ printf("log file max: %d\n", (int)v.val);
+ /*! [nested get] */
+ ret = parser->close(parser);
+
+ if ((ret = wiredtiger_config_parser_open(
+ NULL, config_string, strlen(config_string), &parser)) != 0) {
+ fprintf(stderr, "Error creating configuration parser: %s\n",
+ wiredtiger_strerror(ret));
+ return (ret);
+ }
+ /*! [nested traverse] */
+ {
+ WT_CONFIG_PARSER *sub_parser;
+ while ((ret = parser->next(parser, &k, &v)) == 0) {
+ if (v.type == WT_CONFIG_ITEM_STRUCT) {
+ printf("Found nested configuration: %.*s\n",
+ (int)k.len, k.str);
+ if ((ret = wiredtiger_config_parser_open(
+ NULL, v.str, v.len, &sub_parser)) != 0) {
+ fprintf(stderr,
+ "Error creating nested configuration "
+ "parser: %s\n",
+ wiredtiger_strerror(ret));
+ ret = parser->close(parser);
+ return (ret);
+ }
+ while ((ret = sub_parser->next(
+ sub_parser, &k, &v)) == 0)
+ printf("\t%.*s\n", (int)k.len, k.str);
+ ret = sub_parser->close(sub_parser);
+ }
+ }
+ /*! [nested traverse] */
+ ret = parser->close(parser);
+ }
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_cursor.c b/src/third_party/wiredtiger/examples/c/ex_cursor.c
new file mode 100644
index 00000000000..e8f9b7fa9e8
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_cursor.c
@@ -0,0 +1,227 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_cursor.c
+ * This is an example demonstrating some cursor types and operations.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+
+int cursor_reset(WT_CURSOR *cursor);
+int cursor_forward_scan(WT_CURSOR *cursor);
+int cursor_reverse_scan(WT_CURSOR *cursor);
+int cursor_search(WT_CURSOR *cursor);
+int cursor_search_near(WT_CURSOR *cursor);
+int cursor_insert(WT_CURSOR *cursor);
+int cursor_update(WT_CURSOR *cursor);
+int cursor_remove(WT_CURSOR *cursor);
+
+static const char *home;
+
+/*! [cursor next] */
+int
+cursor_forward_scan(WT_CURSOR *cursor)
+{
+ const char *key, *value;
+ int ret;
+
+ while ((ret = cursor->next(cursor)) == 0) {
+ ret = cursor->get_key(cursor, &key);
+ ret = cursor->get_value(cursor, &value);
+ }
+ return (ret);
+}
+/*! [cursor next] */
+
+/*! [cursor prev] */
+int
+cursor_reverse_scan(WT_CURSOR *cursor)
+{
+ const char *key, *value;
+ int ret;
+
+ while ((ret = cursor->prev(cursor)) == 0) {
+ ret = cursor->get_key(cursor, &key);
+ ret = cursor->get_value(cursor, &value);
+ }
+ return (ret);
+}
+/*! [cursor prev] */
+
+/*! [cursor reset] */
+int
+cursor_reset(WT_CURSOR *cursor)
+{
+ return (cursor->reset(cursor));
+}
+/*! [cursor reset] */
+
+/*! [cursor search] */
+int
+cursor_search(WT_CURSOR *cursor)
+{
+ const char *value;
+ int ret;
+
+ cursor->set_key(cursor, "foo");
+
+ if ((ret = cursor->search(cursor)) != 0)
+ ret = cursor->get_value(cursor, &value);
+
+ return (ret);
+}
+/*! [cursor search] */
+
+/*! [cursor search near] */
+int
+cursor_search_near(WT_CURSOR *cursor)
+{
+ const char *key, *value;
+ int exact, ret;
+
+ cursor->set_key(cursor, "foo");
+
+ if ((ret = cursor->search_near(cursor, &exact)) == 0) {
+ switch (exact) {
+ case -1: /* Returned key smaller than search key */
+ ret = cursor->get_key(cursor, &key);
+ break;
+ case 0: /* Exact match found */
+ break;
+ case 1: /* Returned key larger than search key */
+ ret = cursor->get_key(cursor, &key);
+ break;
+ }
+
+ ret = cursor->get_value(cursor, &value);
+ }
+
+ return (ret);
+}
+/*! [cursor search near] */
+
+/*! [cursor insert] */
+int
+cursor_insert(WT_CURSOR *cursor)
+{
+ cursor->set_key(cursor, "foo");
+ cursor->set_value(cursor, "bar");
+
+ return (cursor->insert(cursor));
+}
+/*! [cursor insert] */
+
+/*! [cursor update] */
+int
+cursor_update(WT_CURSOR *cursor)
+{
+ cursor->set_key(cursor, "foo");
+ cursor->set_value(cursor, "newbar");
+
+ return (cursor->update(cursor));
+}
+/*! [cursor update] */
+
+/*! [cursor remove] */
+int
+cursor_remove(WT_CURSOR *cursor)
+{
+ cursor->set_key(cursor, "foo");
+ return (cursor->remove(cursor));
+}
+/*! [cursor remove] */
+
+int
+main(void)
+{
+ WT_CONNECTION *conn;
+ WT_CURSOR *cursor;
+ WT_SESSION *session;
+ int ret;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ /* Open a connection to the database, creating it if necessary. */
+ if ((ret = wiredtiger_open(
+ home, NULL, "create,statistics=(fast)", &conn)) != 0)
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ /* Open a session for the current thread's work. */
+ if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+ fprintf(stderr, "Error opening a session on %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ ret = session->create(session, "table:world",
+ "key_format=r,value_format=5sii,"
+ "columns=(id,country,population,area)");
+
+ /*! [open cursor #1] */
+ ret = session->open_cursor(session, "table:world", NULL, NULL, &cursor);
+ /*! [open cursor #1] */
+
+ /*! [open cursor #2] */
+ ret = session->open_cursor(session,
+ "table:world(country,population)", NULL, NULL, &cursor);
+ /*! [open cursor #2] */
+
+ /*! [open cursor #3] */
+ ret = session->open_cursor(session, "statistics:", NULL, NULL, &cursor);
+ /*! [open cursor #3] */
+
+ /* Create a simple string table to illustrate basic operations. */
+ ret = session->create(session, "table:map",
+ "key_format=S,value_format=S");
+ ret = session->open_cursor(session, "table:map", NULL, NULL, &cursor);
+ ret = cursor_insert(cursor);
+ ret = cursor_reset(cursor);
+ ret = cursor_forward_scan(cursor);
+ ret = cursor_reset(cursor);
+ ret = cursor_reverse_scan(cursor);
+ ret = cursor_search_near(cursor);
+ ret = cursor_update(cursor);
+ ret = cursor_remove(cursor);
+ ret = cursor->close(cursor);
+
+ /* Note: closing the connection implicitly closes open session(s). */
+ if ((ret = conn->close(conn, NULL)) != 0)
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_data_source.c b/src/third_party/wiredtiger/examples/c/ex_data_source.c
new file mode 100644
index 00000000000..b6fc143a586
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_data_source.c
@@ -0,0 +1,661 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_data_source.c
+ * demonstrates how to create and access a data source
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <wiredtiger.h>
+
+/*! [WT_EXTENSION_API declaration] */
+#include <wiredtiger_ext.h>
+
+static WT_EXTENSION_API *wt_api;
+
+static void
+my_data_source_init(WT_CONNECTION *connection)
+{
+ wt_api = connection->get_extension_api(connection);
+}
+/*! [WT_EXTENSION_API declaration] */
+
+/*! [WT_DATA_SOURCE create] */
+static int
+my_create(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config)
+/*! [WT_DATA_SOURCE create] */
+{
+ /* Unused parameters */
+ (void)dsrc;
+ (void)uri;
+ (void)config;
+
+ {
+ const char *msg = "string";
+ /*! [WT_EXTENSION_API err_printf] */
+ (void)wt_api->err_printf(
+ wt_api, session, "extension error message: %s", msg);
+ /*! [WT_EXTENSION_API err_printf] */
+ }
+
+ {
+ const char *msg = "string";
+ /*! [WT_EXTENSION_API msg_printf] */
+ (void)wt_api->msg_printf(wt_api, session, "extension message: %s", msg);
+ /*! [WT_EXTENSION_API msg_printf] */
+ }
+
+ {
+ int ret = 0;
+ /*! [WT_EXTENSION_API strerror] */
+ (void)wt_api->err_printf(wt_api,
+ session, "WiredTiger error return: %s", wt_api->strerror(ret));
+ /*! [WT_EXTENSION_API strerror] */
+ }
+
+ {
+ /*! [WT_EXTENSION_API scr_alloc] */
+ void *buffer;
+ if ((buffer = wt_api->scr_alloc(wt_api, session, 512)) == NULL) {
+ (void)wt_api->err_printf(wt_api, session,
+ "buffer allocation: %s", wiredtiger_strerror(ENOMEM));
+ return (ENOMEM);
+ }
+ /*! [WT_EXTENSION_API scr_alloc] */
+
+ /*! [WT_EXTENSION_API scr_free] */
+ wt_api->scr_free(wt_api, session, buffer);
+ /*! [WT_EXTENSION_API scr_free] */
+ }
+
+ return (0);
+}
+
+/*! [WT_DATA_SOURCE compact] */
+static int
+my_compact(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config)
+/*! [WT_DATA_SOURCE compact] */
+{
+ /* Unused parameters */
+ (void)dsrc;
+ (void)session;
+ (void)uri;
+ (void)config;
+
+ return (0);
+}
+
+/*! [WT_DATA_SOURCE drop] */
+static int
+my_drop(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config)
+/*! [WT_DATA_SOURCE drop] */
+{
+ /* Unused parameters */
+ (void)dsrc;
+ (void)session;
+ (void)uri;
+ (void)config;
+
+ return (0);
+}
+
+static int
+data_source_cursor(void)
+{
+ return (0);
+}
+
+static const char *
+data_source_error(int v)
+{
+ return (v == 0 ? "one" : "two");
+}
+
+static int
+data_source_notify(
+ WT_TXN_NOTIFY *handler, WT_SESSION *session, uint64_t txnid, int committed)
+{
+ /* Unused parameters */
+ (void)handler;
+ (void)session;
+ (void)txnid;
+ (void)committed;
+
+ return (0);
+}
+
+static int my_cursor_next(WT_CURSOR *wtcursor)
+ { (void)wtcursor; return (0); }
+static int my_cursor_prev(WT_CURSOR *wtcursor)
+ { (void)wtcursor; return (0); }
+static int my_cursor_reset(WT_CURSOR *wtcursor)
+ { (void)wtcursor; return (0); }
+static int my_cursor_search(WT_CURSOR *wtcursor)
+ { (void)wtcursor; return (0); }
+static int my_cursor_search_near(WT_CURSOR *wtcursor, int *exactp)
+ { (void)wtcursor; (void)exactp; return (0); }
+static int my_cursor_insert(WT_CURSOR *wtcursor)
+{
+ WT_SESSION *session = NULL;
+ int ret;
+
+ /* Unused parameters */
+ (void)wtcursor;
+
+ {
+ int is_snapshot_isolation, isolation_level;
+ /*! [WT_EXTENSION transaction isolation level] */
+ isolation_level = wt_api->transaction_isolation_level(wt_api, session);
+ if (isolation_level == WT_TXN_ISO_SNAPSHOT)
+ is_snapshot_isolation = 1;
+ else
+ is_snapshot_isolation = 0;
+ /*! [WT_EXTENSION transaction isolation level] */
+ (void)is_snapshot_isolation;
+ }
+
+ {
+ /*! [WT_EXTENSION transaction ID] */
+ uint64_t transaction_id;
+
+ transaction_id = wt_api->transaction_id(wt_api, session);
+ /*! [WT_EXTENSION transaction ID] */
+ (void)transaction_id;
+ }
+
+ {
+ /*! [WT_EXTENSION transaction oldest] */
+ uint64_t transaction_oldest;
+
+ transaction_oldest = wt_api->transaction_oldest(wt_api);
+ /*! [WT_EXTENSION transaction oldest] */
+ (void)transaction_oldest;
+ }
+
+ {
+ /*! [WT_EXTENSION transaction notify] */
+ WT_TXN_NOTIFY handler;
+ handler.notify = data_source_notify;
+ ret = wt_api->transaction_notify(wt_api, session, &handler);
+ /*! [WT_EXTENSION transaction notify] */
+ }
+
+ {
+ uint64_t transaction_id = 1;
+ int is_visible;
+ /*! [WT_EXTENSION transaction visible] */
+ is_visible =
+ wt_api->transaction_visible(wt_api, session, transaction_id);
+ /*! [WT_EXTENSION transaction visible] */
+ (void)is_visible;
+ }
+
+ {
+ const char *key1 = NULL, *key2 = NULL;
+ uint32_t key1_len = 0, key2_len = 0;
+ WT_COLLATOR *collator = NULL;
+ /*! [WT_EXTENSION collate] */
+ WT_ITEM first, second;
+ int cmp;
+
+ first.data = key1;
+ first.size = key1_len;
+ second.data = key2;
+ second.size = key2_len;
+
+ ret = wt_api->collate(wt_api, session, collator, &first, &second, &cmp);
+ if (cmp == 0)
+ printf("key1 collates identically to key2\n");
+ else if (cmp < 0)
+ printf("key1 collates less than key2\n");
+ else
+ printf("key1 collates greater than key2\n");
+ /*! [WT_EXTENSION collate] */
+ }
+
+ return (ret);
+}
+
+static int my_cursor_update(WT_CURSOR *wtcursor)
+ { (void)wtcursor; return (0); }
+static int my_cursor_remove(WT_CURSOR *wtcursor)
+ { (void)wtcursor; return (0); }
+static int my_cursor_close(WT_CURSOR *wtcursor)
+ { (void)wtcursor; return (0); }
+
+/*! [WT_DATA_SOURCE open_cursor] */
+typedef struct __my_cursor {
+ WT_CURSOR wtcursor; /* WiredTiger cursor, must come first */
+
+ /*
+ * Local cursor information: for example, we might want to have a
+ * reference to the extension functions.
+ */
+ WT_EXTENSION_API *wtext; /* Extension functions */
+} MY_CURSOR;
+
+static int
+my_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config, WT_CURSOR **new_cursor)
+{
+ MY_CURSOR *cursor;
+
+ /* Allocate and initialize a WiredTiger cursor. */
+ if ((cursor = calloc(1, sizeof(*cursor))) == NULL)
+ return (errno);
+
+ cursor->wtcursor.next = my_cursor_next;
+ cursor->wtcursor.prev = my_cursor_prev;
+ cursor->wtcursor.reset = my_cursor_reset;
+ cursor->wtcursor.search = my_cursor_search;
+ cursor->wtcursor.search_near = my_cursor_search_near;
+ cursor->wtcursor.insert = my_cursor_insert;
+ cursor->wtcursor.update = my_cursor_update;
+ cursor->wtcursor.remove = my_cursor_remove;
+ cursor->wtcursor.close = my_cursor_close;
+
+ /*
+ * Configure local cursor information.
+ */
+
+ /* Return combined cursor to WiredTiger. */
+ *new_cursor = (WT_CURSOR *)cursor;
+
+/*! [WT_DATA_SOURCE open_cursor] */
+ {
+ int ret = 0;
+ (void)dsrc; /* Unused parameters */
+ (void)session;
+ (void)uri;
+ (void)new_cursor;
+
+ {
+ /*! [WT_EXTENSION_CONFIG boolean] */
+ WT_CONFIG_ITEM v;
+ int my_data_source_overwrite;
+
+ /*
+ * Retrieve the value of the boolean type configuration string
+ * "overwrite".
+ */
+ if ((ret = wt_api->config_get(
+ wt_api, session, config, "overwrite", &v)) != 0) {
+ (void)wt_api->err_printf(wt_api, session,
+ "overwrite configuration: %s", wiredtiger_strerror(ret));
+ return (ret);
+ }
+ my_data_source_overwrite = v.val != 0;
+ /*! [WT_EXTENSION_CONFIG boolean] */
+
+ (void)my_data_source_overwrite;
+ }
+
+ {
+ /*! [WT_EXTENSION_CONFIG integer] */
+ WT_CONFIG_ITEM v;
+ int64_t my_data_source_page_size;
+
+ /*
+ * Retrieve the value of the integer type configuration string
+ * "page_size".
+ */
+ if ((ret = wt_api->config_get(
+ wt_api, session, config, "page_size", &v)) != 0) {
+ (void)wt_api->err_printf(wt_api, session,
+ "page_size configuration: %s", wiredtiger_strerror(ret));
+ return (ret);
+ }
+ my_data_source_page_size = v.val;
+ /*! [WT_EXTENSION_CONFIG integer] */
+
+ (void)my_data_source_page_size;
+ }
+
+ {
+ /*! [WT_EXTENSION config_get] */
+ WT_CONFIG_ITEM v;
+ const char *my_data_source_key;
+
+ /*
+ * Retrieve the value of the string type configuration string
+ * "key_format".
+ */
+ if ((ret = wt_api->config_get(
+ wt_api, session, config, "key_format", &v)) != 0) {
+ (void)wt_api->err_printf(wt_api, session,
+ "key_format configuration: %s", wiredtiger_strerror(ret));
+ return (ret);
+ }
+
+ /*
+ * Values returned from WT_EXTENSION_API::config in the str field are
+ * not nul-terminated; the associated length must be used instead.
+ */
+ if (v.len == 1 && v.str[0] == 'r')
+ my_data_source_key = "recno";
+ else
+ my_data_source_key = "bytestring";
+ /*! [WT_EXTENSION config_get] */
+
+ (void)my_data_source_key;
+ }
+
+ {
+ /*! [WT_EXTENSION collator config] */
+ WT_COLLATOR *collator;
+ int collator_owned;
+ /*
+ * Configure the appropriate collator.
+ */
+ if ((ret = wt_api->collator_config(
+ wt_api, session, config, &collator, &collator_owned)) != 0) {
+ (void)wt_api->err_printf(wt_api, session,
+ "collator configuration: %s", wiredtiger_strerror(ret));
+ return (ret);
+ }
+ /*! [WT_EXTENSION collator config] */
+ }
+
+ /*! [WT_DATA_SOURCE error message] */
+ /*
+ * If an underlying function fails, log the error and then return an
+ * error within WiredTiger's name space.
+ */
+ if ((ret = data_source_cursor()) != 0) {
+ (void)wt_api->err_printf(wt_api,
+ session, "my_open_cursor: %s", data_source_error(ret));
+ return (WT_ERROR);
+ }
+ /*! [WT_DATA_SOURCE error message] */
+
+ {
+ /*! [WT_EXTENSION metadata insert] */
+ /*
+ * Insert a new WiredTiger metadata record.
+ */
+ const char *key = "datasource_uri";
+ const char *value = "data source uri's record";
+
+ if ((ret = wt_api->metadata_insert(wt_api, session, key, value)) != 0) {
+ (void)wt_api->err_printf(wt_api, session,
+ "%s: metadata insert: %s", key, wiredtiger_strerror(ret));
+ return (ret);
+ }
+ /*! [WT_EXTENSION metadata insert] */
+ }
+
+ {
+ /*! [WT_EXTENSION metadata remove] */
+ /*
+ * Remove a WiredTiger metadata record.
+ */
+ const char *key = "datasource_uri";
+
+ if ((ret = wt_api->metadata_remove(wt_api, session, key)) != 0) {
+ (void)wt_api->err_printf(wt_api, session,
+ "%s: metadata remove: %s", key, wiredtiger_strerror(ret));
+ return (ret);
+ }
+ /*! [WT_EXTENSION metadata remove] */
+ }
+
+ {
+ /*! [WT_EXTENSION metadata search] */
+ /*
+ * Insert a new WiredTiger metadata record.
+ */
+ const char *key = "datasource_uri";
+ const char *value;
+
+ if ((ret =
+ wt_api->metadata_search(wt_api, session, key, &value)) != 0) {
+ (void)wt_api->err_printf(wt_api, session,
+ "%s: metadata search: %s", key, wiredtiger_strerror(ret));
+ return (ret);
+ }
+ printf("metadata: %s has a value of %s\n", key, value);
+ /*! [WT_EXTENSION metadata search] */
+ }
+
+ {
+ /*! [WT_EXTENSION metadata update] */
+ /*
+ * Update a WiredTiger metadata record (insert it if it does not yet
+ * exist, update it if it does).
+ */
+ const char *key = "datasource_uri";
+ const char *value = "data source uri's record";
+
+ if ((ret = wt_api->metadata_update(wt_api, session, key, value)) != 0) {
+ (void)wt_api->err_printf(wt_api, session,
+ "%s: metadata update: %s", key, wiredtiger_strerror(ret));
+ return (ret);
+ }
+ /*! [WT_EXTENSION metadata update] */
+ }
+
+ }
+ return (0);
+}
+
+/*! [WT_DATA_SOURCE rename] */
+static int
+my_rename(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, const char *newname, WT_CONFIG_ARG *config)
+/*! [WT_DATA_SOURCE rename] */
+{
+ /* Unused parameters */
+ (void)dsrc;
+ (void)session;
+ (void)uri;
+ (void)newname;
+ (void)config;
+
+ return (0);
+}
+
+/*! [WT_DATA_SOURCE salvage] */
+static int
+my_salvage(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config)
+/*! [WT_DATA_SOURCE salvage] */
+{
+ /* Unused parameters */
+ (void)dsrc;
+ (void)session;
+ (void)uri;
+ (void)config;
+
+ return (0);
+}
+
+/*! [WT_DATA_SOURCE truncate] */
+static int
+my_truncate(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config)
+/*! [WT_DATA_SOURCE truncate] */
+{
+ /* Unused parameters */
+ (void)dsrc;
+ (void)session;
+ (void)uri;
+ (void)config;
+
+ return (0);
+}
+
+/*! [WT_DATA_SOURCE range truncate] */
+static int
+my_range_truncate(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ WT_CURSOR *start, WT_CURSOR *stop)
+/*! [WT_DATA_SOURCE range truncate] */
+{
+ /* Unused parameters */
+ (void)dsrc;
+ (void)session;
+ (void)start;
+ (void)stop;
+
+ return (0);
+}
+
+/*! [WT_DATA_SOURCE verify] */
+static int
+my_verify(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config)
+/*! [WT_DATA_SOURCE verify] */
+{
+ /* Unused parameters */
+ (void)dsrc;
+ (void)session;
+ (void)uri;
+ (void)config;
+
+ return (0);
+}
+
+/*! [WT_DATA_SOURCE checkpoint] */
+static int
+my_checkpoint(WT_DATA_SOURCE *dsrc, WT_SESSION *session, WT_CONFIG_ARG *config)
+/*! [WT_DATA_SOURCE checkpoint] */
+{
+ /* Unused parameters */
+ (void)dsrc;
+ (void)session;
+ (void)config;
+
+ return (0);
+}
+
+/*! [WT_DATA_SOURCE terminate] */
+static int
+my_terminate(WT_DATA_SOURCE *dsrc, WT_SESSION *session)
+/*! [WT_DATA_SOURCE terminate] */
+{
+ /* Unused parameters */
+ (void)dsrc;
+ (void)session;
+
+ return (0);
+}
+
+int
+main(void)
+{
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+ int ret;
+
+ ret = wiredtiger_open(NULL, NULL, "create", &conn);
+ ret = conn->open_session(conn, NULL, NULL, &session);
+
+ my_data_source_init(conn);
+
+ {
+ /*! [WT_DATA_SOURCE register] */
+ static WT_DATA_SOURCE my_dsrc = {
+ my_create,
+ my_compact,
+ my_drop,
+ my_open_cursor,
+ my_rename,
+ my_salvage,
+ my_truncate,
+ my_range_truncate,
+ my_verify,
+ my_checkpoint,
+ my_terminate
+ };
+ ret = conn->add_data_source(conn, "dsrc:", &my_dsrc, NULL);
+ /*! [WT_DATA_SOURCE register] */
+ }
+
+ /*! [WT_DATA_SOURCE configure boolean] */
+ /* my_boolean defaults to true. */
+ ret = conn->configure_method(conn,
+ "session.open_cursor", NULL, "my_boolean=true", "boolean", NULL);
+ /*! [WT_DATA_SOURCE configure boolean] */
+
+ /*! [WT_DATA_SOURCE configure integer] */
+ /* my_integer defaults to 5. */
+ ret = conn->configure_method(conn,
+ "session.open_cursor", NULL, "my_integer=5", "int", NULL);
+ /*! [WT_DATA_SOURCE configure integer] */
+
+ /*! [WT_DATA_SOURCE configure string] */
+ /* my_string defaults to "name". */
+ ret = conn->configure_method(conn,
+ "session.open_cursor", NULL, "my_string=name", "string", NULL);
+ /*! [WT_DATA_SOURCE configure string] */
+
+ /*! [WT_DATA_SOURCE configure list] */
+ /* my_list defaults to "first" and "second". */
+ ret = conn->configure_method(conn,
+ "session.open_cursor",
+ NULL, "my_list=[first, second]", "list", NULL);
+ /*! [WT_DATA_SOURCE configure list] */
+
+ /*! [WT_DATA_SOURCE configure integer with checking] */
+ /*
+ * Limit the number of devices to between 1 and 30; the default is 5.
+ */
+ ret = conn->configure_method(conn,
+ "session.open_cursor", NULL, "devices=5", "int", "min=1, max=30");
+ /*! [WT_DATA_SOURCE configure integer with checking] */
+
+ /*! [WT_DATA_SOURCE configure string with checking] */
+ /*
+ * Limit the target string to one of /device, /home or /target; default
+ * to /home.
+ */
+ ret = conn->configure_method(conn,
+ "session.open_cursor", NULL, "target=/home", "string",
+ "choices=[/device, /home, /target]");
+ /*! [WT_DATA_SOURCE configure string with checking] */
+
+ /*! [WT_DATA_SOURCE configure list with checking] */
+ /*
+ * Limit the paths list to one or more of /device, /home, /mnt or
+ * /target; default to /mnt.
+ */
+ ret = conn->configure_method(conn,
+ "session.open_cursor", NULL, "paths=[/mnt]", "list",
+ "choices=[/device, /home, /mnt, /target]");
+ /*! [WT_DATA_SOURCE configure list with checking] */
+
+ /*! [WT_EXTENSION_API default_session] */
+ (void)wt_api->msg_printf(wt_api, NULL, "configuration complete");
+ /*! [WT_EXTENSION_API default_session] */
+
+ (void)conn->close(conn, NULL);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_extending.c b/src/third_party/wiredtiger/examples/c/ex_extending.c
new file mode 100644
index 00000000000..f043dd5b383
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_extending.c
@@ -0,0 +1,132 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_extending.c
+ * This is an example demonstrating ways to extend WiredTiger with
+ * extractors, collators and loadable modules.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+
+#ifdef _WIN32
+#define strcasecmp stricmp
+#endif
+
+static const char *home;
+
+/*! [case insensitive comparator] */
+/* A simple case insensitive comparator. */
+static int
+__compare_nocase(WT_COLLATOR *collator, WT_SESSION *session,
+ const WT_ITEM *v1, const WT_ITEM *v2, int *cmp)
+{
+ const char *s1 = (const char *)v1->data;
+ const char *s2 = (const char *)v2->data;
+
+ (void)session; /* unused */
+ (void)collator; /* unused */
+
+ *cmp = strcasecmp(s1, s2);
+ return (0);
+}
+
+static WT_COLLATOR nocasecoll = { __compare_nocase, NULL, NULL };
+/*! [case insensitive comparator] */
+
+/*! [n character comparator] */
+/*
+ * Comparator that only compares the first N prefix characters of the string.
+ * This has associated data, so we need to extend WT_COLLATOR.
+ */
+typedef struct {
+ WT_COLLATOR iface;
+ uint32_t maxlen;
+} PREFIX_COLLATOR;
+
+static int
+__compare_prefixes(WT_COLLATOR *collator, WT_SESSION *session,
+ const WT_ITEM *v1, const WT_ITEM *v2, int *cmp)
+{
+ PREFIX_COLLATOR *pcoll = (PREFIX_COLLATOR *)collator;
+ const char *s1 = (const char *)v1->data;
+ const char *s2 = (const char *)v2->data;
+
+ (void)session; /* unused */
+
+ *cmp = strncmp(s1, s2, pcoll->maxlen);
+ return (0);
+}
+
+static PREFIX_COLLATOR pcoll10 = { {__compare_prefixes, NULL, NULL}, 10 };
+/*! [n character comparator] */
+
+int
+main(void)
+{
+ int ret;
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ /* Open a connection to the database, creating it if necessary. */
+ if ((ret = wiredtiger_open(home, NULL, "create", &conn)) != 0)
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ /*! [add collator nocase] */
+ ret = conn->add_collator(conn, "nocase", &nocasecoll, NULL);
+ /*! [add collator nocase] */
+ /*! [add collator prefix10] */
+ ret = conn->add_collator(conn, "prefix10", &pcoll10.iface, NULL);
+
+ /* Open a session for the current thread's work. */
+ if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+ fprintf(stderr, "Error opening a session on %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ /* XXX Do some work... */
+
+ /* Note: closing the connection implicitly closes open session(s). */
+ if ((ret = conn->close(conn, NULL)) != 0)
+ /*! [add collator prefix10] */
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_file.c b/src/third_party/wiredtiger/examples/c/ex_file.c
new file mode 100644
index 00000000000..4170d1b099d
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_file.c
@@ -0,0 +1,72 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_file.c
+ * This is an example demonstrating how to configure an individual file.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+
+static const char *home;
+
+int
+main(void)
+{
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+ int ret;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ if ((ret = wiredtiger_open(home, NULL, "create", &conn)) != 0 ||
+ (ret = conn->open_session(conn, NULL, NULL, &session)) != 0) {
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+ return (ret);
+ }
+ /* Note: further error checking omitted for clarity. */
+
+ /*! [file create] */
+ ret = session->create(session, "file:example",
+ "key_format=u,"
+ "internal_page_max=32KB,internal_item_max=1KB,"
+ "leaf_page_max=1MB,leaf_item_max=32KB");
+ /*! [file create] */
+
+ return (conn->close(conn, NULL) == 0 ? ret : EXIT_FAILURE);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_hello.c b/src/third_party/wiredtiger/examples/c/ex_hello.c
new file mode 100644
index 00000000000..c94c1072f61
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_hello.c
@@ -0,0 +1,75 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_hello.c
+ * This is an example demonstrating how to create and connect to a
+ * database.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+
+static const char *home;
+
+int
+main(void)
+{
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+ int ret;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ /* Open a connection to the database, creating it if necessary. */
+ if ((ret = wiredtiger_open(home, NULL, "create", &conn)) != 0)
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ /* Open a session for the current thread's work. */
+ if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+ fprintf(stderr, "Error opening a session on %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ /* Do some work... */
+
+ /* Note: closing the connection implicitly closes open session(s). */
+ if ((ret = conn->close(conn, NULL)) != 0)
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_log.c b/src/third_party/wiredtiger/examples/c/ex_log.c
new file mode 100644
index 00000000000..8ac7d079da1
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_log.c
@@ -0,0 +1,344 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_log.c
+ * demonstrates how to logging and log cursors.
+ */
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifndef _WIN32
+#include <unistd.h>
+#else
+/* snprintf is not supported on <= VS2013 */
+#define snprintf _snprintf
+#endif
+
+#include <wiredtiger.h>
+
+static const char *home1 = "WT_HOME_LOG_1";
+static const char *home2 = "WT_HOME_LOG_2";
+
+static const char * const uri = "table:logtest";
+
+#define CONN_CONFIG "create,cache_size=100MB,log=(archive=false,enabled=true)"
+#define MAX_KEYS 10
+
+static int
+setup_copy(WT_CONNECTION **wt_connp, WT_SESSION **sessionp)
+{
+ int ret;
+
+ if ((ret = wiredtiger_open(home2, NULL,
+ CONN_CONFIG, wt_connp)) != 0) {
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home1, wiredtiger_strerror(ret));
+ return (ret);
+ }
+
+ ret = (*wt_connp)->open_session(*wt_connp, NULL, NULL, sessionp);
+ ret = (*sessionp)->create(*sessionp, uri,
+ "key_format=S,value_format=S");
+ return (ret);
+}
+
+static int
+compare_tables(WT_SESSION *session, WT_SESSION *sess_copy)
+{
+ WT_CURSOR *cursor, *curs_copy;
+ int ret;
+ const char *key, *key_copy, *value, *value_copy;
+
+ ret = session->open_cursor(session, uri, NULL, NULL, &cursor);
+ ret = sess_copy->open_cursor(sess_copy, uri, NULL, NULL, &curs_copy);
+
+ while ((ret = cursor->next(cursor)) == 0) {
+ ret = curs_copy->next(curs_copy);
+ ret = cursor->get_key(cursor, &key);
+ ret = cursor->get_value(cursor, &value);
+ ret = curs_copy->get_key(curs_copy, &key_copy);
+ ret = curs_copy->get_value(curs_copy, &value_copy);
+ if (strcmp(key, key_copy) != 0 ||
+ strcmp(value, value_copy) != 0) {
+ fprintf(stderr,
+ "Mismatched: key %s, key_copy %s "
+ "value %s value_copy %s\n",
+ key, key_copy, value, value_copy);
+ return (1);
+ }
+ }
+ if (ret != WT_NOTFOUND)
+ fprintf(stderr,
+ "WT_CURSOR.next: %s\n", wiredtiger_strerror(ret));
+ ret = cursor->close(cursor);
+
+ ret = curs_copy->next(curs_copy);
+ if (ret != WT_NOTFOUND)
+ fprintf(stderr,
+ "copy: WT_CURSOR.next: %s\n", wiredtiger_strerror(ret));
+ ret = curs_copy->close(curs_copy);
+
+ return (ret);
+}
+
+/*! [log cursor walk] */
+static void
+print_record(WT_LSN *lsn, uint32_t opcount,
+ uint32_t rectype, uint32_t optype, uint64_t txnid, uint32_t fileid,
+ WT_ITEM *key, WT_ITEM *value)
+{
+ printf(
+ "LSN [%" PRIu32 "][%" PRIu64 "].%" PRIu32
+ ": record type %" PRIu32 " optype %" PRIu32
+ " txnid %" PRIu64 " fileid %" PRIu32,
+ lsn->file, (uint64_t)lsn->offset, opcount,
+ rectype, optype, txnid, fileid);
+ printf(" key size %zu value size %zu\n", key->size, value->size);
+ if (rectype == WT_LOGREC_MESSAGE)
+ printf("Application Record: %s\n", (char *)value->data);
+}
+
+/*
+ * simple_walk_log --
+ * A simple walk of the log.
+ */
+static int
+simple_walk_log(WT_SESSION *session)
+{
+ WT_CURSOR *cursor;
+ WT_LSN lsn;
+ WT_ITEM logrec_key, logrec_value;
+ uint64_t txnid;
+ uint32_t fileid, opcount, optype, rectype;
+ int ret;
+
+ /*! [log cursor open] */
+ ret = session->open_cursor(session, "log:", NULL, NULL, &cursor);
+ /*! [log cursor open] */
+
+ while ((ret = cursor->next(cursor)) == 0) {
+ /*! [log cursor get_key] */
+ ret = cursor->get_key(cursor, &lsn.file, &lsn.offset, &opcount);
+ /*! [log cursor get_key] */
+ /*! [log cursor get_value] */
+ ret = cursor->get_value(cursor, &txnid,
+ &rectype, &optype, &fileid, &logrec_key, &logrec_value);
+ /*! [log cursor get_value] */
+
+ print_record(&lsn, opcount,
+ rectype, optype, txnid, fileid, &logrec_key, &logrec_value);
+ }
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ ret = cursor->close(cursor);
+ return (ret);
+}
+/*! [log cursor walk] */
+
+static int
+walk_log(WT_SESSION *session)
+{
+ WT_CONNECTION *wt_conn2;
+ WT_CURSOR *cursor, *cursor2;
+ WT_LSN lsn, lsnsave;
+ WT_ITEM logrec_key, logrec_value;
+ WT_SESSION *session2;
+ uint64_t txnid;
+ uint32_t fileid, opcount, optype, rectype;
+ int first, i, in_txn, ret;
+
+ ret = setup_copy(&wt_conn2, &session2);
+ ret = session->open_cursor(session, "log:", NULL, NULL, &cursor);
+ ret = session2->open_cursor(session2, uri, NULL, "raw=true", &cursor2);
+ i = 0;
+ in_txn = 0;
+ txnid = 0;
+ memset(&lsnsave, 0, sizeof(lsnsave));
+ while ((ret = cursor->next(cursor)) == 0) {
+ ret = cursor->get_key(cursor, &lsn.file, &lsn.offset, &opcount);
+ /*
+ * Save one of the LSNs we get back to search for it
+ * later. Pick a later one because we want to walk from
+ * that LSN to the end (where the multi-step transaction
+ * was performed). Just choose the record that is MAX_KEYS.
+ */
+ if (++i == MAX_KEYS)
+ lsnsave = lsn;
+ ret = cursor->get_value(cursor, &txnid, &rectype,
+ &optype, &fileid, &logrec_key, &logrec_value);
+
+ print_record(&lsn, opcount,
+ rectype, optype, txnid, fileid, &logrec_key, &logrec_value);
+
+ /*
+ * If we are in a transaction and this is a new one, end
+ * the previous one.
+ */
+ if (in_txn && opcount == 0) {
+ ret = session2->commit_transaction(session2, NULL);
+ in_txn = 0;
+ }
+
+ /*
+ * If the operation is a put, replay it here on the backup
+ * connection. Note, we cheat by looking only for fileid 1
+ * in this example. The metadata is fileid 0.
+ */
+ if (fileid == 1 && rectype == WT_LOGREC_COMMIT &&
+ optype == WT_LOGOP_ROW_PUT) {
+ if (!in_txn) {
+ ret = session2->begin_transaction(session2,
+ NULL);
+ in_txn = 1;
+ }
+ cursor2->set_key(cursor2, &logrec_key);
+ cursor2->set_value(cursor2, &logrec_value);
+ ret = cursor2->insert(cursor2);
+ }
+ }
+ if (in_txn)
+ ret = session2->commit_transaction(session2, NULL);
+
+ ret = cursor2->close(cursor2);
+ /*
+ * Compare the tables after replay. They should be identical.
+ */
+ if (compare_tables(session, session2))
+ printf("compare failed\n");
+ ret = session2->close(session2, NULL);
+ ret = wt_conn2->close(wt_conn2, NULL);
+
+ ret = cursor->reset(cursor);
+ /*! [log cursor set_key] */
+ cursor->set_key(cursor, lsnsave.file, lsnsave.offset, 0);
+ /*! [log cursor set_key] */
+ /*! [log cursor search] */
+ ret = cursor->search(cursor);
+ /*! [log cursor search] */
+ printf("Reset to saved...\n");
+ /*
+ * Walk all records starting with this key.
+ */
+ first = 1;
+ while ((ret = cursor->get_key(cursor,
+ &lsn.file, &lsn.offset, &opcount)) == 0) {
+ if (first) {
+ first = 0;
+ if (lsnsave.file != lsn.file ||
+ lsnsave.offset != lsn.offset) {
+ fprintf(stderr,
+ "search returned the wrong LSN\n");
+ exit (1);
+ }
+ }
+ ret = cursor->get_value(cursor, &txnid, &rectype,
+ &optype, &fileid, &logrec_key, &logrec_value);
+
+ print_record(&lsn, opcount,
+ rectype, optype, txnid, fileid, &logrec_key, &logrec_value);
+
+ ret = cursor->next(cursor);
+ if (ret != 0)
+ break;
+ }
+ ret = cursor->close(cursor);
+ return (ret);
+}
+
+int
+main(void)
+{
+ WT_CONNECTION *wt_conn;
+ WT_CURSOR *cursor;
+ WT_SESSION *session;
+ int i, record_count, ret;
+ char cmd_buf[256], k[16], v[16];
+
+ snprintf(cmd_buf, sizeof(cmd_buf), "rm -rf %s %s && mkdir %s %s",
+ home1, home2, home1, home2);
+ if ((ret = system(cmd_buf)) != 0) {
+ fprintf(stderr, "%s: failed ret %d\n", cmd_buf, ret);
+ return (ret);
+ }
+ if ((ret = wiredtiger_open(home1, NULL,
+ CONN_CONFIG, &wt_conn)) != 0) {
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home1, wiredtiger_strerror(ret));
+ return (ret);
+ }
+
+ ret = wt_conn->open_session(wt_conn, NULL, NULL, &session);
+ ret = session->create(session, uri, "key_format=S,value_format=S");
+
+ ret = session->open_cursor(session, uri, NULL, NULL, &cursor);
+ /*
+ * Perform some operations with individual auto-commit transactions.
+ */
+ for (record_count = 0, i = 0; i < MAX_KEYS; i++, record_count++) {
+ snprintf(k, sizeof(k), "key%d", i);
+ snprintf(v, sizeof(v), "value%d", i);
+ cursor->set_key(cursor, k);
+ cursor->set_value(cursor, v);
+ ret = cursor->insert(cursor);
+ }
+ ret = session->begin_transaction(session, NULL);
+ /*
+ * Perform some operations within a single transaction.
+ */
+ for (i = MAX_KEYS; i < MAX_KEYS+5; i++, record_count++) {
+ snprintf(k, sizeof(k), "key%d", i);
+ snprintf(v, sizeof(v), "value%d", i);
+ cursor->set_key(cursor, k);
+ cursor->set_value(cursor, v);
+ ret = cursor->insert(cursor);
+ }
+ ret = session->commit_transaction(session, NULL);
+ ret = cursor->close(cursor);
+
+ /*! [log cursor printf] */
+ ret = session->log_printf(session, "Wrote %d records", record_count);
+ /*! [log cursor printf] */
+
+ /*
+ * Close and reopen the connection so that the log ends up with
+ * a variety of records such as file sync and checkpoint. We
+ * have archiving turned off.
+ */
+ ret = wt_conn->close(wt_conn, NULL);
+ if ((ret = wiredtiger_open(home1, NULL,
+ CONN_CONFIG, &wt_conn)) != 0) {
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home1, wiredtiger_strerror(ret));
+ return (ret);
+ }
+
+ ret = wt_conn->open_session(wt_conn, NULL, NULL, &session);
+ ret = simple_walk_log(session);
+ ret = walk_log(session);
+ ret = wt_conn->close(wt_conn, NULL);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_pack.c b/src/third_party/wiredtiger/examples/c/ex_pack.c
new file mode 100644
index 00000000000..29d645f6cfc
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_pack.c
@@ -0,0 +1,85 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_pack.c
+ * This is an example demonstrating basic packing and unpacking of fields.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+
+static const char *home;
+
+int
+main(void)
+{
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+ char buf[50];
+ size_t size;
+ int i, j, k, ret;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ /* Open a connection to the database, creating it if necessary. */
+ if ((ret = wiredtiger_open(home, NULL, "create", &conn)) != 0)
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ /* Open a session for the current thread's work. */
+ if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+ fprintf(stderr, "Error opening a session on %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ /*! [packing] */
+ ret = wiredtiger_struct_size(session, &size, "iii", 42, 1000, -9);
+ if (size > sizeof(buf)) {
+ /* Allocate a bigger buffer. */
+ }
+
+ ret = wiredtiger_struct_pack(session, buf, size, "iii", 42, 1000, -9);
+
+ ret = wiredtiger_struct_unpack(session, buf, size, "iii", &i, &j, &k);
+ /*! [packing] */
+
+ /* Note: closing the connection implicitly closes open session(s). */
+ if ((ret = conn->close(conn, NULL)) != 0)
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_process.c b/src/third_party/wiredtiger/examples/c/ex_process.c
new file mode 100644
index 00000000000..a25d9084965
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_process.c
@@ -0,0 +1,78 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_process.c
+ * This is an example demonstrating how to connect to a database from
+ * multiple processes.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+
+static const char *home;
+
+int
+main(void)
+{
+ int ret;
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ /*! [processes] */
+ /* Open a connection to the database, creating it if necessary. */
+ if ((ret =
+ wiredtiger_open(home, NULL, "create,multiprocess", &conn)) != 0)
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ /* Open a session for the current thread's work. */
+ if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+ fprintf(stderr, "Error opening a session on %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ /* XXX Do some work... */
+
+ /* Note: closing the connection implicitly closes open session(s). */
+ if ((ret = conn->close(conn, NULL)) != 0)
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+ /*! [processes] */
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_schema.c b/src/third_party/wiredtiger/examples/c/ex_schema.c
new file mode 100644
index 00000000000..bc2f3b6c2d9
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_schema.c
@@ -0,0 +1,309 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_schema.c
+ * This is an example application demonstrating how to create and access
+ * tables using a schema.
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+
+static const char *home;
+
+/*! [schema declaration] */
+/* The C struct for the data we are storing in a WiredTiger table. */
+typedef struct {
+ char country[5];
+ uint16_t year;
+ uint64_t population;
+} POP_RECORD;
+
+static POP_RECORD pop_data[] = {
+ { "AU", 1900, 4000000 },
+ { "AU", 2000, 19053186 },
+ { "CAN", 1900, 5500000 },
+ { "CAN", 2000, 31099561 },
+ { "UK", 1900, 369000000 },
+ { "UK", 2000, 59522468 },
+ { "USA", 1900, 76212168 },
+ { "USA", 2000, 301279593 },
+ { "", 0, 0 }
+};
+/*! [schema declaration] */
+
+int
+main(void)
+{
+ POP_RECORD *p;
+ WT_CONNECTION *conn;
+ WT_CURSOR *cursor;
+ WT_SESSION *session;
+ const char *country;
+ uint64_t recno, population;
+ uint16_t year;
+ int ret;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ if ((ret = wiredtiger_open(home, NULL, "create", &conn)) != 0) {
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+ return (ret);
+ }
+ /* Note: error checking omitted for clarity. */
+
+ ret = conn->open_session(conn, NULL, NULL, &session);
+
+ /*! [Create a table with column groups] */
+ /*
+ * Create the population table.
+ * Keys are record numbers, the format for values is (5-byte string,
+ * uint16_t, uint64_t).
+ * See ::wiredtiger_struct_pack for details of the format strings.
+ */
+ ret = session->create(session, "table:poptable",
+ "key_format=r,"
+ "value_format=5sHQ,"
+ "columns=(id,country,year,population),"
+ "colgroups=(main,population)");
+
+ /*
+ * Create two column groups: a primary column group with the country
+ * code, year and population (named "main"), and a population column
+ * group with the population by itself (named "population").
+ */
+ ret = session->create(session,
+ "colgroup:poptable:main", "columns=(country,year,population)");
+ ret = session->create(session,
+ "colgroup:poptable:population", "columns=(population)");
+ /*! [Create a table with column groups] */
+
+ /*! [Create an index] */
+ /* Create an index with a simple key. */
+ ret = session->create(session,
+ "index:poptable:country", "columns=(country)");
+ /*! [Create an index] */
+
+ /*! [Create an index with a composite key] */
+ /* Create an index with a composite key (country,year). */
+ ret = session->create(session,
+ "index:poptable:country_plus_year", "columns=(country,year)");
+ /*! [Create an index with a composite key] */
+
+ /* Insert the records into the table. */
+ ret = session->open_cursor(
+ session, "table:poptable", NULL, "append", &cursor);
+ for (p = pop_data; p->year != 0; p++) {
+ cursor->set_value(cursor, p->country, p->year, p->population);
+ ret = cursor->insert(cursor);
+ }
+ ret = cursor->close(cursor);
+
+ /* List the records in the table. */
+ ret = session->open_cursor(session,
+ "table:poptable", NULL, NULL, &cursor);
+ while ((ret = cursor->next(cursor)) == 0) {
+ ret = cursor->get_key(cursor, &recno);
+ ret = cursor->get_value(cursor, &country, &year, &population);
+ printf("ID %" PRIu64, recno);
+ printf(": country %s, year %u, population %" PRIu64 "\n",
+ country, year, population);
+ }
+ ret = cursor->close(cursor);
+
+ /*! [List the records in the table using raw mode.] */
+ /* List the records in the table using raw mode. */
+ ret = session->open_cursor(session,
+ "table:poptable", NULL, "raw", &cursor);
+ while ((ret = cursor->next(cursor)) == 0) {
+ WT_ITEM key, value;
+
+ ret = cursor->get_key(cursor, &key);
+ ret = wiredtiger_struct_unpack(session,
+ key.data, key.size, "r", &recno);
+ printf("ID %" PRIu64, recno);
+
+ ret = cursor->get_value(cursor, &value);
+ ret = wiredtiger_struct_unpack(session,
+ value.data, value.size,
+ "5sHQ", &country, &year, &population);
+ printf(": country %s, year %u, population %" PRIu64 "\n",
+ country, year, population);
+ }
+ /*! [List the records in the table using raw mode.] */
+ ret = cursor->close(cursor);
+
+ /*! [Read population from the primary column group] */
+ /*
+ * Open a cursor on the main column group, and return the information
+ * for a particular country.
+ */
+ ret = session->open_cursor(
+ session, "colgroup:poptable:main", NULL, NULL, &cursor);
+ cursor->set_key(cursor, 2);
+ if ((ret = cursor->search(cursor)) == 0) {
+ ret = cursor->get_value(cursor, &country, &year, &population);
+ printf("ID 2: country %s, year %u, population %" PRIu64 "\n",
+ country, year, population);
+ }
+ /*! [Read population from the primary column group] */
+ ret = cursor->close(cursor);
+
+ /*! [Read population from the standalone column group] */
+ /*
+ * Open a cursor on the population column group, and return the
+ * population of a particular country.
+ */
+ ret = session->open_cursor(session,
+ "colgroup:poptable:population", NULL, NULL, &cursor);
+ cursor->set_key(cursor, 2);
+ if ((ret = cursor->search(cursor)) == 0) {
+ ret = cursor->get_value(cursor, &population);
+ printf("ID 2: population %" PRIu64 "\n", population);
+ }
+ /*! [Read population from the standalone column group] */
+ ret = cursor->close(cursor);
+
+ /*! [Search in a simple index] */
+ /* Search in a simple index. */
+ ret = session->open_cursor(session,
+ "index:poptable:country", NULL, NULL, &cursor);
+ cursor->set_key(cursor, "AU\0\0\0");
+ ret = cursor->search(cursor);
+ ret = cursor->get_value(cursor, &country, &year, &population);
+ printf("AU: country %s, year %u, population %" PRIu64 "\n",
+ country, (unsigned int)year, population);
+ /*! [Search in a simple index] */
+ ret = cursor->close(cursor);
+
+ /*! [Search in a composite index] */
+ /* Search in a composite index. */
+ ret = session->open_cursor(session,
+ "index:poptable:country_plus_year", NULL, NULL, &cursor);
+ cursor->set_key(cursor, "USA\0\0", (uint16_t)1900);
+ ret = cursor->search(cursor);
+ ret = cursor->get_value(cursor, &country, &year, &population);
+ printf("US 1900: country %s, year %u, population %" PRIu64 "\n",
+ country, (unsigned int)year, population);
+ /*! [Search in a composite index] */
+ ret = cursor->close(cursor);
+
+ /*! [Return a subset of values from the table] */
+ /*
+ * Use a projection to return just the table's country and year
+ * columns.
+ */
+ ret = session->open_cursor(session,
+ "table:poptable(country,year)", NULL, NULL, &cursor);
+ while ((ret = cursor->next(cursor)) == 0) {
+ ret = cursor->get_value(cursor, &country, &year);
+ printf("country %s, year %u\n", country, year);
+ }
+ /*! [Return a subset of values from the table] */
+ ret = cursor->close(cursor);
+
+ /*! [Return a subset of values from the table using raw mode] */
+ /*
+ * Use a projection to return just the table's country and year
+ * columns, using raw mode.
+ */
+ ret = session->open_cursor(session,
+ "table:poptable(country,year)", NULL, "raw", &cursor);
+ while ((ret = cursor->next(cursor)) == 0) {
+ WT_ITEM value;
+
+ ret = cursor->get_value(cursor, &value);
+ ret = wiredtiger_struct_unpack(
+ session, value.data, value.size, "5sH", &country, &year);
+ printf("country %s, year %u\n", country, year);
+ }
+ /*! [Return a subset of values from the table using raw mode] */
+ ret = cursor->close(cursor);
+
+ /*! [Return the table's record number key using an index] */
+ /*
+ * Use a projection to return just the table's record number key
+ * from an index.
+ */
+ ret = session->open_cursor(session,
+ "index:poptable:country_plus_year(id)", NULL, NULL, &cursor);
+ while ((ret = cursor->next(cursor)) == 0) {
+ ret = cursor->get_key(cursor, &country, &year);
+ ret = cursor->get_value(cursor, &recno);
+ printf("row ID %" PRIu64 ": country %s, year %u\n",
+ recno, country, year);
+ }
+ /*! [Return the table's record number key using an index] */
+ ret = cursor->close(cursor);
+
+ /*! [Return a subset of the value columns from an index] */
+ /*
+ * Use a projection to return just the population column from an
+ * index.
+ */
+ ret = session->open_cursor(session,
+ "index:poptable:country_plus_year(population)",
+ NULL, NULL, &cursor);
+ while ((ret = cursor->next(cursor)) == 0) {
+ ret = cursor->get_key(cursor, &country, &year);
+ ret = cursor->get_value(cursor, &population);
+ printf("population %" PRIu64 ": country %s, year %u\n",
+ population, country, year);
+ }
+ /*! [Return a subset of the value columns from an index] */
+ ret = cursor->close(cursor);
+
+ /*! [Access only the index] */
+ /*
+ * Use a projection to avoid accessing any other column groups when
+ * using an index: supply an empty list of value columns.
+ */
+ ret = session->open_cursor(session,
+ "index:poptable:country_plus_year()", NULL, NULL, &cursor);
+ while ((ret = cursor->next(cursor)) == 0) {
+ ret = cursor->get_key(cursor, &country, &year);
+ printf("country %s, year %u\n", country, year);
+ }
+ /*! [Access only the index] */
+ ret = cursor->close(cursor);
+
+ ret = conn->close(conn, NULL);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_scope.c b/src/third_party/wiredtiger/examples/c/ex_scope.c
new file mode 100644
index 00000000000..334745f7e37
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_scope.c
@@ -0,0 +1,174 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_scope.c
+ * demonstrates the scope of buffers holding cursor keys and values.
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+
+#ifdef _WIN32
+/* snprintf is not supported on <= VS2013 */
+#define snprintf _snprintf
+#endif
+
+static const char *home;
+
+static int
+cursor_scope_ops(WT_CURSOR *cursor)
+{
+ struct {
+ const char *key;
+ const char *value;
+ int (*apply)(WT_CURSOR *);
+ } *op, ops[] = {
+ { "key1", "value1", cursor->insert, },
+ { "key1", "value2", cursor->update, },
+ { "key1", "value2", cursor->search, },
+ { "key1", "value2", cursor->remove, },
+ { NULL, NULL, NULL }
+ };
+ const char *key, *value;
+ char keybuf[10], valuebuf[10];
+ int ret;
+
+ for (op = ops; op->key != NULL; op++) {
+ key = value = NULL;
+
+ /*! [cursor scope operation] */
+ (void)snprintf(keybuf, sizeof(keybuf), "%s", op->key);
+ cursor->set_key(cursor, keybuf);
+ (void)snprintf(valuebuf, sizeof(valuebuf), "%s", op->value);
+ cursor->set_value(cursor, valuebuf);
+
+ /*
+ * The application must keep the key and value memory valid
+ * until the next operation that positions the cursor.
+ * Modifying either the key or value buffers is not permitted.
+ */
+
+ /* Apply the operation (insert, update, search or remove). */
+ if ((ret = op->apply(cursor)) != 0) {
+ fprintf(stderr, "Error performing the operation: %s\n",
+ wiredtiger_strerror(ret));
+ return (ret);
+ }
+
+ /*
+ * Except for WT_CURSOR::insert, the cursor has been positioned
+ * and no longer references application memory, so application
+ * buffers can be safely overwritten.
+ */
+ if (op->apply != cursor->insert) {
+ strcpy(keybuf, "no key");
+ strcpy(valuebuf, "no value");
+ }
+
+ /*
+ * Check that get_key/value behave as expected after the
+ * operation.
+ */
+ if ((ret = cursor->get_key(cursor, &key)) != 0 ||
+ (op->apply != cursor->remove &&
+ (ret = cursor->get_value(cursor, &value)) != 0)) {
+ fprintf(stderr, "Error in get_key/value: %s\n",
+ wiredtiger_strerror(ret));
+ return (ret);
+ }
+
+ /*
+ * Except for WT_CURSOR::insert (which does not position the
+ * cursor), the application now has pointers to memory owned
+ * by the cursor. Modifying the memory referenced by either
+ * key or value is not permitted.
+ */
+
+ /* Check that the cursor's key and value are what we expect. */
+ if (op->apply != cursor->insert)
+ if (key == keybuf ||
+ (op->apply != cursor->remove &&
+ value == valuebuf)) {
+ fprintf(stderr,
+ "Cursor points at application memory!\n");
+ return (EINVAL);
+ }
+
+ if (strcmp(key, op->key) != 0 ||
+ (op->apply != cursor->remove &&
+ strcmp(value, op->value) != 0)) {
+ fprintf(stderr, "Unexpected key / value!\n");
+ return (EINVAL);
+ }
+ /*! [cursor scope operation] */
+ }
+
+ return (0);
+}
+
+int
+main(void)
+{
+ WT_CONNECTION *conn;
+ WT_CURSOR *cursor;
+ WT_SESSION *session;
+ int ret, tret;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ /* Open a connection, create a simple table, open a cursor. */
+ if ((ret = wiredtiger_open(home, NULL, "create", &conn)) != 0 ||
+ (ret = conn->open_session(conn, NULL, NULL, &session)) != 0) {
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+ return (ret);
+ }
+
+ ret = session->create(session,
+ "table:scope", "key_format=S,value_format=S,columns=(k,v)");
+
+ ret = session->open_cursor(session,
+ "table:scope", NULL, NULL, &cursor);
+
+ ret = cursor_scope_ops(cursor);
+
+ /* Close the connection and clean up. */
+ if ((tret = conn->close(conn, NULL)) != 0 && ret == 0)
+ ret = tret;
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_stat.c b/src/third_party/wiredtiger/examples/c/ex_stat.c
new file mode 100644
index 00000000000..0614a1de234
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_stat.c
@@ -0,0 +1,223 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_stat.c
+ * This is an example demonstrating how to query database statistics.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+
+int print_cursor(WT_CURSOR *);
+int print_database_stats(WT_SESSION *);
+int print_file_stats(WT_SESSION *);
+int print_overflow_pages(WT_SESSION *);
+int get_stat(WT_CURSOR *cursor, int stat_field, uint64_t *valuep);
+int print_derived_stats(WT_SESSION *);
+
+static const char *home;
+
+/*! [statistics display function] */
+int
+print_cursor(WT_CURSOR *cursor)
+{
+ const char *desc, *pvalue;
+ uint64_t value;
+ int ret;
+
+ while ((ret = cursor->next(cursor)) == 0 &&
+ (ret = cursor->get_value(cursor, &desc, &pvalue, &value)) == 0)
+ if (value != 0)
+ printf("%s=%s\n", desc, pvalue);
+
+ return (ret == WT_NOTFOUND ? 0 : ret);
+}
+/*! [statistics display function] */
+
+int
+print_database_stats(WT_SESSION *session)
+{
+ WT_CURSOR *cursor;
+ int ret;
+
+ /*! [statistics database function] */
+ if ((ret = session->open_cursor(session,
+ "statistics:", NULL, NULL, &cursor)) != 0)
+ return (ret);
+
+ ret = print_cursor(cursor);
+ ret = cursor->close(cursor);
+ /*! [statistics database function] */
+
+ return (ret);
+}
+
+int
+print_file_stats(WT_SESSION *session)
+{
+ WT_CURSOR *cursor;
+ int ret;
+
+ /*! [statistics table function] */
+ if ((ret = session->open_cursor(session,
+ "statistics:table:access", NULL, NULL, &cursor)) != 0)
+ return (ret);
+
+ ret = print_cursor(cursor);
+ ret = cursor->close(cursor);
+ /*! [statistics table function] */
+
+ return (ret);
+}
+
+int
+print_overflow_pages(WT_SESSION *session)
+{
+ /*! [statistics retrieve by key] */
+ WT_CURSOR *cursor;
+ const char *desc, *pvalue;
+ uint64_t value;
+ int ret;
+
+ if ((ret = session->open_cursor(session,
+ "statistics:table:access", NULL, NULL, &cursor)) != 0)
+ return (ret);
+
+ cursor->set_key(cursor, WT_STAT_DSRC_BTREE_OVERFLOW);
+ ret = cursor->search(cursor);
+ ret = cursor->get_value(cursor, &desc, &pvalue, &value);
+ printf("%s=%s\n", desc, pvalue);
+
+ ret = cursor->close(cursor);
+ /*! [statistics retrieve by key] */
+
+ return (ret);
+}
+
+/*! [statistics calculation helper function] */
+int
+get_stat(WT_CURSOR *cursor, int stat_field, uint64_t *valuep)
+{
+ const char *desc, *pvalue;
+ int ret;
+
+ cursor->set_key(cursor, stat_field);
+ if ((ret = cursor->search(cursor)) != 0)
+ return (ret);
+
+ return (cursor->get_value(cursor, &desc, &pvalue, valuep));
+}
+/*! [statistics calculation helper function] */
+
+int
+print_derived_stats(WT_SESSION *session)
+{
+ WT_CURSOR *cursor;
+ int ret;
+
+ /*! [statistics calculate open table stats] */
+ if ((ret = session->open_cursor(session,
+ "statistics:table:access", NULL, NULL, &cursor)) != 0)
+ return (ret);
+ /*! [statistics calculate open table stats] */
+
+ {
+ /*! [statistics calculate table fragmentation] */
+ uint64_t ckpt_size, file_size;
+ ret = get_stat(cursor, WT_STAT_DSRC_BLOCK_CHECKPOINT_SIZE, &ckpt_size);
+ ret = get_stat(cursor, WT_STAT_DSRC_BLOCK_SIZE, &file_size);
+
+ printf("File is %d%% fragmented\n",
+ (int)(100 * (file_size - ckpt_size) / file_size));
+ /*! [statistics calculate table fragmentation] */
+ }
+
+ {
+ /*! [statistics calculate write amplification] */
+ uint64_t app_insert, app_remove, app_update, fs_writes;
+
+ ret = get_stat(cursor, WT_STAT_DSRC_CURSOR_INSERT_BYTES, &app_insert);
+ ret = get_stat(cursor, WT_STAT_DSRC_CURSOR_REMOVE_BYTES, &app_remove);
+ ret = get_stat(cursor, WT_STAT_DSRC_CURSOR_UPDATE_BYTES, &app_update);
+
+ ret = get_stat(cursor, WT_STAT_DSRC_CACHE_BYTES_WRITE, &fs_writes);
+
+ if (app_insert + app_remove + app_update != 0)
+ printf("Write amplification is %.2lf\n",
+ (double)fs_writes / (app_insert + app_remove + app_update));
+ /*! [statistics calculate write amplification] */
+ }
+
+ ret = cursor->close(cursor);
+
+ return (ret);
+}
+
+int
+main(void)
+{
+ WT_CONNECTION *conn;
+ WT_CURSOR *cursor;
+ WT_SESSION *session;
+ int ret;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ ret = wiredtiger_open(home, NULL, "create,statistics=(all)", &conn);
+ ret = conn->open_session(conn, NULL, NULL, &session);
+ ret = session->create(
+ session, "table:access", "key_format=S,value_format=S");
+
+ ret = session->open_cursor(
+ session, "table:access", NULL, NULL, &cursor);
+ cursor->set_key(cursor, "key");
+ cursor->set_value(cursor, "value");
+ ret = cursor->insert(cursor);
+ ret = cursor->close(cursor);
+
+ ret = session->checkpoint(session, NULL);
+
+ ret = print_database_stats(session);
+
+ ret = print_file_stats(session);
+
+ ret = print_overflow_pages(session);
+
+ ret = print_derived_stats(session);
+
+ return (conn->close(conn, NULL) == 0 ? ret : EXIT_FAILURE);
+}
diff --git a/src/third_party/wiredtiger/examples/c/ex_thread.c b/src/third_party/wiredtiger/examples/c/ex_thread.c
new file mode 100644
index 00000000000..5ea3a0a4894
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/c/ex_thread.c
@@ -0,0 +1,126 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_thread.c
+ * This is an example demonstrating how to create and access a simple
+ * table from multiple threads.
+ */
+
+#ifndef _WIN32
+#include <pthread.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+#include "windows_shim.h"
+#endif
+
+#include <wiredtiger.h>
+
+static const char *home;
+
+void *scan_thread(void *arg);
+
+#define NUM_THREADS 10
+
+/*! [thread scan] */
+void *
+scan_thread(void *conn_arg)
+{
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+ WT_CURSOR *cursor;
+ const char *key, *value;
+ int ret;
+
+ conn = conn_arg;
+ ret = conn->open_session(conn, NULL, NULL, &session);
+ ret = session->open_cursor(
+ session, "table:access", NULL, NULL, &cursor);
+
+ /* Show all records. */
+ while ((ret = cursor->next(cursor)) == 0) {
+ ret = cursor->get_key(cursor, &key);
+ ret = cursor->get_value(cursor, &value);
+
+ printf("Got record: %s : %s\n", key, value);
+ }
+ if (ret != WT_NOTFOUND)
+ fprintf(stderr,
+ "WT_CURSOR.next: %s\n", wiredtiger_strerror(ret));
+
+ return (NULL);
+}
+/*! [thread scan] */
+
+/*! [thread main] */
+int
+main(void)
+{
+ WT_CONNECTION *conn;
+ WT_SESSION *session;
+ WT_CURSOR *cursor;
+ pthread_t threads[NUM_THREADS];
+ int i, ret;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ if ((ret = wiredtiger_open(home, NULL, "create", &conn)) != 0)
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+ /* Note: further error checking omitted for clarity. */
+
+ ret = conn->open_session(conn, NULL, NULL, &session);
+ ret = session->create(session, "table:access",
+ "key_format=S,value_format=S");
+ ret = session->open_cursor(session, "table:access", NULL,
+ "overwrite", &cursor);
+ cursor->set_key(cursor, "key1");
+ cursor->set_value(cursor, "value1");
+ ret = cursor->insert(cursor);
+ ret = session->close(session, NULL);
+
+ for (i = 0; i < NUM_THREADS; i++)
+ ret = pthread_create(&threads[i], NULL, scan_thread, conn);
+
+ for (i = 0; i < NUM_THREADS; i++)
+ ret = pthread_join(threads[i], NULL);
+
+ ret = conn->close(conn, NULL);
+
+ return (ret);
+}
+/*! [thread main] */
diff --git a/src/third_party/wiredtiger/examples/java/Makefile.am b/src/third_party/wiredtiger/examples/java/Makefile.am
new file mode 100644
index 00000000000..c7fbfffa48c
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/java/Makefile.am
@@ -0,0 +1,21 @@
+AM_CPPFLAGS = -I$(abs_top_builddir)
+
+JAVAEXAMPLES = $(top_srcdir)/examples/java/com/wiredtiger/examples
+
+# TODO: How to add to existing Javadoc from main API?
+# JDOCDIR = $(top_srcdir)/docs/java
+# java_DATA = $(JDOCDIR)/index.html
+
+javadir = $(datadir)/java
+dist_java_JAVA = \
+ $(JAVAEXAMPLES)/ex_access.java
+
+all-local: wiredtiger.jar
+
+$(JDOCDIR)/index.html: $(dist_java_JAVA)
+ mkdir -p $(JDOCDIR)
+ javadoc -public -d $(JDOCDIR) -link http://docs.oracle.com/javase/6/docs/api $(JAVAEXAMPLES)/[A-Z]*.java
+
+wiredtiger.jar: $(dist_java_JAVA)
+ (cd $(top_builddir) && \
+ $(JAR) -cf wiredtiger.jar com/)
diff --git a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_access.java b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_access.java
new file mode 100644
index 00000000000..d4046495df5
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_access.java
@@ -0,0 +1,93 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_access.java
+ * demonstrates how to create and access a simple table.
+ */
+package com.wiredtiger.examples;
+import com.wiredtiger.db.*;
+
+public class ex_access {
+ public static void main(String[] args) {
+ /*! [access example connection] */
+ Connection conn;
+ Session s;
+ Cursor c;
+
+ try {
+ conn = wiredtiger.open("WT_HOME", "create");
+ s = conn.open_session(null);
+ } catch (WiredTigerException wte) {
+ System.err.println("WiredTigerException: " + wte);
+ return;
+ }
+ /*! [access example connection] */
+ try {
+ /*! [access example table create] */
+ s.create("table:t", "key_format=S,value_format=u");
+ /*! [access example table create] */
+ /*! [access example cursor open] */
+ c = s.open_cursor("table:t", null, null);
+ /*! [access example cursor open] */
+ } catch (WiredTigerException wte) {
+ System.err.println("WiredTigerException: " + wte);
+ return;
+ }
+ System.out.println("Key format: " + c.getKeyFormat());
+ System.out.println("Value format: " + c.getValueFormat());
+ /*! [access example cursor insert] */
+ try {
+ c.putKeyString("foo");
+ c.putValueByteArray("bar".getBytes());
+ c.insert();
+ } catch (WiredTigerPackingException wtpe) {
+ System.err.println("WiredTigerPackingException: " + wtpe);
+ } catch (WiredTigerException wte) {
+ System.err.println("WiredTigerException: " + wte);
+ }
+ /*! [access example cursor insert] */
+ /*! [access example cursor list] */
+ try {
+ c.reset();
+ while (c.next() == 0) {
+ System.out.println("Got: " + c.getKeyString());
+ }
+ } catch (WiredTigerPackingException wtpe) {
+ System.err.println("WiredTigerPackingException: " + wtpe);
+ } catch (WiredTigerException wte) {
+ System.err.println("WiredTigerException: " + wte);
+ }
+ /*! [access example cursor list] */
+
+ /*! [access example close] */
+ try {
+ conn.close(null);
+ } catch (WiredTigerException wte) {
+ System.err.println("WiredTigerException: " + wte);
+ }
+ /*! [access example close] */
+ }
+}
diff --git a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_all.java b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_all.java
new file mode 100644
index 00000000000..8648d95a185
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_all.java
@@ -0,0 +1,1009 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_all.java
+ * Containing a call to every method in the WiredTiger API.
+ *
+ * It doesn't do anything very useful, just demonstrates how to call each
+ * method. This file is used to populate the API reference with code
+ * fragments.
+ */
+package com.wiredtiger.examples;
+import com.wiredtiger.db.*;
+import java.io.*;
+import java.nio.*;
+
+/* Note: indentation in non-standard so it will display nicely in doc. */
+public class ex_all {
+
+public static String progname = "ex_all";
+
+public static int cursor_ops(Session session)
+ throws WiredTigerException
+{
+ Cursor cursor;
+ int ret;
+
+ /*! [Open a cursor] */
+ cursor = session.open_cursor("table:mytable", null, null);
+ /*! [Open a cursor] */
+
+ /*! [Open a cursor on the metadata] */
+ cursor = session.open_cursor("metadata:", null, null);
+ /*! [Open a cursor on the metadata] */
+
+ {
+ Cursor duplicate;
+ String key = "some key";
+ /*! [Duplicate a cursor] */
+ cursor = session.open_cursor("table:mytable", null, null);
+ cursor.putKeyString(key);
+ ret = cursor.search();
+
+ /* Duplicate the cursor. */
+ duplicate = session.open_cursor(null, cursor, null);
+ /*! [Duplicate a cursor] */
+ }
+
+ {
+ Cursor overwrite_cursor;
+ String key = "some key", value = "some value";
+ /*! [Reconfigure a cursor] */
+ cursor = session.open_cursor("table:mytable", null, null);
+ cursor.putKeyString(key);
+
+ /* Reconfigure the cursor to overwrite the record. */
+ overwrite_cursor = session.open_cursor(null, cursor, "overwrite");
+ ret = cursor.close();
+
+ overwrite_cursor.putValueString(value);
+ ret = overwrite_cursor.insert();
+ /*! [Reconfigure a cursor] */
+ }
+
+ {
+ /*! [boolean configuration string example] */
+ cursor = session.open_cursor("table:mytable", null, "overwrite");
+ cursor = session.open_cursor("table:mytable", null, "overwrite=true");
+ cursor = session.open_cursor("table:mytable", null, "overwrite=1");
+ /*! [boolean configuration string example] */
+ }
+
+ {
+ /*! [open a named checkpoint] */
+ cursor = session.open_cursor("table:mytable", null, "checkpoint=midnight");
+ /*! [open a named checkpoint] */
+ }
+
+ {
+ /*! [open the default checkpoint] */
+ cursor = session.open_cursor("table:mytable", null,
+ "checkpoint=WiredTigerCheckpoint");
+ /*! [open the default checkpoint] */
+ }
+
+ {
+ /*! [Get the cursor's string key] */
+ String key; /* Get the cursor's string key. */
+ key = cursor.getKeyString();
+ /*! [Get the cursor's string key] */
+ }
+
+ {
+ /*! [Set the cursor's string key] */
+ /* Set the cursor's string key. */
+ String key = "another key";
+ cursor.putKeyString(key);
+ /*! [Set the cursor's string key] */
+ }
+
+ {
+ /*! [Get the cursor's record number key] */
+ long recno; /* Get the cursor's record number key. */
+ recno = cursor.getKeyLong();
+ /*! [Get the cursor's record number key] */
+ }
+
+ {
+ /*! [Set the cursor's record number key] */
+ long recno = 37; /* Set the cursor's record number key. */
+ cursor.putKeyLong(recno);
+ /*! [Set the cursor's record number key] */
+ }
+
+ {
+ /*! [Get the cursor's composite key] */
+ /* Get the cursor's "SiH" format composite key. */
+ String first;
+ int second;
+ short third;
+
+ first = cursor.getKeyString();
+ second = cursor.getKeyInt();
+ third = cursor.getKeyShort();
+ /*! [Get the cursor's composite key] */
+ }
+
+ {
+ /*! [Set the cursor's composite key] */
+ /* Set the cursor's "SiH" format composite key. */
+ cursor.putKeyString("first");
+ cursor.putKeyInt(5);
+ cursor.putKeyShort((short)7);
+ /*! [Set the cursor's composite key] */
+ }
+
+ {
+ /*! [Get the cursor's string value] */
+ String value; /* Get the cursor's string value. */
+ value = cursor.getValueString();
+ /*! [Get the cursor's string value] */
+ }
+
+ {
+ /*! [Set the cursor's string value] */
+ /* Set the cursor's string value. */
+ String value = "another value";
+ cursor.putValueString(value);
+ /*! [Set the cursor's string value] */
+ }
+
+ {
+ /*! [Get the cursor's raw value] */
+ byte[] value; /* Get the cursor's raw value. */
+ value = cursor.getValueByteArray();
+ /*! [Get the cursor's raw value] */
+ }
+
+ {
+ /*! [Set the cursor's raw value] */
+ byte[] value; /* Set the cursor's raw value. */
+ value = "another value".getBytes();
+ cursor.putValueByteArray(value);
+ /*! [Set the cursor's raw value] */
+ }
+
+ /*! [Return the next record] */
+ ret = cursor.next();
+ /*! [Return the next record] */
+
+ /*! [Return the previous record] */
+ ret = cursor.prev();
+ /*! [Return the previous record] */
+
+ /*! [Reset the cursor] */
+ ret = cursor.reset();
+ /*! [Reset the cursor] */
+
+ {
+ Cursor other = null;
+ /*! [Cursor comparison] */
+ int compare;
+ compare = cursor.compare(other);
+ if (compare == 0) {
+ /* Cursors reference the same key */
+ } else if (compare < 0) {
+ /* Cursor key less than other key */
+ } else if (compare > 0) {
+ /* Cursor key greater than other key */
+ }
+ /*! [Cursor comparison] */
+ }
+
+ {
+ /*! [Search for an exact match] */
+ String key = "some key";
+ cursor.putKeyString(key);
+ ret = cursor.search();
+ /*! [Search for an exact match] */
+ }
+
+ cursor_search_near(cursor);
+
+ {
+ /*! [Insert a new record or overwrite an existing record] */
+ /* Insert a new record or overwrite an existing record. */
+ String key = "some key", value = "some value";
+ cursor = session.open_cursor("table:mytable", null, null);
+ cursor.putKeyString(key);
+ cursor.putValueString(value);
+ ret = cursor.insert();
+ /*! [Insert a new record or overwrite an existing record] */
+ }
+
+ {
+ /*! [Insert a new record and fail if the record exists] */
+ /* Insert a new record and fail if the record exists. */
+ String key = "some key", value = "some value";
+ cursor = session.open_cursor("table:mytable", null, "overwrite=false");
+ cursor.putKeyString(key);
+ cursor.putValueString(value);
+ ret = cursor.insert();
+ /*! [Insert a new record and fail if the record exists] */
+ }
+
+ {
+ /*! [Insert a new record and assign a record number] */
+ /* Insert a new record and assign a record number. */
+ long recno;
+ String value = "some value";
+ cursor = session.open_cursor("table:mytable", null, "append");
+ cursor.putValueString(value);
+ ret = cursor.insert();
+ if (ret == 0)
+ recno = cursor.getKeyLong();
+ /*! [Insert a new record and assign a record number] */
+ }
+
+ {
+ /*! [Update an existing record or insert a new record] */
+ String key = "some key", value = "some value";
+ cursor = session.open_cursor("table:mytable", null, null);
+ cursor.putKeyString(key);
+ cursor.putValueString(value);
+ ret = cursor.update();
+ /*! [Update an existing record or insert a new record] */
+ }
+
+ {
+ /*! [Update an existing record and fail if DNE] */
+ String key = "some key", value = "some value";
+ cursor = session.open_cursor("table:mytable", null, "overwrite=false");
+ cursor.putKeyString(key);
+ cursor.putValueString(value);
+ ret = cursor.update();
+ /*! [Update an existing record and fail if DNE] */
+ }
+
+ {
+ /*! [Remove a record] */
+ String key = "some key";
+ cursor = session.open_cursor("table:mytable", null, null);
+ cursor.putKeyString(key);
+ ret = cursor.remove();
+ /*! [Remove a record] */
+ }
+
+ {
+ /*! [Remove a record and fail if DNE] */
+ String key = "some key";
+ cursor = session.open_cursor("table:mytable", null, "overwrite=false");
+ cursor.putKeyString(key);
+ ret = cursor.remove();
+ /*! [Remove a record and fail if DNE] */
+ }
+
+ {
+ /*! [Display an error] */
+ try {
+ String key = "non-existent key";
+ cursor.putKeyString(key);
+ if ((ret = cursor.remove()) != 0) {
+ System.err.println(
+ "cursor.remove: " + wiredtiger.wiredtiger_strerror(ret));
+ return (ret);
+ }
+ } catch (WiredTigerException wte) { /* Catch severe errors. */
+ System.err.println("cursor.remove exception: " + wte);
+ }
+ /*! [Display an error] */
+ }
+
+ /*! [Close the cursor] */
+ ret = cursor.close();
+ /*! [Close the cursor] */
+
+ return (ret);
+}
+
+static int
+cursor_search_near(Cursor cursor)
+ throws WiredTigerException
+{
+ int ret;
+ String key = "some key";
+ SearchStatus status;
+
+ /*! [Search for an exact or adjacent match] */
+ cursor.putKeyString(key);
+ status = cursor.search_near();
+ if (status == SearchStatus.FOUND) {
+ /* an exact match */
+ } else if (status == SearchStatus.SMALLER) {
+ /* returned smaller key */
+ } else if (status == SearchStatus.LARGER) {
+ /* returned larger key */
+ } else if (status == SearchStatus.NOTFOUND) {
+ /* no match found */
+ }
+ /*! [Search for an exact or adjacent match] */
+
+ /*! [Forward scan greater than or equal] */
+ cursor.putKeyString(key);
+ status = cursor.search_near();
+ if (status == SearchStatus.FOUND || status == SearchStatus.LARGER) {
+ /* include first key returned in the scan */
+ }
+
+ while ((ret = cursor.next()) == 0) {
+ /* the rest of the scan */
+ }
+ /*! [Forward scan greater than or equal] */
+
+ /*! [Backward scan less than] */
+ cursor.putKeyString(key);
+ status = cursor.search_near();
+ if (status == SearchStatus.SMALLER) {
+ /* include first key returned in the scan */
+ }
+
+ while ((ret = cursor.prev()) == 0) {
+ /* the rest of the scan */
+ }
+ /*! [Backward scan less than] */
+
+ return (ret);
+}
+
+static int
+checkpoint_ops(Session session)
+ throws WiredTigerException
+{
+ int ret;
+
+ /*! [Checkpoint examples] */
+ /* Checkpoint the database. */
+ ret = session.checkpoint(null);
+
+ /* Checkpoint of the database, creating a named snapshot. */
+ ret = session.checkpoint("name=June01");
+
+ /*
+ * Checkpoint a list of objects.
+ * JSON parsing requires quoting the list of target URIs.
+ */
+ ret = session.
+ checkpoint("target=(\"table:table1\",\"table:table2\")");
+
+ /*
+ * Checkpoint a list of objects, creating a named snapshot.
+ * JSON parsing requires quoting the list of target URIs.
+ */
+ ret = session.
+ checkpoint("target=(\"table:mytable\"),name=midnight");
+
+ /* Checkpoint the database, discarding all previous snapshots. */
+ ret = session.checkpoint("drop=(from=all)");
+
+ /* Checkpoint the database, discarding the "midnight" snapshot. */
+ ret = session.checkpoint("drop=(midnight)");
+
+ /*
+ * Checkpoint the database, discarding all snapshots after and
+ * including "noon".
+ */
+ ret = session.checkpoint("drop=(from=noon)");
+
+ /*
+ * Checkpoint the database, discarding all snapshots before and
+ * including "midnight".
+ */
+ ret = session.checkpoint("drop=(to=midnight)");
+
+ /*
+ * Create a checkpoint of a table, creating the "July01" snapshot and
+ * discarding the "May01" and "June01" snapshots.
+ * JSON parsing requires quoting the list of target URIs.
+ */
+ ret = session.checkpoint("target=(\"table:mytable\"),name=July01,drop=(May01,June01)");
+ /*! [Checkpoint examples] */
+
+ /*! [JSON quoting example] */
+ /*
+ * Checkpoint a list of objects.
+ * JSON parsing requires quoting the list of target URIs.
+ */
+ ret = session.
+ checkpoint("target=(\"table:table1\",\"table:table2\")");
+ /*! [JSON quoting example] */
+
+ return (ret);
+}
+
+static boolean
+cursor_statistics(Session session)
+ throws WiredTigerException
+{
+ Cursor cursor;
+
+ /*! [Statistics cursor database] */
+ cursor = session.open_cursor(
+ "statistics:", null, null);
+ /*! [Statistics cursor database] */
+
+ /*! [Statistics cursor table] */
+ cursor = session.open_cursor(
+ "statistics:table:mytable", null, null);
+ /*! [Statistics cursor table] */
+
+ /*! [Statistics cursor table fast] */
+ cursor = session.open_cursor("statistics:table:mytable", null, "statistics=(fast)");
+ /*! [Statistics cursor table fast] */
+
+ /*! [Statistics clear configuration] */
+ cursor = session.open_cursor("statistics:", null, "statistics=(fast,clear)");
+ /*! [Statistics clear configuration] */
+
+ /*! [Statistics cursor clear configuration] */
+ cursor = session.open_cursor("statistics:table:mytable",
+ null, "statistics=(all,clear)");
+ /*! [Statistics cursor clear configuration] */
+
+ return (true);
+}
+
+static int
+session_ops(Session session)
+ throws WiredTigerException
+{
+ int ret;
+
+ /*! [Reconfigure a session] */
+ ret = session.reconfigure("isolation=snapshot");
+ /*! [Reconfigure a session] */
+
+ /*! [Create a table] */
+ ret = session.create("table:mytable", "key_format=S,value_format=S");
+ /*! [Create a table] */
+ ret = session.drop("table:mytable", null);
+
+ /*! [Create a column-store table] */
+ ret = session.create("table:mytable", "key_format=r,value_format=S");
+ /*! [Create a column-store table] */
+ ret = session.drop("table:mytable", null);
+
+ /*! [Create a table with columns] */
+ /*
+ * Create a table with columns: keys are record numbers, values are
+ * (string, signed 32-bit integer, unsigned 16-bit integer).
+ */
+ ret = session.create("table:mytable",
+ "key_format=r,value_format=SiH," +
+ "columns=(id,department,salary,year-started)");
+ /*! [Create a table with columns] */
+ ret = session.drop("table:mytable", null);
+
+ /*
+ * This example code gets run, and the compression libraries might not
+ * be loaded, causing the create to fail. The documentation requires
+ * the code snippets, use if (false) to avoid running it.
+ */
+ if (false) { // MIGHT_NOT_RUN
+ /*! [Create a bzip2 compressed table] */
+ ret = session.create("table:mytable",
+ "block_compressor=bzip2,key_format=S,value_format=S");
+ /*! [Create a bzip2 compressed table] */
+ ret = session.drop("table:mytable", null);
+
+ /*! [Create a snappy compressed table] */
+ ret = session.create("table:mytable",
+ "block_compressor=snappy,key_format=S,value_format=S");
+ /*! [Create a snappy compressed table] */
+ ret = session.drop("table:mytable", null);
+
+ /*! [Create a zlib compressed table] */
+ ret = session.create("table:mytable",
+ "block_compressor=zlib,key_format=S,value_format=S");
+ /*! [Create a zlib compressed table] */
+ ret = session.drop("table:mytable", null);
+ } // if (false)
+
+ /*! [Configure checksums to uncompressed] */
+ ret = session.create("table:mytable",
+ "key_format=S,value_format=S,checksum=uncompressed");
+ /*! [Configure checksums to uncompressed] */
+ ret = session.drop("table:mytable", null);
+
+ /*! [Configure dictionary compression on] */
+ ret = session.create("table:mytable",
+ "key_format=S,value_format=S,dictionary=1000");
+ /*! [Configure dictionary compression on] */
+ ret = session.drop("table:mytable", null);
+
+ /*! [Configure key prefix compression on] */
+ ret = session.create("table:mytable",
+ "key_format=S,value_format=S,prefix_compression=true");
+ /*! [Configure key prefix compression on] */
+ ret = session.drop("table:mytable", null);
+
+ if (false) { // MIGHT_NOT_RUN
+ /* Requires sync_file_range */
+ /*! [os_cache_dirty_max configuration] */
+ ret = session.create(
+ "table:mytable", "os_cache_dirty_max=500MB");
+ /*! [os_cache_dirty_max configuration] */
+ ret = session.drop("table:mytable", null);
+
+ /* Requires posix_fadvise */
+ /*! [os_cache_max configuration] */
+ ret = session.create("table:mytable", "os_cache_max=1GB");
+ /*! [os_cache_max configuration] */
+ ret = session.drop("table:mytable", null);
+ } // if (false)
+
+ /*! [Configure block_allocation] */
+ ret = session.create("table:mytable",
+ "key_format=S,value_format=S,block_allocation=first");
+ /*! [Configure block_allocation] */
+ ret = session.drop("table:mytable", null);
+
+ /*! [Create a cache-resident object] */
+ ret = session.create("table:mytable", "key_format=r,value_format=S,cache_resident=true");
+ /*! [Create a cache-resident object] */
+ ret = session.drop("table:mytable", null);
+
+ {
+ /* Create a table for the session operations. */
+ ret = session.create(
+ "table:mytable", "key_format=S,value_format=S");
+
+ /*! [Compact a table] */
+ ret = session.compact("table:mytable", null);
+ /*! [Compact a table] */
+
+ /*! [Rename a table] */
+ ret = session.rename("table:old", "table:new", null);
+ /*! [Rename a table] */
+
+ /*! [Salvage a table] */
+ ret = session.salvage("table:mytable", null);
+ /*! [Salvage a table] */
+
+ /*! [Truncate a table] */
+ ret = session.truncate("table:mytable", null, null, null);
+ /*! [Truncate a table] */
+
+ {
+ /*
+ * Insert a pair of keys so we can truncate a range.
+ */
+ Cursor cursor;
+ cursor = session.open_cursor(
+ "table:mytable", null, null);
+ cursor.putKeyString("June01");
+ cursor.putValueString("value");
+ ret = cursor.update();
+ cursor.putKeyString("June30");
+ cursor.putValueString("value");
+ ret = cursor.update();
+ cursor.close();
+
+ {
+ /*! [Truncate a range] */
+ Cursor start, stop;
+
+ start = session.open_cursor(
+ "table:mytable", null, null);
+ start.putKeyString("June01");
+ ret = start.search();
+
+ stop = session.open_cursor(
+ "table:mytable", null, null);
+ stop.putKeyString("June30");
+ ret = stop.search();
+
+ ret = session.truncate(null, start, stop, null);
+ /*! [Truncate a range] */
+ }
+ }
+
+ /*! [Upgrade a table] */
+ ret = session.upgrade("table:mytable", null);
+ /*! [Upgrade a table] */
+
+ /*! [Verify a table] */
+ ret = session.verify("table:mytable", null);
+ /*! [Verify a table] */
+
+ /*! [Drop a table] */
+ ret = session.drop("table:mytable", null);
+ /*! [Drop a table] */
+ }
+
+ /*! [Close a session] */
+ ret = session.close(null);
+ /*! [Close a session] */
+
+ return (ret);
+}
+
+static int
+transaction_ops(Connection conn, Session session)
+ throws WiredTigerException
+{
+ Cursor cursor;
+ int ret;
+
+ /*! [transaction commit/rollback] */
+ cursor = session.open_cursor("table:mytable", null, null);
+ ret = session.begin_transaction(null);
+ /*
+ * Cursors may be opened before or after the transaction begins, and in
+ * either case, subsequent operations are included in the transaction.
+ * The begin_transaction call resets all open cursors.
+ */
+
+ cursor.putKeyString("key");
+ cursor.putValueString("value");
+ switch (ret = cursor.update()) {
+ case 0: /* Update success */
+ ret = session.commit_transaction(null);
+ /*
+ * The commit_transaction call resets all open cursors.
+ * If commit_transaction fails, the transaction was rolled-back.
+ */
+ break;
+ case wiredtiger.WT_ROLLBACK: /* Update conflict */
+ default: /* Other error */
+ ret = session.rollback_transaction(null);
+ /* The rollback_transaction call resets all open cursors. */
+ break;
+ }
+
+ /* Cursors remain open and may be used for multiple transactions. */
+ /*! [transaction commit/rollback] */
+ ret = cursor.close();
+
+ /*! [transaction isolation] */
+ /* A single transaction configured for snapshot isolation. */
+ cursor = session.open_cursor("table:mytable", null, null);
+ ret = session.begin_transaction("isolation=snapshot");
+ cursor.putKeyString("some-key");
+ cursor.putValueString("some-value");
+ ret = cursor.update();
+ ret = session.commit_transaction(null);
+ /*! [transaction isolation] */
+
+ /*! [session isolation configuration] */
+ /* Open a session configured for read-uncommitted isolation. */
+ session = conn.open_session(
+ "isolation=read_uncommitted");
+ /*! [session isolation configuration] */
+
+ /*! [session isolation re-configuration] */
+ /* Re-configure a session for snapshot isolation. */
+ ret = session.reconfigure("isolation=snapshot");
+ /*! [session isolation re-configuration] */
+
+ return (ret);
+}
+
+/*! [Implement WT_COLLATOR] */
+/* Not available for java */
+/*! [Implement WT_COLLATOR] */
+
+/*! [WT_EXTRACTOR] */
+/* Not available for java */
+/*! [WT_EXTRACTOR] */
+
+static int
+connection_ops(Connection conn)
+ throws WiredTigerException
+{
+ int ret;
+
+ if (false) { // Might not run.
+ /*! [Load an extension] */
+ ret = conn.load_extension("my_extension.dll", null);
+
+ ret = conn.load_extension(
+ "datasource/libdatasource.so",
+ "config=[device=/dev/sd1,alignment=64]");
+ /*! [Load an extension] */
+ } // if (false)
+
+ /*! [Reconfigure a connection] */
+ ret = conn.reconfigure("eviction_target=75");
+ /*! [Reconfigure a connection] */
+
+ /*! [Get the database home directory] */
+ System.out.println("The database home is " + conn.get_home());
+ /*! [Get the database home directory] */
+
+ /*! [Check if the database is newly created] */
+ if (conn.is_new() != 0) {
+ /* First time initialization. */
+ }
+ /*! [Check if the database is newly created] */
+
+ {
+ /*! [Open a session] */
+ Session session;
+ session = conn.open_session(null);
+ /*! [Open a session] */
+
+ session_ops(session);
+ }
+
+ /*! [Configure method configuration] */
+ /*
+ * Applications opening a cursor for the data-source object "my_data"
+ * have an additional configuration option "entries", which is an
+ * integer type, defaults to 5, and must be an integer between 1 and 10.
+ */
+ ret = conn.configure_method(
+ "session.open_cursor",
+ "my_data:", "entries=5", "int", "min=1,max=10");
+
+ /*
+ * Applications opening a cursor for the data-source object "my_data"
+ * have an additional configuration option "devices", which is a list
+ * of strings.
+ */
+ ret = conn.configure_method(
+ "session.open_cursor", "my_data:", "devices", "list", null);
+ /*! [Configure method configuration] */
+
+ /*! [Close a connection] */
+ ret = conn.close(null);
+ /*! [Close a connection] */
+
+ return (ret);
+}
+
+static int
+pack_ops(Session session)
+{
+ {
+ /*! [Get the packed size] */
+ /* Not available for java */
+ /*! [Get the packed size] */
+ }
+
+ {
+ /*! [Pack fields into a buffer] */
+ /* Not available for java */
+ /*! [Pack fields into a buffer] */
+ }
+
+ {
+ /*! [Unpack fields from a buffer] */
+ /* Not available for java */
+ /*! [Unpack fields from a buffer] */
+ }
+
+ return (0);
+}
+
+static boolean
+backup(Session session)
+ throws WiredTigerException
+{
+ char buf[] = new char[1024];
+
+ /*! [backup]*/
+ Cursor cursor;
+ String filename;
+ int ret = 0;
+ String databasedir = "/path/database";
+ String backdir = "/path/database.backup";
+ final String sep = File.separator;
+
+ try {
+ /* Create the backup directory. */
+ if (!(new File(backdir)).mkdir()) {
+ System.err.println(progname + ": cannot create backup dir: " +
+ backdir);
+ return false;
+ }
+
+ /* Open the backup data source. */
+ cursor = session.open_cursor("backup:", null, null);
+
+ /* Copy the list of files. */
+ while ((ret = cursor.next()) == 0 &&
+ (filename = cursor.getKeyString()) != null) {
+ String src = databasedir + sep + filename;
+ String dest = backdir + sep + filename;
+ java.nio.file.Files.copy(
+ new java.io.File(src).toPath(),
+ new java.io.File(dest).toPath(),
+ java.nio.file.StandardCopyOption.REPLACE_EXISTING,
+ java.nio.file.StandardCopyOption.COPY_ATTRIBUTES);
+ }
+ if (ret == wiredtiger.WT_NOTFOUND)
+ ret = 0;
+ if (ret != 0)
+ System.err.println(progname +
+ ": cursor next(backup:) failed: " +
+ wiredtiger.wiredtiger_strerror(ret));
+
+ ret = cursor.close();
+ }
+ catch (Exception ex) {
+ System.err.println(progname +
+ ": backup failed: " + ex.toString());
+ }
+ /*! [backup]*/
+
+ /*! [backup of a checkpoint]*/
+ ret = session.checkpoint("drop=(from=June01),name=June01");
+ /*! [backup of a checkpoint]*/
+
+ return (ret == 0);
+}
+
+public static int
+allExample()
+ throws WiredTigerException
+{
+ Connection conn;
+ int ret = 0;
+ String home = "/home/example/WT_TEST";
+
+ /*! [Open a connection] */
+ conn = wiredtiger.open(home, "create,cache_size=500M");
+ /*! [Open a connection] */
+
+ connection_ops(conn);
+ /*
+ * The connection has been closed.
+ */
+
+ if (false) { // MIGHT_NOT_RUN
+ /*
+ * This example code gets run, and the compression libraries might not
+ * be installed, causing the open to fail. The documentation requires
+ * the code snippets, use if (false) to avoid running it.
+ */
+ /*! [Configure bzip2 extension] */
+ conn = wiredtiger.open(home,
+ "create," +
+ "extensions=[/usr/local/lib/libwiredtiger_bzip2.so]");
+ /*! [Configure bzip2 extension] */
+ conn.close(null);
+
+ /*! [Configure snappy extension] */
+ conn = wiredtiger.open(home,
+ "create," +
+ "extensions=[/usr/local/lib/libwiredtiger_snappy.so]");
+ /*! [Configure snappy extension] */
+ conn.close(null);
+
+ /*! [Configure zlib extension] */
+ conn = wiredtiger.open(home,
+ "create," +
+ "extensions=[/usr/local/lib/libwiredtiger_zlib.so]");
+ /*! [Configure zlib extension] */
+ conn.close(null);
+
+ /*
+ * This example code gets run, and direct I/O might not be available,
+ * causing the open to fail. The documentation requires code snippets,
+ * use if (false) to avoid running it.
+ */
+ /* Might Not Run: direct I/O may not be available. */
+ /*! [Configure direct_io for data files] */
+ conn = wiredtiger.open(home, "create,direct_io=[data]");
+ /*! [Configure direct_io for data files] */
+ conn.close(null);
+ } // if (false)
+
+ /*! [Configure file_extend] */
+ conn = wiredtiger.open(
+ home, "create,file_extend=(data=16MB)");
+ /*! [Configure file_extend] */
+ conn.close(null);
+
+ /*! [Eviction configuration] */
+ /*
+ * Configure eviction to begin at 90% full, and run until the cache
+ * is only 75% dirty.
+ */
+ conn = wiredtiger.open(home,
+ "create,eviction_trigger=90,eviction_dirty_target=75");
+ /*! [Eviction configuration] */
+ conn.close(null);
+
+ /*! [Eviction worker configuration] */
+ /* Configure up to four eviction threads */
+ conn = wiredtiger.open(home,
+ "create,eviction_trigger=90,eviction=(threads_max=4)");
+ /*! [Eviction worker configuration] */
+ conn.close(null);
+
+ /*! [Statistics configuration] */
+ conn = wiredtiger.open(home, "create,statistics=(all)");
+ /*! [Statistics configuration] */
+ conn.close(null);
+
+ /*! [Statistics logging] */
+ conn = wiredtiger.open(
+ home, "create,statistics_log=(wait=30)");
+ /*! [Statistics logging] */
+ conn.close(null);
+
+ /*! [Statistics logging with a table] */
+ conn = wiredtiger.open(home,
+ "create," +
+ "statistics_log=(sources=(\"table:table1\",\"table:table2\"))");
+ /*! [Statistics logging with a table] */
+ conn.close(null);
+
+ /*! [Statistics logging with all tables] */
+ conn = wiredtiger.open(home,
+ "create,statistics_log=(sources=(\"table:\"))");
+ /*! [Statistics logging with all tables] */
+ conn.close(null);
+
+ if (false) { // MIGHT_NOT_RUN
+ /*
+ * This example code gets run, and a non-existent log file path might
+ * cause the open to fail. The documentation requires code snippets,
+ * use if (false) to avoid running it.
+ */
+ /*! [Statistics logging with path] */
+ conn = wiredtiger.open(home,
+ "create," +
+ "statistics_log=(wait=120,path=/log/log.%m.%d.%y)");
+ /*! [Statistics logging with path] */
+ conn.close(null);
+
+ /*
+ * Don't run this code, because memory checkers get very upset when we
+ * leak memory.
+ */
+ conn = wiredtiger.open(home, "create");
+ /*! [Connection close leaking memory] */
+ ret = conn.close("leak_memory=true");
+ /*! [Connection close leaking memory] */
+ } // if (false)
+
+ /*! [Get the WiredTiger library version #1] */
+ /* Not available for java */
+ /*! [Get the WiredTiger library version #1] */
+
+ {
+ /*! [Get the WiredTiger library version #2] */
+ /* Not available for java */
+ /*! [Get the WiredTiger library version #2] */
+ }
+
+ return (0);
+}
+
+public static int
+main(String[] argv)
+{
+ try {
+ return (allExample());
+ }
+ catch (WiredTigerException wte) {
+ System.err.println("Exception: " + wte);
+ return (-1);
+ }
+}
+} /* Non-standard indentation */
diff --git a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_async.java b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_async.java
new file mode 100644
index 00000000000..c6cb0550571
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_async.java
@@ -0,0 +1,222 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_async.java
+ * demonstrates how to use the asynchronous API.
+ */
+package com.wiredtiger.examples;
+import com.wiredtiger.db.*;
+import java.io.*;
+import java.util.*;
+
+/*! [async example callback implementation] */
+class AsyncKeys implements AsyncCallback {
+
+ public int numKeys = 0;
+
+ public AsyncKeys() {}
+
+ public void notifyError(String desc) {
+ System.err.println("ERROR: notify: " + desc);
+ }
+
+ public int notify(AsyncOp op, int opReturn, int flags) {
+ /*
+ * Note: we are careful not to throw any errors here. Any
+ * exceptions would be swallowed by a native worker thread.
+ */
+ int ret = 0;
+ try {
+ /*! [async get type] */
+ /* Retrieve the operation's type. */
+ AsyncOpType optype = op.getType();
+ /*! [async get type] */
+ /*! [async get identifier] */
+ /* Retrieve the operation's 64-bit identifier. */
+ long id = op.getId();
+ /*! [async get identifier] */
+
+ if (optype == AsyncOpType.WT_AOP_SEARCH) {
+ /*! [async get the operation's string key] */
+ String key = op.getKeyString();
+ /*! [async get the operation's string key] */
+ /*! [async get the operation's string value] */
+ String value = op.getValueString();
+ /*! [async get the operation's string value] */
+ synchronized (this) {
+ numKeys += 1;
+ }
+ System.out.println("Id " + id + " got record: " + key +
+ " : " + value);
+ }
+ else {
+ notifyError("unexpected optype");
+ ret = 1;
+ }
+ }
+ catch (Exception e) {
+ System.err.println("ERROR: exception in notify: " + e.toString() +
+ ", opreturn=" + opReturn);
+ ret = 1;
+ }
+ return (ret);
+ }
+}
+/*! [async example callback implementation] */
+
+public class ex_async {
+
+ public static String home;
+
+ public static final int MAX_KEYS = 15;
+
+ public static AsyncOp tryAsyncNewOp(Connection conn, String uri,
+ String config, AsyncCallback cb) throws WiredTigerException
+ {
+ WiredTigerException savedwte = null;
+
+ for (int tries = 0; tries < 10; tries++)
+ try {
+ return conn.async_new_op(uri, config, cb);
+ }
+ catch (WiredTigerException wte) {
+ /*
+ * If we used up all the handles, pause and retry to
+ * give the workers a chance to catch up.
+ */
+ System.err.println(
+ "asynchronous operation handle not available: " + wte);
+ savedwte = wte;
+ try {
+ Thread.sleep(1);
+ } catch (InterruptedException ie) {
+ /* not a big problem, continue to retry */
+ }
+ }
+
+ throw savedwte;
+ }
+
+ public static int
+ asyncExample()
+ throws WiredTigerException
+ {
+ AsyncOp op;
+ Connection conn;
+ Session session;
+ int i, ret;
+ String k[] = new String[MAX_KEYS];
+ String v[] = new String[MAX_KEYS];
+
+ /*! [async example callback implementation part 2] */
+ AsyncKeys asynciface = new AsyncKeys();
+ /*! [async example callback implementation part 2] */
+
+ /*! [async example connection] */
+ conn = wiredtiger.open(home, "create,cache_size=100MB," +
+ "async=(enabled=true,ops_max=20,threads=2)");
+ /*! [async example connection] */
+
+ /*! [async example table create] */
+ session = conn.open_session(null);
+ ret = session.create("table:async", "key_format=S,value_format=S");
+ /*! [async example table create] */
+
+ /* Insert a set of keys asynchronously. */
+ for (i = 0; i < MAX_KEYS; i++) {
+ /*! [async handle allocation] */
+ op = tryAsyncNewOp(conn, "table:async", null, asynciface);
+ /*! [async handle allocation] */
+
+ /*! [async insert] */
+ /*
+ * Set the operation's string key and value, and then do
+ * an asynchronous insert.
+ */
+ /*! [async set the operation's string key] */
+ k[i] = "key" + i;
+ op.putKeyString(k[i]);
+ /*! [async set the operation's string key] */
+
+ /*! [async set the operation's string value] */
+ v[i] = "value" + i;
+ op.putValueString(v[i]);
+ /*! [async set the operation's string value] */
+
+ ret = op.insert();
+ /*! [async insert] */
+ }
+
+ /*! [async flush] */
+ /* Wait for all outstanding operations to complete. */
+ ret = conn.async_flush();
+ /*! [async flush] */
+
+ /*! [async compaction] */
+ /*
+ * Compact a table asynchronously, limiting the run-time to 5 minutes.
+ */
+ op = tryAsyncNewOp(conn, "table:async", "timeout=300", asynciface);
+ ret = op.compact();
+ /*! [async compaction] */
+
+ /* Search for the keys we just inserted, asynchronously. */
+ for (i = 0; i < MAX_KEYS; i++) {
+ op = tryAsyncNewOp(conn, "table:async", null, asynciface);
+ /*! [async search] */
+ /*
+ * Set the operation's string key and value, and then do
+ * an asynchronous search.
+ */
+ k[i] = "key" + i;
+ op.putKeyString(k[i]);
+ ret = op.search();
+ /*! [async search] */
+ }
+
+ /*
+ * Connection close automatically does an async_flush so it will wait
+ * for all queued search operations to complete.
+ */
+ ret = conn.close(null);
+
+ System.out.println("Searched for " + asynciface.numKeys + " keys");
+
+ return (ret);
+ }
+
+ public static int
+ main(String[] argv)
+ {
+ try {
+ return (asyncExample());
+ }
+ catch (WiredTigerException wte) {
+ System.err.println("Exception: " + wte);
+ return (-1);
+ }
+ }
+}
diff --git a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_call_center.java b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_call_center.java
new file mode 100644
index 00000000000..553f63612bd
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_call_center.java
@@ -0,0 +1,299 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_call_center.java
+ * This is an example application that demonstrates how to map a
+ * moderately complex SQL application into WiredTiger.
+ */
+
+package com.wiredtiger.examples;
+import com.wiredtiger.db.*;
+import java.io.*;
+import java.util.*;
+
+/*! [call-center decl] */
+/*
+ * In SQL, the tables are described as follows:
+ *
+ * CREATE TABLE Customers(id INTEGER PRIMARY KEY,
+ * name VARCHAR(30), address VARCHAR(50), phone VARCHAR(15))
+ * CREATE INDEX CustomersPhone ON Customers(phone)
+ *
+ * CREATE TABLE Calls(id INTEGER PRIMARY KEY, call_date DATE,
+ * cust_id INTEGER, emp_id INTEGER, call_type VARCHAR(12),
+ * notes VARCHAR(25))
+ * CREATE INDEX CallsCustDate ON Calls(cust_id, call_date)
+ *
+ * In this example, both tables will use record numbers for their IDs, which
+ * will be the key. The C structs for the records are as follows.
+ */
+
+/* Customer records. */
+class Customer {
+ public long id;
+ public String name;
+ public String address;
+ public String phone;
+ public Customer(long id, String name, String address, String phone) {
+ this.id = id;
+ this.name = name;
+ this.address = address;
+ this.phone = phone;
+ }
+ public Customer() {}
+}
+
+/* Call records. */
+class Call {
+ public long id;
+ public long call_date;
+ public long cust_id;
+ public long emp_id;
+ public String call_type;
+ public String notes;
+ public Call(long id, long call_date, long cust_id, long emp_id,
+ String call_type, String notes) {
+ this.id = id;
+ this.call_date = call_date;
+ this.cust_id = cust_id;
+ this.emp_id = emp_id;
+ this.call_type = call_type;
+ this.notes = notes;
+ }
+ public Call() {}
+}
+/*! [call-center decl] */
+
+public class ex_call_center {
+
+ public static String home;
+
+ public static int
+ callCenterExample()
+ throws WiredTigerException
+ {
+ Connection conn;
+ Cursor cursor;
+ Session session;
+ int count, ret;
+ SearchStatus nearstatus;
+ List<Customer> custSample = new ArrayList<Customer>();
+ List<Call> callSample = new ArrayList<Call>();
+
+ custSample.add(new Customer(0, "Professor Oak",
+ "LeafGreen Avenue", "123-456-7890"));
+ custSample.add(new Customer(0, "Lorelei",
+ "Sevii Islands", "098-765-4321"));
+ callSample.add(new Call(0, 32, 1, 2, "billing", "unavailable"));
+ callSample.add(new Call(0, 33, 1, 2, "billing", "available"));
+ callSample.add(new Call(0, 34, 1, 2, "reminder", "unavailable"));
+ callSample.add(new Call(0, 35, 1, 2, "reminder", "available"));
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (System.getenv("WIREDTIGER_HOME") == null) {
+ home = "WT_HOME";
+ try {
+ Process proc = Runtime.getRuntime().exec("/bin/rm -rf WT_HOME");
+ BufferedReader br = new BufferedReader(
+ new InputStreamReader(proc.getInputStream()));
+ while(br.ready())
+ System.out.println(br.readLine());
+ br.close();
+ new File("WT_HOME").mkdir();
+ } catch (IOException ioe) {
+ System.err.println("IOException: WT_HOME: " + ioe);
+ return(1);
+ }
+ } else
+ home = null;
+
+ try {
+ conn = wiredtiger.open(home, "create");
+ session = conn.open_session(null);
+ } catch (WiredTigerException wte) {
+ System.err.println("WiredTigerException: " + wte);
+ return(1);
+ }
+ /* Note: further error checking omitted for clarity. */
+
+ /*! [call-center work] */
+ /*
+ * Create the customers table, give names and types to the columns.
+ * The columns will be stored in two groups: "main" and "address",
+ * created below.
+ */
+ ret = session.create("table:customers",
+ "key_format=r," +
+ "value_format=SSS," +
+ "columns=(id,name,address,phone)," +
+ "colgroups=(main,address)");
+
+ /* Create the main column group with value columns except address. */
+ ret = session.create(
+ "colgroup:customers:main", "columns=(name,phone)");
+
+ /* Create the address column group with just the address. */
+ ret = session.create(
+ "colgroup:customers:address", "columns=(address)");
+
+ /* Create an index on the customer table by phone number. */
+ ret = session.create(
+ "index:customers:phone", "columns=(phone)");
+
+ /* Populate the customers table with some data. */
+ cursor = session.open_cursor("table:customers", null, "append");
+ for (Customer cust : custSample) {
+ cursor.putValueString(cust.name);
+ cursor.putValueString(cust.address);
+ cursor.putValueString(cust.phone);
+ ret = cursor.insert();
+ }
+ ret = cursor.close();
+
+ /*
+ * Create the calls table, give names and types to the columns. All the
+ * columns will be stored together, so no column groups are declared.
+ */
+ ret = session.create("table:calls",
+ "key_format=r," +
+ "value_format=qrrSS," +
+ "columns=(id,call_date,cust_id,emp_id,call_type,notes)");
+
+ /*
+ * Create an index on the calls table with a composite key of cust_id
+ * and call_date.
+ */
+ ret = session.create("index:calls:cust_date",
+ "columns=(cust_id,call_date)");
+
+ /* Populate the calls table with some data. */
+ cursor = session.open_cursor("table:calls", null, "append");
+ for (Call call : callSample) {
+ cursor.putValueLong(call.call_date);
+ cursor.putValueLong(call.cust_id);
+ cursor.putValueLong(call.emp_id);
+ cursor.putValueString(call.call_type);
+ cursor.putValueString(call.notes);
+ ret = cursor.insert();
+ }
+ ret = cursor.close();
+
+ /*
+ * First query: a call arrives. In SQL:
+ *
+ * SELECT id, name FROM Customers WHERE phone=?
+ *
+ * Use the cust_phone index, lookup by phone number to fill the
+ * customer record. The cursor will have a key format of "S" for a
+ * string because the cust_phone index has a single column ("phone"),
+ * which is of type "S".
+ *
+ * Specify the columns we want: the customer ID and the name. This
+ * means the cursor's value format will be "rS".
+ */
+ cursor = session.open_cursor(
+ "index:customers:phone(id,name)", null, null);
+ cursor.putKeyString("123-456-7890");
+ ret = cursor.search();
+ if (ret == 0) {
+ Customer cust = new Customer();
+ cust.id = cursor.getValueLong();
+ cust.name = cursor.getValueString();
+ System.out.println("Read customer record for " + cust.name +
+ " (ID " + cust.id + ")");
+ }
+ ret = cursor.close();
+
+ /*
+ * Next query: get the recent order history. In SQL:
+ *
+ * SELECT * FROM Calls WHERE cust_id=? ORDER BY call_date DESC LIMIT 3
+ *
+ * Use the call_cust_date index to find the matching calls. Since it is
+ * is in increasing order by date for a given customer, we want to start
+ * with the last record for the customer and work backwards.
+ *
+ * Specify a subset of columns to be returned. (Note that if these were
+ * all covered by the index, the primary would not have to be accessed.)
+ * Stop after getting 3 records.
+ */
+ cursor = session.open_cursor(
+ "index:calls:cust_date(cust_id,call_type,notes)",
+ null, null);
+
+ /*
+ * The keys in the index are (cust_id,call_date) -- we want the largest
+ * call date for a given cust_id. Search for (cust_id+1,0), then work
+ * backwards.
+ */
+ long custid = 1;
+ cursor.putKeyLong(custid + 1);
+ cursor.putKeyLong(0);
+ nearstatus = cursor.search_near();
+
+ /*
+ * If the table is empty, search_near will return WT_NOTFOUND, else the
+ * cursor will be positioned on a matching key if one exists, or an
+ * adjacent key if one does not. If the positioned key is equal to or
+ * larger than the search key, go back one.
+ */
+ if (ret == 0 && (nearstatus == SearchStatus.LARGER ||
+ nearstatus == SearchStatus.FOUND))
+ ret = cursor.prev();
+ for (count = 0; ret == 0 && count < 3; ++count) {
+ Call call = new Call();
+ call.cust_id = cursor.getValueLong();
+ call.call_type = cursor.getValueString();
+ call.notes = cursor.getValueString();
+ if (call.cust_id != custid)
+ break;
+ System.out.println("Call record: customer " + call.cust_id +
+ " (" + call.call_type +
+ ": " + call.notes + ")");
+ ret = cursor.prev();
+ }
+ /*! [call-center work] */
+
+ ret = conn.close(null);
+
+ return (ret);
+ }
+
+ public static int
+ main(String[] argv)
+ {
+ try {
+ return (callCenterExample());
+ }
+ catch (WiredTigerException wte) {
+ System.err.println("Exception: " + wte);
+ return (-1);
+ }
+ }
+}
diff --git a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_cursor.java b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_cursor.java
new file mode 100644
index 00000000000..7b8de7739d2
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_cursor.java
@@ -0,0 +1,239 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_cursor.java
+ * This is an example demonstrating some cursor types and operations.
+ */
+
+package com.wiredtiger.examples;
+import com.wiredtiger.db.*;
+import java.io.*;
+import java.util.*;
+
+public class ex_cursor {
+
+ public static String home;
+
+ /*! [cursor next] */
+ public static int
+ cursor_forward_scan(Cursor cursor)
+ throws WiredTigerException
+ {
+ String key, value;
+ int ret;
+
+ while ((ret = cursor.next()) == 0) {
+ key = cursor.getKeyString();
+ value = cursor.getValueString();
+ }
+ return (ret);
+ }
+ /*! [cursor next] */
+
+ /*! [cursor prev] */
+ public static int
+ cursor_reverse_scan(Cursor cursor)
+ throws WiredTigerException
+ {
+ String key, value;
+ int ret;
+
+ while ((ret = cursor.prev()) == 0) {
+ key = cursor.getKeyString();
+ value = cursor.getValueString();
+ }
+ return (ret);
+ }
+ /*! [cursor prev] */
+
+ /*! [cursor reset] */
+ public static int
+ cursor_reset(Cursor cursor)
+ throws WiredTigerException
+ {
+ return (cursor.reset());
+ }
+ /*! [cursor reset] */
+
+ /*! [cursor search] */
+ public static int
+ cursor_search(Cursor cursor)
+ throws WiredTigerException
+ {
+ String value;
+ int ret;
+
+ cursor.putKeyString("foo");
+
+ if ((ret = cursor.search()) != 0)
+ value = cursor.getValueString();
+
+ return (ret);
+ }
+ /*! [cursor search] */
+
+ /*! [cursor search near] */
+ public static int
+ cursor_search_near(Cursor cursor)
+ throws WiredTigerException
+ {
+ String key, value;
+ SearchStatus exact;
+
+ key = "foo";
+ cursor.putKeyString(key);
+
+ exact = cursor.search_near();
+ if (exact == SearchStatus.SMALLER)
+ /* Returned key smaller than search key */
+ key = cursor.getKeyString();
+ else if (exact == SearchStatus.LARGER)
+ /* Returned key larger than search key */
+ key = cursor.getKeyString();
+ /* Else exact match found, and key already set */
+
+ value = cursor.getValueString();
+
+ return (0);
+ }
+ /*! [cursor search near] */
+
+ /*! [cursor insert] */
+ public static int
+ cursor_insert(Cursor cursor)
+ throws WiredTigerException
+ {
+ cursor.putKeyString("foo");
+ cursor.putValueString("bar");
+
+ return (cursor.insert());
+ }
+ /*! [cursor insert] */
+
+ /*! [cursor update] */
+ public static int
+ cursor_update(Cursor cursor)
+ throws WiredTigerException
+ {
+ cursor.putKeyString("foo");
+ cursor.putValueString("newbar");
+
+ return (cursor.update());
+ }
+ /*! [cursor update] */
+
+ /*! [cursor remove] */
+ public static int
+ cursor_remove(Cursor cursor)
+ throws WiredTigerException
+ {
+ cursor.putKeyString("foo");
+ return (cursor.remove());
+ }
+ /*! [cursor remove] */
+
+ public static int
+ cursorExample()
+ throws WiredTigerException
+ {
+ Connection conn;
+ Cursor cursor;
+ Session session;
+ int ret;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (System.getenv("WIREDTIGER_HOME") == null) {
+ home = "WT_HOME";
+ try {
+ Process proc = Runtime.getRuntime().exec("/bin/rm -rf WT_HOME");
+ BufferedReader br = new BufferedReader(
+ new InputStreamReader(proc.getInputStream()));
+ while(br.ready())
+ System.out.println(br.readLine());
+ br.close();
+ new File("WT_HOME").mkdir();
+ } catch (IOException ioe) {
+ System.err.println("IOException: WT_HOME: " + ioe);
+ return(1);
+ }
+ } else
+ home = null;
+
+ conn = wiredtiger.open(home, "create,statistics=(fast)");
+ session = conn.open_session(null);
+
+ ret = session.create("table:world",
+ "key_format=r,value_format=5sii," +
+ "columns=(id,country,population,area)");
+
+ /*! [open cursor #1] */
+ cursor = session.open_cursor("table:world", null, null);
+ /*! [open cursor #1] */
+
+ /*! [open cursor #2] */
+ cursor = session.open_cursor("table:world(country,population)", null, null);
+ /*! [open cursor #2] */
+
+ /*! [open cursor #3] */
+ cursor = session.open_cursor("statistics:", null, null);
+ /*! [open cursor #3] */
+
+ /* Create a simple string table to illustrate basic operations. */
+ ret = session.create("table:map", "key_format=S,value_format=S");
+ cursor = session.open_cursor("table:map", null, null);
+ ret = cursor_insert(cursor);
+ ret = cursor_reset(cursor);
+ ret = cursor_forward_scan(cursor);
+ ret = cursor_reset(cursor);
+ ret = cursor_reverse_scan(cursor);
+ ret = cursor_search_near(cursor);
+ ret = cursor_update(cursor);
+ ret = cursor_remove(cursor);
+ ret = cursor.close();
+
+ /* Note: closing the connection implicitly closes open session(s). */
+ if ((ret = conn.close(null)) != 0)
+ System.err.println("Error connecting to " + home + ": " +
+ wiredtiger.wiredtiger_strerror(ret));
+
+ return (ret);
+ }
+
+ public static int
+ main(String[] argv)
+ {
+ try {
+ return (cursorExample());
+ }
+ catch (WiredTigerException wte) {
+ System.err.println("Exception: " + wte);
+ return (-1);
+ }
+ }
+}
diff --git a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_log.java b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_log.java
new file mode 100644
index 00000000000..d7bc6987878
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_log.java
@@ -0,0 +1,376 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_log.java
+ * demonstrates how to logging and log cursors.
+ */
+package com.wiredtiger.examples;
+import com.wiredtiger.db.*;
+import java.io.*;
+import java.util.*;
+
+class Lsn {
+ int file;
+ long offset;
+}
+
+public class ex_log {
+
+ public static String home1 = "WT_HOME_LOG_1";
+ public static String home2 = "WT_HOME_LOG_2";
+ public static String uri = "table:logtest";
+
+ public static final String CONN_CONFIG =
+ "create,cache_size=100MB,log=(archive=false,enabled=true)";
+ public static final int MAX_KEYS = 10;
+
+ static Session
+ setup_copy()
+ throws WiredTigerException
+ {
+ int ret = 0;
+ Connection conn;
+
+ conn = wiredtiger.open(home2, CONN_CONFIG);
+ Session session = conn.open_session(null);
+ session.create(uri, "key_format=S,value_format=S");
+ return (session);
+ }
+
+ static int
+ compare_tables(Session session, Session sess_copy)
+ throws WiredTigerException
+ {
+ int ret;
+
+ Cursor cursor = session.open_cursor(uri, null, null);
+ Cursor curs_copy = sess_copy.open_cursor(uri, null, null);
+
+ while ((ret = cursor.next()) == 0) {
+ ret = curs_copy.next();
+ String key = cursor.getKeyString();
+ String value = cursor.getValueString();
+ String key_copy = curs_copy.getKeyString();
+ String value_copy = curs_copy.getValueString();
+ if (!key.equals(key_copy) || !value.equals(value_copy)) {
+ System.err.println(
+ "Mismatched: key " + key +
+ ", key_copy " + key_copy +
+ " value " + value +
+ " value_copy " + value_copy);
+ return (1);
+ }
+ }
+ if (ret != wiredtiger.WT_NOTFOUND)
+ System.err.println("WT_CURSOR.next: " +
+ wiredtiger.wiredtiger_strerror(ret));
+ ret = cursor.close();
+
+ ret = curs_copy.next();
+ if (ret != wiredtiger.WT_NOTFOUND)
+ System.err.println("WT_CURSOR.next: " +
+ wiredtiger.wiredtiger_strerror(ret));
+ ret = curs_copy.close();
+
+ return (ret);
+ }
+
+ /*! [log cursor walk] */
+ static void
+ print_record(Lsn lsn, int opcount,
+ int rectype, int optype, long txnid, int fileid,
+ byte[] key, byte[] value)
+ {
+ System.out.print(
+ "LSN [" + lsn.file + "][" + lsn.offset + "]." + opcount +
+ ": record type " + rectype + " optype " + optype +
+ " txnid " + txnid + " fileid " + fileid);
+ System.out.println(" key size " + key.length +
+ "value size " + value.length);
+ if (rectype == wiredtiger.WT_LOGREC_MESSAGE)
+ System.out.println("Application Record: " + new String(value));
+ }
+
+ /*
+ * simple_walk_log --
+ * A simple walk of the log.
+ */
+ static int
+ simple_walk_log(Session session)
+ throws WiredTigerException
+ {
+ Cursor cursor;
+ Lsn lsn = new Lsn();
+ byte[] logrec_key, logrec_value;
+ long txnid;
+ int fileid, opcount, optype, rectype;
+ int ret;
+
+ /*! [log cursor open] */
+ cursor = session.open_cursor("log:", null, null);
+ /*! [log cursor open] */
+
+ while ((ret = cursor.next()) == 0) {
+ /*! [log cursor get_key] */
+ lsn.file = cursor.getKeyInt();
+ lsn.offset = cursor.getKeyLong();
+ opcount = cursor.getKeyInt();
+ /*! [log cursor get_key] */
+ /*! [log cursor get_value] */
+ txnid = cursor.getValueLong();
+ rectype = cursor.getValueInt();
+ optype = cursor.getValueInt();
+ fileid = cursor.getValueInt();
+ logrec_key = cursor.getValueByteArray();
+ logrec_value = cursor.getValueByteArray();
+ /*! [log cursor get_value] */
+
+ print_record(lsn, opcount,
+ rectype, optype, txnid, fileid, logrec_key, logrec_value);
+ }
+ if (ret == wiredtiger.WT_NOTFOUND)
+ ret = 0;
+ ret = cursor.close();
+ return (ret);
+ }
+ /*! [log cursor walk] */
+
+ static int
+ walk_log(Session session)
+ throws WiredTigerException
+ {
+ Connection wt_conn2;
+ Cursor cursor, cursor2;
+ Lsn lsn, lsnsave;
+ byte[] logrec_key, logrec_value;
+ Session session2;
+ long txnid;
+ int fileid, opcount, optype, rectype;
+ int i, ret;
+ boolean in_txn, first;
+
+ session2 = setup_copy();
+ wt_conn2 = session2.getConnection();
+ cursor = session.open_cursor("log:", null, null);
+ cursor2 = session2.open_cursor(uri, null, "raw=true");
+ i = 0;
+ in_txn = false;
+ txnid = 0;
+ lsn = new Lsn();
+ lsnsave = new Lsn();
+ while ((ret = cursor.next()) == 0) {
+ lsn.file = cursor.getKeyInt();
+ lsn.offset = cursor.getKeyLong();
+ opcount = cursor.getKeyInt();
+
+ /*
+ * Save one of the LSNs we get back to search for it
+ * later. Pick a later one because we want to walk from
+ * that LSN to the end (where the multi-step transaction
+ * was performed). Just choose the record that is MAX_KEYS.
+ */
+ if (++i == MAX_KEYS)
+ lsnsave = lsn;
+ txnid = cursor.getValueLong();
+ rectype = cursor.getValueInt();
+ optype = cursor.getValueInt();
+ fileid = cursor.getValueInt();
+ logrec_key = cursor.getValueByteArray();
+ logrec_value = cursor.getValueByteArray();
+
+ print_record(lsn, opcount,
+ rectype, optype, txnid, fileid, logrec_key, logrec_value);
+
+ /*
+ * If we are in a transaction and this is a new one, end
+ * the previous one.
+ */
+ if (in_txn && opcount == 0) {
+ ret = session2.commit_transaction(null);
+ in_txn = false;
+ }
+
+ /*
+ * If the operation is a put, replay it here on the backup
+ * connection. Note, we cheat by looking only for fileid 1
+ * in this example. The metadata is fileid 0.
+ */
+ if (fileid == 1 && rectype == wiredtiger.WT_LOGREC_COMMIT &&
+ optype == wiredtiger.WT_LOGOP_ROW_PUT) {
+ if (!in_txn) {
+ ret = session2.begin_transaction(null);
+ in_txn = true;
+ }
+ cursor2.putKeyByteArray(logrec_key);
+ cursor2.putValueByteArray(logrec_value);
+ ret = cursor2.insert();
+ }
+ }
+ if (in_txn)
+ ret = session2.commit_transaction(null);
+
+ ret = cursor2.close();
+ /*
+ * Compare the tables after replay. They should be identical.
+ */
+ if (compare_tables(session, session2) != 0)
+ System.out.println("compare failed");
+ ret = session2.close(null);
+ ret = wt_conn2.close(null);
+
+ ret = cursor.reset();
+ /*! [log cursor set_key] */
+ cursor.putKeyInt(lsnsave.file);
+ cursor.putKeyLong(lsnsave.offset);
+ /*! [log cursor set_key] */
+ /*! [log cursor search] */
+ ret = cursor.search();
+ /*! [log cursor search] */
+ System.out.println("Reset to saved...");
+ /*
+ * Walk all records starting with this key.
+ */
+ first = true;
+ while (ret == 0) { /*TODO: not quite right*/
+ lsn.file = cursor.getKeyInt();
+ lsn.offset = cursor.getKeyLong();
+ opcount = cursor.getKeyInt();
+ if (first) {
+ first = false;
+ if (lsnsave.file != lsn.file ||
+ lsnsave.offset != lsn.offset) {
+ System.err.println("search returned the wrong LSN");
+ System.exit(1);
+ }
+ }
+ txnid = cursor.getValueLong();
+ rectype = cursor.getValueInt();
+ optype = cursor.getValueInt();
+ fileid = cursor.getValueInt();
+ logrec_key = cursor.getValueByteArray();
+ logrec_value = cursor.getValueByteArray();
+
+ print_record(lsn, opcount, rectype, optype, txnid,
+ fileid, logrec_key, logrec_value);
+
+ ret = cursor.next();
+ if (ret != 0)
+ break;
+ }
+ ret = cursor.close();
+ return (ret);
+ }
+
+ public static int
+ logExample()
+ throws WiredTigerException
+ {
+ Connection wt_conn;
+ Cursor cursor;
+ Session session;
+ int i, record_count, ret;
+
+ try {
+ String command = "/bin/rm -rf " + home1 + " " + home2;
+ Process proc = Runtime.getRuntime().exec(command);
+ BufferedReader br = new BufferedReader(
+ new InputStreamReader(proc.getInputStream()));
+ while(br.ready())
+ System.out.println(br.readLine());
+ br.close();
+ new File(home1).mkdir();
+ new File(home2).mkdir();
+ } catch (IOException ioe) {
+ System.err.println("IOException: " + ioe);
+ return (1);
+ }
+ if ((wt_conn = wiredtiger.open(home1, CONN_CONFIG)) == null) {
+ System.err.println("Error connecting to " + home1);
+ return (1);
+ }
+
+ session = wt_conn.open_session(null);
+ ret = session.create(uri, "key_format=S,value_format=S");
+
+ cursor = session.open_cursor(uri, null, null);
+ /*
+ * Perform some operations with individual auto-commit transactions.
+ */
+ for (record_count = 0, i = 0; i < MAX_KEYS; i++, record_count++) {
+ String k = "key" + i;
+ String v = "value" + i;
+ cursor.putKeyString(k);
+ cursor.putValueString(v);
+ ret = cursor.insert();
+ }
+ ret = session.begin_transaction(null);
+ /*
+ * Perform some operations within a single transaction.
+ */
+ for (i = MAX_KEYS; i < MAX_KEYS+5; i++, record_count++) {
+ String k = "key" + i;
+ String v = "value" + i;
+ cursor.putKeyString(k);
+ cursor.putValueString(v);
+ ret = cursor.insert();
+ }
+ ret = session.commit_transaction(null);
+ ret = cursor.close();
+
+ /*! [log cursor printf] */
+ ret = session.log_printf("Wrote " + record_count + " records");
+ /*! [log cursor printf] */
+
+ /*
+ * Close and reopen the connection so that the log ends up with
+ * a variety of records such as file sync and checkpoint. We
+ * have archiving turned off.
+ */
+ ret = wt_conn.close(null);
+ if ((wt_conn = wiredtiger.open(home1, CONN_CONFIG)) == null) {
+ System.err.println("Error connecting to " + home1);
+ return (ret);
+ }
+
+ session = wt_conn.open_session(null);
+ ret = simple_walk_log(session);
+ ret = walk_log(session);
+ ret = wt_conn.close(null);
+ return (ret);
+ }
+
+ public static int
+ main()
+ {
+ try {
+ return (logExample());
+ }
+ catch (WiredTigerException wte) {
+ System.err.println("Exception: " + wte);
+ return (-1);
+ }
+ }
+}
diff --git a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_schema.java b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_schema.java
new file mode 100644
index 00000000000..9b84912e0f0
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_schema.java
@@ -0,0 +1,333 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_schema.java
+ * This is an example application demonstrating how to create and access
+ * tables using a schema.
+ */
+package com.wiredtiger.examples;
+import com.wiredtiger.db.*;
+import java.io.*;
+import java.util.*;
+
+public class ex_schema {
+
+ public static String home;
+
+ /*! [schema declaration] */
+ /* The class for the data we are storing in a WiredTiger table. */
+ static class PopRecord {
+ public String country; // Stored in database as fixed size char[5];
+ public short year;
+ public long population;
+ public PopRecord(String country, short year, long population) {
+ this.country = country;
+ this.year = year;
+ this.population = population;
+ }
+ }
+
+ static List<PopRecord> popData;
+
+ static {
+ popData = new ArrayList<PopRecord>();
+
+ popData.add(new PopRecord("AU", (short)1900, 4000000 ));
+ popData.add(new PopRecord("AU", (short)2000, 19053186 ));
+ popData.add(new PopRecord("CAN", (short)1900, 5500000 ));
+ popData.add(new PopRecord("CAN", (short)2000, 31099561 ));
+ popData.add(new PopRecord("UK", (short)1900, 369000000 ));
+ popData.add(new PopRecord("UK", (short)2000, 59522468 ));
+ popData.add(new PopRecord("USA", (short)1900, 76212168 ));
+ popData.add(new PopRecord("USA", (short)2000, 301279593 ));
+ };
+ /*! [schema declaration] */
+
+ public static int
+ schemaExample()
+ throws WiredTigerException
+ {
+ Connection conn;
+ Cursor cursor;
+ Session session;
+ String country;
+ long recno, population;
+ short year;
+ int ret;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (System.getenv("WIREDTIGER_HOME") == null) {
+ home = "WT_HOME";
+ try {
+ Process proc = Runtime.getRuntime().exec("/bin/rm -rf WT_HOME");
+ BufferedReader br = new BufferedReader(
+ new InputStreamReader(proc.getInputStream()));
+ while(br.ready())
+ System.out.println(br.readLine());
+ br.close();
+ new File("WT_HOME").mkdir();
+ } catch (IOException ioe) {
+ System.err.println("IOException: WT_HOME: " + ioe);
+ return(1);
+ }
+ } else
+ home = null;
+
+ try {
+ conn = wiredtiger.open(home, "create");
+ session = conn.open_session(null);
+ } catch (WiredTigerException wte) {
+ System.err.println("WiredTigerException: " + wte);
+ return(1);
+ }
+
+ /*! [Create a table with column groups] */
+ /*
+ * Create the population table.
+ * Keys are record numbers, the format for values is (5-byte string,
+ * long, long).
+ * See ::wiredtiger_struct_pack for details of the format strings.
+ */
+ ret = session.create("table:poptable",
+ "key_format=r,value_format=5sHQ," +
+ "columns=(id,country,year,population),colgroups=(main,population)");
+
+ /*
+ * Create two column groups: a primary column group with the country
+ * code, year and population (named "main"), and a population column
+ * group with the population by itself (named "population").
+ */
+ ret = session.create("colgroup:poptable:main",
+ "columns=(country,year,population)");
+ ret = session.create("colgroup:poptable:population",
+ "columns=(population)");
+ /*! [Create a table with column groups] */
+
+ /*! [Create an index] */
+ /* Create an index with a simple key. */
+ ret = session.create("index:poptable:country",
+ "columns=(country)");
+ /*! [Create an index] */
+
+ /*! [Create an index with a composite key] */
+ /* Create an index with a composite key (country,year). */
+ ret = session.create("index:poptable:country_plus_year",
+ "columns=(country,year)");
+ /*! [Create an index with a composite key] */
+
+ /*! [Insert and list records] */
+ /* Insert the records into the table. */
+ cursor = session.open_cursor("table:poptable", null, "append");
+ for (PopRecord p : popData) {
+ cursor.putValueString(p.country);
+ cursor.putValueShort(p.year);
+ cursor.putValueLong(p.population);
+ ret = cursor.insert();
+ }
+ ret = cursor.close();
+
+ /* List the records in the table. */
+ cursor = session.open_cursor("table:poptable", null, null);
+ while ((ret = cursor.next()) == 0) {
+ recno = cursor.getKeyLong();
+ country = cursor.getValueString();
+ year = cursor.getValueShort();
+ population = cursor.getValueLong();
+ System.out.print("ID " + recno);
+ System.out.println(": country " + country + ", year " + year +
+ ", population " + population);
+ }
+ ret = cursor.close();
+ /*! [Insert and list records] */
+
+ /*! [List the records in the table using raw mode.] */
+ cursor = session.open_cursor("table:poptable", null, "raw");
+ while ((ret = cursor.next()) == 0) {
+ byte[] key, value;
+
+ key = cursor.getKeyByteArray();
+ System.out.println(Arrays.toString(key));
+ value = cursor.getValueByteArray();
+ System.out.println("raw key: " + Arrays.toString(key) +
+ ", raw value: " + Arrays.toString(value));
+ }
+ /*! [List the records in the table using raw mode.] */
+
+ /*! [Read population from the primary column group] */
+ /*
+ * Open a cursor on the main column group, and return the information
+ * for a particular country.
+ */
+ cursor = session.open_cursor("colgroup:poptable:main", null, null);
+ cursor.putKeyLong(2);
+ if ((ret = cursor.search()) == 0) {
+ country = cursor.getValueString();
+ year = cursor.getValueShort();
+ population = cursor.getValueLong();
+ System.out.println("ID 2: country " + country +
+ ", year " + year + ", population " + population);
+ }
+ /*! [Read population from the primary column group] */
+ ret = cursor.close();
+
+ /*! [Read population from the standalone column group] */
+ /*
+ * Open a cursor on the population column group, and return the
+ * population of a particular country.
+ */
+ cursor = session.open_cursor("colgroup:poptable:population", null, null);
+ cursor.putKeyLong(2);
+ if ((ret = cursor.search()) == 0) {
+ population = cursor.getValueLong();
+ System.out.println("ID 2: population " + population);
+ }
+ /*! [Read population from the standalone column group] */
+ ret = cursor.close();
+
+ /*! [Search in a simple index] */
+ /* Search in a simple index. */
+ cursor = session.open_cursor("index:poptable:country", null, null);
+ cursor.putKeyString("AU");
+ ret = cursor.search();
+ country = cursor.getValueString();
+ year = cursor.getValueShort();
+ population = cursor.getValueLong();
+ System.out.println("AU: country " + country + ", year " + year +
+ ", population " + population);
+ /*! [Search in a simple index] */
+ ret = cursor.close();
+
+ /*! [Search in a composite index] */
+ /* Search in a composite index. */
+ cursor = session.open_cursor(
+ "index:poptable:country_plus_year", null, null);
+ cursor.putKeyString("USA");
+ cursor.putKeyShort((short)1900);
+ ret = cursor.search();
+ country = cursor.getValueString();
+ year = cursor.getValueShort();
+ population = cursor.getValueLong();
+ System.out.println("US 1900: country " + country +
+ ", year " + year + ", population " + population);
+ /*! [Search in a composite index] */
+ ret = cursor.close();
+
+ /*! [Return a subset of values from the table] */
+ /*
+ * Use a projection to return just the table's country and year
+ * columns.
+ */
+ cursor = session.open_cursor("table:poptable(country,year)", null, null);
+ while ((ret = cursor.next()) == 0) {
+ country = cursor.getValueString();
+ year = cursor.getValueShort();
+ System.out.println("country " + country + ", year " + year);
+ }
+ /*! [Return a subset of values from the table] */
+ ret = cursor.close();
+
+ /*! [Return a subset of values from the table using raw mode] */
+ /*
+ * Use a projection to return just the table's country and year
+ * columns.
+ */
+ cursor = session.open_cursor("table:poptable(country,year)", null, null);
+ while ((ret = cursor.next()) == 0) {
+ country = cursor.getValueString();
+ year = cursor.getValueShort();
+ System.out.println("country " + country + ", year " + year);
+ }
+ /*! [Return a subset of values from the table using raw mode] */
+ ret = cursor.close();
+
+ /*! [Return the table's record number key using an index] */
+ /*
+ * Use a projection to return just the table's record number key
+ * from an index.
+ */
+ cursor = session.open_cursor("index:poptable:country_plus_year(id)", null, null);
+ while ((ret = cursor.next()) == 0) {
+ country = cursor.getKeyString();
+ year = cursor.getKeyShort();
+ recno = cursor.getValueLong();
+ System.out.println("row ID " + recno + ": country " + country +
+ ", year " + year);
+ }
+ /*! [Return the table's record number key using an index] */
+ ret = cursor.close();
+
+ /*! [Return a subset of the value columns from an index] */
+ /*
+ * Use a projection to return just the population column from an
+ * index.
+ */
+ cursor = session.open_cursor(
+ "index:poptable:country_plus_year(population)", null, null);
+ while ((ret = cursor.next()) == 0) {
+ country = cursor.getKeyString();
+ year = cursor.getKeyShort();
+ population = cursor.getValueLong();
+ System.out.println("population " + population +
+ ": country " + country + ", year " + year);
+ }
+ /*! [Return a subset of the value columns from an index] */
+ ret = cursor.close();
+
+ /*! [Access only the index] */
+ /*
+ * Use a projection to avoid accessing any other column groups when
+ * using an index: supply an empty list of value columns.
+ */
+ cursor = session.open_cursor(
+ "index:poptable:country_plus_year()", null, null);
+ while ((ret = cursor.next()) == 0) {
+ country = cursor.getKeyString();
+ year = cursor.getKeyShort();
+ System.out.println("country " + country + ", year " + year);
+ }
+ /*! [Access only the index] */
+ ret = cursor.close();
+
+ ret = conn.close(null);
+
+ return (ret);
+ }
+
+ public static int
+ main(String[] argv)
+ {
+ try {
+ return (schemaExample());
+ }
+ catch (WiredTigerException wte) {
+ System.err.println("Exception: " + wte);
+ return (-1);
+ }
+ }
+}
diff --git a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_stat.java b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_stat.java
new file mode 100644
index 00000000000..c81bb64c22a
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_stat.java
@@ -0,0 +1,252 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_stat.java
+ * This is an example demonstrating how to query database statistics.
+ */
+package com.wiredtiger.examples;
+import com.wiredtiger.db.*;
+import java.io.*;
+import java.util.*;
+
+public class ex_stat {
+
+ public static String home;
+
+ /*! [statistics display function] */
+ int
+ print_cursor(Cursor cursor)
+ throws WiredTigerException
+ {
+ String desc, pvalue;
+ long value;
+ int ret;
+
+ while ((ret = cursor.next()) == 0) {
+ desc = cursor.getValueString();
+ pvalue = cursor.getValueString();
+ value = cursor.getValueLong();
+ if (value != 0)
+ System.out.println(desc + "=" + pvalue);
+ }
+
+ return (ret == wiredtiger.WT_NOTFOUND ? 0 : ret);
+ }
+ /*! [statistics display function] */
+
+ int
+ print_database_stats(Session session)
+ throws WiredTigerException
+ {
+ Cursor cursor;
+ int ret;
+
+ /*! [statistics database function] */
+ cursor = session.open_cursor("statistics:", null, null);
+
+ ret = print_cursor(cursor);
+ ret = cursor.close();
+ /*! [statistics database function] */
+
+ return (ret);
+ }
+
+ int
+ print_file_stats(Session session)
+ throws WiredTigerException
+ {
+ Cursor cursor;
+ int ret;
+
+ /*! [statistics table function] */
+ cursor = session.open_cursor("statistics:table:access", null, null);
+ ret = print_cursor(cursor);
+ ret = cursor.close();
+ /*! [statistics table function] */
+
+ return (ret);
+ }
+
+ int
+ print_overflow_pages(Session session)
+ throws WiredTigerException
+ {
+ /*! [statistics retrieve by key] */
+ Cursor cursor;
+ String desc, pvalue;
+ long value;
+ int ret;
+
+ cursor = session.open_cursor("statistics:table:access", null, null);
+
+ cursor.putKeyInt(wiredtiger.WT_STAT_DSRC_BTREE_OVERFLOW);
+ ret = cursor.search();
+ desc = cursor.getValueString();
+ pvalue = cursor.getValueString();
+ value = cursor.getValueLong();
+ System.out.println(desc + "=" + pvalue);
+
+ ret = cursor.close();
+ /*! [statistics retrieve by key] */
+
+ return (ret);
+ }
+
+ /*! [statistics calculation helper function] */
+ long
+ get_stat(Cursor cursor, int stat_field)
+ throws WiredTigerException
+ {
+ long value;
+ int ret;
+
+ cursor.putKeyInt(stat_field);
+ if ((ret = cursor.search()) != 0) {
+ System.err.println("stat_field: " + stat_field + " not found");
+ value = 0;
+ }
+ else {
+ String desc = cursor.getValueString();
+ String pvalue = cursor.getValueString();
+ value = cursor.getValueLong();
+ }
+ return (value);
+ }
+ /*! [statistics calculation helper function] */
+
+ int
+ print_derived_stats(Session session)
+ throws WiredTigerException
+ {
+ Cursor cursor;
+ int ret;
+
+ /*! [statistics calculate open table stats] */
+ cursor = session.open_cursor("statistics:table:access", null, null);
+ /*! [statistics calculate open table stats] */
+
+ {
+ /*! [statistics calculate table fragmentation] */
+ long ckpt_size = get_stat(cursor,
+ wiredtiger.WT_STAT_DSRC_BLOCK_CHECKPOINT_SIZE);
+ long file_size = get_stat(cursor,
+ wiredtiger.WT_STAT_DSRC_BLOCK_SIZE);
+
+ System.out.println("File is " +
+ (int)(100 * (file_size - ckpt_size) / file_size) +
+ "% fragmented\n");
+ /*! [statistics calculate table fragmentation] */
+ }
+
+ {
+ /*! [statistics calculate write amplification] */
+ long app_insert = get_stat(cursor,
+ wiredtiger.WT_STAT_DSRC_CURSOR_INSERT_BYTES);
+ long app_remove = get_stat(cursor,
+ wiredtiger.WT_STAT_DSRC_CURSOR_REMOVE_BYTES);
+ long app_update = get_stat(cursor,
+ wiredtiger.WT_STAT_DSRC_CURSOR_UPDATE_BYTES);
+
+ long fs_writes = get_stat(cursor,
+ wiredtiger.WT_STAT_DSRC_CACHE_BYTES_WRITE);
+
+ if (app_insert + app_remove + app_update != 0)
+ System.out.println("Write amplification is " +
+ (double)fs_writes / (app_insert + app_remove + app_update));
+ /*! [statistics calculate write amplification] */
+ }
+
+ ret = cursor.close();
+
+ return (ret);
+ }
+
+ public int
+ statExample()
+ throws WiredTigerException
+ {
+ Connection conn;
+ Cursor cursor;
+ Session session;
+ int ret;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (System.getenv("WIREDTIGER_HOME") == null) {
+ home = "WT_HOME";
+ try {
+ Process proc = Runtime.getRuntime().exec("/bin/rm -rf WT_HOME");
+ BufferedReader br = new BufferedReader(
+ new InputStreamReader(proc.getInputStream()));
+ while(br.ready())
+ System.out.println(br.readLine());
+ br.close();
+ new File("WT_HOME").mkdir();
+ } catch (IOException ioe) {
+ System.err.println("IOException: WT_HOME: " + ioe);
+ return(1);
+ }
+ } else
+ home = null;
+
+ conn = wiredtiger.open(home, "create,statistics=(all)");
+ session = conn.open_session(null);
+
+ ret = session.create("table:access", "key_format=S,value_format=S");
+
+ cursor = session.open_cursor("table:access", null, null);
+ cursor.putKeyString("key");
+ cursor.putValueString("value");
+ ret = cursor.insert();
+ ret = cursor.close();
+
+ ret = session.checkpoint(null);
+
+ ret = print_database_stats(session);
+
+ ret = print_file_stats(session);
+
+ ret = print_overflow_pages(session);
+
+ ret = print_derived_stats(session);
+
+ return (conn.close(null) == 0 ? ret : -1);
+ }
+
+ public static int
+ main(String[] argv)
+ {
+ try {
+ return ((new ex_stat()).statExample());
+ }
+ catch (WiredTigerException wte) {
+ System.err.println("Exception: " + wte);
+ return (-1);
+ }
+ }
+}
diff --git a/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_thread.java b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_thread.java
new file mode 100644
index 00000000000..c6b9a5479a9
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/java/com/wiredtiger/examples/ex_thread.java
@@ -0,0 +1,142 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_thread.java
+ * This is an example demonstrating how to create and access a simple
+ * table from multiple threads.
+ */
+
+package com.wiredtiger.examples;
+import com.wiredtiger.db.*;
+import java.io.*;
+import java.util.*;
+
+/*! [thread scan] */
+class ScanThread extends Thread {
+ private Connection conn;
+
+ public ScanThread(Connection conn) {
+ this.conn = conn;
+ }
+
+ public void run()
+ {
+ try {
+ int ret;
+
+ Session session = conn.open_session(null);
+ Cursor cursor = session.open_cursor("table:access", null, null);
+
+ /* Show all records. */
+ while ((ret = cursor.next()) == 0) {
+ String key = cursor.getKeyString();
+ String value = cursor.getValueString();
+ System.out.println("Got record: " + key + " : " + value);
+ }
+ if (ret != wiredtiger.WT_NOTFOUND)
+ System.err.println("Cursor.next: " +
+ wiredtiger.wiredtiger_strerror(ret));
+ } catch (WiredTigerException wte) {
+ System.err.println("Exception " + wte);
+ }
+ }
+}
+/*! [thread scan] */
+
+public class ex_thread {
+
+ public static String home;
+
+ public static final int NUM_THREADS = 10;
+
+ /*! [thread main] */
+ static int main(String[] argv)
+ {
+ try {
+ Thread[] threads = new Thread[NUM_THREADS];
+ int i, ret;
+ Connection conn;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (System.getenv("WIREDTIGER_HOME") == null) {
+ home = "WT_HOME";
+ try {
+ Process proc = Runtime.getRuntime().exec("/bin/rm -rf " + home);
+ BufferedReader br = new BufferedReader(
+ new InputStreamReader(proc.getInputStream()));
+ while(br.ready())
+ System.out.println(br.readLine());
+ br.close();
+ new File(home).mkdir();
+ } catch (IOException ioe) {
+ System.err.println("IOException: " + home + ": " + ioe);
+ return(1);
+ }
+ } else
+ home = null;
+
+ if ((conn = wiredtiger.open(home, "create")) == null) {
+ System.err.println("Error connecting to " + home);
+ return(1);
+ }
+
+ /* Note: further error checking omitted for clarity. */
+
+ Session session = conn.open_session(null);
+ ret = session.create("table:access", "key_format=S,value_format=S");
+ Cursor cursor = session.open_cursor("table:access", null, "overwrite");
+ cursor.putKeyString("key1");
+ cursor.putValueString("value1");
+ ret = cursor.insert();
+ ret = session.close(null);
+
+ for (i = 0; i < NUM_THREADS; i++) {
+ threads[i] = new ScanThread(conn);
+ threads[i].start();
+ }
+
+ for (i = 0; i < NUM_THREADS; i++)
+ try {
+ threads[i].join();
+ ret = -1;
+ }
+ catch (InterruptedException ie) {
+ }
+
+ ret = conn.close(null);
+ return (ret);
+ }
+ catch (WiredTigerException wte) {
+ System.err.println("Exception: " + wte);
+ return (-1);
+ }
+ }
+ /*! [thread main] */
+
+}
diff --git a/src/third_party/wiredtiger/examples/python/ex_access.py b/src/third_party/wiredtiger/examples/python/ex_access.py
new file mode 100755
index 00000000000..addc7386f03
--- /dev/null
+++ b/src/third_party/wiredtiger/examples/python/ex_access.py
@@ -0,0 +1,47 @@
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+from wiredtiger import wiredtiger_open
+
+# Connect to the database and open a session
+conn = wiredtiger_open('WT_TEST', 'create')
+session = conn.open_session()
+
+# Create a simple table
+session.create('table:T', 'key_format=S,value_format=S')
+
+# Open a cursor and insert a record
+cursor = session.open_cursor('table:T', None)
+
+cursor.set_key('key1')
+cursor.set_value('value1')
+cursor.insert()
+
+# Iterate through the records
+cursor.reset()
+for key, value in cursor:
+ print('Got record: ' + key + ' : ' + value)
+
+conn.close()
diff --git a/src/third_party/wiredtiger/ext/collators/reverse/Makefile.am b/src/third_party/wiredtiger/ext/collators/reverse/Makefile.am
new file mode 100644
index 00000000000..5cfde94d847
--- /dev/null
+++ b/src/third_party/wiredtiger/ext/collators/reverse/Makefile.am
@@ -0,0 +1,10 @@
+AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include
+
+noinst_LTLIBRARIES = libwiredtiger_reverse_collator.la
+libwiredtiger_reverse_collator_la_SOURCES = reverse_collator.c
+
+# libtool hack: noinst_LTLIBRARIES turns off building shared libraries as well
+# as installation, it will only build static libraries. As far as I can tell,
+# the "approved" libtool way to turn them back on is by adding -rpath.
+libwiredtiger_reverse_collator_la_LDFLAGS = \
+ -avoid-version -module -rpath /nowhere
diff --git a/src/third_party/wiredtiger/ext/collators/reverse/reverse_collator.c b/src/third_party/wiredtiger/ext/collators/reverse/reverse_collator.c
new file mode 100644
index 00000000000..0ccebba7919
--- /dev/null
+++ b/src/third_party/wiredtiger/ext/collators/reverse/reverse_collator.c
@@ -0,0 +1,74 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <string.h>
+
+#include <wiredtiger_ext.h>
+
+/*
+ * collate_reverse --
+ * WiredTiger reverse collation.
+ */
+static int
+collate_reverse(WT_COLLATOR *collator,
+ WT_SESSION *session, const WT_ITEM *k1, const WT_ITEM *k2, int *ret)
+{
+ size_t len;
+ int cmp;
+
+ (void)collator; /* Unused */
+ (void)session;
+
+ len = (k1->size < k2->size) ? k1->size : k2->size;
+ cmp = memcmp(k1->data, k2->data, len);
+ if (cmp < 0)
+ *ret = 1;
+ else if (cmp > 0)
+ *ret = -1;
+ else if (k1->size < k2->size)
+ *ret = 1;
+ else if (k1->size > k2->size)
+ *ret = -1;
+ else
+ *ret = 0;
+ return (0);
+}
+
+static WT_COLLATOR reverse_collator = { collate_reverse, NULL, NULL };
+
+/*
+ * wiredtiger_extension_init --
+ * WiredTiger reverse collation extension.
+ */
+int
+wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
+{
+ (void)config; /* Unused parameters */
+
+ return (connection->add_collator(
+ connection, "reverse", &reverse_collator, NULL));
+}
diff --git a/src/third_party/wiredtiger/ext/compressors/bzip2/Makefile.am b/src/third_party/wiredtiger/ext/compressors/bzip2/Makefile.am
new file mode 100644
index 00000000000..0aedc2efd80
--- /dev/null
+++ b/src/third_party/wiredtiger/ext/compressors/bzip2/Makefile.am
@@ -0,0 +1,6 @@
+AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include
+
+lib_LTLIBRARIES = libwiredtiger_bzip2.la
+libwiredtiger_bzip2_la_SOURCES = bzip2_compress.c
+libwiredtiger_bzip2_la_LDFLAGS = -avoid-version -module
+libwiredtiger_bzip2_la_LIBADD = -lbz2
diff --git a/src/third_party/wiredtiger/ext/compressors/bzip2/bzip2_compress.c b/src/third_party/wiredtiger/ext/compressors/bzip2/bzip2_compress.c
new file mode 100644
index 00000000000..cd73b237387
--- /dev/null
+++ b/src/third_party/wiredtiger/ext/compressors/bzip2/bzip2_compress.c
@@ -0,0 +1,407 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <bzlib.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+#include <wiredtiger_ext.h>
+
+/* Local compressor structure. */
+typedef struct {
+ WT_COMPRESSOR compressor; /* Must come first */
+
+ WT_EXTENSION_API *wt_api; /* Extension API */
+
+ int bz_verbosity; /* Configuration */
+ int bz_blocksize100k;
+ int bz_workfactor;
+ int bz_small;
+} BZIP_COMPRESSOR;
+
+/*
+ * Bzip gives us a cookie to pass to the underlying allocation functions; we
+ * we need two handles, package them up.
+ */
+typedef struct {
+ WT_COMPRESSOR *compressor;
+ WT_SESSION *session;
+} BZIP_OPAQUE;
+
+/*
+ * bzip2_error --
+ * Output an error message, and return a standard error code.
+ */
+static int
+bzip2_error(
+ WT_COMPRESSOR *compressor, WT_SESSION *session, const char *call, int bzret)
+{
+ WT_EXTENSION_API *wt_api;
+ const char *msg;
+
+ wt_api = ((BZIP_COMPRESSOR *)compressor)->wt_api;
+
+ switch (bzret) {
+ case BZ_MEM_ERROR:
+ msg = "BZ_MEM_ERROR";
+ break;
+ case BZ_OUTBUFF_FULL:
+ msg = "BZ_OUTBUFF_FULL";
+ break;
+ case BZ_SEQUENCE_ERROR:
+ msg = "BZ_SEQUENCE_ERROR";
+ break;
+ case BZ_PARAM_ERROR:
+ msg = "BZ_PARAM_ERROR";
+ break;
+ case BZ_DATA_ERROR:
+ msg = "BZ_DATA_ERROR";
+ break;
+ case BZ_DATA_ERROR_MAGIC:
+ msg = "BZ_DATA_ERROR_MAGIC";
+ break;
+ case BZ_IO_ERROR:
+ msg = "BZ_IO_ERROR";
+ break;
+ case BZ_UNEXPECTED_EOF:
+ msg = "BZ_UNEXPECTED_EOF";
+ break;
+ case BZ_CONFIG_ERROR:
+ msg = "BZ_CONFIG_ERROR";
+ break;
+ default:
+ msg = "unknown error";
+ break;
+ }
+
+ (void)wt_api->err_printf(wt_api, session,
+ "bzip2 error: %s: %s: %d", call, msg, bzret);
+ return (WT_ERROR);
+}
+
+/*
+ * bzalloc --
+ * Allocate scratch buffers.
+ */
+static void *
+bzalloc(void *cookie, int number, int size)
+{
+ BZIP_OPAQUE *opaque;
+ WT_EXTENSION_API *wt_api;
+
+ opaque = cookie;
+ wt_api = ((BZIP_COMPRESSOR *)opaque->compressor)->wt_api;
+ return (wt_api->scr_alloc(
+ wt_api, opaque->session, (size_t)(number * size)));
+}
+
+/*
+ * bzfree --
+ * Free scratch buffers.
+ */
+static void
+bzfree(void *cookie, void *p)
+{
+ BZIP_OPAQUE *opaque;
+ WT_EXTENSION_API *wt_api;
+
+ opaque = cookie;
+ wt_api = ((BZIP_COMPRESSOR *)opaque->compressor)->wt_api;
+ wt_api->scr_free(wt_api, opaque->session, p);
+}
+
+/*
+ * bzip2_compress --
+ * WiredTiger bzip2 compression.
+ */
+static int
+bzip2_compress(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ uint8_t *dst, size_t dst_len,
+ size_t *result_lenp, int *compression_failed)
+{
+ BZIP_COMPRESSOR *bzip_compressor;
+ BZIP_OPAQUE opaque;
+ bz_stream bz;
+ int ret;
+
+ bzip_compressor = (BZIP_COMPRESSOR *)compressor;
+
+ memset(&bz, 0, sizeof(bz));
+ bz.bzalloc = bzalloc;
+ bz.bzfree = bzfree;
+ opaque.compressor = compressor;
+ opaque.session = session;
+ bz.opaque = &opaque;
+
+ if ((ret = BZ2_bzCompressInit(&bz,
+ bzip_compressor->bz_blocksize100k,
+ bzip_compressor->bz_verbosity,
+ bzip_compressor->bz_workfactor)) != BZ_OK)
+ return (bzip2_error(
+ compressor, session, "BZ2_bzCompressInit", ret));
+
+ bz.next_in = (char *)src;
+ bz.avail_in = (uint32_t)src_len;
+ bz.next_out = (char *)dst;
+ bz.avail_out = (uint32_t)dst_len;
+ if ((ret = BZ2_bzCompress(&bz, BZ_FINISH)) == BZ_STREAM_END) {
+ *compression_failed = 0;
+ *result_lenp = dst_len - bz.avail_out;
+ } else
+ *compression_failed = 1;
+
+ if ((ret = BZ2_bzCompressEnd(&bz)) != BZ_OK)
+ return (
+ bzip2_error(compressor, session, "BZ2_bzCompressEnd", ret));
+
+ return (0);
+}
+
+/*
+ * __bzip2_compress_raw_random --
+ * Return a 32-bit pseudo-random number.
+ *
+ * This is an implementation of George Marsaglia's multiply-with-carry pseudo-
+ * random number generator. Computationally fast, with reasonable randomness
+ * properties.
+ */
+static uint32_t
+__bzip2_compress_raw_random(void)
+{
+ static uint32_t m_w = 521288629;
+ static uint32_t m_z = 362436069;
+
+ m_z = 36969 * (m_z & 65535) + (m_z >> 16);
+ m_w = 18000 * (m_w & 65535) + (m_w >> 16);
+ return (m_z << 16) + (m_w & 65535);
+}
+
+/*
+ * bzip2_compress_raw --
+ * Test function for the test/format utility.
+ */
+static int
+bzip2_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ size_t page_max, int split_pct, size_t extra,
+ uint8_t *src, uint32_t *offsets, uint32_t slots,
+ uint8_t *dst, size_t dst_len, int final,
+ size_t *result_lenp, uint32_t *result_slotsp)
+{
+ uint32_t take, twenty_pct;
+ int compression_failed, ret;
+
+ (void)page_max; /* Unused parameters */
+ (void)split_pct;
+ (void)extra;
+ (void)final;
+
+ /*
+ * This function is used by the test/format utility to test the
+ * WT_COMPRESSOR::compress_raw functionality.
+ *
+ * I'm trying to mimic how a real application is likely to behave: if
+ * it's a small number of slots, we're not going to take them because
+ * they aren't worth compressing. In all likelihood, that's going to
+ * be because the btree is wrapping up a page, but that's OK, that is
+ * going to happen a lot. In addition, add a 2% chance of not taking
+ * anything at all just because we don't want to take it. Otherwise,
+ * select between 80 and 100% of the slots and compress them, stepping
+ * down by 5 slots at a time until something works.
+ */
+ take = slots;
+ if (take < 10 || __bzip2_compress_raw_random() % 100 < 2)
+ take = 0;
+ else {
+ twenty_pct = (slots / 10) * 2;
+ if (twenty_pct < slots)
+ take -= __bzip2_compress_raw_random() % twenty_pct;
+
+ for (;;) {
+ if ((ret = bzip2_compress(compressor, session,
+ src, offsets[take],
+ dst, dst_len,
+ result_lenp, &compression_failed)) != 0)
+ return (ret);
+ if (!compression_failed)
+ break;
+ if (take < 10) {
+ take = 0;
+ break;
+ }
+ take -= 5;
+ }
+ }
+
+ *result_slotsp = take;
+ if (take == 0)
+ *result_lenp = 0;
+
+#if 0
+ fprintf(stderr,
+ "bzip2_compress_raw (%s): page_max %" PRIuMAX
+ ", split_pct %u, extra %" PRIuMAX
+ ", slots %" PRIu32 ", take %" PRIu32 ": %" PRIu32 " -> %"
+ PRIuMAX "\n",
+ final ? "final" : "not final",
+ (uintmax_t)page_max, split_pct, (uintmax_t)extra,
+ slots, take, offsets[take], (uintmax_t)*result_lenp);
+#endif
+ return (take == 0 ? EAGAIN : 0);
+}
+
+/*
+ * bzip2_decompress --
+ * WiredTiger bzip2 decompression.
+ */
+static int
+bzip2_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ uint8_t *dst, size_t dst_len,
+ size_t *result_lenp)
+{
+ BZIP_COMPRESSOR *bzip_compressor;
+ BZIP_OPAQUE opaque;
+ bz_stream bz;
+ int ret, tret;
+
+ bzip_compressor = (BZIP_COMPRESSOR *)compressor;
+
+ memset(&bz, 0, sizeof(bz));
+ bz.bzalloc = bzalloc;
+ bz.bzfree = bzfree;
+ opaque.compressor = compressor;
+ opaque.session = session;
+ bz.opaque = &opaque;
+
+ if ((ret = BZ2_bzDecompressInit(&bz,
+ bzip_compressor->bz_small, bzip_compressor->bz_verbosity)) != BZ_OK)
+ return (bzip2_error(
+ compressor, session, "BZ2_bzDecompressInit", ret));
+
+ bz.next_in = (char *)src;
+ bz.avail_in = (uint32_t)src_len;
+ bz.next_out = (char *)dst;
+ bz.avail_out = (uint32_t)dst_len;
+ if ((ret = BZ2_bzDecompress(&bz)) == BZ_STREAM_END) {
+ *result_lenp = dst_len - bz.avail_out;
+ ret = 0;
+ } else
+ (void)bzip2_error(compressor, session, "BZ2_bzDecompress", ret);
+
+ if ((tret = BZ2_bzDecompressEnd(&bz)) != BZ_OK)
+ return (bzip2_error(
+ compressor, session, "BZ2_bzDecompressEnd", tret));
+
+ return (ret == 0 ?
+ 0 : bzip2_error(compressor, session, "BZ2_bzDecompressEnd", ret));
+}
+
+/*
+ * bzip2_terminate --
+ * WiredTiger bzip2 compression termination.
+ */
+static int
+bzip2_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session)
+{
+ (void)session; /* Unused parameters */
+
+ free(compressor);
+ return (0);
+}
+
+/*
+ * bzip2_add_compressor --
+ * Add a bzip2 compressor.
+ */
+static int
+bzip2_add_compressor(WT_CONNECTION *connection, int raw, const char *name)
+{
+ BZIP_COMPRESSOR *bzip_compressor;
+
+ /*
+ * There are two almost identical bzip2 compressors: one supporting raw
+ * compression (used by test/format to test raw compression), the other
+ * without raw compression, that might be useful for real applications.
+ */
+ if ((bzip_compressor = calloc(1, sizeof(BZIP_COMPRESSOR))) == NULL)
+ return (errno);
+
+ bzip_compressor->compressor.compress = bzip2_compress;
+ bzip_compressor->
+ compressor.compress_raw = raw ? bzip2_compress_raw : NULL;
+ bzip_compressor->compressor.decompress = bzip2_decompress;
+ bzip_compressor->compressor.pre_size = NULL;
+ bzip_compressor->compressor.terminate = bzip2_terminate;
+
+ bzip_compressor->wt_api = connection->get_extension_api(connection);
+
+ /* between 0-4: set the amount of verbosity to stderr */
+ bzip_compressor->bz_verbosity = 0;
+
+ /*
+ * between 1-9: set the block size to 100k x this number (compression
+ * only)
+ */
+ bzip_compressor->bz_blocksize100k = 1;
+
+ /*
+ * between 0-250: workFactor: see bzip2 manual. 0 is a reasonable
+ * default (compression only)
+ */
+ bzip_compressor->bz_workfactor = 0;
+
+ /*
+ * if nonzero, decompress using less memory, but slower (decompression
+ * only)
+ */
+ bzip_compressor->bz_small = 0;
+
+ return (connection->add_compressor( /* Load the compressor */
+ connection, name, (WT_COMPRESSOR *)bzip_compressor, NULL));
+}
+
+/*
+ * wiredtiger_extension_init --
+ * WiredTiger bzip2 compression extension.
+ */
+int
+wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
+{
+ int ret;
+
+ (void)config; /* Unused parameters */
+
+ if ((ret = bzip2_add_compressor(connection, 0, "bzip2")) != 0)
+ return (ret);
+ if ((ret = bzip2_add_compressor(connection, 1, "bzip2-raw-test")) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/ext/compressors/nop/Makefile.am b/src/third_party/wiredtiger/ext/compressors/nop/Makefile.am
new file mode 100644
index 00000000000..87dbf18cb22
--- /dev/null
+++ b/src/third_party/wiredtiger/ext/compressors/nop/Makefile.am
@@ -0,0 +1,9 @@
+AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include
+
+noinst_LTLIBRARIES = libwiredtiger_nop.la
+libwiredtiger_nop_la_SOURCES = nop_compress.c
+
+# libtool hack: noinst_LTLIBRARIES turns off building shared libraries as well
+# as installation, it will only build static libraries. As far as I can tell,
+# the "approved" libtool way to turn them back on is by adding -rpath.
+libwiredtiger_nop_la_LDFLAGS = -avoid-version -module -rpath /nowhere
diff --git a/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c b/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c
new file mode 100644
index 00000000000..e536c8fefd8
--- /dev/null
+++ b/src/third_party/wiredtiger/ext/compressors/nop/nop_compress.c
@@ -0,0 +1,187 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+#include <wiredtiger_ext.h>
+
+/*! [WT_COMPRESSOR initialization structure] */
+/* Local compressor structure. */
+typedef struct {
+ WT_COMPRESSOR compressor; /* Must come first */
+
+ WT_EXTENSION_API *wt_api; /* Extension API */
+
+ unsigned long nop_calls; /* Count of calls */
+
+} NOP_COMPRESSOR;
+/*! [WT_COMPRESSOR initialization structure] */
+
+/*! [WT_COMPRESSOR compress] */
+/*
+ * nop_compress --
+ * A simple compression example that passes data through unchanged.
+ */
+static int
+nop_compress(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ uint8_t *dst, size_t dst_len,
+ size_t *result_lenp, int *compression_failed)
+{
+ NOP_COMPRESSOR *nop_compressor = (NOP_COMPRESSOR *)compressor;
+
+ (void)session; /* Unused parameters */
+
+ ++nop_compressor->nop_calls; /* Call count */
+
+ *compression_failed = 0;
+ if (dst_len < src_len) {
+ *compression_failed = 1;
+ return (0);
+ }
+
+ memcpy(dst, src, src_len);
+ *result_lenp = src_len;
+
+ return (0);
+}
+/*! [WT_COMPRESSOR compress] */
+
+/*! [WT_COMPRESSOR decompress] */
+/*
+ * nop_decompress --
+ * A simple decompression example that passes data through unchanged.
+ */
+static int
+nop_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ uint8_t *dst, size_t dst_len,
+ size_t *result_lenp)
+{
+ NOP_COMPRESSOR *nop_compressor = (NOP_COMPRESSOR *)compressor;
+
+ (void)session; /* Unused parameters */
+ (void)src_len;
+
+ ++nop_compressor->nop_calls; /* Call count */
+
+ /*
+ * The destination length is the number of uncompressed bytes we're
+ * expected to return.
+ */
+ memcpy(dst, src, dst_len);
+ *result_lenp = dst_len;
+ return (0);
+}
+/*! [WT_COMPRESSOR decompress] */
+
+/*! [WT_COMPRESSOR presize] */
+/*
+ * nop_pre_size --
+ * A simple pre-size example that returns the source length.
+ */
+static int
+nop_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ size_t *result_lenp)
+{
+ NOP_COMPRESSOR *nop_compressor = (NOP_COMPRESSOR *)compressor;
+
+ (void)session; /* Unused parameters */
+ (void)src;
+
+ ++nop_compressor->nop_calls; /* Call count */
+
+ *result_lenp = src_len;
+ return (0);
+}
+/*! [WT_COMPRESSOR presize] */
+
+/*! [WT_COMPRESSOR terminate] */
+/*
+ * nop_terminate --
+ * WiredTiger no-op compression termination.
+ */
+static int
+nop_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session)
+{
+ NOP_COMPRESSOR *nop_compressor = (NOP_COMPRESSOR *)compressor;
+
+ (void)session; /* Unused parameters */
+
+ ++nop_compressor->nop_calls; /* Call count */
+
+ /* Free the allocated memory. */
+ free(compressor);
+
+ return (0);
+}
+/*! [WT_COMPRESSOR terminate] */
+
+/*! [WT_COMPRESSOR initialization function] */
+/*
+ * wiredtiger_extension_init --
+ * A simple shared library compression example.
+ */
+int
+wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
+{
+ NOP_COMPRESSOR *nop_compressor;
+
+ (void)config; /* Unused parameters */
+
+ if ((nop_compressor = calloc(1, sizeof(NOP_COMPRESSOR))) == NULL)
+ return (errno);
+
+ /*
+ * Allocate a local compressor structure, with a WT_COMPRESSOR structure
+ * as the first field, allowing us to treat references to either type of
+ * structure as a reference to the other type.
+ *
+ * This could be simplified if only a single database is opened in the
+ * application, we could use a static WT_COMPRESSOR structure, and a
+ * static reference to the WT_EXTENSION_API methods, then we don't need
+ * to allocate memory when the compressor is initialized or free it when
+ * the compressor is terminated. However, this approach is more general
+ * purpose and supports multiple databases per application.
+ */
+ nop_compressor->compressor.compress = nop_compress;
+ nop_compressor->compressor.compress_raw = NULL;
+ nop_compressor->compressor.decompress = nop_decompress;
+ nop_compressor->compressor.pre_size = nop_pre_size;
+ nop_compressor->compressor.terminate = nop_terminate;
+
+ nop_compressor->wt_api = connection->get_extension_api(connection);
+
+ /* Load the compressor */
+ return (connection->add_compressor(
+ connection, "nop", (WT_COMPRESSOR *)nop_compressor, NULL));
+}
+/*! [WT_COMPRESSOR initialization function] */
diff --git a/src/third_party/wiredtiger/ext/compressors/snappy/Makefile.am b/src/third_party/wiredtiger/ext/compressors/snappy/Makefile.am
new file mode 100644
index 00000000000..78317234ba0
--- /dev/null
+++ b/src/third_party/wiredtiger/ext/compressors/snappy/Makefile.am
@@ -0,0 +1,10 @@
+AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include
+
+if HAVE_BUILTIN_EXTENSION_SNAPPY
+noinst_LTLIBRARIES = libwiredtiger_snappy.la
+else
+lib_LTLIBRARIES = libwiredtiger_snappy.la
+libwiredtiger_snappy_la_LDFLAGS = -avoid-version -module
+endif
+libwiredtiger_snappy_la_SOURCES = snappy_compress.c
+libwiredtiger_snappy_la_LIBADD = -lsnappy
diff --git a/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c b/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c
new file mode 100644
index 00000000000..7ed759e6807
--- /dev/null
+++ b/src/third_party/wiredtiger/ext/compressors/snappy/snappy_compress.c
@@ -0,0 +1,244 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <snappy-c.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+#include <wiredtiger_ext.h>
+
+/*
+ * We need to include the configuration file to detect whether this extension
+ * is being built into the WiredTiger library.
+ */
+#include "wiredtiger_config.h"
+#ifdef _MSC_VER
+#define inline __inline
+#endif
+
+/* Local compressor structure. */
+typedef struct {
+ WT_COMPRESSOR compressor; /* Must come first */
+
+ WT_EXTENSION_API *wt_api; /* Extension API */
+} SNAPPY_COMPRESSOR;
+
+/*
+ * wt_snappy_error --
+ * Output an error message, and return a standard error code.
+ */
+static int
+wt_snappy_error(WT_COMPRESSOR *compressor,
+ WT_SESSION *session, const char *call, snappy_status snret)
+{
+ WT_EXTENSION_API *wt_api;
+ const char *msg;
+
+ wt_api = ((SNAPPY_COMPRESSOR *)compressor)->wt_api;
+
+ switch (snret) {
+ case SNAPPY_BUFFER_TOO_SMALL:
+ msg = "SNAPPY_BUFFER_TOO_SMALL";
+ break;
+ case SNAPPY_INVALID_INPUT:
+ msg = "SNAPPY_INVALID_INPUT";
+ break;
+ default:
+ msg = "unknown error";
+ break;
+ }
+
+ (void)wt_api->err_printf(wt_api,
+ session, "snappy error: %s: %s: %d", call, msg, snret);
+ return (WT_ERROR);
+}
+
+/*
+ * wt_snappy_compress --
+ * WiredTiger snappy compression.
+ */
+static int
+wt_snappy_compress(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ uint8_t *dst, size_t dst_len,
+ size_t *result_lenp, int *compression_failed)
+{
+ snappy_status snret;
+ size_t snaplen;
+ char *snapbuf;
+
+ /*
+ * dst_len was computed in wt_snappy_pre_size, so we know it's big
+ * enough. Skip past the space we'll use to store the final count
+ * of compressed bytes.
+ */
+ snaplen = dst_len - sizeof(size_t);
+ snapbuf = (char *)dst + sizeof(size_t);
+
+ /* snaplen is an input and an output arg. */
+ snret = snappy_compress((char *)src, src_len, snapbuf, &snaplen);
+
+ if (snret == SNAPPY_OK) {
+ /*
+ * On decompression, snappy requires the exact compressed byte
+ * count (the current value of snaplen). WiredTiger does not
+ * preserve that value, so save snaplen at the beginning of the
+ * destination buffer.
+ */
+ if (snaplen + sizeof(size_t) < src_len) {
+ *(size_t *)dst = snaplen;
+ *result_lenp = snaplen + sizeof(size_t);
+ *compression_failed = 0;
+ } else
+ /* The compressor failed to produce a smaller result. */
+ *compression_failed = 1;
+ return (0);
+ }
+ return (wt_snappy_error(compressor, session, "snappy_compress", snret));
+}
+
+/*
+ * wt_snappy_decompress --
+ * WiredTiger snappy decompression.
+ */
+static int
+wt_snappy_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ uint8_t *dst, size_t dst_len,
+ size_t *result_lenp)
+{
+ WT_EXTENSION_API *wt_api;
+ snappy_status snret;
+ size_t snaplen;
+
+ wt_api = ((SNAPPY_COMPRESSOR *)compressor)->wt_api;
+
+ /* retrieve the saved length */
+ snaplen = *(size_t *)src;
+ if (snaplen + sizeof(size_t) > src_len) {
+ (void)wt_api->err_printf(wt_api,
+ session,
+ "wt_snappy_decompress: stored size exceeds buffer size");
+ return (WT_ERROR);
+ }
+
+ /* dst_len is an input and an output arg. */
+ snret = snappy_uncompress(
+ (char *)src + sizeof(size_t), snaplen, (char *)dst, &dst_len);
+
+ if (snret == SNAPPY_OK) {
+ *result_lenp = dst_len;
+ return (0);
+ }
+
+ return (
+ wt_snappy_error(compressor, session, "snappy_decompress", snret));
+}
+
+/*
+ * wt_snappy_pre_size --
+ * WiredTiger snappy destination buffer sizing.
+ */
+static int
+wt_snappy_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ size_t *result_lenp)
+{
+ (void)compressor; /* Unused parameters */
+ (void)session;
+ (void)src;
+
+ /*
+ * Snappy requires the dest buffer be somewhat larger than the source.
+ * Fortunately, this is fast to compute, and will give us a dest buffer
+ * in wt_snappy_compress that we can compress to directly. We add space
+ * in the dest buffer to store the accurate compressed size.
+ */
+ *result_lenp = snappy_max_compressed_length(src_len) + sizeof(size_t);
+ return (0);
+}
+
+/*
+ * wt_snappy_terminate --
+ * WiredTiger snappy compression termination.
+ */
+static int
+wt_snappy_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session)
+{
+ (void)session; /* Unused parameters */
+
+ free(compressor);
+ return (0);
+}
+
+int snappy_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *);
+
+/*
+ * snappy_extension_init --
+ * WiredTiger snappy compression extension - called directly when
+ * Snappy support is built in, or via wiredtiger_extension_init when
+ * snappy support is included via extension loading.
+ */
+int
+snappy_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
+{
+ SNAPPY_COMPRESSOR *snappy_compressor;
+
+ (void)config; /* Unused parameters */
+
+ if ((snappy_compressor = calloc(1, sizeof(SNAPPY_COMPRESSOR))) == NULL)
+ return (errno);
+
+ snappy_compressor->compressor.compress = wt_snappy_compress;
+ snappy_compressor->compressor.compress_raw = NULL;
+ snappy_compressor->compressor.decompress = wt_snappy_decompress;
+ snappy_compressor->compressor.pre_size = wt_snappy_pre_size;
+ snappy_compressor->compressor.terminate = wt_snappy_terminate;
+
+ snappy_compressor->wt_api = connection->get_extension_api(connection);
+
+ return (connection->add_compressor(
+ connection, "snappy", (WT_COMPRESSOR *)snappy_compressor, NULL));
+}
+
+/*
+ * We have to remove this symbol when building as a builtin extension otherwise
+ * it will conflict with other builtin libraries.
+ */
+#ifndef HAVE_BUILTIN_EXTENSION_SNAPPY
+/*
+ * wiredtiger_extension_init --
+ * WiredTiger snappy compression extension.
+ */
+int
+wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
+{
+ return snappy_extension_init(connection, config);
+}
+#endif
diff --git a/src/third_party/wiredtiger/ext/compressors/zlib/Makefile.am b/src/third_party/wiredtiger/ext/compressors/zlib/Makefile.am
new file mode 100644
index 00000000000..fb0ec306562
--- /dev/null
+++ b/src/third_party/wiredtiger/ext/compressors/zlib/Makefile.am
@@ -0,0 +1,10 @@
+AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include
+
+if HAVE_BUILTIN_EXTENSION_ZLIB
+noinst_LTLIBRARIES = libwiredtiger_zlib.la
+else
+lib_LTLIBRARIES = libwiredtiger_zlib.la
+libwiredtiger_zlib_la_LDFLAGS = -avoid-version -module
+endif
+libwiredtiger_zlib_la_SOURCES = zlib_compress.c
+libwiredtiger_zlib_la_LIBADD = -lz
diff --git a/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c b/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c
new file mode 100644
index 00000000000..8dd619d695c
--- /dev/null
+++ b/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c
@@ -0,0 +1,426 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <zlib.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+#include <wiredtiger_ext.h>
+
+/*
+ * We need to include the configuration file to detect whether this extension
+ * is being built into the WiredTiger library.
+ */
+#include "wiredtiger_config.h"
+#ifdef _MSC_VER
+#define inline __inline
+#endif
+
+/* Local compressor structure. */
+typedef struct {
+ WT_COMPRESSOR compressor; /* Must come first */
+
+ WT_EXTENSION_API *wt_api; /* Extension API */
+
+ int zlib_level; /* Configuration */
+} ZLIB_COMPRESSOR;
+
+/*
+ * zlib gives us a cookie to pass to the underlying allocation functions; we
+ * need two handles, package them up.
+ */
+typedef struct {
+ WT_COMPRESSOR *compressor;
+ WT_SESSION *session;
+} ZLIB_OPAQUE;
+
+/*
+ * zlib_error --
+ * Output an error message, and return a standard error code.
+ */
+static int
+zlib_error(
+ WT_COMPRESSOR *compressor, WT_SESSION *session, const char *call, int zret)
+{
+ WT_EXTENSION_API *wt_api;
+
+ wt_api = ((ZLIB_COMPRESSOR *)compressor)->wt_api;
+
+ (void)wt_api->err_printf(wt_api, session,
+ "zlib error: %s: %s: %d", call, zError(zret), zret);
+ return (WT_ERROR);
+}
+
+/*
+ * zalloc --
+ * Allocate a scratch buffer.
+ */
+static void *
+zalloc(void *cookie, uint32_t number, uint32_t size)
+{
+ ZLIB_OPAQUE *opaque;
+ WT_EXTENSION_API *wt_api;
+
+ opaque = cookie;
+ wt_api = ((ZLIB_COMPRESSOR *)opaque->compressor)->wt_api;
+ return (wt_api->scr_alloc(
+ wt_api, opaque->session, (size_t)(number * size)));
+}
+
+/*
+ * zfree --
+ * Free a scratch buffer.
+ */
+static void
+zfree(void *cookie, void *p)
+{
+ ZLIB_OPAQUE *opaque;
+ WT_EXTENSION_API *wt_api;
+
+ opaque = cookie;
+ wt_api = ((ZLIB_COMPRESSOR *)opaque->compressor)->wt_api;
+ wt_api->scr_free(wt_api, opaque->session, p);
+}
+
+/*
+ * zlib_compress --
+ * WiredTiger zlib compression.
+ */
+static int
+zlib_compress(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ uint8_t *dst, size_t dst_len,
+ size_t *result_lenp, int *compression_failed)
+{
+ ZLIB_COMPRESSOR *zlib_compressor;
+ ZLIB_OPAQUE opaque;
+ z_stream zs;
+ int ret;
+
+ zlib_compressor = (ZLIB_COMPRESSOR *)compressor;
+
+ memset(&zs, 0, sizeof(zs));
+ zs.zalloc = zalloc;
+ zs.zfree = zfree;
+ opaque.compressor = compressor;
+ opaque.session = session;
+ zs.opaque = &opaque;
+
+ if ((ret = deflateInit(&zs, zlib_compressor->zlib_level)) != Z_OK)
+ return (zlib_error(compressor, session, "deflateInit", ret));
+
+ zs.next_in = src;
+ zs.avail_in = (uint32_t)src_len;
+ zs.next_out = dst;
+ zs.avail_out = (uint32_t)dst_len;
+ if (deflate(&zs, Z_FINISH) == Z_STREAM_END) {
+ *compression_failed = 0;
+ *result_lenp = zs.total_out;
+ } else
+ *compression_failed = 1;
+
+ if ((ret = deflateEnd(&zs)) != Z_OK && ret != Z_DATA_ERROR)
+ return (zlib_error(compressor, session, "deflateEnd", ret));
+
+ return (0);
+}
+
+/*
+ * zlib_find_slot --
+ * Find the slot containing the target offset (binary search).
+ */
+static inline uint32_t
+zlib_find_slot(uint32_t target, uint32_t *offsets, uint32_t slots)
+{
+ uint32_t base, indx, limit;
+
+ indx = 1;
+
+ /* Figure out which slot we got to: binary search */
+ if (target >= offsets[slots])
+ indx = slots;
+ else if (target > offsets[1])
+ for (base = 2, limit = slots - base; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ if (target < offsets[indx])
+ continue;
+ base = indx + 1;
+ --limit;
+ }
+
+ return (indx);
+}
+
+/*
+ * zlib_compress_raw --
+ * Pack records into a specified on-disk page size.
+ */
+static int
+zlib_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ size_t page_max, int split_pct, size_t extra,
+ uint8_t *src, uint32_t *offsets, uint32_t slots,
+ uint8_t *dst, size_t dst_len, int final,
+ size_t *result_lenp, uint32_t *result_slotsp)
+{
+ ZLIB_COMPRESSOR *zlib_compressor;
+ ZLIB_OPAQUE opaque;
+ z_stream last_zs, zs;
+ uint32_t curr_slot, last_slot;
+ int ret;
+
+ curr_slot = last_slot = 0;
+ (void)split_pct;
+ (void)dst_len;
+ (void)final;
+
+ zlib_compressor = (ZLIB_COMPRESSOR *)compressor;
+
+ memset(&zs, 0, sizeof(zs));
+ zs.zalloc = zalloc;
+ zs.zfree = zfree;
+ opaque.compressor = compressor;
+ opaque.session = session;
+ zs.opaque = &opaque;
+
+ if ((ret = deflateInit(&zs,
+ zlib_compressor->zlib_level)) != Z_OK)
+ return (zlib_error(compressor, session, "deflateInit", ret));
+
+ zs.next_in = src;
+ zs.next_out = dst;
+ /*
+ * Experimentally derived, reserve this many bytes for zlib to finish
+ * up a buffer. If this isn't sufficient, we don't fail but we will be
+ * inefficient.
+ */
+#define WT_ZLIB_RESERVED 24
+ zs.avail_out = (uint32_t)(page_max - extra - WT_ZLIB_RESERVED);
+ last_zs = zs;
+
+ /*
+ * Strategy: take the available output size and compress that much
+ * input. Continue until there is no input small enough or the
+ * compression fails to fit.
+ *
+ * Don't let the compression ratio become insanely good (which can
+ * happen with synthetic workloads). Once we hit a limit, stop so that
+ * the in-memory size of pages isn't totally different to the on-disk
+ * size. Otherwise we can get into trouble where every update to a
+ * page results in forced eviction based on in-memory size, even though
+ * the data fits into a single on-disk block.
+ */
+ while (zs.avail_out > 0 && zs.total_in <= zs.total_out * 20) {
+ /* Find the slot we will try to compress up to. */
+ if ((curr_slot = zlib_find_slot(
+ zs.total_in + zs.avail_out, offsets, slots)) <= last_slot)
+ break;
+
+ zs.avail_in = offsets[curr_slot] - offsets[last_slot];
+ /* Save the stream state in case the chosen data doesn't fit. */
+ last_zs = zs;
+
+ while (zs.avail_in > 0 && zs.avail_out > 0)
+ if ((ret = deflate(&zs, Z_SYNC_FLUSH)) != Z_OK)
+ return (zlib_error(
+ compressor, session, "deflate", ret));
+
+ /* Roll back if the last deflate didn't complete. */
+ if (zs.avail_in > 0) {
+ zs = last_zs;
+ break;
+ } else
+ last_slot = curr_slot;
+ }
+
+ zs.avail_out += WT_ZLIB_RESERVED;
+ ret = deflate(&zs, Z_FINISH);
+
+ /*
+ * If the end marker didn't fit, report that we got no work done. WT
+ * will compress the (possibly large) page image using ordinary
+ * compression instead.
+ */
+ if (ret == Z_OK || ret == Z_BUF_ERROR)
+ last_slot = 0;
+ else if (ret != Z_STREAM_END)
+ return (
+ zlib_error(compressor, session, "deflate end block", ret));
+
+ if ((ret = deflateEnd(&zs)) != Z_OK && ret != Z_DATA_ERROR)
+ return (zlib_error(compressor, session, "deflateEnd", ret));
+
+ if (last_slot > 0) {
+ *result_slotsp = last_slot;
+ *result_lenp = zs.total_out;
+ } else {
+ /* We didn't manage to compress anything: don't retry. */
+ *result_slotsp = 0;
+ *result_lenp = 1;
+ }
+
+#if 0
+ fprintf(stderr,
+ "zlib_compress_raw (%s): page_max %" PRIuMAX ", slots %" PRIu32
+ ", take %" PRIu32 ": %" PRIu32 " -> %" PRIuMAX "\n",
+ final ? "final" : "not final", (uintmax_t)page_max,
+ slots, last_slot, offsets[last_slot], (uintmax_t)*result_lenp);
+#endif
+ return (0);
+}
+
+/*
+ * zlib_decompress --
+ * WiredTiger zlib decompression.
+ */
+static int
+zlib_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ uint8_t *dst, size_t dst_len,
+ size_t *result_lenp)
+{
+ ZLIB_OPAQUE opaque;
+ z_stream zs;
+ int ret, tret;
+
+ memset(&zs, 0, sizeof(zs));
+ zs.zalloc = zalloc;
+ zs.zfree = zfree;
+ opaque.compressor = compressor;
+ opaque.session = session;
+ zs.opaque = &opaque;
+
+ if ((ret = inflateInit(&zs)) != Z_OK)
+ return (zlib_error(compressor, session, "inflateInit", ret));
+
+ zs.next_in = src;
+ zs.avail_in = (uint32_t)src_len;
+ zs.next_out = dst;
+ zs.avail_out = (uint32_t)dst_len;
+ while ((ret = inflate(&zs, Z_FINISH)) == Z_OK)
+ ;
+ if (ret == Z_STREAM_END) {
+ *result_lenp = zs.total_out;
+ ret = Z_OK;
+ }
+
+ if ((tret = inflateEnd(&zs)) != Z_OK && ret == Z_OK)
+ ret = tret;
+
+ return (ret == Z_OK ?
+ 0 : zlib_error(compressor, session, "inflate", ret));
+}
+
+/*
+ * zlib_terminate --
+ * WiredTiger zlib compression termination.
+ */
+static int
+zlib_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session)
+{
+ (void)session; /* Unused parameters */
+
+ free(compressor);
+ return (0);
+}
+
+/*
+ * zlib_add_compressor --
+ * Add a zlib compressor.
+ */
+static int
+zlib_add_compressor(WT_CONNECTION *connection, int raw, const char *name)
+{
+ ZLIB_COMPRESSOR *zlib_compressor;
+
+ /*
+ * There are two almost identical zlib compressors: one supporting raw
+ * compression, and one without.
+ */
+ if ((zlib_compressor = calloc(1, sizeof(ZLIB_COMPRESSOR))) == NULL)
+ return (errno);
+
+ zlib_compressor->compressor.compress = zlib_compress;
+ zlib_compressor->compressor.compress_raw = raw ?
+ zlib_compress_raw : NULL;
+ zlib_compressor->compressor.decompress = zlib_decompress;
+ zlib_compressor->compressor.pre_size = NULL;
+ zlib_compressor->compressor.terminate = zlib_terminate;
+
+ zlib_compressor->wt_api = connection->get_extension_api(connection);
+
+ /*
+ * between 0-10: level: see zlib manual.
+ */
+ zlib_compressor->zlib_level = Z_DEFAULT_COMPRESSION;
+
+ /* Load the standard compressor. */
+ return (connection->add_compressor(
+ connection, name, &zlib_compressor->compressor, NULL));
+}
+
+int zlib_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *);
+
+/*
+ * zlib_extension_init --
+ * WiredTiger zlib compression extension - called directly when zlib
+ * support is built in, or via wiredtiger_extension_init when zlib
+ * support is included via extension loading.
+ */
+int
+zlib_extension_init(
+ WT_CONNECTION *connection, WT_CONFIG_ARG *config)
+{
+ int ret;
+
+ (void)config; /* Unused parameters */
+
+ if ((ret = zlib_add_compressor(connection, 1, "zlib")) != 0)
+ return (ret);
+ if ((ret = zlib_add_compressor(connection, 0, "zlib-noraw")) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * We have to remove this symbol when building as a builtin extension otherwise
+ * it will conflict with other builtin libraries.
+ */
+#ifndef HAVE_BUILTIN_EXTENSION_SNAPPY
+/*
+ * wiredtiger_extension_init --
+ * WiredTiger zlib compression extension.
+ */
+int
+wiredtiger_extension_init(
+ WT_CONNECTION *connection, WT_CONFIG_ARG *config)
+{
+ return (zlib_extension_init(connection, config));
+}
+#endif
diff --git a/src/third_party/wiredtiger/ext/datasources/helium/Makefile.am b/src/third_party/wiredtiger/ext/datasources/helium/Makefile.am
new file mode 100644
index 00000000000..b4e6e67e2cd
--- /dev/null
+++ b/src/third_party/wiredtiger/ext/datasources/helium/Makefile.am
@@ -0,0 +1,11 @@
+AM_CPPFLAGS = -I$(top_builddir) \
+ -I$(top_srcdir)/src/include -I$(HELIUM_PATH)
+
+noinst_LTLIBRARIES = libwiredtiger_helium.la
+libwiredtiger_helium_la_SOURCES = helium.c
+libwiredtiger_helium_la_LIBADD = -L$(HELIUM_PATH) -lhe
+
+# libtool hack: noinst_LTLIBRARIES turns off building shared libraries as well
+# as installation, it will only build static libraries. As far as I can tell,
+# the "approved" libtool way to turn them back on is by adding -rpath.
+libwiredtiger_helium_la_LDFLAGS = -avoid-version -module -rpath /nowhere
diff --git a/src/third_party/wiredtiger/ext/datasources/helium/README b/src/third_party/wiredtiger/ext/datasources/helium/README
new file mode 100644
index 00000000000..e78ba58c71d
--- /dev/null
+++ b/src/third_party/wiredtiger/ext/datasources/helium/README
@@ -0,0 +1,125 @@
+Helium README.
+
+The data structures are "Helium sources" which map to one or more physical
+volumes; each Helium source supports any number of "WiredTiger sources",
+where a WiredTiger source is an object similar to a Btree "file:" object.
+Each WiredTiger source supports any number of WiredTiger cursors.
+
+Each Helium source is given a logical name when first referenced, and that
+logical name is subsequently used when a WiredTiger source is created. For
+example, the logical name for a Helium source might be "dev1", and it would
+map to the Helium volumes /dev/sd0 and /dev/sd1; subsequent WT_SESSION.create
+calls specify a URI like "table:dev1/my_table".
+
+For each WiredTiger source, we create two namespaces on the underlying device,
+a "cache" and a "primary".
+
+The cache contains key/value pairs based on updates or changes that have been
+made, and includes transactional information. So, for example, if transaction
+3 modifies key/value pair "foo/aaa", and then transaction 4 removes key "foo",
+then transaction 5 inserts key/value pair "foo/bbb", the entry in the cache
+will look something like:
+
+ Key: foo
+ Value: [transaction ID 3] [aaa]
+ [transaction ID 4] [remove]
+ [transaction ID 5] [bbb]
+
+Obviously, we have to marshall/unmarshall these values to/from the cache.
+
+In contrast, the primary contains only key/value pairs known to be committed
+and visible to any reader.
+
+When an insert, update or remove is done:
+ acquire a lock
+ read any matching key from the cache
+ check to see if the update can proceed
+ append a new value for this transaction
+ release the lock
+
+When a search is done:
+ if there's a matching key/value pair in the cache {
+ if there's an item visible to the reading transaction
+ return it
+ }
+ if there's a matching key/value pair in the primary {
+ return it
+ }
+
+When a next/prev is done:
+ move to the next/prev visible item in the cache
+ move to the next/prev visible item in the primary
+ return the one closest to the starting position
+
+Locks are not acquired for read operations, and no flushes are done for any of
+these operations.
+
+We also create one additional object, the transaction name space, which serves
+all of the WiredTiger and Helium objects in a WiredTiger connection. Whenever
+a transaction involving a Helium source commits, we insert a commit record into
+the transaction name space and flush the device. When a transaction rolls back,
+we insert an abort record into the txn name space, but don't flush the device.
+
+The visibility check is slightly different than the rest of WiredTiger: we do
+not reset anything when a transaction aborts, and so we have to check if the
+transaction has been aborted as well as check the transaction ID for visibility.
+
+We create a "cleanup" thread for every underlying Helium source. The job of
+this thread is to migrate rows from the cache object into the primary. Any
+committed, globally visible change in the cache can be copied into the primary
+and removed from the cache:
+
+ set BaseTxnID to the oldest transaction ID
+ not yet visible to a running transaction
+
+ for each row in the cache:
+ if all of the updates are greater than BaseTxnID
+ copy the last update to the primary
+
+ flush the primary to stable storage
+
+ lock the cache
+ for each row in the cache:
+ if all of the updates are greater than BaseTxnID
+ remove the row from the cache
+ unlock the cache
+
+ for each row in the transaction store:
+ if the transaction ID is less than BaseTxnID
+ remove the row
+
+We only need to lock the cache when removing rows, the initial copy to the
+primary does not require locks because only the cleanup thread ever writes
+to the primary.
+
+No lock is required when removing rows from the transaction store, once the
+transaction ID is less than the BaseTxnID, it will never be read.
+
+Helium recovery is almost identical to the cleanup thread, which migrates rows
+from the cache into the primary. For every cache/primary pair, migrate every
+commit to the primary (by definition, at recovery time it must be globally
+visible), and discard everything else (by definition, at recovery time anything
+not committed has been aborted.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+Questions, problems, whatever:
+
+* The implementation is endian-specific, that is, the WiredTiger metadata
+stored on the Helium device is on not portable to a big-endian machine.
+Helium's metadata is portable between different endian machines, so this
+should probably be fixed.
+
+* There's a problem with transactions in WiredTiger that span more than a
+single data source. For example, consider a transaction that modifies
+both a Helium object and a Btree object. If we commit and push the Helium
+commit record to stable storage, and then crash before committing the Btree
+change, the enclosing WiredTiger transaction will/should end up aborting,
+and there's no way for us to back out the change in Helium. I'm leaving
+this problem alone until WiredTiger fine-grained durability is complete,
+we're going to need WiredTiger support for some kind of 2PC to solve this.
+
+* If a record in the cache gets too busy, we could end up unable to remove
+it (there would always be an active transaction), and it would grow forever.
+I suspect the solution is to clean it up when we realize we can't remove it,
+that is, we can rebuild the record, discarding the no longer needed entries,
+even if the record can't be entirely discarded.
diff --git a/src/third_party/wiredtiger/ext/datasources/helium/helium.c b/src/third_party/wiredtiger/ext/datasources/helium/helium.c
new file mode 100644
index 00000000000..f5be26e9119
--- /dev/null
+++ b/src/third_party/wiredtiger/ext/datasources/helium/helium.c
@@ -0,0 +1,3449 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <sys/select.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <he.h>
+
+#include <wiredtiger.h>
+#include <wiredtiger_ext.h>
+
+typedef struct he_env HE_ENV;
+typedef struct he_item HE_ITEM;
+typedef struct he_stats HE_STATS;
+
+static int verbose = 0; /* Verbose messages */
+
+/*
+ * Macros to output error and verbose messages, and set or return an error.
+ * Error macros require local "ret" variable.
+ *
+ * ESET: update an error value, handling more/less important errors.
+ * ERET: output a message, return the error.
+ * EMSG: output a message, set the local error value.
+ * EMSG_ERR:
+ * output a message, set the local error value, jump to the err label.
+ * VMSG: verbose message.
+ */
+#undef ESET
+#define ESET(a) do { \
+ int __v; \
+ if ((__v = (a)) != 0) { \
+ /* \
+ * On error, check for a panic (it overrides all other \
+ * returns). Else, if there's no return value or the \
+ * return value is not strictly an error, override it \
+ * with the error. \
+ */ \
+ if (__v == WT_PANIC || \
+ ret == 0 || \
+ ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND) \
+ ret = __v; \
+ /* \
+ * If we're set to a Helium error at the end of the day,\
+ * switch to a generic WiredTiger error. \
+ */ \
+ if (ret < 0 && ret > -31,800) \
+ ret = WT_ERROR; \
+ } \
+} while (0)
+#undef ERET
+#define ERET(wtext, session, v, ...) do { \
+ (void) \
+ wtext->err_printf(wtext, session, "helium: " __VA_ARGS__); \
+ ESET(v); \
+ return (ret); \
+} while (0)
+#undef EMSG
+#define EMSG(wtext, session, v, ...) do { \
+ (void) \
+ wtext->err_printf(wtext, session, "helium: " __VA_ARGS__); \
+ ESET(v); \
+} while (0)
+#undef EMSG_ERR
+#define EMSG_ERR(wtext, session, v, ...) do { \
+ (void) \
+ wtext->err_printf(wtext, session, "helium: " __VA_ARGS__); \
+ ESET(v); \
+ goto err; \
+} while (0)
+#undef VERBOSE_L1
+#define VERBOSE_L1 1
+#undef VERBOSE_L2
+#define VERBOSE_L2 2
+#undef VMSG
+#define VMSG(wtext, session, v, ...) do { \
+ if (verbose >= v) \
+ (void)wtext-> \
+ msg_printf(wtext, session, "helium: " __VA_ARGS__); \
+} while (0)
+
+/*
+ * OVERWRITE_AND_FREE --
+ * Make sure we don't re-use a structure after it's dead.
+ */
+#undef OVERWRITE_AND_FREE
+#define OVERWRITE_AND_FREE(p) do { \
+ memset(p, 0xab, sizeof(*(p))); \
+ free(p); \
+} while (0)
+
+/*
+ * Version each object, out of sheer raging paranoia.
+ */
+#define WIREDTIGER_HELIUM_MAJOR 1 /* Major, minor version */
+#define WIREDTIGER_HELIUM_MINOR 0
+
+/*
+ * WiredTiger name space on the Helium store: all objects are named with the
+ * WiredTiger prefix (we don't require the Helium store be exclusive to our
+ * files). Primary objects are named "WiredTiger.[name]", associated cache
+ * objects are "WiredTiger.[name].cache". The per-connection transaction
+ * object is "WiredTiger.WiredTigerTxn". When we first open a Helium volume,
+ * we open/close a file in order to apply flags for the first open of the
+ * volume, that's "WiredTiger.WiredTigerInit".
+ */
+#define WT_NAME_PREFIX "WiredTiger."
+#define WT_NAME_INIT "WiredTiger.WiredTigerInit"
+#define WT_NAME_TXN "WiredTiger.WiredTigerTxn"
+#define WT_NAME_CACHE ".cache"
+
+/*
+ * WT_SOURCE --
+ * A WiredTiger source, supporting one or more cursors.
+ */
+typedef struct __wt_source {
+ char *uri; /* Unique name */
+
+ pthread_rwlock_t lock; /* Lock */
+ int lockinit; /* Lock created */
+
+ int configured; /* If structure configured */
+ u_int ref; /* Active reference count */
+
+ uint64_t append_recno; /* Allocation record number */
+
+ int config_bitfield; /* config "value_format=#t" */
+ int config_compress; /* config "helium_o_compress" */
+ int config_recno; /* config "key_format=r" */
+
+ /*
+ * Each WiredTiger object has a "primary" namespace in a Helium store
+ * plus a "cache" namespace, which has not-yet-resolved updates. There
+ * is a dirty flag so read-only data sets can ignore the cache.
+ */
+ he_t he; /* Underlying Helium object */
+ he_t he_cache; /* Underlying Helium cache */
+ int he_cache_inuse;
+
+ struct __he_source *hs; /* Underlying Helium source */
+ struct __wt_source *next; /* List of WiredTiger objects */
+} WT_SOURCE;
+
+/*
+ * HELIUM_SOURCE --
+ * A Helium volume, supporting one or more WT_SOURCE objects.
+ */
+typedef struct __he_source {
+ /*
+ * XXX
+ * The transaction commit handler must appear first in the structure.
+ */
+ WT_TXN_NOTIFY txn_notify; /* Transaction commit handler */
+
+ WT_EXTENSION_API *wtext; /* Extension functions */
+
+ char *name; /* Unique WiredTiger name */
+ char *device; /* Unique Helium volume name */
+
+ /*
+ * Maintain a handle for each underlying Helium source so checkpoint is
+ * faster, we can "commit" a single handle per source, regardless of the
+ * number of objects.
+ */
+ he_t he_volume;
+
+ struct __wt_source *ws_head; /* List of WiredTiger sources */
+
+ /*
+ * Each Helium source has a cleaner thread to migrate WiredTiger source
+ * updates from the cache namespace to the primary namespace, based on
+ * the number of bytes or the number of operations. (There's a cleaner
+ * thread per Helium store so migration operations can overlap.) We
+ * read these fields without a lock, but serialize writes to minimize
+ * races (and because it costs us nothing).
+ */
+ pthread_t cleaner_id; /* Cleaner thread ID */
+ volatile int cleaner_stop; /* Cleaner thread quit flag */
+
+ /*
+ * Each WiredTiger connection has a transaction namespace which lists
+ * resolved transactions with their committed or aborted state as a
+ * value. That namespace appears in a single Helium store (the first
+ * one created, if it doesn't already exist), and then it's referenced
+ * from other Helium stores.
+ */
+#define TXN_ABORTED 'A'
+#define TXN_COMMITTED 'C'
+#define TXN_UNRESOLVED 0
+ he_t he_txn; /* Helium txn store */
+ int he_owner; /* Owns transaction store */
+
+ struct __he_source *next; /* List of Helium sources */
+} HELIUM_SOURCE;
+
+/*
+ * DATA_SOURCE --
+ * A WiredTiger data source, supporting one or more HELIUM_SOURCE objects.
+ */
+typedef struct __data_source {
+ WT_DATA_SOURCE wtds; /* Must come first */
+
+ WT_EXTENSION_API *wtext; /* Extension functions */
+
+ pthread_rwlock_t global_lock; /* Global lock */
+ int lockinit; /* Lock created */
+
+ struct __he_source *hs_head; /* List of Helium sources */
+} DATA_SOURCE;
+
+/*
+ * CACHE_RECORD --
+ * An array of updates from the cache object.
+ *
+ * Values in the cache store are marshalled/unmarshalled to/from the store,
+ * using a simple encoding:
+ * {N records: 4B}
+ * {record#1 TxnID: 8B}
+ * {record#1 remove tombstone: 1B}
+ * {record#1 data length: 4B}
+ * {record#1 data}
+ * ...
+ *
+ * Each cursor potentially has a single set of these values.
+ */
+typedef struct __cache_record {
+ uint8_t *v; /* Value */
+ uint32_t len; /* Value length */
+ uint64_t txnid; /* Transaction ID */
+#define REMOVE_TOMBSTONE 'R'
+ int remove; /* 1/0 remove flag */
+} CACHE_RECORD;
+
+/*
+ * CURSOR --
+ * A cursor, supporting a single WiredTiger cursor.
+ */
+typedef struct __cursor {
+ WT_CURSOR wtcursor; /* Must come first */
+
+ WT_EXTENSION_API *wtext; /* Extension functions */
+
+ WT_SOURCE *ws; /* Underlying source */
+
+ HE_ITEM record; /* Record */
+ uint8_t __key[HE_MAX_KEY_LEN]; /* Record.key, Record.value */
+ uint8_t *v;
+ size_t len;
+ size_t mem_len;
+
+ struct {
+ uint8_t *v; /* Temporary buffers */
+ size_t len;
+ size_t mem_len;
+ } t1, t2, t3;
+
+ int config_append; /* config "append" */
+ int config_overwrite; /* config "overwrite" */
+
+ CACHE_RECORD *cache; /* unmarshalled cache records */
+ uint32_t cache_entries; /* cache records */
+ uint32_t cache_slots; /* cache total record slots */
+} CURSOR;
+
+/*
+ * prefix_match --
+ * Return if a string matches a prefix.
+ */
+static inline int
+prefix_match(const char *str, const char *pfx)
+{
+ return (strncmp(str, pfx, strlen(pfx)) == 0);
+}
+
+/*
+ * string_match --
+ * Return if a string matches a byte string of len bytes.
+ */
+static inline int
+string_match(const char *str, const char *bytes, size_t len)
+{
+ return (strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0');
+}
+
+/*
+ * cursor_destroy --
+ * Free a cursor's memory, and optionally the cursor itself.
+ */
+static void
+cursor_destroy(CURSOR *cursor)
+{
+ if (cursor != NULL) {
+ free(cursor->v);
+ free(cursor->t1.v);
+ free(cursor->t2.v);
+ free(cursor->t3.v);
+ free(cursor->cache);
+ OVERWRITE_AND_FREE(cursor);
+ }
+}
+
+/*
+ * os_errno --
+ * Limit our use of errno so it's easy to find/remove.
+ */
+static int
+os_errno(void)
+{
+ return (errno);
+}
+
+/*
+ * lock_init --
+ * Initialize a lock.
+ */
+static int
+lock_init(WT_EXTENSION_API *wtext, WT_SESSION *session, pthread_rwlock_t *lockp)
+{
+ int ret = 0;
+
+ if ((ret = pthread_rwlock_init(lockp, NULL)) != 0)
+ ERET(wtext, session, WT_PANIC,
+ "pthread_rwlock_init: %s", strerror(ret));
+ return (0);
+}
+
+/*
+ * lock_destroy --
+ * Destroy a lock.
+ */
+static int
+lock_destroy(
+ WT_EXTENSION_API *wtext, WT_SESSION *session, pthread_rwlock_t *lockp)
+{
+ int ret = 0;
+
+ if ((ret = pthread_rwlock_destroy(lockp)) != 0)
+ ERET(wtext, session, WT_PANIC,
+ "pthread_rwlock_destroy: %s", strerror(ret));
+ return (0);
+}
+
+/*
+ * writelock --
+ * Acquire a write lock.
+ */
+static inline int
+writelock(WT_EXTENSION_API *wtext, WT_SESSION *session, pthread_rwlock_t *lockp)
+{
+ int ret = 0;
+
+ if ((ret = pthread_rwlock_wrlock(lockp)) != 0)
+ ERET(wtext, session, WT_PANIC,
+ "pthread_rwlock_wrlock: %s", strerror(ret));
+ return (0);
+}
+
+/*
+ * unlock --
+ * Release a lock.
+ */
+static inline int
+unlock(WT_EXTENSION_API *wtext, WT_SESSION *session, pthread_rwlock_t *lockp)
+{
+ int ret = 0;
+
+ if ((ret = pthread_rwlock_unlock(lockp)) != 0)
+ ERET(wtext, session, WT_PANIC,
+ "pthread_rwlock_unlock: %s", strerror(ret));
+ return (0);
+}
+
+#if 0
+/*
+ * helium_dump_kv --
+ * Dump a Helium record.
+ */
+static void
+helium_dump_kv(const char *pfx, uint8_t *p, size_t len, FILE *fp)
+{
+ (void)fprintf(stderr, "%s %3zu: ", pfx, len);
+ for (; len > 0; --len, ++p)
+ if (!isspace(*p) && isprint(*p))
+ (void)putc(*p, fp);
+ else if (len == 1 && *p == '\0') /* Skip string nuls. */
+ continue;
+ else
+ (void)fprintf(fp, "%#x", *p);
+ (void)putc('\n', fp);
+}
+
+/*
+ * helium_dump --
+ * Dump the records in a Helium store.
+ */
+static int
+helium_dump(WT_EXTENSION_API *wtext, he_t he, const char *tag)
+{
+ HE_ITEM *r, _r;
+ uint8_t k[4 * 1024], v[4 * 1024];
+ int ret = 0;
+
+ r = &_r;
+ memset(r, 0, sizeof(*r));
+ r->key = k;
+ r->val = v;
+
+ (void)fprintf(stderr, "== %s\n", tag);
+ while ((ret = he_next(he, r, (size_t)0, sizeof(v))) == 0) {
+#if 0
+ uint64_t recno;
+ if ((ret = wtext->struct_unpack(wtext,
+ NULL, r->key, r->key_len, "r", &recno)) != 0)
+ return (ret);
+ fprintf(stderr, "K: %" PRIu64, recno);
+#else
+ helium_dump_kv("K: ", r->key, r->key_len, stderr);
+#endif
+ helium_dump_kv("V: ", r->val, r->val_len, stderr);
+ }
+ if (ret != HE_ERR_ITEM_NOT_FOUND) {
+ fprintf(stderr, "he_next: %s\n", he_strerror(ret));
+ ret = WT_ERROR;
+ }
+ return (ret);
+}
+
+/*
+ * helium_stats --
+ * Display Helium statistics for a datastore.
+ */
+static int
+helium_stats(
+ WT_EXTENSION_API *wtext, WT_SESSION *session, he_t he, const char *tag)
+{
+ HE_STATS stats;
+ int ret = 0;
+
+ if ((ret = he_stats(he, &stats)) != 0)
+ ERET(wtext, session, ret, "he_stats: %s", he_strerror(ret));
+ fprintf(stderr, "== %s\n", tag);
+ fprintf(stderr, "name=%s\n", stats.name);
+ fprintf(stderr, "deleted_items=%" PRIu64 "\n", stats.deleted_items);
+ fprintf(stderr, "locked_items=%" PRIu64 "\n", stats.locked_items);
+ fprintf(stderr, "valid_items=%" PRIu64 "\n", stats.valid_items);
+ fprintf(stderr, "capacity=%" PRIu64 "B\n", stats.capacity);
+ fprintf(stderr, "size=%" PRIu64 "B\n", stats.size);
+ return (0);
+}
+#endif
+
+/*
+ * helium_call --
+ * Call a Helium key retrieval function, handling overflow.
+ */
+static inline int
+helium_call(WT_CURSOR *wtcursor, const char *fname,
+ he_t he, int (*f)(he_t, HE_ITEM *, size_t, size_t))
+{
+ CURSOR *cursor;
+ HE_ITEM *r;
+ WT_EXTENSION_API *wtext;
+ WT_SESSION *session;
+ int ret = 0;
+ char *p;
+
+ session = wtcursor->session;
+ cursor = (CURSOR *)wtcursor;
+ wtext = cursor->wtext;
+
+ r = &cursor->record;
+ r->val = cursor->v;
+
+restart:
+ if ((ret = f(he, r, (size_t)0, cursor->mem_len)) != 0) {
+ if (ret == HE_ERR_ITEM_NOT_FOUND)
+ return (WT_NOTFOUND);
+ ERET(wtext, session, ret, "%s: %s", fname, he_strerror(ret));
+ }
+
+ /*
+ * If the returned length is larger than our passed-in length, we didn't
+ * get the complete value. Grow the buffer and use he_lookup to do the
+ * retrieval (he_lookup because the call succeeded and the key was
+ * copied out, so calling he_next/he_prev again would skip key/value
+ * pairs).
+ *
+ * We have to loop, another thread of control might change the length of
+ * the value, requiring we grow our buffer multiple times.
+ *
+ * We have to potentially restart the entire call in case the underlying
+ * key/value disappears.
+ */
+ for (;;) {
+ if (cursor->mem_len >= r->val_len) {
+ cursor->len = r->val_len;
+ return (0);
+ }
+
+ /* Grow the value buffer. */
+ if ((p = realloc(cursor->v, r->val_len + 32)) == NULL)
+ return (os_errno());
+ cursor->v = r->val = p;
+ cursor->mem_len = r->val_len + 32;
+
+ if ((ret = he_lookup(he, r, (size_t)0, cursor->mem_len)) != 0) {
+ if (ret == HE_ERR_ITEM_NOT_FOUND)
+ goto restart;
+ ERET(wtext,
+ session, ret, "he_lookup: %s", he_strerror(ret));
+ }
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * txn_state_set --
+ * Resolve a transaction.
+ */
+static int
+txn_state_set(WT_EXTENSION_API *wtext,
+ WT_SESSION *session, HELIUM_SOURCE *hs, uint64_t txnid, int commit)
+{
+ HE_ITEM txn;
+ uint8_t val;
+ int ret = 0;
+
+ /*
+ * Update the store -- commits must be durable, flush the volume.
+ *
+ * XXX
+ * Not endian-portable, we're writing a native transaction ID to the
+ * store.
+ */
+ memset(&txn, 0, sizeof(txn));
+ txn.key = &txnid;
+ txn.key_len = sizeof(txnid);
+ val = commit ? TXN_COMMITTED : TXN_ABORTED;
+ txn.val = &val;
+ txn.val_len = sizeof(val);
+
+ if ((ret = he_update(hs->he_txn, &txn)) != 0)
+ ERET(wtext, session, ret, "he_update: %s", he_strerror(ret));
+
+ if (commit && (ret = he_commit(hs->he_txn)) != 0)
+ ERET(wtext, session, ret, "he_commit: %s", he_strerror(ret));
+ return (0);
+}
+
+/*
+ * txn_notify --
+ * Resolve a transaction; called from WiredTiger during commit/abort.
+ */
+static int
+txn_notify(WT_TXN_NOTIFY *handler,
+ WT_SESSION *session, uint64_t txnid, int committed)
+{
+ HELIUM_SOURCE *hs;
+
+ hs = (HELIUM_SOURCE *)handler;
+ return (txn_state_set(hs->wtext, session, hs, txnid, committed));
+}
+
+/*
+ * txn_state --
+ * Return a transaction's state.
+ */
+static int
+txn_state(WT_CURSOR *wtcursor, uint64_t txnid)
+{
+ CURSOR *cursor;
+ HE_ITEM txn;
+ HELIUM_SOURCE *hs;
+ uint8_t val_buf[16];
+
+ cursor = (CURSOR *)wtcursor;
+ hs = cursor->ws->hs;
+
+ memset(&txn, 0, sizeof(txn));
+ txn.key = &txnid;
+ txn.key_len = sizeof(txnid);
+ txn.val = val_buf;
+ txn.val_len = sizeof(val_buf);
+
+ if (he_lookup(hs->he_txn, &txn, (size_t)0, sizeof(val_buf)) == 0)
+ return (val_buf[0]);
+ return (TXN_UNRESOLVED);
+}
+
+/*
+ * cache_value_append --
+ * Append the current WiredTiger cursor's value to a cache record.
+ */
+static int
+cache_value_append(WT_CURSOR *wtcursor, int remove_op)
+{
+ CURSOR *cursor;
+ HE_ITEM *r;
+ WT_EXTENSION_API *wtext;
+ WT_SESSION *session;
+ uint64_t txnid;
+ size_t len;
+ uint32_t entries;
+ uint8_t *p;
+
+ session = wtcursor->session;
+ cursor = (CURSOR *)wtcursor;
+ wtext = cursor->wtext;
+
+ r = &cursor->record;
+
+ /*
+ * A cache update is 4B that counts the number of entries in the update,
+ * followed by sets of: 8B of txn ID then either a remove tombstone or a
+ * 4B length and variable-length data pair. Grow the value buffer, then
+ * append the cursor's information.
+ */
+ len = cursor->len + /* current length */
+ sizeof(uint32_t) + /* entries */
+ sizeof(uint64_t) + /* txn ID */
+ 1 + /* remove byte */
+ (remove_op ? 0 : /* optional data */
+ sizeof(uint32_t) + wtcursor->value.size) +
+ 32; /* slop */
+
+ if (len > cursor->mem_len) {
+ if ((p = realloc(cursor->v, len)) == NULL)
+ return (os_errno());
+ cursor->v = p;
+ cursor->mem_len = len;
+ }
+
+ /* Get the transaction ID. */
+ txnid = wtext->transaction_id(wtext, session);
+
+ /* Update the number of records in this value. */
+ if (cursor->len == 0) {
+ entries = 1;
+ cursor->len = sizeof(uint32_t);
+ } else {
+ memcpy(&entries, cursor->v, sizeof(uint32_t));
+ ++entries;
+ }
+ memcpy(cursor->v, &entries, sizeof(uint32_t));
+
+ /*
+ * Copy the WiredTiger cursor's data into place: txn ID, remove
+ * tombstone, data length, data.
+ *
+ * XXX
+ * Not endian-portable, we're writing a native transaction ID to the
+ * store.
+ */
+ p = cursor->v + cursor->len;
+ memcpy(p, &txnid, sizeof(uint64_t));
+ p += sizeof(uint64_t);
+ if (remove_op)
+ *p++ = REMOVE_TOMBSTONE;
+ else {
+ *p++ = ' ';
+ memcpy(p, &wtcursor->value.size, sizeof(uint32_t));
+ p += sizeof(uint32_t);
+ memcpy(p, wtcursor->value.data, wtcursor->value.size);
+ p += wtcursor->value.size;
+ }
+ cursor->len = (size_t)(p - cursor->v);
+
+ /* Update the underlying Helium record. */
+ r->val = cursor->v;
+ r->val_len = cursor->len;
+
+ return (0);
+}
+
+/*
+ * cache_value_unmarshall --
+ * Unmarshall a cache value into a set of records.
+ */
+static int
+cache_value_unmarshall(WT_CURSOR *wtcursor)
+{
+ CACHE_RECORD *cp;
+ CURSOR *cursor;
+ uint32_t entries, i;
+ uint8_t *p;
+ int ret = 0;
+
+ cursor = (CURSOR *)wtcursor;
+
+ /* If we don't have enough record slots, allocate some more. */
+ memcpy(&entries, cursor->v, sizeof(uint32_t));
+ if (entries > cursor->cache_slots) {
+ if ((p = realloc(cursor->cache,
+ (entries + 20) * sizeof(cursor->cache[0]))) == NULL)
+ return (os_errno());
+
+ cursor->cache = (CACHE_RECORD *)p;
+ cursor->cache_slots = entries + 20;
+ }
+
+ /* Walk the value, splitting it up into records. */
+ p = cursor->v + sizeof(uint32_t);
+ for (i = 0, cp = cursor->cache; i < entries; ++i, ++cp) {
+ memcpy(&cp->txnid, p, sizeof(uint64_t));
+ p += sizeof(uint64_t);
+ cp->remove = *p++ == REMOVE_TOMBSTONE ? 1 : 0;
+ if (!cp->remove) {
+ memcpy(&cp->len, p, sizeof(uint32_t));
+ p += sizeof(uint32_t);
+ cp->v = p;
+ p += cp->len;
+ }
+ }
+ cursor->cache_entries = entries;
+
+ return (ret);
+}
+
+/*
+ * cache_value_aborted --
+ * Return if a transaction has been aborted.
+ */
+static inline int
+cache_value_aborted(WT_CURSOR *wtcursor, CACHE_RECORD *cp)
+{
+ /*
+ * This function exists as a place to hang this comment.
+ *
+ * WiredTiger resets updated entry transaction IDs to an aborted state
+ * on rollback; to do that here would require tracking updated entries
+ * for a transaction or scanning the cache for updates made on behalf
+ * of the transaction during rollback, expensive stuff. Instead, check
+ * if the transaction has been aborted before calling the underlying
+ * WiredTiger visibility function.
+ */
+ return (txn_state(wtcursor, cp->txnid) == TXN_ABORTED ? 1 : 0);
+}
+
+/*
+ * cache_value_committed --
+ * Return if a transaction has been committed.
+ */
+static inline int
+cache_value_committed(WT_CURSOR *wtcursor, CACHE_RECORD *cp)
+{
+ return (txn_state(wtcursor, cp->txnid) == TXN_COMMITTED ? 1 : 0);
+}
+
+/*
+ * cache_value_update_check --
+ * Return if an update can proceed based on the previous updates made to
+ * the cache entry.
+ */
+static int
+cache_value_update_check(WT_CURSOR *wtcursor)
+{
+ CACHE_RECORD *cp;
+ CURSOR *cursor;
+ WT_EXTENSION_API *wtext;
+ WT_SESSION *session;
+ u_int i;
+
+ session = wtcursor->session;
+ cursor = (CURSOR *)wtcursor;
+ wtext = cursor->wtext;
+
+ /* Only interesting for snapshot isolation. */
+ if (wtext->
+ transaction_isolation_level(wtext, session) != WT_TXN_ISO_SNAPSHOT)
+ return (0);
+
+ /*
+ * If there's an entry that's not visible and hasn't been aborted,
+ * return a deadlock.
+ */
+ for (i = 0, cp = cursor->cache; i < cursor->cache_entries; ++i, ++cp)
+ if (!cache_value_aborted(wtcursor, cp) &&
+ !wtext->transaction_visible(wtext, session, cp->txnid))
+ return (WT_ROLLBACK);
+ return (0);
+}
+
+/*
+ * cache_value_visible --
+ * Return the most recent cache entry update visible to the running
+ * transaction.
+ */
+static int
+cache_value_visible(WT_CURSOR *wtcursor, CACHE_RECORD **cpp)
+{
+ CACHE_RECORD *cp;
+ CURSOR *cursor;
+ WT_EXTENSION_API *wtext;
+ WT_SESSION *session;
+ u_int i;
+
+ *cpp = NULL;
+
+ session = wtcursor->session;
+ cursor = (CURSOR *)wtcursor;
+ wtext = cursor->wtext;
+
+ /*
+ * We want the most recent cache entry update; the cache entries are
+ * in update order, walk from the end to the beginning.
+ */
+ cp = cursor->cache + cursor->cache_entries;
+ for (i = 0; i < cursor->cache_entries; ++i) {
+ --cp;
+ if (!cache_value_aborted(wtcursor, cp) &&
+ wtext->transaction_visible(wtext, session, cp->txnid)) {
+ *cpp = cp;
+ return (1);
+ }
+ }
+ return (0);
+}
+
+/*
+ * cache_value_visible_all --
+ * Return if a cache entry has no updates that aren't globally visible.
+ */
+static int
+cache_value_visible_all(WT_CURSOR *wtcursor, uint64_t oldest)
+{
+ CACHE_RECORD *cp;
+ CURSOR *cursor;
+ u_int i;
+
+ cursor = (CURSOR *)wtcursor;
+
+ /*
+ * Compare the update's transaction ID and the oldest transaction ID
+ * not yet visible to a running transaction. If there's an update a
+ * running transaction might want, the entry must remain in the cache.
+ * (We could tighten this requirement: if the only update required is
+ * also the update we'd migrate to the primary, it would still be OK
+ * to migrate it.)
+ */
+ for (i = 0, cp = cursor->cache; i < cursor->cache_entries; ++i, ++cp)
+ if (cp->txnid >= oldest)
+ return (0);
+ return (1);
+}
+
+/*
+ * cache_value_last_committed --
+ * Find the most recent update in a cache entry, recovery processing.
+ */
+static void
+cache_value_last_committed(WT_CURSOR *wtcursor, CACHE_RECORD **cpp)
+{
+ CACHE_RECORD *cp;
+ CURSOR *cursor;
+ u_int i;
+
+ *cpp = NULL;
+
+ cursor = (CURSOR *)wtcursor;
+
+ /*
+ * Find the most recent update in the cache record, we're going to try
+ * and migrate it into the primary, recovery version.
+ *
+ * We know the entry is visible, but it must have been committed before
+ * the failure to be migrated.
+ *
+ * Cache entries are in update order, walk from end to beginning.
+ */
+ cp = cursor->cache + cursor->cache_entries;
+ for (i = 0; i < cursor->cache_entries; ++i) {
+ --cp;
+ if (cache_value_committed(wtcursor, cp)) {
+ *cpp = cp;
+ return;
+ }
+ }
+}
+
+/*
+ * cache_value_last_not_aborted --
+ * Find the most recent update in a cache entry, normal processing.
+ */
+static void
+cache_value_last_not_aborted(WT_CURSOR *wtcursor, CACHE_RECORD **cpp)
+{
+ CACHE_RECORD *cp;
+ CURSOR *cursor;
+ u_int i;
+
+ *cpp = NULL;
+
+ cursor = (CURSOR *)wtcursor;
+
+ /*
+ * Find the most recent update in the cache record, we're going to try
+ * and migrate it into the primary, normal processing version.
+ *
+ * We don't have to check if the entry was committed, we've already
+ * confirmed all entries for this cache key are globally visible, which
+ * means they must be either committed or aborted.
+ *
+ * Cache entries are in update order, walk from end to beginning.
+ */
+ cp = cursor->cache + cursor->cache_entries;
+ for (i = 0; i < cursor->cache_entries; ++i) {
+ --cp;
+ if (!cache_value_aborted(wtcursor, cp)) {
+ *cpp = cp;
+ return;
+ }
+ }
+}
+
+/*
+ * cache_value_txnmin --
+ * Return the oldest transaction ID involved in a cache update.
+ */
+static void
+cache_value_txnmin(WT_CURSOR *wtcursor, uint64_t *txnminp)
+{
+ CACHE_RECORD *cp;
+ CURSOR *cursor;
+ uint64_t txnmin;
+ u_int i;
+
+ cursor = (CURSOR *)wtcursor;
+
+ /* Return the oldest transaction ID for in the cache entry. */
+ txnmin = UINT64_MAX;
+ for (i = 0, cp = cursor->cache; i < cursor->cache_entries; ++i, ++cp)
+ if (txnmin > cp->txnid)
+ txnmin = cp->txnid;
+ *txnminp = txnmin;
+}
+
+/*
+ * key_max_err --
+ * Common error when a WiredTiger key is too large.
+ */
+static int
+key_max_err(WT_EXTENSION_API *wtext, WT_SESSION *session, size_t len)
+{
+ int ret = 0;
+
+ ERET(wtext, session, EINVAL,
+ "key length (%zu bytes) larger than the maximum Helium "
+ "key length of %d bytes",
+ len, HE_MAX_KEY_LEN);
+}
+
+/*
+ * copyin_key --
+ * Copy a WT_CURSOR key to a HE_ITEM key.
+ */
+static inline int
+copyin_key(WT_CURSOR *wtcursor, int allocate_key)
+{
+ CURSOR *cursor;
+ HE_ITEM *r;
+ WT_EXTENSION_API *wtext;
+ WT_SESSION *session;
+ WT_SOURCE *ws;
+ size_t size;
+ int ret = 0;
+
+ session = wtcursor->session;
+ cursor = (CURSOR *)wtcursor;
+ ws = cursor->ws;
+ wtext = cursor->wtext;
+
+ r = &cursor->record;
+ if (ws->config_recno) {
+ /*
+ * Allocate a new record for append operations.
+ *
+ * A specified record number could potentially be larger than
+ * the maximum known record number, update the maximum number
+ * as necessary.
+ *
+ * Assume we can compare 8B values without locking them, and
+ * test again after acquiring the lock.
+ *
+ * XXX
+ * If the put fails for some reason, we'll have incremented the
+ * maximum record number past the correct point. I can't think
+ * of a reason any application would care or notice, but it's
+ * not quite right.
+ */
+ if (allocate_key && cursor->config_append) {
+ if ((ret = writelock(wtext, session, &ws->lock)) != 0)
+ return (ret);
+ wtcursor->recno = ++ws->append_recno;
+ if ((ret = unlock(wtext, session, &ws->lock)) != 0)
+ return (ret);
+ } else if (wtcursor->recno > ws->append_recno) {
+ if ((ret = writelock(wtext, session, &ws->lock)) != 0)
+ return (ret);
+ if (wtcursor->recno > ws->append_recno)
+ ws->append_recno = wtcursor->recno;
+ if ((ret = unlock(wtext, session, &ws->lock)) != 0)
+ return (ret);
+ }
+
+ if ((ret = wtext->struct_size(wtext, session,
+ &size, "r", wtcursor->recno)) != 0 ||
+ (ret = wtext->struct_pack(wtext, session,
+ r->key, HE_MAX_KEY_LEN, "r", wtcursor->recno)) != 0)
+ return (ret);
+ r->key_len = size;
+ } else {
+ /* I'm not sure this test is necessary, but it's cheap. */
+ if (wtcursor->key.size > HE_MAX_KEY_LEN)
+ return (
+ key_max_err(wtext, session, wtcursor->key.size));
+
+ /*
+ * A set cursor key might reference application memory, which
+ * is only OK until the cursor operation has been called (in
+ * other words, we can only reference application memory from
+ * the WT_CURSOR.set_key call until the WT_CURSOR.op call).
+ * For this reason, do a full copy, don't just reference the
+ * WT_CURSOR key's data.
+ */
+ memcpy(r->key, wtcursor->key.data, wtcursor->key.size);
+ r->key_len = wtcursor->key.size;
+ }
+ return (0);
+}
+
+/*
+ * copyout_key --
+ * Copy a HE_ITEM key to a WT_CURSOR key.
+ */
+static inline int
+copyout_key(WT_CURSOR *wtcursor)
+{
+ CURSOR *cursor;
+ HE_ITEM *r;
+ WT_EXTENSION_API *wtext;
+ WT_SESSION *session;
+ WT_SOURCE *ws;
+ int ret = 0;
+
+ session = wtcursor->session;
+ cursor = (CURSOR *)wtcursor;
+ wtext = cursor->wtext;
+ ws = cursor->ws;
+
+ r = &cursor->record;
+ if (ws->config_recno) {
+ if ((ret = wtext->struct_unpack(wtext,
+ session, r->key, r->key_len, "r", &wtcursor->recno)) != 0)
+ return (ret);
+ } else {
+ wtcursor->key.data = r->key;
+ wtcursor->key.size = (size_t)r->key_len;
+ }
+ return (0);
+}
+
+/*
+ * copyout_val --
+ * Copy a Helium store's HE_ITEM value to a WT_CURSOR value.
+ */
+static inline int
+copyout_val(WT_CURSOR *wtcursor, CACHE_RECORD *cp)
+{
+ CURSOR *cursor;
+
+ cursor = (CURSOR *)wtcursor;
+
+ if (cp == NULL) {
+ wtcursor->value.data = cursor->v;
+ wtcursor->value.size = cursor->len;
+ } else {
+ wtcursor->value.data = cp->v;
+ wtcursor->value.size = cp->len;
+ }
+ return (0);
+}
+
+/*
+ * nextprev --
+ * Cursor next/prev.
+ */
+static int
+nextprev(WT_CURSOR *wtcursor, const char *fname,
+ int (*f)(he_t, HE_ITEM *, size_t, size_t))
+{
+ CACHE_RECORD *cp;
+ CURSOR *cursor;
+ HE_ITEM *r;
+ WT_EXTENSION_API *wtext;
+ WT_ITEM a, b;
+ WT_SESSION *session;
+ WT_SOURCE *ws;
+ int cache_ret, cache_rm, cmp, ret = 0;
+ void *p;
+
+ session = wtcursor->session;
+ cursor = (CURSOR *)wtcursor;
+ ws = cursor->ws;
+ wtext = cursor->wtext;
+ r = &cursor->record;
+
+ cache_rm = 0;
+
+ /*
+ * If the cache isn't yet in use, it's a simpler problem, just check
+ * the store. We don't care if we race, we're not guaranteeing any
+ * special behavior with respect to phantoms.
+ */
+ if (ws->he_cache_inuse == 0) {
+ cache_ret = WT_NOTFOUND;
+ goto cache_clean;
+ }
+
+skip_deleted:
+ /*
+ * The next/prev key/value pair might be in the cache, which means we
+ * are making two calls and returning the best choice. As each call
+ * overwrites both key and value, we have to have a copy of the key
+ * for the second call plus the returned key and value from the first
+ * call. That's why each cursor has 3 temporary buffers.
+ *
+ * First, copy the key.
+ */
+ if (cursor->t1.mem_len < r->key_len) {
+ if ((p = realloc(cursor->t1.v, r->key_len)) == NULL)
+ return (os_errno());
+ cursor->t1.v = p;
+ cursor->t1.mem_len = r->key_len;
+ }
+ memcpy(cursor->t1.v, r->key, r->key_len);
+ cursor->t1.len = r->key_len;
+
+ /*
+ * Move through the cache until we either find a record with a visible
+ * entry, or we reach the end/beginning.
+ */
+ for (cache_rm = 0;;) {
+ if ((ret = helium_call(wtcursor, fname, ws->he_cache, f)) != 0)
+ break;
+ if ((ret = cache_value_unmarshall(wtcursor)) != 0)
+ return (ret);
+
+ /* If there's no visible entry, move to the next one. */
+ if (!cache_value_visible(wtcursor, &cp))
+ continue;
+
+ /*
+ * If the entry has been deleted, remember that and continue.
+ * We can't just skip the entry because it might be a delete
+ * of an entry in the primary store, which means the cache
+ * entry stops us from returning the primary store's entry.
+ */
+ if (cp->remove)
+ cache_rm = 1;
+
+ /*
+ * Copy the cache key. If the cache's entry wasn't a delete,
+ * copy the value as well, we may return the cache entry.
+ */
+ if (cursor->t2.mem_len < r->key_len) {
+ if ((p = realloc(cursor->t2.v, r->key_len)) == NULL)
+ return (os_errno());
+ cursor->t2.v = p;
+ cursor->t2.mem_len = r->key_len;
+ }
+ memcpy(cursor->t2.v, r->key, r->key_len);
+ cursor->t2.len = r->key_len;
+
+ if (cache_rm)
+ break;
+
+ if (cursor->t3.mem_len < cp->len) {
+ if ((p = realloc(cursor->t3.v, cp->len)) == NULL)
+ return (os_errno());
+ cursor->t3.v = p;
+ cursor->t3.mem_len = cp->len;
+ }
+ memcpy(cursor->t3.v, cp->v, cp->len);
+ cursor->t3.len = cp->len;
+
+ break;
+ }
+ if (ret != 0 && ret != WT_NOTFOUND)
+ return (ret);
+ cache_ret = ret;
+
+ /* Copy the original key back into place. */
+ memcpy(r->key, cursor->t1.v, cursor->t1.len);
+ r->key_len = cursor->t1.len;
+
+cache_clean:
+ /* Get the next/prev entry from the store. */
+ ret = helium_call(wtcursor, fname, ws->he, f);
+ if (ret != 0 && ret != WT_NOTFOUND)
+ return (ret);
+
+ /* If no entries in either the cache or the primary, we're done. */
+ if (cache_ret == WT_NOTFOUND && ret == WT_NOTFOUND)
+ return (WT_NOTFOUND);
+
+ /*
+ * If both the cache and the primary had entries, decide which is a
+ * better choice and pretend we didn't find the other one.
+ */
+ if (cache_ret == 0 && ret == 0) {
+ a.data = r->key; /* a is the primary */
+ a.size = (uint32_t)r->key_len;
+ b.data = cursor->t2.v; /* b is the cache */
+ b.size = (uint32_t)cursor->t2.len;
+ if ((ret = wtext->collate(wtext, session, &a, &b, &cmp)) != 0)
+ return (ret);
+
+ if (f == he_next) {
+ if (cmp >= 0)
+ ret = WT_NOTFOUND;
+ else
+ cache_ret = WT_NOTFOUND;
+ } else {
+ if (cmp <= 0)
+ ret = WT_NOTFOUND;
+ else
+ cache_ret = WT_NOTFOUND;
+ }
+ }
+
+ /*
+ * If the cache is the key we'd choose, but it's a delete, skip past it
+ * by moving from the deleted key to the next/prev item in either the
+ * primary or the cache.
+ */
+ if (cache_ret == 0 && cache_rm) {
+ memcpy(r->key, cursor->t2.v, cursor->t2.len);
+ r->key_len = cursor->t2.len;
+ goto skip_deleted;
+ }
+
+ /* If taking the cache's entry, copy the value into place. */
+ if (cache_ret == 0) {
+ memcpy(r->key, cursor->t2.v, cursor->t2.len);
+ r->key_len = cursor->t2.len;
+
+ memcpy(cursor->v, cursor->t3.v, cursor->t3.len);
+ cursor->len = cursor->t3.len;
+ }
+
+ /* Copy out the chosen key/value pair. */
+ if ((ret = copyout_key(wtcursor)) != 0)
+ return (ret);
+ if ((ret = copyout_val(wtcursor, NULL)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * helium_cursor_next --
+ * WT_CURSOR.next method.
+ */
+static int
+helium_cursor_next(WT_CURSOR *wtcursor)
+{
+ return (nextprev(wtcursor, "he_next", he_next));
+}
+
+/*
+ * helium_cursor_prev --
+ * WT_CURSOR.prev method.
+ */
+static int
+helium_cursor_prev(WT_CURSOR *wtcursor)
+{
+ return (nextprev(wtcursor, "he_prev", he_prev));
+}
+
+/*
+ * helium_cursor_reset --
+ * WT_CURSOR.reset method.
+ */
+static int
+helium_cursor_reset(WT_CURSOR *wtcursor)
+{
+ CURSOR *cursor;
+ HE_ITEM *r;
+
+ cursor = (CURSOR *)wtcursor;
+ r = &cursor->record;
+
+ /*
+ * Reset the cursor by setting the key length to 0, causing subsequent
+ * next/prev operations to return the first/last record of the object.
+ */
+ r->key_len = 0;
+ return (0);
+}
+
+/*
+ * helium_cursor_search --
+ * WT_CURSOR.search method.
+ */
+static int
+helium_cursor_search(WT_CURSOR *wtcursor)
+{
+ CACHE_RECORD *cp;
+ CURSOR *cursor;
+ WT_SOURCE *ws;
+ int ret = 0;
+
+ cursor = (CURSOR *)wtcursor;
+ ws = cursor->ws;
+
+ /* Copy in the WiredTiger cursor's key. */
+ if ((ret = copyin_key(wtcursor, 0)) != 0)
+ return (ret);
+
+ /*
+ * Check for an entry in the cache. If we find one, unmarshall it
+ * and check for a visible entry we can return.
+ */
+ if ((ret =
+ helium_call(wtcursor, "he_lookup", ws->he_cache, he_lookup)) == 0) {
+ if ((ret = cache_value_unmarshall(wtcursor)) != 0)
+ return (ret);
+ if (cache_value_visible(wtcursor, &cp))
+ return (cp->remove ?
+ WT_NOTFOUND : copyout_val(wtcursor, cp));
+ } else if (ret != WT_NOTFOUND)
+ return (ret);
+
+ /* Check for an entry in the primary store. */
+ if ((ret = helium_call(wtcursor, "he_lookup", ws->he, he_lookup)) != 0)
+ return (ret);
+
+ return (copyout_val(wtcursor, NULL));
+}
+
+/*
+ * helium_cursor_search_near --
+ * WT_CURSOR.search_near method.
+ */
+static int
+helium_cursor_search_near(WT_CURSOR *wtcursor, int *exact)
+{
+ int ret = 0;
+
+ /*
+ * XXX
+ * I'm not confident this is sufficient: if there are multiple threads
+ * of control, it's possible for the search for an exact match to fail,
+ * another thread of control to insert (and commit) an exact match, and
+ * then it's possible we'll return the wrong value. This needs to be
+ * revisited once the transactional code is in place.
+ */
+
+ /* Search for an exact match. */
+ if ((ret = helium_cursor_search(wtcursor)) == 0) {
+ *exact = 0;
+ return (0);
+ }
+ if (ret != WT_NOTFOUND)
+ return (ret);
+
+ /* Search for a key that's larger. */
+ if ((ret = helium_cursor_next(wtcursor)) == 0) {
+ *exact = 1;
+ return (0);
+ }
+ if (ret != WT_NOTFOUND)
+ return (ret);
+
+ /* Search for a key that's smaller. */
+ if ((ret = helium_cursor_prev(wtcursor)) == 0) {
+ *exact = -1;
+ return (0);
+ }
+
+ return (ret);
+}
+
+/*
+ * helium_cursor_insert --
+ * WT_CURSOR.insert method.
+ */
+static int
+helium_cursor_insert(WT_CURSOR *wtcursor)
+{
+ CACHE_RECORD *cp;
+ CURSOR *cursor;
+ HE_ITEM *r;
+ HELIUM_SOURCE *hs;
+ WT_EXTENSION_API *wtext;
+ WT_SESSION *session;
+ WT_SOURCE *ws;
+ int ret = 0;
+
+ session = wtcursor->session;
+ cursor = (CURSOR *)wtcursor;
+ wtext = cursor->wtext;
+ ws = cursor->ws;
+ hs = ws->hs;
+ r = &cursor->record;
+
+ /* Get the WiredTiger cursor's key. */
+ if ((ret = copyin_key(wtcursor, 1)) != 0)
+ return (ret);
+
+ VMSG(wtext, session, VERBOSE_L2,
+ "I %.*s.%.*s", (int)r->key_len, r->key, (int)r->val_len, r->val);
+
+ /* Clear the value, assume we're adding the first cache entry. */
+ cursor->len = 0;
+
+ /* Updates are read-modify-writes, lock the underlying cache. */
+ if ((ret = writelock(wtext, session, &ws->lock)) != 0)
+ return (ret);
+
+ /* Read the record from the cache store. */
+ switch (ret = helium_call(
+ wtcursor, "he_lookup", ws->he_cache, he_lookup)) {
+ case 0:
+ /* Crack the record. */
+ if ((ret = cache_value_unmarshall(wtcursor)) != 0)
+ goto err;
+
+ /* Check if the update can proceed. */
+ if ((ret = cache_value_update_check(wtcursor)) != 0)
+ goto err;
+
+ if (cursor->config_overwrite)
+ break;
+
+ /*
+ * If overwrite is false, a visible entry (that's not a removed
+ * entry), is an error. We're done checking if there is a
+ * visible entry in the cache, otherwise repeat the check on the
+ * primary store.
+ */
+ if (cache_value_visible(wtcursor, &cp)) {
+ if (cp->remove)
+ break;
+
+ ret = WT_DUPLICATE_KEY;
+ goto err;
+ }
+ /* FALLTHROUGH */
+ case WT_NOTFOUND:
+ if (cursor->config_overwrite)
+ break;
+
+ /* If overwrite is false, an entry is an error. */
+ if ((ret = helium_call(
+ wtcursor, "he_lookup", ws->he, he_lookup)) != WT_NOTFOUND) {
+ if (ret == 0)
+ ret = WT_DUPLICATE_KEY;
+ goto err;
+ }
+ ret = 0;
+ break;
+ default:
+ goto err;
+ }
+
+ /*
+ * Create a new value using the current cache record plus the WiredTiger
+ * cursor's value, and update the cache.
+ */
+ if ((ret = cache_value_append(wtcursor, 0)) != 0)
+ goto err;
+ if ((ret = he_update(ws->he_cache, r)) != 0)
+ EMSG(wtext, session, ret, "he_update: %s", he_strerror(ret));
+
+ /* Update the state while still holding the lock. */
+ if (ws->he_cache_inuse == 0)
+ ws->he_cache_inuse = 1;
+
+ /* Discard the lock. */
+err: ESET(unlock(wtext, session, &ws->lock));
+
+ /* If successful, request notification at transaction resolution. */
+ if (ret == 0)
+ ESET(
+ wtext->transaction_notify(wtext, session, &hs->txn_notify));
+
+ return (ret);
+}
+
+/*
+ * update --
+ * Update or remove an entry.
+ */
+static int
+update(WT_CURSOR *wtcursor, int remove_op)
+{
+ CACHE_RECORD *cp;
+ CURSOR *cursor;
+ HE_ITEM *r;
+ HELIUM_SOURCE *hs;
+ WT_EXTENSION_API *wtext;
+ WT_SESSION *session;
+ WT_SOURCE *ws;
+ int ret = 0;
+
+ session = wtcursor->session;
+ cursor = (CURSOR *)wtcursor;
+ wtext = cursor->wtext;
+ ws = cursor->ws;
+ hs = ws->hs;
+ r = &cursor->record;
+
+ /* Get the WiredTiger cursor's key. */
+ if ((ret = copyin_key(wtcursor, 0)) != 0)
+ return (ret);
+
+ VMSG(wtext, session, VERBOSE_L2,
+ "%c %.*s.%.*s",
+ remove_op ? 'R' : 'U',
+ (int)r->key_len, r->key, (int)r->val_len, r->val);
+
+ /* Clear the value, assume we're adding the first cache entry. */
+ cursor->len = 0;
+
+ /* Updates are read-modify-writes, lock the underlying cache. */
+ if ((ret = writelock(wtext, session, &ws->lock)) != 0)
+ return (ret);
+
+ /* Read the record from the cache store. */
+ switch (ret = helium_call(
+ wtcursor, "he_lookup", ws->he_cache, he_lookup)) {
+ case 0:
+ /* Crack the record. */
+ if ((ret = cache_value_unmarshall(wtcursor)) != 0)
+ goto err;
+
+ /* Check if the update can proceed. */
+ if ((ret = cache_value_update_check(wtcursor)) != 0)
+ goto err;
+
+ if (cursor->config_overwrite)
+ break;
+
+ /*
+ * If overwrite is false, no entry (or a removed entry), is an
+ * error. We're done checking if there is a visible entry in
+ * the cache, otherwise repeat the check on the primary store.
+ */
+ if (cache_value_visible(wtcursor, &cp)) {
+ if (!cp->remove)
+ break;
+
+ ret = WT_NOTFOUND;
+ goto err;
+ }
+ /* FALLTHROUGH */
+ case WT_NOTFOUND:
+ if (cursor->config_overwrite)
+ break;
+
+ /* If overwrite is false, no entry is an error. */
+ if ((ret =
+ helium_call(wtcursor, "he_lookup", ws->he, he_lookup)) != 0)
+ goto err;
+
+ /*
+ * All we care about is the cache entry, which didn't exist;
+ * clear the returned value, we're about to "append" to it.
+ */
+ cursor->len = 0;
+ break;
+ default:
+ goto err;
+ }
+
+ /*
+ * Create a new cache value based on the current cache record plus the
+ * WiredTiger cursor's value.
+ */
+ if ((ret = cache_value_append(wtcursor, remove_op)) != 0)
+ goto err;
+
+ /* Push the record into the cache. */
+ if ((ret = he_update(ws->he_cache, r)) != 0)
+ EMSG(wtext, session, ret, "he_update: %s", he_strerror(ret));
+
+ /* Update the state while still holding the lock. */
+ if (ws->he_cache_inuse == 0)
+ ws->he_cache_inuse = 1;
+
+ /* Discard the lock. */
+err: ESET(unlock(wtext, session, &ws->lock));
+
+ /* If successful, request notification at transaction resolution. */
+ if (ret == 0)
+ ESET(
+ wtext->transaction_notify(wtext, session, &hs->txn_notify));
+
+ return (ret);
+}
+
+/*
+ * helium_cursor_update --
+ * WT_CURSOR.update method.
+ */
+static int
+helium_cursor_update(WT_CURSOR *wtcursor)
+{
+ return (update(wtcursor, 0));
+}
+
+/*
+ * helium_cursor_remove --
+ * WT_CURSOR.remove method.
+ */
+static int
+helium_cursor_remove(WT_CURSOR *wtcursor)
+{
+ CURSOR *cursor;
+ WT_SOURCE *ws;
+
+ cursor = (CURSOR *)wtcursor;
+ ws = cursor->ws;
+
+ /*
+ * WiredTiger's "remove" of a bitfield is really an update with a value
+ * of zero.
+ */
+ if (ws->config_bitfield) {
+ wtcursor->value.size = 1;
+ wtcursor->value.data = "";
+ return (update(wtcursor, 0));
+ }
+ return (update(wtcursor, 1));
+}
+
+/*
+ * helium_cursor_close --
+ * WT_CURSOR.close method.
+ */
+static int
+helium_cursor_close(WT_CURSOR *wtcursor)
+{
+ CURSOR *cursor;
+ WT_EXTENSION_API *wtext;
+ WT_SESSION *session;
+ WT_SOURCE *ws;
+ int ret = 0;
+
+ session = wtcursor->session;
+ cursor = (CURSOR *)wtcursor;
+ wtext = cursor->wtext;
+ ws = cursor->ws;
+
+ if ((ret = writelock(wtext, session, &ws->lock)) == 0) {
+ --ws->ref;
+ ret = unlock(wtext, session, &ws->lock);
+ }
+ cursor_destroy(cursor);
+
+ return (ret);
+}
+
+/*
+ * ws_source_name --
+ * Build a namespace name.
+ */
+static int
+ws_source_name(WT_DATA_SOURCE *wtds,
+ WT_SESSION *session, const char *uri, const char *suffix, char **pp)
+{
+ DATA_SOURCE *ds;
+ WT_EXTENSION_API *wtext;
+ size_t len;
+ int ret = 0;
+ const char *p;
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+
+ /*
+ * Create the store's name. Application URIs are "helium:device/name";
+ * we want the names on the Helium device to be obviously WiredTiger's,
+ * and the device name isn't interesting. Convert to "WiredTiger:name",
+ * and add an optional suffix.
+ */
+ if (!prefix_match(uri, "helium:") || (p = strchr(uri, '/')) == NULL)
+ ERET(wtext, session, EINVAL, "%s: illegal Helium URI", uri);
+ ++p;
+
+ len = strlen(WT_NAME_PREFIX) +
+ strlen(p) + (suffix == NULL ? 0 : strlen(suffix)) + 5;
+ if ((*pp = malloc(len)) == NULL)
+ return (os_errno());
+ (void)snprintf(*pp, len, "%s%s%s",
+ WT_NAME_PREFIX, p, suffix == NULL ? "" : suffix);
+ return (0);
+}
+
+/*
+ * ws_source_close --
+ * Close a WT_SOURCE reference.
+ */
+static int
+ws_source_close(WT_EXTENSION_API *wtext, WT_SESSION *session, WT_SOURCE *ws)
+{
+ int ret = 0, tret;
+
+ /*
+ * Warn if open cursors: it shouldn't happen because the upper layers of
+ * WiredTiger prevent it, so we don't do anything more than warn.
+ */
+ if (ws->ref != 0)
+ EMSG(wtext, session, WT_ERROR,
+ "%s: open object with %u open cursors being closed",
+ ws->uri, ws->ref);
+
+ if (ws->he != NULL) {
+ if ((tret = he_commit(ws->he)) != 0)
+ EMSG(wtext, session, tret,
+ "he_commit: %s: %s", ws->uri, he_strerror(tret));
+ if ((tret = he_close(ws->he)) != 0)
+ EMSG(wtext, session, tret,
+ "he_close: %s: %s", ws->uri, he_strerror(tret));
+ ws->he = NULL;
+ }
+ if (ws->he_cache != NULL) {
+ if ((tret = he_close(ws->he_cache)) != 0)
+ EMSG(wtext, session, tret,
+ "he_close: %s(cache): %s",
+ ws->uri, he_strerror(tret));
+ ws->he_cache = NULL;
+ }
+
+ if (ws->lockinit)
+ ESET(lock_destroy(wtext, session, &ws->lock));
+
+ free(ws->uri);
+ OVERWRITE_AND_FREE(ws);
+
+ return (ret);
+}
+
+/*
+ * ws_source_open_object --
+ * Open an object in the Helium store.
+ */
+static int
+ws_source_open_object(WT_DATA_SOURCE *wtds, WT_SESSION *session,
+ HELIUM_SOURCE *hs,
+ const char *uri, const char *suffix, int flags, he_t *hep)
+{
+ DATA_SOURCE *ds;
+ WT_EXTENSION_API *wtext;
+ he_t he;
+ char *p;
+ int ret = 0;
+
+ *hep = NULL;
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+ p = NULL;
+
+ /* Open the underlying Helium object. */
+ if ((ret = ws_source_name(wtds, session, uri, suffix, &p)) != 0)
+ return (ret);
+ VMSG(wtext, session, VERBOSE_L1, "open %s/%s", hs->name, p);
+ if ((he = he_open(hs->device, p, flags, NULL)) == NULL) {
+ ret = os_errno();
+ EMSG(wtext, session, ret,
+ "he_open: %s/%s: %s", hs->name, p, he_strerror(ret));
+ }
+ *hep = he;
+
+ free(p);
+ return (ret);
+}
+
+#define WS_SOURCE_OPEN_BUSY 0x01 /* Fail if source busy */
+#define WS_SOURCE_OPEN_GLOBAL 0x02 /* Keep the global lock */
+
+/*
+ * ws_source_open --
+ * Return a locked WiredTiger source, allocating and opening if it doesn't
+ * already exist.
+ */
+static int
+ws_source_open(WT_DATA_SOURCE *wtds, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config, u_int flags, WT_SOURCE **refp)
+{
+ DATA_SOURCE *ds;
+ HELIUM_SOURCE *hs;
+ WT_CONFIG_ITEM a;
+ WT_EXTENSION_API *wtext;
+ WT_SOURCE *ws;
+ size_t len;
+ int oflags, ret = 0;
+ const char *p, *t;
+
+ *refp = NULL;
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+ ws = NULL;
+
+ /*
+ * The URI will be "helium:" followed by a Helium name and object name
+ * pair separated by a slash, for example, "helium:volume/object".
+ */
+ if (!prefix_match(uri, "helium:"))
+ goto bad_name;
+ p = uri + strlen("helium:");
+ if (p[0] == '/' || (t = strchr(p, '/')) == NULL || t[1] == '\0')
+bad_name: ERET(wtext, session, EINVAL, "%s: illegal name format", uri);
+ len = (size_t)(t - p);
+
+ /* Find a matching Helium device. */
+ for (hs = ds->hs_head; hs != NULL; hs = hs->next)
+ if (string_match(hs->name, p, len))
+ break;
+ if (hs == NULL)
+ ERET(wtext, NULL,
+ EINVAL, "%s: no matching Helium store found", uri);
+
+ /*
+ * We're about to walk the Helium device's list of files, acquire the
+ * global lock.
+ */
+ if ((ret = writelock(wtext, session, &ds->global_lock)) != 0)
+ return (ret);
+
+ /*
+ * Check for a match: if we find one, optionally trade the global lock
+ * for the object's lock, optionally check if the object is busy, and
+ * return.
+ */
+ for (ws = hs->ws_head; ws != NULL; ws = ws->next)
+ if (strcmp(ws->uri, uri) == 0) {
+ /* Check to see if the object is busy. */
+ if (ws->ref != 0 && (flags & WS_SOURCE_OPEN_BUSY)) {
+ ret = EBUSY;
+ ESET(unlock(wtext, session, &ds->global_lock));
+ return (ret);
+ }
+ /* Swap the global lock for an object lock. */
+ if (!(flags & WS_SOURCE_OPEN_GLOBAL)) {
+ ret = writelock(wtext, session, &ws->lock);
+ ESET(unlock(wtext, session, &ds->global_lock));
+ if (ret != 0)
+ return (ret);
+ }
+ *refp = ws;
+ return (0);
+ }
+
+ /* Allocate and initialize a new underlying WiredTiger source object. */
+ if ((ws = calloc(1, sizeof(*ws))) == NULL ||
+ (ws->uri = strdup(uri)) == NULL) {
+ ret = os_errno();
+ goto err;
+ }
+ if ((ret = lock_init(wtext, session, &ws->lock)) != 0)
+ goto err;
+ ws->lockinit = 1;
+ ws->hs = hs;
+
+ /*
+ * Open the underlying Helium objects, then push the change.
+ *
+ * The naming scheme is simple: the URI names the primary store, and the
+ * URI with a trailing suffix names the associated caching store.
+ *
+ * We can set truncate flag, we always set the create flag, our caller
+ * handles attempts to create existing objects.
+ */
+ oflags = HE_O_CREATE;
+ if ((ret = wtext->config_get(wtext,
+ session, config, "helium_o_truncate", &a)) == 0 && a.val != 0)
+ oflags |= HE_O_TRUNCATE;
+ if (ret != 0 && ret != WT_NOTFOUND)
+ EMSG_ERR(wtext, session, ret,
+ "helium_o_truncate configuration: %s",
+ wtext->strerror(ret));
+
+ if ((ret = ws_source_open_object(
+ wtds, session, hs, uri, NULL, oflags, &ws->he)) != 0)
+ goto err;
+ if ((ret = ws_source_open_object(
+ wtds, session, hs, uri, WT_NAME_CACHE, oflags, &ws->he_cache)) != 0)
+ goto err;
+ if ((ret = he_commit(ws->he)) != 0)
+ EMSG_ERR(wtext, session, ret,
+ "he_commit: %s", he_strerror(ret));
+
+ /* Optionally trade the global lock for the object lock. */
+ if (!(flags & WS_SOURCE_OPEN_GLOBAL) &&
+ (ret = writelock(wtext, session, &ws->lock)) != 0)
+ goto err;
+
+ /* Insert the new entry at the head of the list. */
+ ws->next = hs->ws_head;
+ hs->ws_head = ws;
+
+ *refp = ws;
+ ws = NULL;
+
+ if (0) {
+err: if (ws != NULL)
+ ESET(ws_source_close(wtext, session, ws));
+ }
+
+ /*
+ * If there was an error or our caller doesn't need the global lock,
+ * release the global lock.
+ */
+ if (!(flags & WS_SOURCE_OPEN_GLOBAL) || ret != 0)
+ ESET(unlock(wtext, session, &ds->global_lock));
+
+ return (ret);
+}
+
+/*
+ * master_uri_get --
+ * Get the Helium master record for a URI.
+ */
+static int
+master_uri_get(WT_DATA_SOURCE *wtds,
+ WT_SESSION *session, const char *uri, const char **valuep)
+{
+ DATA_SOURCE *ds;
+ WT_EXTENSION_API *wtext;
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+
+ return (wtext->metadata_search(wtext, session, uri, valuep));
+}
+
+/*
+ * master_uri_drop --
+ * Drop the Helium master record for a URI.
+ */
+static int
+master_uri_drop(WT_DATA_SOURCE *wtds, WT_SESSION *session, const char *uri)
+{
+ DATA_SOURCE *ds;
+ WT_EXTENSION_API *wtext;
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+
+ return (wtext->metadata_remove(wtext, session, uri));
+}
+
+/*
+ * master_uri_rename --
+ * Rename the Helium master record for a URI.
+ */
+static int
+master_uri_rename(WT_DATA_SOURCE *wtds,
+ WT_SESSION *session, const char *uri, const char *newuri)
+{
+ DATA_SOURCE *ds;
+ WT_EXTENSION_API *wtext;
+ int ret = 0;
+ const char *value;
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+ value = NULL;
+
+ /* Insert the record under a new name. */
+ if ((ret = master_uri_get(wtds, session, uri, &value)) != 0 ||
+ (ret = wtext->metadata_insert(wtext, session, newuri, value)) != 0)
+ goto err;
+
+ /*
+ * Remove the original record, and if that fails, attempt to remove
+ * the new record.
+ */
+ if ((ret = wtext->metadata_remove(wtext, session, uri)) != 0)
+ (void)wtext->metadata_remove(wtext, session, newuri);
+
+err: free((void *)value);
+ return (ret);
+}
+
+/*
+ * master_uri_set --
+ * Set the Helium master record for a URI.
+ */
+static int
+master_uri_set(WT_DATA_SOURCE *wtds,
+ WT_SESSION *session, const char *uri, WT_CONFIG_ARG *config)
+{
+ DATA_SOURCE *ds;
+ WT_CONFIG_ITEM a, b, c;
+ WT_EXTENSION_API *wtext;
+ int exclusive, ret = 0;
+ char value[1024];
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+
+ exclusive = 0;
+ if ((ret =
+ wtext->config_get(wtext, session, config, "exclusive", &a)) == 0)
+ exclusive = a.val != 0;
+ else if (ret != WT_NOTFOUND)
+ ERET(wtext, session, ret,
+ "exclusive configuration: %s", wtext->strerror(ret));
+
+ /* Get the key/value format strings. */
+ if ((ret = wtext->config_get(
+ wtext, session, config, "key_format", &a)) != 0) {
+ if (ret == WT_NOTFOUND) {
+ a.str = "u";
+ a.len = 1;
+ } else
+ ERET(wtext, session, ret,
+ "key_format configuration: %s",
+ wtext->strerror(ret));
+ }
+ if ((ret = wtext->config_get(
+ wtext, session, config, "value_format", &b)) != 0) {
+ if (ret == WT_NOTFOUND) {
+ b.str = "u";
+ b.len = 1;
+ } else
+ ERET(wtext, session, ret,
+ "value_format configuration: %s",
+ wtext->strerror(ret));
+ }
+
+ /* Get the compression configuration. */
+ if ((ret = wtext->config_get(
+ wtext, session, config, "helium_o_compress", &c)) != 0) {
+ if (ret == WT_NOTFOUND)
+ c.val = 0;
+ else
+ ERET(wtext, session, ret,
+ "helium_o_compress configuration: %s",
+ wtext->strerror(ret));
+ }
+
+ /*
+ * Create a new reference using insert (which fails if the record
+ * already exists).
+ */
+ (void)snprintf(value, sizeof(value),
+ "wiredtiger_helium_version=(major=%d,minor=%d),"
+ "key_format=%.*s,value_format=%.*s,"
+ "helium_o_compress=%d",
+ WIREDTIGER_HELIUM_MAJOR, WIREDTIGER_HELIUM_MINOR,
+ (int)a.len, a.str, (int)b.len, b.str, c.val ? 1 : 0);
+ if ((ret = wtext->metadata_insert(wtext, session, uri, value)) == 0)
+ return (0);
+ if (ret == WT_DUPLICATE_KEY)
+ return (exclusive ? EEXIST : 0);
+ ERET(wtext, session, ret, "%s: %s", uri, wtext->strerror(ret));
+}
+
+/*
+ * helium_session_open_cursor --
+ * WT_SESSION.open_cursor method.
+ */
+static int
+helium_session_open_cursor(WT_DATA_SOURCE *wtds, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config, WT_CURSOR **new_cursor)
+{
+ CURSOR *cursor;
+ DATA_SOURCE *ds;
+ WT_CONFIG_ITEM v;
+ WT_CONFIG_PARSER *config_parser;
+ WT_CURSOR *wtcursor;
+ WT_EXTENSION_API *wtext;
+ WT_SOURCE *ws;
+ int locked, ret, tret;
+ const char *value;
+
+ *new_cursor = NULL;
+
+ config_parser = NULL;
+ cursor = NULL;
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+ ws = NULL;
+ locked = 0;
+ ret = tret = 0;
+ value = NULL;
+
+ /* Allocate and initialize a cursor. */
+ if ((cursor = calloc(1, sizeof(CURSOR))) == NULL)
+ return (os_errno());
+
+ if ((ret = wtext->config_get( /* Parse configuration */
+ wtext, session, config, "append", &v)) != 0)
+ EMSG_ERR(wtext, session, ret,
+ "append configuration: %s", wtext->strerror(ret));
+ cursor->config_append = v.val != 0;
+
+ if ((ret = wtext->config_get(
+ wtext, session, config, "overwrite", &v)) != 0)
+ EMSG_ERR(wtext, session, ret,
+ "overwrite configuration: %s", wtext->strerror(ret));
+ cursor->config_overwrite = v.val != 0;
+
+ if ((ret = wtext->collator_config(wtext, session, config)) != 0)
+ EMSG_ERR(wtext, session, ret,
+ "collator configuration: %s", wtext->strerror(ret));
+
+ /* Finish initializing the cursor. */
+ cursor->wtcursor.close = helium_cursor_close;
+ cursor->wtcursor.insert = helium_cursor_insert;
+ cursor->wtcursor.next = helium_cursor_next;
+ cursor->wtcursor.prev = helium_cursor_prev;
+ cursor->wtcursor.remove = helium_cursor_remove;
+ cursor->wtcursor.reset = helium_cursor_reset;
+ cursor->wtcursor.search = helium_cursor_search;
+ cursor->wtcursor.search_near = helium_cursor_search_near;
+ cursor->wtcursor.update = helium_cursor_update;
+
+ cursor->wtext = wtext;
+ cursor->record.key = cursor->__key;
+ if ((cursor->v = malloc(128)) == NULL)
+ goto err;
+ cursor->mem_len = 128;
+
+ /* Get a locked reference to the WiredTiger source. */
+ if ((ret = ws_source_open(wtds, session, uri, config, 0, &ws)) != 0)
+ goto err;
+ locked = 1;
+ cursor->ws = ws;
+
+ /*
+ * If this is the first access to the URI, we have to configure it
+ * using information stored in the master record.
+ */
+ if (!ws->configured) {
+ if ((ret = master_uri_get(wtds, session, uri, &value)) != 0)
+ goto err;
+
+ if ((ret = wtext->config_parser_open(wtext,
+ session, value, strlen(value), &config_parser)) != 0)
+ EMSG_ERR(wtext, session, ret,
+ "Configuration string parser: %s",
+ wtext->strerror(ret));
+ if ((ret = config_parser->get(
+ config_parser, "key_format", &v)) != 0)
+ EMSG_ERR(wtext, session, ret,
+ "key_format configuration: %s",
+ wtext->strerror(ret));
+ ws->config_recno = v.len == 1 && v.str[0] == 'r';
+
+ if ((ret = config_parser->get(
+ config_parser, "value_format", &v)) != 0)
+ EMSG_ERR(wtext, session, ret,
+ "value_format configuration: %s",
+ wtext->strerror(ret));
+ ws->config_bitfield =
+ v.len == 2 && isdigit(v.str[0]) && v.str[1] == 't';
+
+ if ((ret = config_parser->get(
+ config_parser, "helium_o_compress", &v)) != 0)
+ EMSG_ERR(wtext, session, ret,
+ "helium_o_compress configuration: %s",
+ wtext->strerror(ret));
+ ws->config_compress = v.val ? 1 : 0;
+
+ /*
+ * If it's a record-number key, read the last record from the
+ * object and set the allocation record value.
+ */
+ if (ws->config_recno) {
+ wtcursor = (WT_CURSOR *)cursor;
+ if ((ret = helium_cursor_reset(wtcursor)) != 0)
+ goto err;
+
+ if ((ret = helium_cursor_prev(wtcursor)) == 0)
+ ws->append_recno = wtcursor->recno;
+ else if (ret != WT_NOTFOUND)
+ goto err;
+
+ if ((ret = helium_cursor_reset(wtcursor)) != 0)
+ goto err;
+ }
+
+ ws->configured = 1;
+ }
+
+ /* Increment the open reference count to pin the URI and unlock it. */
+ ++ws->ref;
+ if ((ret = unlock(wtext, session, &ws->lock)) != 0)
+ goto err;
+
+ *new_cursor = (WT_CURSOR *)cursor;
+
+ if (0) {
+err: if (ws != NULL && locked)
+ ESET(unlock(wtext, session, &ws->lock));
+ cursor_destroy(cursor);
+ }
+ if (config_parser != NULL &&
+ (tret = config_parser->close(config_parser)) != 0)
+ EMSG(wtext, session, tret,
+ "WT_CONFIG_PARSER.close: %s", wtext->strerror(tret));
+
+ free((void *)value);
+ return (ret);
+}
+
+/*
+ * helium_session_create --
+ * WT_SESSION.create method.
+ */
+static int
+helium_session_create(WT_DATA_SOURCE *wtds,
+ WT_SESSION *session, const char *uri, WT_CONFIG_ARG *config)
+{
+ DATA_SOURCE *ds;
+ WT_EXTENSION_API *wtext;
+ WT_SOURCE *ws;
+ int ret = 0;
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+
+ /*
+ * Get a locked reference to the WiredTiger source, then immediately
+ * unlock it, we aren't doing anything else.
+ */
+ if ((ret = ws_source_open(wtds, session, uri, config, 0, &ws)) != 0)
+ return (ret);
+ if ((ret = unlock(wtext, session, &ws->lock)) != 0)
+ return (ret);
+
+ /*
+ * Create the URI master record if it doesn't already exist.
+ *
+ * We've discarded the lock, but that's OK, creates are single-threaded
+ * at the WiredTiger level, it's not our problem to solve.
+ *
+ * If unable to enter a WiredTiger record, leave the Helium store alone.
+ * A subsequent create should do the right thing, we aren't leaving
+ * anything in an inconsistent state.
+ */
+ return (master_uri_set(wtds, session, uri, config));
+}
+
+/*
+ * helium_session_drop --
+ * WT_SESSION.drop method.
+ */
+static int
+helium_session_drop(WT_DATA_SOURCE *wtds,
+ WT_SESSION *session, const char *uri, WT_CONFIG_ARG *config)
+{
+ DATA_SOURCE *ds;
+ HELIUM_SOURCE *hs;
+ WT_EXTENSION_API *wtext;
+ WT_SOURCE **p, *ws;
+ int ret = 0;
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+
+ /*
+ * Get a locked reference to the data source: hold the global lock,
+ * we're changing the HELIUM_SOURCE's list of WT_SOURCE objects.
+ *
+ * Remove the entry from the WT_SOURCE list -- it's a singly-linked
+ * list, find the reference to it.
+ */
+ if ((ret = ws_source_open(wtds, session, uri, config,
+ WS_SOURCE_OPEN_BUSY | WS_SOURCE_OPEN_GLOBAL, &ws)) != 0)
+ return (ret);
+ hs = ws->hs;
+ for (p = &hs->ws_head; *p != NULL; p = &(*p)->next)
+ if (*p == ws) {
+ *p = (*p)->next;
+ break;
+ }
+
+ /* Drop the underlying Helium objects. */
+ ESET(he_remove(ws->he));
+ ws->he = NULL; /* The handle is dead. */
+ ESET(he_remove(ws->he_cache));
+ ws->he_cache = NULL; /* The handle is dead. */
+
+ /* Close the source, discarding the structure. */
+ ESET(ws_source_close(wtext, session, ws));
+ ws = NULL;
+
+ /* Discard the metadata entry. */
+ ESET(master_uri_drop(wtds, session, uri));
+
+ /*
+ * If we have an error at this point, panic -- there's an inconsistency
+ * in what WiredTiger knows about and the underlying store.
+ */
+ if (ret != 0)
+ ret = WT_PANIC;
+
+ ESET(unlock(wtext, session, &ds->global_lock));
+ return (ret);
+}
+
+/*
+ * helium_session_rename --
+ * WT_SESSION.rename method.
+ */
+static int
+helium_session_rename(WT_DATA_SOURCE *wtds, WT_SESSION *session,
+ const char *uri, const char *newuri, WT_CONFIG_ARG *config)
+{
+ DATA_SOURCE *ds;
+ WT_EXTENSION_API *wtext;
+ WT_SOURCE *ws;
+ int ret = 0;
+ char *p;
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+
+ /*
+ * Get a locked reference to the data source; hold the global lock,
+ * we are going to change the object's name, and we can't allow
+ * other threads walking the list and comparing against the name.
+ */
+ if ((ret = ws_source_open(wtds, session, uri, config,
+ WS_SOURCE_OPEN_BUSY | WS_SOURCE_OPEN_GLOBAL, &ws)) != 0)
+ return (ret);
+
+ /* Get a copy of the new name for the WT_SOURCE structure. */
+ if ((p = strdup(newuri)) == NULL) {
+ ret = os_errno();
+ goto err;
+ }
+ free(ws->uri);
+ ws->uri = p;
+
+ /* Rename the underlying Helium objects. */
+ ESET(ws_source_name(wtds, session, newuri, NULL, &p));
+ if (ret == 0) {
+ ESET(he_rename(ws->he, p));
+ free(p);
+ }
+ ESET(ws_source_name(wtds, session, newuri, WT_NAME_CACHE, &p));
+ if (ret == 0) {
+ ESET(he_rename(ws->he_cache, p));
+ free(p);
+ }
+
+ /* Update the metadata record. */
+ ESET(master_uri_rename(wtds, session, uri, newuri));
+
+ /*
+ * If we have an error at this point, panic -- there's an inconsistency
+ * in what WiredTiger knows about and the underlying store.
+ */
+ if (ret != 0)
+ ret = WT_PANIC;
+
+err: ESET(unlock(wtext, session, &ds->global_lock));
+
+ return (ret);
+}
+
+/*
+ * helium_session_truncate --
+ * WT_SESSION.truncate method.
+ */
+static int
+helium_session_truncate(WT_DATA_SOURCE *wtds,
+ WT_SESSION *session, const char *uri, WT_CONFIG_ARG *config)
+{
+ DATA_SOURCE *ds;
+ WT_EXTENSION_API *wtext;
+ WT_SOURCE *ws;
+ int ret = 0, tret;
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+
+ /* Get a locked reference to the WiredTiger source. */
+ if ((ret = ws_source_open(wtds, session,
+ uri, config, WS_SOURCE_OPEN_BUSY, &ws)) != 0)
+ return (ret);
+
+ /* Truncate the underlying namespaces. */
+ if ((tret = he_truncate(ws->he)) != 0)
+ EMSG(wtext, session, tret,
+ "he_truncate: %s: %s", ws->uri, he_strerror(tret));
+ if ((tret = he_truncate(ws->he_cache)) != 0)
+ EMSG(wtext, session, tret,
+ "he_truncate: %s: %s", ws->uri, he_strerror(tret));
+
+ ESET(unlock(wtext, session, &ws->lock));
+ return (ret);
+}
+
+/*
+ * helium_session_verify --
+ * WT_SESSION.verify method.
+ */
+static int
+helium_session_verify(WT_DATA_SOURCE *wtds,
+ WT_SESSION *session, const char *uri, WT_CONFIG_ARG *config)
+{
+ (void)wtds;
+ (void)session;
+ (void)uri;
+ (void)config;
+ return (0);
+}
+
+/*
+ * helium_session_checkpoint --
+ * WT_SESSION.checkpoint method.
+ */
+static int
+helium_session_checkpoint(
+ WT_DATA_SOURCE *wtds, WT_SESSION *session, WT_CONFIG_ARG *config)
+{
+ DATA_SOURCE *ds;
+ HELIUM_SOURCE *hs;
+ WT_EXTENSION_API *wtext;
+ int ret = 0;
+
+ (void)config;
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+
+ /* Flush all volumes. */
+ if ((hs = ds->hs_head) != NULL &&
+ (ret = he_commit(hs->he_volume)) != 0)
+ ERET(wtext, session, ret,
+ "he_commit: %s: %s", hs->device, he_strerror(ret));
+
+ return (0);
+}
+
+/*
+ * helium_source_close --
+ * Discard a HELIUM_SOURCE.
+ */
+static int
+helium_source_close(
+ WT_EXTENSION_API *wtext, WT_SESSION *session, HELIUM_SOURCE *hs)
+{
+ WT_SOURCE *ws;
+ int ret = 0, tret;
+
+ /* Resolve the cache into the primary one last time and quit. */
+ if (hs->cleaner_id != 0) {
+ hs->cleaner_stop = 1;
+
+ if ((tret = pthread_join(hs->cleaner_id, NULL)) != 0)
+ EMSG(wtext, session, tret,
+ "pthread_join: %s", strerror(tret));
+ hs->cleaner_id = 0;
+ }
+
+ /* Close the underlying WiredTiger sources. */
+ while ((ws = hs->ws_head) != NULL) {
+ hs->ws_head = ws->next;
+ ESET(ws_source_close(wtext, session, ws));
+ }
+
+ /* If the owner, close the database transaction store. */
+ if (hs->he_txn != NULL && hs->he_owner) {
+ if ((tret = he_close(hs->he_txn)) != 0)
+ EMSG(wtext, session, tret,
+ "he_close: %s: %s: %s",
+ hs->name, WT_NAME_TXN, he_strerror(tret));
+ hs->he_txn = NULL;
+ }
+
+ /* Flush and close the Helium source. */
+ if (hs->he_volume != NULL) {
+ if ((tret = he_commit(hs->he_volume)) != 0)
+ EMSG(wtext, session, tret,
+ "he_commit: %s: %s",
+ hs->device, he_strerror(tret));
+
+ if ((tret = he_close(hs->he_volume)) != 0)
+ EMSG(wtext, session, tret,
+ "he_close: %s: %s: %s",
+ hs->name, WT_NAME_INIT, he_strerror(tret));
+ hs->he_volume = NULL;
+ }
+
+ free(hs->name);
+ free(hs->device);
+ OVERWRITE_AND_FREE(hs);
+
+ return (ret);
+}
+
+/*
+ * cache_cleaner --
+ * Migrate information from the cache to the primary store.
+ */
+static int
+cache_cleaner(WT_EXTENSION_API *wtext,
+ WT_CURSOR *wtcursor, uint64_t oldest, uint64_t *txnminp)
+{
+ CACHE_RECORD *cp;
+ CURSOR *cursor;
+ HE_ITEM *r;
+ WT_SOURCE *ws;
+ uint64_t txnid;
+ int locked, pushed, recovery, ret = 0;
+
+ /*
+ * Called in two ways: in normal processing mode where we're supplied a
+ * value for the oldest transaction ID not yet visible to a running
+ * transaction, and we're tracking the smallest transaction ID
+ * referenced by any cache entry, and in recovery mode where neither of
+ * those are true.
+ */
+ if (txnminp == NULL)
+ recovery = 1;
+ else {
+ recovery = 0;
+ *txnminp = UINT64_MAX;
+ }
+
+ cursor = (CURSOR *)wtcursor;
+ ws = cursor->ws;
+ r = &cursor->record;
+ locked = pushed = 0;
+
+ /*
+ * For every cache key where all updates are globally visible:
+ * Migrate the most recent update value to the primary store.
+ */
+ for (r->key_len = 0; (ret =
+ helium_call(wtcursor, "he_next", ws->he_cache, he_next)) == 0;) {
+ /*
+ * Unmarshall the value, and if all of the updates are globally
+ * visible, update the primary with the last committed update.
+ * In normal processing, the last committed update test is for
+ * a globally visible update that's not explicitly aborted. In
+ * recovery processing, the last committed update test is for
+ * an explicitly committed update. See the underlying functions
+ * for more information.
+ */
+ if ((ret = cache_value_unmarshall(wtcursor)) != 0)
+ goto err;
+ if (!recovery && !cache_value_visible_all(wtcursor, oldest))
+ continue;
+ if (recovery)
+ cache_value_last_committed(wtcursor, &cp);
+ else
+ cache_value_last_not_aborted(wtcursor, &cp);
+ if (cp == NULL)
+ continue;
+
+ pushed = 1;
+ if (cp->remove) {
+ if ((ret = he_delete(ws->he, r)) == 0)
+ continue;
+
+ /*
+ * Updates confined to the cache may not appear in the
+ * primary at all, that is, an insert and remove pair
+ * may be confined to the cache.
+ */
+ if (ret == HE_ERR_ITEM_NOT_FOUND) {
+ ret = 0;
+ continue;
+ }
+ ERET(wtext, NULL, ret,
+ "he_delete: %s", he_strerror(ret));
+ } else {
+ r->val = cp->v;
+ r->val_len = cp->len;
+ /*
+ * If compression configured for this datastore, set the
+ * compression flag, we're updating the "real" store.
+ */
+ if (ws->config_compress)
+ r->flags |= HE_I_COMPRESS;
+ ret = he_update(ws->he, r);
+ r->flags = 0;
+ if (ret == 0)
+ continue;
+
+ ERET(wtext, NULL, ret,
+ "he_update: %s", he_strerror(ret));
+ }
+ }
+
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ if (ret != 0)
+ ERET(wtext, NULL, ret, "he_next: %s", he_strerror(ret));
+
+ /*
+ * If we didn't move any keys from the cache to the primary, quit. It's
+ * possible we could still remove values from the cache, but not likely,
+ * and another pass would probably be wasted effort (especially locked).
+ */
+ if (!pushed)
+ return (0);
+
+ /*
+ * Push the store to stable storage for correctness. (It doesn't matter
+ * what Helium handle we commit, so we just commit one of them.)
+ */
+ if ((ret = he_commit(ws->he)) != 0)
+ ERET(wtext, NULL, ret, "he_commit: %s", he_strerror(ret));
+
+ /*
+ * If we're performing recovery, that's all we need to do, we're going
+ * to simply discard the cache, there's no reason to remove entries one
+ * at a time.
+ */
+ if (recovery)
+ return (0);
+
+ /*
+ * For every cache key where all updates are globally visible:
+ * Remove the cache key.
+ *
+ * We're updating the cache, which requires a lock during normal
+ * cleaning.
+ */
+ if ((ret = writelock(wtext, NULL, &ws->lock)) != 0)
+ goto err;
+ locked = 1;
+
+ for (r->key_len = 0; (ret =
+ helium_call(wtcursor, "he_next", ws->he_cache, he_next)) == 0;) {
+ /*
+ * Unmarshall the value, and if all of the updates are globally
+ * visible, remove the cache entry.
+ */
+ if ((ret = cache_value_unmarshall(wtcursor)) != 0)
+ goto err;
+ if (cache_value_visible_all(wtcursor, oldest)) {
+ if ((ret = he_delete(ws->he_cache, r)) != 0)
+ EMSG_ERR(wtext, NULL, ret,
+ "he_delete: %s", he_strerror(ret));
+ continue;
+ }
+
+ /*
+ * If the entry will remain in the cache, figure out the oldest
+ * transaction for which it contains an update (which might be
+ * different from the oldest transaction in the system). We
+ * need the oldest transaction ID that appears anywhere in any
+ * cache, it limits the records we can discard from the
+ * transaction store.
+ */
+ cache_value_txnmin(wtcursor, &txnid);
+ if (txnid < *txnminp)
+ *txnminp = txnid;
+ }
+
+ locked = 0;
+ if ((ret = unlock(wtext, NULL, &ws->lock)) != 0)
+ goto err;
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ if (ret != 0)
+ EMSG_ERR(wtext, NULL, ret, "he_next: %s", he_strerror(ret));
+
+err: if (locked)
+ ESET(unlock(wtext, NULL, &ws->lock));
+
+ return (ret);
+}
+
+/*
+ * txn_cleaner --
+ * Discard no longer needed entries from the transaction store.
+ */
+static int
+txn_cleaner(WT_CURSOR *wtcursor, he_t he_txn, uint64_t txnmin)
+{
+ CURSOR *cursor;
+ HE_ITEM *r;
+ WT_EXTENSION_API *wtext;
+ uint64_t txnid;
+ int ret = 0;
+
+ cursor = (CURSOR *)wtcursor;
+ wtext = cursor->wtext;
+ r = &cursor->record;
+
+ /*
+ * Remove all entries from the transaction store that are before the
+ * oldest transaction ID that appears anywhere in any cache.
+ */
+ for (r->key_len = 0;
+ (ret = helium_call(wtcursor, "he_next", he_txn, he_next)) == 0;) {
+ memcpy(&txnid, r->key, sizeof(txnid));
+ if (txnid < txnmin && (ret = he_delete(he_txn, r)) != 0)
+ ERET(wtext, NULL, ret,
+ "he_delete: %s", he_strerror(ret));
+ }
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ if (ret != 0)
+ ERET(wtext, NULL, ret, "he_next: %s", he_strerror(ret));
+
+ return (0);
+}
+
+/*
+ * fake_cursor --
+ * Fake up enough of a cursor to do Helium operations.
+ */
+static int
+fake_cursor(WT_EXTENSION_API *wtext, WT_CURSOR **wtcursorp)
+{
+ CURSOR *cursor;
+ WT_CURSOR *wtcursor;
+
+ /*
+ * Fake a cursor.
+ */
+ if ((cursor = calloc(1, sizeof(CURSOR))) == NULL)
+ return (os_errno());
+ cursor->wtext = wtext;
+ cursor->record.key = cursor->__key;
+ if ((cursor->v = malloc(128)) == NULL) {
+ free(cursor);
+ return (os_errno());
+ }
+ cursor->mem_len = 128;
+
+ /*
+ * !!!
+ * Fake cursors don't have WT_SESSION handles.
+ */
+ wtcursor = (WT_CURSOR *)cursor;
+ wtcursor->session = NULL;
+
+ *wtcursorp = wtcursor;
+ return (0);
+}
+
+/*
+ * cache_cleaner_worker --
+ * Thread to migrate data from the cache to the primary.
+ */
+static void *
+cache_cleaner_worker(void *arg)
+{
+ struct timeval t;
+ CURSOR *cursor;
+ HELIUM_SOURCE *hs;
+ HE_STATS stats;
+ WT_CURSOR *wtcursor;
+ WT_EXTENSION_API *wtext;
+ WT_SOURCE *ws;
+ uint64_t oldest, txnmin, txntmp;
+ int cleaner_stop, delay, ret = 0;
+
+ hs = (HELIUM_SOURCE *)arg;
+
+ cursor = NULL;
+ wtext = hs->wtext;
+
+ if ((ret = fake_cursor(wtext, &wtcursor)) != 0)
+ EMSG_ERR(wtext, NULL, ret, "cleaner: %s", strerror(ret));
+ cursor = (CURSOR *)wtcursor;
+
+ for (cleaner_stop = delay = 0; !cleaner_stop;) {
+ /*
+ * Check if this will be the final run; cleaner_stop is declared
+ * volatile, and so the read will happen. We don't much care if
+ * there's extra loops, it's enough if a read eventually happens
+ * and finds the variable set. Store the read locally, reading
+ * the variable twice might race.
+ */
+ cleaner_stop = hs->cleaner_stop;
+
+ /*
+ * Delay if this isn't the final run and the last pass didn't
+ * find any work to do.
+ */
+ if (!cleaner_stop && delay != 0) {
+ t.tv_sec = delay;
+ t.tv_usec = 0;
+ (void)select(0, NULL, NULL, NULL, &t);
+ }
+
+ /* Run at least every 5 seconds. */
+ if (delay < 5)
+ ++delay;
+
+ /*
+ * Clean the datastore caches, depending on their size. It's
+ * both more and less expensive to return values from the cache:
+ * more because we have to marshall/unmarshall the values, less
+ * because there's only a single call, to the cache store rather
+ * one to the cache and one to the primary. I have no turning
+ * information, for now simply set the limit at 50MB.
+ */
+#undef CACHE_SIZE_TRIGGER
+#define CACHE_SIZE_TRIGGER (50 * 1048576)
+ for (ws = hs->ws_head; ws != NULL; ws = ws->next) {
+ if ((ret = he_stats(ws->he_cache, &stats)) != 0)
+ EMSG_ERR(wtext, NULL,
+ ret, "he_stats: %s", he_strerror(ret));
+ if (stats.size > CACHE_SIZE_TRIGGER)
+ break;
+ }
+ if (!cleaner_stop && ws == NULL)
+ continue;
+
+ /* There was work to do, don't delay before checking again. */
+ delay = 0;
+
+ /*
+ * Get the oldest transaction ID not yet visible to a running
+ * transaction. Do this before doing anything else, avoiding
+ * any race with creating new WT_SOURCE handles.
+ */
+ oldest = wtext->transaction_oldest(wtext);
+
+ /*
+ * If any cache needs cleaning, clean them all, because we have
+ * to know the minimum transaction ID referenced by any cache.
+ *
+ * For each cache/primary pair, migrate whatever records we can,
+ * tracking the lowest transaction ID of any entry in any cache.
+ */
+ txnmin = UINT64_MAX;
+ for (ws = hs->ws_head; ws != NULL; ws = ws->next) {
+ cursor->ws = ws;
+ if ((ret = cache_cleaner(
+ wtext, wtcursor, oldest, &txntmp)) != 0)
+ goto err;
+ if (txntmp < txnmin)
+ txnmin = txntmp;
+ }
+
+ /*
+ * Discard any transactions less than the minimum transaction ID
+ * referenced in any cache.
+ *
+ * !!!
+ * I'm playing fast-and-loose with whether or not the cursor
+ * references an underlying WT_SOURCE, there's a structural
+ * problem here.
+ */
+ cursor->ws = NULL;
+ if ((ret = txn_cleaner(wtcursor, hs->he_txn, txnmin)) != 0)
+ goto err;
+ }
+
+err: cursor_destroy(cursor);
+ return (NULL);
+}
+
+/*
+ * helium_config_read --
+ * Parse the Helium configuration.
+ */
+static int
+helium_config_read(WT_EXTENSION_API *wtext, WT_CONFIG_ITEM *config,
+ char **devicep, HE_ENV *envp, int *env_setp, int *flagsp)
+{
+ WT_CONFIG_ITEM k, v;
+ WT_CONFIG_PARSER *config_parser;
+ int ret = 0, tret;
+
+ *env_setp = 0;
+ *flagsp = 0;
+
+ /* Traverse the configuration arguments list. */
+ if ((ret = wtext->config_parser_open(
+ wtext, NULL, config->str, config->len, &config_parser)) != 0)
+ ERET(wtext, NULL, ret,
+ "WT_EXTENSION_API.config_parser_open: %s",
+ wtext->strerror(ret));
+ while ((ret = config_parser->next(config_parser, &k, &v)) == 0) {
+ if (string_match("helium_devices", k.str, k.len)) {
+ if ((*devicep = calloc(1, v.len + 1)) == NULL)
+ return (os_errno());
+ memcpy(*devicep, v.str, v.len);
+ continue;
+ }
+ if (string_match("helium_env_read_cache_size", k.str, k.len)) {
+ envp->read_cache_size = (uint64_t)v.val;
+ *env_setp = 1;
+ continue;
+ }
+ if (string_match("helium_env_write_cache_size", k.str, k.len)) {
+ envp->write_cache_size = (uint64_t)v.val;
+ *env_setp = 1;
+ continue;
+ }
+ if (string_match("helium_o_volume_truncate", k.str, k.len)) {
+ if (v.val != 0)
+ *flagsp |= HE_O_VOLUME_TRUNCATE;
+ continue;
+ }
+ EMSG_ERR(wtext, NULL, EINVAL,
+ "unknown configuration key value pair %.*s=%.*s",
+ (int)k.len, k.str, (int)v.len, v.str);
+ }
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ if (ret != 0)
+ EMSG_ERR(wtext, NULL, ret,
+ "WT_CONFIG_PARSER.next: %s", wtext->strerror(ret));
+
+err: if ((tret = config_parser->close(config_parser)) != 0)
+ EMSG(wtext, NULL, tret,
+ "WT_CONFIG_PARSER.close: %s", wtext->strerror(tret));
+
+ return (ret);
+}
+
+/*
+ * helium_source_open --
+ * Allocate and open a Helium source.
+ */
+static int
+helium_source_open(DATA_SOURCE *ds, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v)
+{
+ struct he_env env;
+ HELIUM_SOURCE *hs;
+ WT_EXTENSION_API *wtext;
+ int env_set, flags, ret = 0;
+
+ wtext = ds->wtext;
+ hs = NULL;
+
+ VMSG(wtext, NULL, VERBOSE_L1, "volume %.*s=%.*s",
+ (int)k->len, k->str, (int)v->len, v->str);
+
+ /*
+ * Check for a Helium source we've already opened: we don't check the
+ * value (which implies you can open the same underlying stores using
+ * more than one name, but I don't know of any problems that causes),
+ * we only check the key, that is, the top-level WiredTiger name.
+ */
+ for (hs = ds->hs_head; hs != NULL; hs = hs->next)
+ if (string_match(hs->name, k->str, k->len))
+ ERET(wtext, NULL,
+ EINVAL, "%s: device already open", hs->name);
+
+ /* Allocate and initialize a new underlying Helium source object. */
+ if ((hs = calloc(1, sizeof(*hs))) == NULL ||
+ (hs->name = calloc(1, k->len + 1)) == NULL) {
+ free(hs);
+ return (os_errno());
+ }
+ memcpy(hs->name, k->str, k->len);
+ hs->txn_notify.notify = txn_notify;
+ hs->wtext = wtext;
+
+ /* Read the configuration, require a device naming the Helium store. */
+ memset(&env, 0, sizeof(env));
+ if ((ret = helium_config_read(
+ wtext, v, &hs->device, &env, &env_set, &flags)) != 0)
+ goto err;
+ if (hs->device == NULL)
+ EMSG_ERR(wtext, NULL,
+ EINVAL, "%s: no Helium volumes specified", hs->name);
+
+ /*
+ * Open the Helium volume, creating it if necessary. We have to open
+ * an object at the same time, that's why we have object flags as well
+ * as volume flags.
+ */
+ flags |= HE_O_CREATE |
+ HE_O_TRUNCATE | HE_O_VOLUME_CLEAN | HE_O_VOLUME_CREATE;
+ if ((hs->he_volume = he_open(
+ hs->device, WT_NAME_INIT, flags, env_set ? &env : NULL)) == NULL) {
+ ret = os_errno();
+ EMSG_ERR(wtext, NULL, ret,
+ "he_open: %s: %s: %s",
+ hs->name, WT_NAME_INIT, he_strerror(ret));
+ }
+
+ /* Insert the new entry at the head of the list. */
+ hs->next = ds->hs_head;
+ ds->hs_head = hs;
+
+ if (0) {
+err: if (hs != NULL)
+ ESET(helium_source_close(wtext, NULL, hs));
+ }
+ return (ret);
+}
+
+/*
+ * helium_source_open_txn --
+ * Open the database-wide transaction store.
+ */
+static int
+helium_source_open_txn(DATA_SOURCE *ds)
+{
+ HELIUM_SOURCE *hs, *hs_txn;
+ WT_EXTENSION_API *wtext;
+ he_t he_txn, t;
+ int ret = 0;
+
+ wtext = ds->wtext;
+
+ /*
+ * The global txn namespace is per connection, it spans multiple Helium
+ * sources.
+ *
+ * We've opened the Helium sources: check to see if any of them already
+ * have a transaction store, and make sure we only find one.
+ */
+ hs_txn = NULL;
+ he_txn = NULL;
+ for (hs = ds->hs_head; hs != NULL; hs = hs->next)
+ if ((t = he_open(hs->device, WT_NAME_TXN, 0, NULL)) != NULL) {
+ if (hs_txn != NULL) {
+ (void)he_close(t);
+ (void)he_close(hs_txn);
+ ERET(wtext, NULL, WT_PANIC,
+ "found multiple transaction stores, "
+ "unable to proceed");
+ }
+ he_txn = t;
+ hs_txn = hs;
+ }
+
+ /*
+ * If we didn't find a transaction store, open a transaction store in
+ * the first Helium source we loaded. (It could just as easily be
+ * the last one we loaded, we're just picking one, but picking the first
+ * seems slightly less likely to make people wonder.)
+ */
+ if ((hs = hs_txn) == NULL) {
+ for (hs = ds->hs_head; hs->next != NULL; hs = hs->next)
+ ;
+ if ((he_txn = he_open(
+ hs->device, WT_NAME_TXN, HE_O_CREATE, NULL)) == NULL) {
+ ret = os_errno();
+ ERET(wtext, NULL, ret,
+ "he_open: %s: %s: %s",
+ hs->name, WT_NAME_TXN, he_strerror(ret));
+ }
+
+ /* Push the change. */
+ if ((ret = he_commit(he_txn)) != 0)
+ ERET(wtext, NULL, ret,
+ "he_commit: %s", he_strerror(ret));
+ }
+ VMSG(wtext, NULL, VERBOSE_L1, "%s" "transactional store on %s",
+ hs_txn == NULL ? "creating " : "", hs->name);
+
+ /* Set the owner field, this Helium source has to be closed last. */
+ hs->he_owner = 1;
+
+ /* Add a reference to the transaction store in each Helium source. */
+ for (hs = ds->hs_head; hs != NULL; hs = hs->next)
+ hs->he_txn = he_txn;
+
+ return (0);
+}
+
+/*
+ * helium_source_recover_namespace --
+ * Recover a single cache/primary pair in a Helium namespace.
+ */
+static int
+helium_source_recover_namespace(WT_DATA_SOURCE *wtds,
+ HELIUM_SOURCE *hs, const char *name, WT_CONFIG_ARG *config)
+{
+ CURSOR *cursor;
+ DATA_SOURCE *ds;
+ WT_CURSOR *wtcursor;
+ WT_EXTENSION_API *wtext;
+ WT_SOURCE *ws;
+ size_t len;
+ int ret = 0;
+ const char *p;
+ char *uri;
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+ cursor = NULL;
+ ws = NULL;
+ uri = NULL;
+
+ /*
+ * The name we store on the Helium device is a translation of the
+ * WiredTiger name: do the reverse process here so we can use the
+ * standard source-open function.
+ */
+ p = name + strlen(WT_NAME_PREFIX);
+ len = strlen("helium:") + strlen(hs->name) + strlen(p) + 10;
+ if ((uri = malloc(len)) == NULL) {
+ ret = os_errno();
+ goto err;
+ }
+ (void)snprintf(uri, len, "helium:%s/%s", hs->name, p);
+
+ /*
+ * Open the cache/primary pair by going through the full open process,
+ * instantiating the underlying WT_SOURCE object.
+ */
+ if ((ret = ws_source_open(wtds, NULL, uri, config, 0, &ws)) != 0)
+ goto err;
+ if ((ret = unlock(wtext, NULL, &ws->lock)) != 0)
+ goto err;
+
+ /* Fake up a cursor. */
+ if ((ret = fake_cursor(wtext, &wtcursor)) != 0)
+ EMSG_ERR(wtext, NULL, ret, "recovery: %s", strerror(ret));
+ cursor = (CURSOR *)wtcursor;
+ cursor->ws = ws;
+
+ /* Process, then clear, the cache. */
+ if ((ret = cache_cleaner(wtext, wtcursor, 0, NULL)) != 0)
+ goto err;
+ if ((ret = he_truncate(ws->he_cache)) != 0)
+ EMSG_ERR(wtext, NULL, ret,
+ "he_truncate: %s(cache): %s", ws->uri, he_strerror(ret));
+
+ /* Close the underlying WiredTiger sources. */
+err: while ((ws = hs->ws_head) != NULL) {
+ hs->ws_head = ws->next;
+ ESET(ws_source_close(wtext, NULL, ws));
+ }
+
+ cursor_destroy(cursor);
+ free(uri);
+
+ return (ret);
+}
+
+struct helium_namespace_cookie {
+ char **list;
+ u_int list_cnt;
+ u_int list_max;
+};
+
+/*
+ * helium_namespace_list --
+ * Get a list of the objects we're going to recover.
+ */
+static int
+helium_namespace_list(void *cookie, const char *name)
+{
+ struct helium_namespace_cookie *names;
+ void *allocp;
+
+ names = cookie;
+
+ /*
+ * Ignore any files without a WiredTiger prefix.
+ * Ignore the metadata and cache files.
+ */
+ if (!prefix_match(name, WT_NAME_PREFIX))
+ return (0);
+ if (strcmp(name, WT_NAME_INIT) == 0)
+ return (0);
+ if (strcmp(name, WT_NAME_TXN) == 0)
+ return (0);
+ if (string_match(
+ strrchr(name, '.'), WT_NAME_CACHE, strlen(WT_NAME_CACHE)))
+ return (0);
+
+ if (names->list_cnt + 1 >= names->list_max) {
+ if ((allocp = realloc(names->list,
+ (names->list_max + 20) * sizeof(names->list[0]))) == NULL)
+ return (os_errno());
+ names->list = allocp;
+ names->list_max += 20;
+ }
+ if ((names->list[names->list_cnt] = strdup(name)) == NULL)
+ return (os_errno());
+ ++names->list_cnt;
+ names->list[names->list_cnt] = NULL;
+ return (0);
+}
+
+/*
+ * helium_source_recover --
+ * Recover the HELIUM_SOURCE.
+ */
+static int
+helium_source_recover(
+ WT_DATA_SOURCE *wtds, HELIUM_SOURCE *hs, WT_CONFIG_ARG *config)
+{
+ struct helium_namespace_cookie names;
+ DATA_SOURCE *ds;
+ WT_EXTENSION_API *wtext;
+ u_int i;
+ int ret = 0;
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+ memset(&names, 0, sizeof(names));
+
+ VMSG(wtext, NULL, VERBOSE_L1, "recover %s", hs->name);
+
+ /* Get a list of the cache/primary object pairs in the Helium source. */
+ if ((ret = he_enumerate(
+ hs->device, helium_namespace_list, &names)) != 0)
+ ERET(wtext, NULL, ret,
+ "he_enumerate: %s: %s", hs->name, he_strerror(ret));
+
+ /* Recover the objects. */
+ for (i = 0; i < names.list_cnt; ++i)
+ if ((ret = helium_source_recover_namespace(
+ wtds, hs, names.list[i], config)) != 0)
+ goto err;
+
+ /* Clear the transaction store. */
+ if ((ret = he_truncate(hs->he_txn)) != 0)
+ EMSG_ERR(wtext, NULL, ret,
+ "he_truncate: %s: %s: %s",
+ hs->name, WT_NAME_TXN, he_strerror(ret));
+
+err: for (i = 0; i < names.list_cnt; ++i)
+ free(names.list[i]);
+ free(names.list);
+
+ return (ret);
+}
+
+/*
+ * helium_terminate --
+ * Unload the data-source.
+ */
+static int
+helium_terminate(WT_DATA_SOURCE *wtds, WT_SESSION *session)
+{
+ DATA_SOURCE *ds;
+ HELIUM_SOURCE *hs, *last;
+ WT_EXTENSION_API *wtext;
+ int ret = 0;
+
+ ds = (DATA_SOURCE *)wtds;
+ wtext = ds->wtext;
+
+ /* Lock the system down. */
+ if (ds->lockinit)
+ ret = writelock(wtext, session, &ds->global_lock);
+
+ /*
+ * Close the Helium sources, close the Helium source that "owns" the
+ * database transaction store last.
+ */
+ last = NULL;
+ while ((hs = ds->hs_head) != NULL) {
+ ds->hs_head = hs->next;
+ if (hs->he_owner) {
+ last = hs;
+ continue;
+ }
+ ESET(helium_source_close(wtext, session, hs));
+ }
+ if (last != NULL)
+ ESET(helium_source_close(wtext, session, last));
+
+ /* Unlock and destroy the system. */
+ if (ds->lockinit) {
+ ESET(unlock(wtext, session, &ds->global_lock));
+ ESET(lock_destroy(wtext, NULL, &ds->global_lock));
+ }
+
+ OVERWRITE_AND_FREE(ds);
+
+ return (ret);
+}
+
+/*
+ * wiredtiger_extension_init --
+ * Initialize the Helium connector code.
+ */
+int
+wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
+{
+ /*
+ * List of the WT_DATA_SOURCE methods -- it's static so it breaks at
+ * compile-time should the structure change underneath us.
+ */
+ static const WT_DATA_SOURCE wtds = {
+ helium_session_create, /* session.create */
+ NULL, /* No session.compaction */
+ helium_session_drop, /* session.drop */
+ helium_session_open_cursor, /* session.open_cursor */
+ helium_session_rename, /* session.rename */
+ NULL, /* No session.salvage */
+ helium_session_truncate, /* session.truncate */
+ NULL, /* No session.range_truncate */
+ helium_session_verify, /* session.verify */
+ helium_session_checkpoint, /* session.checkpoint */
+ helium_terminate /* termination */
+ };
+ static const char *session_create_opts[] = {
+ "helium_o_compress=0", /* HE_I_COMPRESS */
+ "helium_o_truncate=0", /* HE_O_TRUNCATE */
+ NULL
+ };
+ DATA_SOURCE *ds;
+ HELIUM_SOURCE *hs;
+ WT_CONFIG_ITEM k, v;
+ WT_CONFIG_PARSER *config_parser;
+ WT_EXTENSION_API *wtext;
+ int vmajor, vminor, ret = 0;
+ const char **p;
+
+ config_parser = NULL;
+ ds = NULL;
+
+ wtext = connection->get_extension_api(connection);
+
+ /* Check the library version */
+#if HE_VERSION_MAJOR != 2 || HE_VERSION_MINOR != 2
+ ERET(wtext, NULL, EINVAL,
+ "unsupported Levyx/Helium header file %d.%d, expected version 2.2",
+ HE_VERSION_MAJOR, HE_VERSION_MINOR);
+#endif
+ he_version(&vmajor, &vminor);
+ if (vmajor != 2 || vminor != 2)
+ ERET(wtext, NULL, EINVAL,
+ "unsupported Levyx/Helium library version %d.%d, expected "
+ "version 2.2", vmajor, vminor);
+
+ /* Allocate and initialize the local data-source structure. */
+ if ((ds = calloc(1, sizeof(DATA_SOURCE))) == NULL)
+ return (os_errno());
+ ds->wtds = wtds;
+ ds->wtext = wtext;
+ if ((ret = lock_init(wtext, NULL, &ds->global_lock)) != 0)
+ goto err;
+ ds->lockinit = 1;
+
+ /* Get the configuration string. */
+ if ((ret = wtext->config_get(wtext, NULL, config, "config", &v)) != 0)
+ EMSG_ERR(wtext, NULL, ret,
+ "WT_EXTENSION_API.config_get: config: %s",
+ wtext->strerror(ret));
+
+ /* Step through the list of Helium sources, opening each one. */
+ if ((ret = wtext->config_parser_open(
+ wtext, NULL, v.str, v.len, &config_parser)) != 0)
+ EMSG_ERR(wtext, NULL, ret,
+ "WT_EXTENSION_API.config_parser_open: config: %s",
+ wtext->strerror(ret));
+ while ((ret = config_parser->next(config_parser, &k, &v)) == 0) {
+ if (string_match("helium_verbose", k.str, k.len)) {
+ verbose = v.val == 0 ? 0 : 1;
+ continue;
+ }
+ if ((ret = helium_source_open(ds, &k, &v)) != 0)
+ goto err;
+ }
+ if (ret != WT_NOTFOUND)
+ EMSG_ERR(wtext, NULL, ret,
+ "WT_CONFIG_PARSER.next: config: %s",
+ wtext->strerror(ret));
+ if ((ret = config_parser->close(config_parser)) != 0)
+ EMSG_ERR(wtext, NULL, ret,
+ "WT_CONFIG_PARSER.close: config: %s",
+ wtext->strerror(ret));
+ config_parser = NULL;
+
+ /* Find and open the database transaction store. */
+ if ((ret = helium_source_open_txn(ds)) != 0)
+ return (ret);
+
+ /* Recover each Helium source. */
+ for (hs = ds->hs_head; hs != NULL; hs = hs->next)
+ if ((ret = helium_source_recover(&ds->wtds, hs, config)) != 0)
+ goto err;
+
+ /* Start each Helium source cleaner thread. */
+ for (hs = ds->hs_head; hs != NULL; hs = hs->next)
+ if ((ret = pthread_create(
+ &hs->cleaner_id, NULL, cache_cleaner_worker, hs)) != 0)
+ EMSG_ERR(wtext, NULL, ret,
+ "%s: pthread_create: cleaner thread: %s",
+ hs->name, strerror(ret));
+
+ /* Add Helium-specific WT_SESSION.create configuration options. */
+ for (p = session_create_opts; *p != NULL; ++p)
+ if ((ret = connection->configure_method(connection,
+ "session.create", "helium:", *p, "boolean", NULL)) != 0)
+ EMSG_ERR(wtext, NULL, ret,
+ "WT_CONNECTION.configure_method: session.create: "
+ "%s: %s",
+ *p, wtext->strerror(ret));
+
+ /* Add the data source */
+ if ((ret = connection->add_data_source(
+ connection, "helium:", (WT_DATA_SOURCE *)ds, NULL)) != 0)
+ EMSG_ERR(wtext, NULL, ret,
+ "WT_CONNECTION.add_data_source: %s", wtext->strerror(ret));
+ return (0);
+
+err: if (ds != NULL)
+ ESET(helium_terminate((WT_DATA_SOURCE *)ds, NULL));
+ if (config_parser != NULL)
+ (void)config_parser->close(config_parser);
+ return (ret);
+}
+
+/*
+ * wiredtiger_extension_terminate --
+ * Shutdown the Helium connector code.
+ */
+int
+wiredtiger_extension_terminate(WT_CONNECTION *connection)
+{
+ (void)connection; /* Unused parameters */
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/lang/java/Makefile.am b/src/third_party/wiredtiger/lang/java/Makefile.am
new file mode 100644
index 00000000000..94a7cb2702d
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/java/Makefile.am
@@ -0,0 +1,91 @@
+AM_CPPFLAGS = -I$(top_srcdir)
+
+JAVADEST = src/com/wiredtiger/db
+JAVADESTFULL = $(srcdir)/$(JAVADEST)
+JAVAEXAMPLES = $(top_srcdir)/examples/java/com/wiredtiger/examples
+JAVATEST = $(top_srcdir)/test/java/com/wiredtiger/test
+BUILT_SOURCES = wiredtiger_wrap.c
+SWIG_SOURCES = wiredtiger.i
+
+JDOCDIR = $(top_srcdir)/docs/java
+# The Java documentation is currently generated by Doxygen - disable javadoc
+#java_DATA = $(JDOCDIR)/index.html
+
+javadir = $(datadir)/java/$(PACKAGE)-$(PACKAGE_VERSION)
+JAVA_SRC = \
+ $(JAVADESTFULL)/AsyncCallback.java \
+ $(JAVADESTFULL)/AsyncOp.java \
+ $(JAVADESTFULL)/AsyncOpType.java \
+ $(JAVADESTFULL)/Connection.java \
+ $(JAVADESTFULL)/Cursor.java \
+ $(JAVADESTFULL)/SearchStatus.java \
+ $(JAVADESTFULL)/PackFormatInputStream.java \
+ $(JAVADESTFULL)/PackInputStream.java \
+ $(JAVADESTFULL)/PackOutputStream.java \
+ $(JAVADESTFULL)/PackUtil.java \
+ $(JAVADESTFULL)/Session.java \
+ $(JAVADESTFULL)/WiredTigerException.java \
+ $(JAVADESTFULL)/WiredTigerPackingException.java \
+ $(JAVADESTFULL)/WiredTigerPanicException.java \
+ $(JAVADESTFULL)/WiredTigerRollbackException.java \
+ $(JAVADESTFULL)/wiredtiger.java \
+ $(JAVADESTFULL)/wiredtigerConstants.java \
+ $(JAVADESTFULL)/wiredtigerJNI.java \
+ $(JAVAEXAMPLES)/ex_access.java \
+ $(JAVAEXAMPLES)/ex_all.java \
+ $(JAVAEXAMPLES)/ex_async.java \
+ $(JAVAEXAMPLES)/ex_call_center.java \
+ $(JAVAEXAMPLES)/ex_cursor.java \
+ $(JAVAEXAMPLES)/ex_log.java \
+ $(JAVAEXAMPLES)/ex_schema.java \
+ $(JAVAEXAMPLES)/ex_stat.java \
+ $(JAVAEXAMPLES)/ex_thread.java
+
+JAVA_JUNIT = \
+ $(JAVATEST)/AutoCloseTest.java \
+ $(JAVATEST)/AsyncTest.java \
+ $(JAVATEST)/CursorTest.java \
+ $(JAVATEST)/CursorTest02.java \
+ $(JAVATEST)/ExceptionTest.java \
+ $(JAVATEST)/PackTest.java \
+ $(JAVATEST)/WiredTigerSuite.java
+
+dist_java_JAVA = $(JAVA_SRC) @JAVA_JUNIT@
+dist_java_DATA = wiredtiger.jar
+
+EXTRA_JAVA = $(JAVA_JUNIT)
+
+java_LTLIBRARIES = libwiredtiger_java.la
+
+TESTS_JUNIT = AllJunitTests
+
+TESTS = @TESTS_JUNIT@
+
+AllJunitTests:
+ echo "#! /bin/sh" > $@
+ echo 'SCRIPT_DIR=`dirname $$0`' >> $@
+ echo 'env LD_LIBRARY_PATH=$$SCRIPT_DIR/../../.libs:$$SCRIPT_DIR/.libs DYLD_LIBRARY_PATH=$$SCRIPT_DIR/../../.libs JAVA_LIBRARY_PATH=$$SCRIPT_DIR/.libs @JUNIT@ com.wiredtiger.test.WiredTigerSuite' >> $@
+ chmod +x $@
+ mkdir -p WT_HOME
+
+CPPFLAGS += $(JNI_CPPFLAGS)
+# Some warnings when compiling the generated code are unavoidable
+CFLAGS += -w
+libwiredtiger_java_la_SOURCES = $(BUILT_SOURCES) $(SWIG_SOURCES)
+#libwiredtiger_java_la_LDFLAGS = -module
+libwiredtiger_java_la_LIBADD = $(abs_top_builddir)/libwiredtiger.la
+
+all-local: wiredtiger.jar
+
+$(srcdir)/wiredtiger_wrap.c: $(top_srcdir)/src/include/wiredtiger.in $(SWIG_SOURCES)
+ (cd $(srcdir) && \
+ $(SWIG) -Wall -v -java -nodefaultctor -nodefaultdtor -package com.wiredtiger.db -I$(abs_top_builddir) -outdir $(JAVADEST) -o wiredtiger_wrap.c wiredtiger.i)
+
+$(JDOCDIR)/index.html: $(dist_java_JAVA)
+ mkdir -p $(JDOCDIR)
+ javadoc -public -d $(JDOCDIR) -link http://docs.oracle.com/javase/6/docs/api $(JAVADESTFULL)/wiredtiger.java $(JAVADESTFULL)/wiredtigerConstants.java $(JAVADESTFULL)/[A-Z]*.java
+
+wiredtiger.jar: $(dist_java_JAVA) classjava.stamp
+ (cd $(top_builddir) && \
+ $(JAR) -cf wiredtiger.jar com/)
+ cp $(top_builddir)/wiredtiger.jar .
diff --git a/src/third_party/wiredtiger/lang/java/java_doc.i b/src/third_party/wiredtiger/lang/java/java_doc.i
new file mode 100644
index 00000000000..31bad525330
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/java/java_doc.i
@@ -0,0 +1,59 @@
+/* DO NOT EDIT: automatically built by dist/java_doc.py. */
+
+COPYDOC(__wt_cursor, WT_CURSOR, get_key)
+COPYDOC(__wt_cursor, WT_CURSOR, get_value)
+COPYDOC(__wt_cursor, WT_CURSOR, set_key)
+COPYDOC(__wt_cursor, WT_CURSOR, set_value)
+COPYDOC(__wt_cursor, WT_CURSOR, compare)
+COPYDOC(__wt_cursor, WT_CURSOR, next)
+COPYDOC(__wt_cursor, WT_CURSOR, prev)
+COPYDOC(__wt_cursor, WT_CURSOR, reset)
+COPYDOC(__wt_cursor, WT_CURSOR, search)
+COPYDOC(__wt_cursor, WT_CURSOR, search_near)
+COPYDOC(__wt_cursor, WT_CURSOR, insert)
+COPYDOC(__wt_cursor, WT_CURSOR, update)
+COPYDOC(__wt_cursor, WT_CURSOR, remove)
+COPYDOC(__wt_cursor, WT_CURSOR, close)
+COPYDOC(__wt_async_op, WT_ASYNC_OP, get_key)
+COPYDOC(__wt_async_op, WT_ASYNC_OP, get_value)
+COPYDOC(__wt_async_op, WT_ASYNC_OP, set_key)
+COPYDOC(__wt_async_op, WT_ASYNC_OP, set_value)
+COPYDOC(__wt_async_op, WT_ASYNC_OP, search)
+COPYDOC(__wt_async_op, WT_ASYNC_OP, insert)
+COPYDOC(__wt_async_op, WT_ASYNC_OP, update)
+COPYDOC(__wt_async_op, WT_ASYNC_OP, remove)
+COPYDOC(__wt_async_op, WT_ASYNC_OP, compact)
+COPYDOC(__wt_async_op, WT_ASYNC_OP, get_id)
+COPYDOC(__wt_async_op, WT_ASYNC_OP, get_type)
+COPYDOC(__wt_session, WT_SESSION, close)
+COPYDOC(__wt_session, WT_SESSION, reconfigure)
+COPYDOC(__wt_session, WT_SESSION, open_cursor)
+COPYDOC(__wt_session, WT_SESSION, create)
+COPYDOC(__wt_session, WT_SESSION, compact)
+COPYDOC(__wt_session, WT_SESSION, drop)
+COPYDOC(__wt_session, WT_SESSION, log_printf)
+COPYDOC(__wt_session, WT_SESSION, rename)
+COPYDOC(__wt_session, WT_SESSION, salvage)
+COPYDOC(__wt_session, WT_SESSION, truncate)
+COPYDOC(__wt_session, WT_SESSION, upgrade)
+COPYDOC(__wt_session, WT_SESSION, verify)
+COPYDOC(__wt_session, WT_SESSION, begin_transaction)
+COPYDOC(__wt_session, WT_SESSION, commit_transaction)
+COPYDOC(__wt_session, WT_SESSION, rollback_transaction)
+COPYDOC(__wt_session, WT_SESSION, checkpoint)
+COPYDOC(__wt_session, WT_SESSION, transaction_pinned_range)
+COPYDOC(__wt_connection, WT_CONNECTION, async_flush)
+COPYDOC(__wt_connection, WT_CONNECTION, async_new_op)
+COPYDOC(__wt_connection, WT_CONNECTION, close)
+COPYDOC(__wt_connection, WT_CONNECTION, reconfigure)
+COPYDOC(__wt_connection, WT_CONNECTION, configure_method)
+COPYDOC(__wt_connection, WT_CONNECTION, is_new)
+COPYDOC(__wt_connection, WT_CONNECTION, open_session)
+COPYDOC(__wt_connection, WT_CONNECTION, load_extension)
+COPYDOC(__wt_connection, WT_CONNECTION, add_data_source)
+COPYDOC(__wt_connection, WT_CONNECTION, add_collator)
+COPYDOC(__wt_connection, WT_CONNECTION, add_compressor)
+COPYDOC(__wt_connection, WT_CONNECTION, add_extractor)
+COPYDOC(__wt_config_parser, WT_CONFIG_PARSER, close)
+COPYDOC(__wt_config_parser, WT_CONFIG_PARSER, next)
+COPYDOC(__wt_config_parser, WT_CONFIG_PARSER, get)
diff --git a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/AsyncCallback.java b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/AsyncCallback.java
new file mode 100644
index 00000000000..4f6fb5df133
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/AsyncCallback.java
@@ -0,0 +1,42 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package com.wiredtiger.db;
+
+/**
+ * An interface that must be implemented to receive notifications
+ * from asynchronous operations.
+ */
+public interface AsyncCallback {
+ /**
+ * Notify when an asynchronous operation completes.
+ *
+ * \param op The operation that completed
+ * \param opReturn The return value of the operation
+ * \param flags Flags (currently 0).
+ */
+ public int notify(AsyncOp op, int opReturn, int flags);
+}
diff --git a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java
new file mode 100644
index 00000000000..fc4b99ae435
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java
@@ -0,0 +1,184 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package com.wiredtiger.db;
+
+import java.io.ByteArrayInputStream;
+import java.lang.StringBuffer;
+import com.wiredtiger.db.PackUtil;
+import com.wiredtiger.db.WiredTigerPackingException;
+
+/**
+ * An internal helper class for consuming pack format strings.
+ *
+ * Applications should not need to use this class.
+ */
+public class PackFormatInputStream {
+
+ protected String format;
+ protected int formatOff;
+ protected int formatRepeatCount;
+
+ /**
+ * Constructor for a format stream.
+ *
+ * \param format the encoded format backing string.
+ */
+ protected PackFormatInputStream(String format) {
+ this.format = format;
+ formatOff = 0;
+ formatRepeatCount = 0;
+ }
+
+ /**
+ * Standard toString - returns the string used during construction.
+ */
+ public String toString() {
+ return format;
+ }
+
+ /**
+ * Returns the approximate count of elements left in the format.
+ * This method does not account for repeat counts or string length
+ * encodings - so should be used as a guide only.
+ */
+ public int available() {
+ return format.length() - formatOff + formatRepeatCount;
+ }
+
+ /**
+ * Reset the current stream position.
+ */
+ public void reset() {
+ formatOff = 0;
+ formatRepeatCount = 0;
+ }
+
+ /**
+ * Return the decoded type for the next entry in the format stream. Does
+ * not adjust the position of the stream.
+ */
+ protected char getType()
+ throws WiredTigerPackingException {
+ if (formatOff >= format.length()) {
+ System.err.println("Raw format is: " + format);
+ throw new WiredTigerPackingException(
+ "No more fields in format.");
+ }
+
+ String fieldName;
+ boolean lenOK = false;
+ int countOff = 0;
+
+ while (PackUtil.PackSpecialCharacters.indexOf(
+ format.charAt(formatOff + countOff)) != -1) {
+ countOff++;
+ }
+ // Skip repeat counts and sizes
+ while (Character.isDigit(format.charAt(formatOff + countOff))) {
+ countOff++;
+ }
+ return format.charAt(formatOff + countOff);
+ }
+
+ /**
+ * Check to see if the next entry is compatible with the requested type.
+ *
+ * \param asking the format type to match.
+ * \param consume indicates whether to update the stream position.
+ */
+ protected void checkType(char asking, boolean consume)
+ throws WiredTigerPackingException {
+
+ char expected = getType();
+ if (Character.toLowerCase(expected) != Character.toLowerCase(asking))
+ throw new WiredTigerPackingException(
+ "Format mismatch. Wanted: " + asking + ", got: " + expected);
+ if (consume) {
+ consume();
+ }
+ }
+
+ /**
+ * Move the format stream position ahead one position.
+ */
+ protected void consume() {
+ if (formatRepeatCount > 1) {
+ --formatRepeatCount;
+ } else if (formatRepeatCount == 1) {
+ formatRepeatCount = 0;
+ ++formatOff;
+ } else {
+ while (PackUtil.PackSpecialCharacters.indexOf(
+ format.charAt(formatOff)) != -1) {
+ ++formatOff;
+ }
+
+ // Don't need to worry about String or byte array size counts
+ // since they have already been consumed.
+ formatRepeatCount = getIntFromFormat(true);
+ if (formatRepeatCount == 0) {
+ ++formatOff;
+ }
+ }
+ }
+
+ /**
+ * Decode an integer from the format string, return zero if not starting
+ * on a digit.
+ *
+ * \param advance whether to move the stream position.
+ */
+ private int getIntFromFormat(boolean advance) {
+ int valueLen = 0;
+ int countOff;
+ for (countOff = 0;
+ Character.isDigit(format.charAt(formatOff + countOff));
+ countOff++) {
+ valueLen *= 10;
+ valueLen += Character.digit(format.charAt(formatOff + countOff), 10);
+ }
+ if (advance) {
+ formatOff += countOff;
+ }
+ return valueLen;
+ }
+
+ /**
+ * Retrieve a length from the format string. Either for a repeat count
+ * or a string length. Return one if no explicit repeat count.
+ *
+ * \param advance whether to move the stream position.
+ */
+ protected int getLengthFromFormat(boolean advance) {
+ int valueLen = getIntFromFormat(advance);
+ if (valueLen == 0) {
+ valueLen = 1;
+ }
+ return valueLen;
+ }
+}
+
diff --git a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackInputStream.java b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackInputStream.java
new file mode 100644
index 00000000000..f0e5bb9663d
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackInputStream.java
@@ -0,0 +1,340 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package com.wiredtiger.db;
+
+import java.io.ByteArrayInputStream;
+import java.lang.StringBuffer;
+import com.wiredtiger.db.PackUtil;
+import com.wiredtiger.db.WiredTigerPackingException;
+
+/**
+ * An internal helper class for decoding WiredTiger packed values.
+ *
+ * Applications should not need to use this class.
+ */
+public class PackInputStream {
+
+ protected PackFormatInputStream format;
+ protected byte[] value;
+ protected int valueOff;
+ protected int valueLen;
+
+ /**
+ * Constructor.
+ *
+ * \param format A String that contains the WiredTiger format that
+ * defines the layout of this packed value.
+ * \param value The raw bytes that back the stream.
+ */
+ public PackInputStream(String format, byte[] value) {
+ this(format, value, 0, value.length);
+ }
+
+ /**
+ * Constructor.
+ *
+ * \param format A String that contains the WiredTiger format that
+ * defines the layout of this packed value.
+ * \param value The raw bytes that back the stream.
+ * \param off Offset into the value array at which the stream begins.
+ * \param len Length of the value array that forms the stream.
+ */
+ public PackInputStream(String format, byte[] value, int off, int len) {
+ this.format = new PackFormatInputStream(format);
+ this.value = value;
+ this.valueOff = off;
+ this.valueLen = len;
+ }
+
+ /**
+ * Returns the raw packing format string.
+ */
+ public String getFormat() {
+ return format.toString();
+ }
+
+ /**
+ * Returns the raw value byte array.
+ */
+ public byte[] getValue() {
+ return value;
+ }
+
+ /**
+ * Retrieves a byte field from the stream.
+ */
+ public byte getByte()
+ throws WiredTigerPackingException {
+ format.checkType('b', false);
+ format.consume();
+ return (byte)(value[valueOff++] - 0x80);
+ }
+
+ /**
+ * Retrieves a byte array field from the stream.
+ *
+ * \param dest The byte array where the returned value will be stored. The
+ * array should be large enough to store the entire data item,
+ * if it is not, a truncated value will be returned.
+ */
+ public void getByteArray(byte[] dest)
+ throws WiredTigerPackingException {
+ this.getByteArray(dest, 0, dest.length);
+ }
+
+ /**
+ * Retrieves a byte array field from the stream.
+ *
+ * \param dest The byte array where the returned value will be stored.
+ * \param off Offset into the destination buffer to start copying into.
+ * \param len The length should be large enough to store the entire data
+ * item, if it is not, a truncated value will be returned.
+ */
+ public void getByteArray(byte[] dest, int off, int len)
+ throws WiredTigerPackingException {
+ format.checkType('U', false);
+ getByteArrayInternal(getByteArrayLength(), dest, off, len);
+
+ }
+
+ /**
+ * Retrieves a byte array field from the stream. Creates a new byte array
+ * that is the size of the object being retrieved.
+ */
+ public byte[] getByteArray()
+ throws WiredTigerPackingException {
+ int itemLen = getByteArrayLength();
+ byte[] unpacked = new byte[itemLen];
+ getByteArrayInternal(itemLen, unpacked, 0, itemLen);
+ return unpacked;
+ }
+
+ /**
+ * Finds the length of a byte array. Either by decoding the length from
+ * the format or using the remaining size of the stream.
+ */
+ private int getByteArrayLength()
+ throws WiredTigerPackingException {
+ int itemLen = 0;
+ /* The rest of the buffer is a byte array. */
+ if (format.available() == 1) {
+ itemLen = valueLen - valueOff;
+ } else {
+ itemLen = unpackInt(false);
+ }
+ return itemLen;
+ }
+
+ /**
+ * Do the work of retrieving a byte array.
+ */
+ private void getByteArrayInternal(
+ int itemLen, byte[] dest, int off, int destLen)
+ throws WiredTigerPackingException {
+ /* TODO: padding. */
+ int copyLen = itemLen;
+ if (itemLen > destLen) {
+ copyLen = destLen;
+ }
+ format.consume();
+ System.arraycopy(value, valueOff, dest, off, copyLen);
+ valueOff += itemLen;
+ }
+
+ /**
+ * Retrieves an integer field from the stream.
+ */
+ public int getInt()
+ throws WiredTigerPackingException {
+ boolean signed = false;
+ format.checkType('i', false);
+ if (format.getType() == 'I' ||
+ format.getType() == 'L') {
+ signed = true;
+ }
+ format.consume();
+ return unpackInt(signed);
+ }
+
+ /**
+ * Retrieves a long field from the stream.
+ */
+ public long getLong()
+ throws WiredTigerPackingException {
+ boolean signed = false;
+ format.checkType('q', false);
+ if (format.getType() == 'Q') {
+ signed = true;
+ }
+ format.consume();
+ return unpackLong(signed);
+ }
+
+ /**
+ * Retrieves a record field from the stream.
+ */
+ public long getRecord()
+ throws WiredTigerPackingException {
+ format.checkType('r', false);
+ format.consume();
+ return unpackLong(false);
+ }
+
+ /**
+ * Retrieves a short field from the stream.
+ */
+ public short getShort()
+ throws WiredTigerPackingException {
+ boolean signed = false;
+ format.checkType('h', false);
+ if (format.getType() == 'H') {
+ signed = true;
+ }
+ format.consume();
+ return unpackShort(signed);
+ }
+
+ /**
+ * Retrieves a string field from the stream.
+ */
+ public String getString()
+ throws WiredTigerPackingException {
+ int stringLength = 0;
+ format.checkType('S', false);
+ // Get the length for a fixed length string
+ if (format.getType() != 'S') {
+ stringLength = format.getLengthFromFormat(true);
+ } else {
+ // The string is null terminated, but we need to know how many
+ // bytes are consumed - which won't necessarily match up to the
+ // string length.
+ for (; valueOff + stringLength < value.length &&
+ value[valueOff + stringLength] != 0; stringLength++) {}
+ }
+ format.consume();
+ String result = new String(value, valueOff, stringLength);
+ valueOff += stringLength + 1;
+ return result;
+ }
+
+ /**
+ * Decodes an encoded short from the stream. This method does bounds
+ * checking, to ensure values fit, since some values may be encoded as
+ * unsigned values, and Java types are all signed.
+ */
+ private short unpackShort(boolean signed)
+ throws WiredTigerPackingException {
+ long ret = unpackLong(true);
+ if ((signed && (ret > Short.MAX_VALUE || ret > Short.MIN_VALUE)) ||
+ (!signed && (short)ret < 0)) {
+ throw new WiredTigerPackingException("Overflow unpacking short.");
+ }
+ return (short)ret;
+ }
+
+ /**
+ * Decodes an encoded integer from the stream. This method does bounds
+ * checking, to ensure values fit, since some values may be encoded as
+ * unsigned values, and Java types are all signed.
+ */
+ private int unpackInt(boolean signed)
+ throws WiredTigerPackingException {
+ long ret = unpackLong(true);
+ if ((signed && (ret > Integer.MAX_VALUE || ret > Integer.MIN_VALUE)) ||
+ (!signed && (int)ret < 0)) {
+ throw new WiredTigerPackingException("Overflow unpacking integer.");
+ }
+ return (int)ret;
+ }
+
+ /**
+ * Decodes an encoded long from the stream. This method does bounds
+ * checking, to ensure values fit, since some values may be encoded as
+ * unsigned values, and Java types are all signed.
+ * The packing format is defined in the WiredTiger C integer packing
+ * implementation, which is at src/include/intpack.i
+ */
+ private long unpackLong(boolean signed)
+ throws WiredTigerPackingException {
+ int len;
+ long unpacked = 0;
+ switch (value[valueOff] & 0xf0) {
+ case PackUtil.NEG_MULTI_MARKER & 0xff:
+ len = (int)PackUtil.SIZEOF_LONG - (value[valueOff++] & 0xf);
+
+ for (unpacked = 0xffffffff; len != 0; --len) {
+ unpacked = (unpacked << 8) | value[valueOff++] & 0xff;
+ }
+ break;
+ case PackUtil.NEG_2BYTE_MARKER & 0xff:
+ case (PackUtil.NEG_2BYTE_MARKER | 0x10) & 0xff:
+ unpacked = PackUtil.GET_BITS(value[valueOff++], 5, 0) << 8;
+ unpacked |= value[valueOff++] & 0xff;
+ unpacked += PackUtil.NEG_2BYTE_MIN;
+ break;
+ case PackUtil.NEG_1BYTE_MARKER & 0xff:
+ case (PackUtil.NEG_1BYTE_MARKER | 0x10) & 0xff:
+ case (PackUtil.NEG_1BYTE_MARKER | 0x20) & 0xff:
+ case (PackUtil.NEG_1BYTE_MARKER | 0x30) & 0xff:
+ unpacked = PackUtil.NEG_1BYTE_MIN +
+ PackUtil.GET_BITS(value[valueOff++], 6, 0);
+ break;
+ case PackUtil.POS_1BYTE_MARKER & 0xff:
+ case (PackUtil.POS_1BYTE_MARKER | 0x10) & 0xff:
+ case (PackUtil.POS_1BYTE_MARKER | 0x20) & 0xff:
+ case (PackUtil.POS_1BYTE_MARKER | 0x30) & 0xff:
+ unpacked = PackUtil.GET_BITS(value[valueOff++], 6, 0);
+ break;
+ case PackUtil.POS_2BYTE_MARKER & 0xff:
+ case (PackUtil.POS_2BYTE_MARKER | 0x10) & 0xff:
+ unpacked = PackUtil.GET_BITS(value[valueOff++], 5, 0) << 8;
+ unpacked |= value[valueOff++] & 0xff;
+ unpacked += PackUtil.POS_1BYTE_MAX + 1;
+ break;
+ case PackUtil.POS_MULTI_MARKER & 0xff:
+ // There are four length bits in the first byte.
+ len = (value[valueOff++] & 0xf);
+
+ for (unpacked = 0; len != 0; --len) {
+ unpacked = (unpacked << 8) | value[valueOff++] & 0xff;
+ }
+ unpacked += PackUtil.POS_2BYTE_MAX + 1;
+ break;
+ default:
+ throw new WiredTigerPackingException(
+ "Error decoding packed value.");
+ }
+ // Check for overflow if decoding an unsigned value - since Java only
+ // supports signed values.
+ if (!signed && unpacked < 0) {
+ throw new WiredTigerPackingException("Overflow unpacking long.");
+ }
+
+ return (unpacked);
+ }
+}
+
diff --git a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackOutputStream.java b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackOutputStream.java
new file mode 100644
index 00000000000..185068d2093
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackOutputStream.java
@@ -0,0 +1,264 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package com.wiredtiger.db;
+
+import java.io.ByteArrayOutputStream;
+import java.lang.StringBuffer;
+import com.wiredtiger.db.WiredTigerPackingException;
+
+/**
+ * An internal helper class for encoding WiredTiger packed values.
+ *
+ * Applications should not need to use this class.
+ */
+public class PackOutputStream {
+
+ final static int MAX_INT_BYTES = 21;
+ protected PackFormatInputStream format;
+ protected ByteArrayOutputStream packed;
+ protected byte[] intBuf;
+
+ /**
+ * Constructor.
+ *
+ * \param format A String that contains the WiredTiger format that
+ * defines the layout of this packed value.
+ */
+ public PackOutputStream(String format) {
+ this.format = new PackFormatInputStream(format);
+ intBuf = new byte[MAX_INT_BYTES];
+ packed = new ByteArrayOutputStream(100);
+ }
+
+ /**
+ * Returns the raw packing format string.
+ */
+ public String getFormat() {
+ return format.toString();
+ }
+
+ /**
+ * Returns the current packed value.
+ */
+ public byte[] getValue() {
+ return packed.toByteArray();
+ }
+
+ /**
+ * Reset the stream position.
+ */
+ public void reset() {
+ format.reset();
+ packed.reset();
+ }
+
+ /**
+ * Add a byte field to the stream.
+ *
+ * \param value The byte value to be added.
+ */
+ public void addByte(byte value)
+ throws WiredTigerPackingException {
+ format.checkType('b', true);
+ /* Translate to maintain ordering with the sign bit. */
+ byte input = (byte)(value + 0x80);
+ packed.write(input);
+ }
+
+ /**
+ * Add a byte array field to the stream.
+ *
+ * \param value The byte array value to be added.
+ */
+ public void addByteArray(byte[] value)
+ throws WiredTigerPackingException {
+ this.addByteArray(value, 0, value.length);
+ }
+
+ /**
+ * Add a byte array field to the stream.
+ *
+ * \param value The byte array value to be added.
+ * \param off The offset from the start of value to begin using the array.
+ * \param len The length of the value to encode.
+ */
+ public void addByteArray(byte[] value, int off, int len)
+ throws WiredTigerPackingException {
+ format.checkType('U', true);
+ // If this is not the last item, store the size.
+ if (format.available() > 0) {
+ packLong(len, false);
+ }
+
+ packed.write(value, off, len);
+ /* TODO: padding. */
+ }
+
+ /**
+ * Add an integer field to the stream.
+ *
+ * \param value The integer value to be added.
+ */
+ public void addInt(int value)
+ throws WiredTigerPackingException {
+ format.checkType('i', true);
+ packLong(value, true);
+ }
+
+ /**
+ * Add a long field to the stream.
+ *
+ * \param value The long value to be added.
+ */
+ public void addLong(long value)
+ throws WiredTigerPackingException {
+ format.checkType('q', true);
+ packLong(value, true);
+ }
+
+ /**
+ * Add a record field to the stream.
+ *
+ * \param value The record value to be added.
+ */
+ public void addRecord(long value)
+ throws WiredTigerPackingException {
+ format.checkType('r', true);
+ packLong(value, true);
+ }
+
+ /**
+ * Add a short field to the stream.
+ *
+ * \param value The short value to be added.
+ */
+ public void addShort(short value)
+ throws WiredTigerPackingException {
+ format.checkType('h', true);
+ packLong(value, true);
+ }
+
+ /**
+ * Add a string field to the stream.
+ *
+ * \param value The string value to be added.
+ */
+ public void addString(String value)
+ throws WiredTigerPackingException {
+ format.checkType('s', false);
+ char fieldFormat = format.getType();
+ int stringLen = 0;
+ int padBytes = 0;
+ // Strings have two possible encodings. A lower case 's' is not null
+ // terminated, and has a length define in the format (default 1). An
+ // upper case 'S' is variable length and has a null terminator.
+ if (fieldFormat == 's') {
+ stringLen = format.getLengthFromFormat(true);
+ if (stringLen > value.length()) {
+ padBytes = stringLen - value.length();
+ }
+ } else {
+ stringLen = value.length();
+ padBytes = 1; // Null terminator
+ }
+ // We're done pulling information from the field now.
+ format.consume();
+
+ // Use the default Charset.
+ packed.write(value.getBytes(), 0, stringLen);
+ while(padBytes-- > 0) {
+ packed.write(0);
+ }
+ }
+
+ /**
+ * Add a long field to the stream.
+ * The packing format is defined in the WiredTiger C integer packing
+ * implementation, which is at src/include/intpack.i
+ *
+ * \param x The long value to be added.
+ * \param signed Whether the value is signed or unsigned.
+ */
+ private void packLong(long x, boolean signed)
+ throws WiredTigerPackingException {
+ int offset = 0;
+
+ if (!signed && x < 0) {
+ throw new WiredTigerPackingException("Overflow packing long.");
+ }
+
+ if (x < PackUtil.NEG_2BYTE_MIN) {
+ intBuf[offset] = PackUtil.NEG_MULTI_MARKER;
+ int lz = Long.numberOfLeadingZeros(~x) / 8;
+ int len = PackUtil.SIZEOF_LONG - lz;
+
+ //
+ // There are four size bits we can use in the first
+ // byte. For negative numbers, we store the number of
+ // leading 0xff byes to maintain ordering (if this is
+ // not obvious, it may help to remember that -1 is the
+ // largest negative number).
+ intBuf[offset++] |= (lz & 0xf);
+
+ for (int shift = (len - 1) << 3;
+ len != 0; shift -= 8, --len) {
+ intBuf[offset++] = (byte)(x >> shift);
+ }
+ } else if (x < PackUtil.NEG_1BYTE_MIN) {
+ x -= PackUtil.NEG_2BYTE_MIN;
+ intBuf[offset++] =
+ (byte)(PackUtil.NEG_2BYTE_MARKER | PackUtil.GET_BITS(x, 13, 8));
+ intBuf[offset++] = PackUtil.GET_BITS(x, 8, 0);
+ } else if (x < 0) {
+ x -= PackUtil.NEG_1BYTE_MIN;
+ intBuf[offset++] =
+ (byte)(PackUtil.NEG_1BYTE_MARKER | PackUtil.GET_BITS(x, 6, 0));
+ } else if (x <= PackUtil.POS_1BYTE_MAX) {
+ intBuf[offset++] =
+ (byte)(PackUtil.POS_1BYTE_MARKER | PackUtil.GET_BITS(x, 6, 0));
+ } else if (x <= PackUtil.POS_2BYTE_MAX) {
+ x -= PackUtil.POS_1BYTE_MAX + 1;
+ intBuf[offset++] =
+ (byte)(PackUtil.POS_2BYTE_MARKER | PackUtil.GET_BITS(x, 13, 8));
+ intBuf[offset++] = PackUtil.GET_BITS(x, 8, 0);
+ } else {
+ x -= PackUtil.POS_2BYTE_MAX + 1;
+ intBuf[offset] = PackUtil.POS_MULTI_MARKER;
+ int lz = Long.numberOfLeadingZeros(x) / 8;
+ int len = PackUtil.SIZEOF_LONG - lz;
+
+ // There are four bits we can use in the first byte.
+ intBuf[offset++] |= (len & 0xf);
+
+ for (int shift = (len - 1) << 3;
+ len != 0; --len, shift -= 8) {
+ intBuf[offset++] = (byte)(x >> shift);
+ }
+ }
+ packed.write(intBuf, 0, offset);
+ }
+}
diff --git a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackUtil.java b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackUtil.java
new file mode 100644
index 00000000000..c8804891da5
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/PackUtil.java
@@ -0,0 +1,69 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package com.wiredtiger.db;
+
+import java.lang.String;
+
+/**
+ * An internal helper class with utilities for packing and unpacking values.
+ *
+ * Applications should not need to use this class.
+ */
+class PackUtil {
+ /* Contants. */
+ final static byte NEG_MULTI_MARKER = (byte)0x10;
+ final static byte NEG_2BYTE_MARKER = (byte)0x20;
+ final static byte NEG_1BYTE_MARKER = (byte)0x40;
+ final static byte POS_1BYTE_MARKER = (byte)0x80;
+ final static byte POS_2BYTE_MARKER = (byte)0xc0;
+ final static byte POS_MULTI_MARKER = (byte)0xe0;
+
+ final static int NEG_1BYTE_MIN = ((-1) << 6);
+ final static int NEG_2BYTE_MIN = (((-1) << 13) + NEG_1BYTE_MIN);
+ final static int POS_1BYTE_MAX = ((1 << 6) - 1);
+ final static int POS_2BYTE_MAX = ((1 << 13) + POS_1BYTE_MAX);
+
+ // See: http://docs.python.org/2/library/struct.html for an explanation
+ // of what these special characters mean.
+ // TODO: Care about byte ordering and padding in packed formats.
+ final static String PackSpecialCharacters = "@=<>!x";
+
+ final static int SIZEOF_LONG = 8;
+
+ /**
+ * Extract bits from a value, counting from LSB == 0.
+ *
+ * \param x The value to extract bits from.
+ * \param start The first bit to extract.
+ * \param end The last bit to extract.
+ */
+ public static byte GET_BITS(long x, int start, int end) {
+ return (byte)((x & ((1 << start) - 1)) >> end);
+ }
+
+
+}
diff --git a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerException.java b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerException.java
new file mode 100644
index 00000000000..b437ab98eee
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerException.java
@@ -0,0 +1,39 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package com.wiredtiger.db;
+
+/**
+ * An exception that is generated by the WiredTiger application.
+ */
+public class WiredTigerException extends RuntimeException {
+ /**
+ * Constructor.
+ */
+ public WiredTigerException(String msg) {
+ super(msg);
+ }
+}
diff --git a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerPackingException.java b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerPackingException.java
new file mode 100644
index 00000000000..4f08f60b956
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerPackingException.java
@@ -0,0 +1,41 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package com.wiredtiger.db;
+
+/**
+ * An exception that is generated by the WiredTiger application during
+ * encoding or decoding of packed values.
+ */
+public class WiredTigerPackingException extends WiredTigerException {
+ /**
+ * Constructor.
+ */
+ public WiredTigerPackingException(String msg) {
+ super(msg);
+ }
+}
+
diff --git a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerPanicException.java b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerPanicException.java
new file mode 100644
index 00000000000..2b10beda752
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerPanicException.java
@@ -0,0 +1,42 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package com.wiredtiger.db;
+
+/**
+ * An exception that is generated by the WiredTiger application when
+ * there is an underlying problem that requires the application exit
+ * and restart.
+ */
+public class WiredTigerPanicException extends WiredTigerException {
+ /**
+ * Constructor.
+ */
+ public WiredTigerPanicException(String msg) {
+ super(msg);
+ }
+}
+
diff --git a/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerRollbackException.java b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerRollbackException.java
new file mode 100644
index 00000000000..0521b43aac9
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/java/src/com/wiredtiger/db/WiredTigerRollbackException.java
@@ -0,0 +1,41 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package com.wiredtiger.db;
+
+/**
+ * An exception that is generated by the WiredTiger application
+ * when there is a conflict between concurrent operations.
+ */
+public class WiredTigerRollbackException extends WiredTigerException {
+ /**
+ * Constructor.
+ */
+ public WiredTigerRollbackException(String msg) {
+ super(msg);
+ }
+}
+
diff --git a/src/third_party/wiredtiger/lang/java/wiredtiger.i b/src/third_party/wiredtiger/lang/java/wiredtiger.i
new file mode 100644
index 00000000000..bb6e0bebe21
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/java/wiredtiger.i
@@ -0,0 +1,1849 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * wiredtiger.i
+ * The SWIG interface file defining the wiredtiger Java API.
+ */
+
+%module wiredtiger
+
+%include "enums.swg"
+%include "typemaps.i"
+%include "stdint.i"
+
+%pragma(java) jniclasscode=%{
+ static {
+ try {
+ System.loadLibrary("wiredtiger_java");
+ } catch (UnsatisfiedLinkError e) {
+ System.err.println("Native code library failed to load. \n" + e);
+ System.exit(1);
+ }
+ }
+%}
+
+%{
+#include "src/include/wt_internal.h"
+
+/*
+ * Closed handle checking:
+ *
+ * The typedef WT_CURSOR_NULLABLE used in wiredtiger.h is only made
+ * visible to the SWIG parser and is used to identify arguments of
+ * Cursor type that are permitted to be null. Likewise, typedefs
+ * WT_{CURSOR,SESSION,CONNECTION}_CLOSED identify 'close' calls that
+ * need explicit nulling of the swigCPtr. These typedefs permit
+ * special casing in typemaps for input args.
+ *
+ * We want SWIG to see these 'fake' typenames, but not the compiler.
+ */
+#define WT_CURSOR_NULLABLE WT_CURSOR
+#define WT_CURSOR_CLOSED WT_CURSOR
+#define WT_SESSION_CLOSED WT_SESSION
+#define WT_CONNECTION_CLOSED WT_CONNECTION
+
+/*
+ * For Connections, Sessions and Cursors created in Java, each of
+ * WT_CONNECTION_IMPL, WT_SESSION_IMPL and WT_CURSOR have a
+ * lang_private field that store a pointer to a JAVA_CALLBACK, alloced
+ * during the various open calls. {conn,session,cursor}CloseHandler()
+ * functions reach into the associated java object, set the swigCPtr
+ * to 0, and free the JAVA_CALLBACK. Typemaps matching Connection,
+ * Session, Cursor args use the NULL_CHECK macro, which checks if
+ * swigCPtr is 0.
+ */
+typedef struct {
+ JavaVM *javavm; /* Used in async threads to craft a jnienv */
+ JNIEnv *jnienv; /* jni env that created the Session/Cursor */
+ jobject jobj; /* the java Session/Cursor/AsyncOp object */
+ jobject jcallback; /* callback object for async ops */
+ jfieldID cptr_fid; /* cached Cursor.swigCPtr field id in session */
+ jfieldID asynccptr_fid; /* cached AsyncOp.swigCptr fid in conn */
+ jfieldID kunp_fid; /* cached AsyncOp.keyUnpacker fid in conn */
+ jfieldID vunp_fid; /* cached AsyncOp.valueUnpacker fid in conn */
+ jmethodID notify_mid; /* cached AsyncCallback.notify mid in conn */
+} JAVA_CALLBACK;
+
+static void throwWiredTigerException(JNIEnv *jenv, int err) {
+ const char *clname;
+ jclass excep;
+
+ clname = NULL;
+ excep = NULL;
+ if (err == WT_PANIC)
+ clname = "com/wiredtiger/db/WiredTigerPanicException";
+ else if (err == WT_ROLLBACK)
+ clname = "com/wiredtiger/db/WiredTigerRollbackException";
+ else
+ clname = "com/wiredtiger/db/WiredTigerException";
+ if (clname)
+ excep = (*jenv)->FindClass(jenv, clname);
+ if (excep)
+ (*jenv)->ThrowNew(jenv, excep, wiredtiger_strerror(err));
+}
+
+%}
+
+/* No finalizers */
+%typemap(javafinalize) SWIGTYPE ""
+
+/* Event handlers are not supported in Java. */
+%typemap(in, numinputs=0) WT_EVENT_HANDLER * %{ $1 = NULL; %}
+
+/* Allow silently passing the Java object and JNIEnv into our code. */
+%typemap(in, numinputs=0) jobject *jthis %{ $1 = jarg1_; %}
+%typemap(in, numinputs=0) JNIEnv * %{ $1 = jenv; %}
+
+/* 64 bit typemaps. */
+%typemap(jni) uint64_t "jlong"
+%typemap(jtype) uint64_t "long"
+%typemap(jstype) uint64_t "long"
+
+%typemap(javain) uint64_t "$javainput"
+%typemap(javaout) uint64_t {
+ return $jnicall;
+}
+
+/* Return byte[] from cursor.get_value */
+%typemap(jni) WT_ITEM, WT_ITEM * "jbyteArray"
+%typemap(jtype) WT_ITEM, WT_ITEM * "byte[]"
+%typemap(jstype) WT_ITEM, WT_ITEM * "byte[]"
+
+%typemap(javain) WT_ITEM, WT_ITEM * "$javainput"
+%typemap(javaout) WT_ITEM, WT_ITEM * {
+ return $jnicall;
+}
+
+%typemap(in) WT_ITEM * (WT_ITEM item) %{
+ $1 = &item;
+ $1->data = (*jenv)->GetByteArrayElements(jenv, $input, 0);
+ $1->size = (size_t)(*jenv)->GetArrayLength(jenv, $input);
+%}
+
+%typemap(argout) WT_ITEM * %{
+ (*jenv)->ReleaseByteArrayElements(jenv, $input, (void *)$1->data, 0);
+%}
+
+%typemap(out) WT_ITEM %{
+ if ($1.data == NULL)
+ $result = NULL;
+ else if (($result = (*jenv)->NewByteArray(jenv, (jsize)$1.size)) != NULL) {
+ (*jenv)->SetByteArrayRegion(jenv,
+ $result, 0, (jsize)$1.size, $1.data);
+ }
+%}
+
+/* Don't require empty config strings. */
+%typemap(default) const char *config %{ $1 = NULL; %}
+
+%typemap(out) int %{
+ if ($1 != 0 && $1 != WT_NOTFOUND) {
+ throwWiredTigerException(jenv, $1);
+ return $null;
+ }
+ $result = $1;
+%}
+
+%define NULL_CHECK(val, name)
+ if (!val) {
+ SWIG_JavaThrowException(jenv, SWIG_JavaNullPointerException,
+ #name " is null");
+ return $null;
+ }
+%enddef
+
+%define WT_CLASS(type, class, name, closeHandler)
+/*
+ * Extra 'self' elimination.
+ * The methods we're wrapping look like this:
+ * struct __wt_xxx {
+ * int method(WT_XXX *, ...otherargs...);
+ * };
+ * To SWIG, that is equivalent to:
+ * int method(struct __wt_xxx *self, WT_XXX *, ...otherargs...);
+ * and we use consecutive argument matching of typemaps to convert two args to
+ * one.
+ */
+%typemap(in, numinputs=0) type *name {
+ $1 = *(type **)&jarg1;
+ NULL_CHECK($1, $1_name)
+}
+
+%typemap(in, numinputs=0) class ## _CLOSED *name {
+ $1 = *(type **)&jarg1;
+ NULL_CHECK($1, $1_name)
+ closeHandler;
+}
+
+%typemap(in) class ## _NULLABLE * {
+ $1 = *(type **)&$input;
+}
+
+%typemap(in) type * {
+ $1 = *(type **)&$input;
+ NULL_CHECK($1, $1_name)
+}
+
+%typemap(javaimports) type "
+/**
+ * @copydoc class
+ * @ingroup wt_java
+ */"
+%enddef
+
+%pragma(java) moduleimports=%{
+/**
+ * @defgroup wt_java WiredTiger Java API
+ *
+ * Java wrappers around the WiredTiger C API.
+ */
+
+/**
+ * @ingroup wt_java
+ */
+%}
+
+WT_CLASS(struct __wt_connection, WT_CONNECTION, connection, connCloseHandler($1))
+WT_CLASS(struct __wt_session, WT_SESSION, session, sessionCloseHandler($1))
+WT_CLASS(struct __wt_cursor, WT_CURSOR, cursor, cursorCloseHandler($1))
+WT_CLASS(struct __wt_async_op, WT_ASYNC_OP, op, )
+
+%define COPYDOC(SIGNATURE_CLASS, CLASS, METHOD)
+%javamethodmodifiers SIGNATURE_CLASS::METHOD "
+ /**
+ * @copydoc CLASS::METHOD
+ */
+ public ";
+%enddef
+
+%include "java_doc.i"
+
+/* WT_ASYNC_OP customization. */
+/* First, replace the varargs get / set methods with Java equivalents. */
+%ignore __wt_async_op::get_key;
+%ignore __wt_async_op::get_value;
+%ignore __wt_async_op::set_key;
+%ignore __wt_async_op::set_value;
+%ignore __wt_async_op::insert;
+%ignore __wt_async_op::remove;
+%ignore __wt_async_op::search;
+%ignore __wt_async_op::update;
+%immutable __wt_async_op::connection;
+%immutable __wt_async_op::key_format;
+%immutable __wt_async_op::value_format;
+
+%javamethodmodifiers __wt_async_op::key_format "protected";
+%javamethodmodifiers __wt_async_op::value_format "protected";
+
+/* WT_CURSOR customization. */
+/* First, replace the varargs get / set methods with Java equivalents. */
+%ignore __wt_cursor::get_key;
+%ignore __wt_cursor::get_value;
+%ignore __wt_cursor::set_key;
+%ignore __wt_cursor::set_value;
+%ignore __wt_cursor::insert;
+%ignore __wt_cursor::remove;
+%ignore __wt_cursor::search;
+%ignore __wt_cursor::search_near;
+%ignore __wt_cursor::update;
+%javamethodmodifiers __wt_cursor::next "protected";
+%rename (next_wrap) __wt_cursor::next;
+%javamethodmodifiers __wt_cursor::prev "protected";
+%rename (prev_wrap) __wt_cursor::prev;
+%javamethodmodifiers __wt_cursor::key_format "protected";
+%javamethodmodifiers __wt_cursor::value_format "protected";
+
+%ignore __wt_cursor::compare(WT_CURSOR *, WT_CURSOR *, int *);
+%rename (compare_wrap) __wt_cursor::compare;
+%rename (AsyncOpType) WT_ASYNC_OPTYPE;
+%rename (getKeyFormat) __wt_async_op::getKey_format;
+%rename (getValueFormat) __wt_async_op::getValue_format;
+%rename (getType) __wt_async_op::get_type;
+
+/* SWIG magic to turn Java byte strings into data / size. */
+%apply (char *STRING, int LENGTH) { (char *data, int size) };
+
+/* Status from search_near */
+%javaconst(1);
+%inline %{
+enum SearchStatus { FOUND, NOTFOUND, SMALLER, LARGER };
+%}
+
+%wrapper %{
+/* Zero out SWIG's pointer to the C object,
+ * equivalent to 'jobj.swigCPtr = 0;' in java.
+ */
+static int
+javaClose(JNIEnv *env, JAVA_CALLBACK *jcb, jfieldID *pfid)
+{
+ jclass cls;
+ jfieldID fid;
+
+ if (pfid == NULL || *pfid == NULL) {
+ cls = (*env)->GetObjectClass(env, jcb->jobj);
+ fid = (*env)->GetFieldID(env, cls, "swigCPtr", "J");
+ if (pfid != NULL)
+ *pfid = fid;
+ } else {
+ fid = *pfid;
+ }
+ (*env)->SetLongField(env, jcb->jobj, fid, 0L);
+ (*env)->DeleteGlobalRef(env, jcb->jobj);
+ return (0);
+}
+
+/* Connection specific close handler. */
+static int
+connCloseHandler(WT_CONNECTION *conn_arg)
+{
+ int ret;
+ JAVA_CALLBACK *jcb;
+ WT_CONNECTION_IMPL *conn;
+
+ conn = (WT_CONNECTION_IMPL *)conn_arg;
+ jcb = (JAVA_CALLBACK *)conn->lang_private;
+ conn->lang_private = NULL;
+ ret = javaClose(jcb->jnienv, jcb, NULL);
+ __wt_free(conn->default_session, jcb);
+
+ return (ret);
+}
+
+/* Session specific close handler. */
+static int
+sessionCloseHandler(WT_SESSION *session_arg)
+{
+ int ret;
+ JAVA_CALLBACK *jcb;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)session_arg;
+ jcb = (JAVA_CALLBACK *)session->lang_private;
+ session->lang_private = NULL;
+ ret = javaClose(jcb->jnienv, jcb, NULL);
+ __wt_free(session, jcb);
+
+ return (ret);
+}
+
+/* Cursor specific close handler. */
+static int
+cursorCloseHandler(WT_CURSOR *cursor)
+{
+ int ret;
+ JAVA_CALLBACK *jcb;
+ JAVA_CALLBACK *sess_jcb;
+
+ jcb = (JAVA_CALLBACK *)cursor->lang_private;
+ sess_jcb = (JAVA_CALLBACK *)
+ ((WT_SESSION_IMPL *)cursor->session)->lang_private;
+ cursor->lang_private = NULL;
+ ret = javaClose(jcb->jnienv, jcb,
+ sess_jcb ? &sess_jcb->cptr_fid : NULL);
+ __wt_free((WT_SESSION_IMPL *)cursor->session, jcb);
+
+ return (ret);
+}
+
+/* Add event handler support. */
+static int
+javaCloseHandler(WT_EVENT_HANDLER *handler, WT_SESSION *session,
+ WT_CURSOR *cursor)
+{
+ int ret;
+
+ WT_UNUSED(handler);
+
+ if (cursor != NULL)
+ ret = cursorCloseHandler(cursor);
+ else
+ ret = sessionCloseHandler(session);
+ return (ret);
+}
+
+WT_EVENT_HANDLER javaApiEventHandler = {NULL, NULL, NULL, javaCloseHandler};
+
+static int
+javaAsyncHandler(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *asyncop, int opret,
+ uint32_t flags)
+{
+ int ret, envret;
+ JAVA_CALLBACK *jcb, *conn_jcb;
+ JavaVM *javavm;
+ jclass cls;
+ jfieldID fid;
+ jmethodID mid;
+ JNIEnv *jenv;
+ WT_ASYNC_OP_IMPL *op;
+ WT_SESSION_IMPL *session;
+
+ WT_UNUSED(cb);
+ WT_UNUSED(flags);
+ op = (WT_ASYNC_OP_IMPL *)asyncop;
+ session = O2S(op);
+ jcb = (JAVA_CALLBACK *)asyncop->c.lang_private;
+ conn_jcb = (JAVA_CALLBACK *)S2C(session)->lang_private;
+ asyncop->c.lang_private = NULL;
+
+ /*
+ * We rely on the fact that the async machinery uses a pool of
+ * threads. Here we attach the current native (POSIX)
+ * thread to a Java thread and never detach it. If the native
+ * thread was previously seen by this callback, it will be
+ * attached to the same Java thread as before without
+ * incurring the cost of the thread initialization.
+ * Marking the Java thread as a daemon means its existence
+ * won't keep an application from exiting.
+ */
+ javavm = jcb->javavm;
+ envret = (*javavm)->GetEnv(javavm, (void **)&jenv, JNI_VERSION_1_6);
+ if (envret == JNI_EDETACHED) {
+ if ((*javavm)->AttachCurrentThreadAsDaemon(javavm,
+ (void **)&jenv, NULL) != 0) {
+ ret = EBUSY;
+ goto err;
+ }
+ } else if (envret != JNI_OK) {
+ ret = EBUSY;
+ goto err;
+ }
+
+ /*
+ * Look up any needed field and method ids, and cache them
+ * in the connection's lang_private. fid and mids are
+ * stable.
+ */
+ if (conn_jcb->notify_mid == NULL) {
+ /* Any JNI error until the actual callback is unexpected. */
+ ret = EINVAL;
+
+ cls = (*jenv)->GetObjectClass(jenv, jcb->jobj);
+ if (cls == NULL)
+ goto err;
+ fid = (*jenv)->GetFieldID(jenv, cls,
+ "keyUnpacker", "Lcom/wiredtiger/db/PackInputStream;");
+ if (fid == NULL)
+ goto err;
+ conn_jcb->kunp_fid = fid;
+
+ fid = (*jenv)->GetFieldID(jenv, cls,
+ "valueUnpacker", "Lcom/wiredtiger/db/PackInputStream;");
+ if (fid == NULL)
+ goto err;
+ conn_jcb->vunp_fid = fid;
+
+ cls = (*jenv)->GetObjectClass(jenv, jcb->jcallback);
+ if (cls == NULL)
+ goto err;
+ mid = (*jenv)->GetMethodID(jenv, cls, "notify",
+ "(Lcom/wiredtiger/db/AsyncOp;II)I");
+ if (mid == NULL)
+ goto err;
+ conn_jcb->notify_mid = mid;
+ }
+
+ /*
+ * Invalidate the unpackers so any calls to op.getKey()
+ * and op.getValue get fresh results.
+ */
+ (*jenv)->SetObjectField(jenv, jcb->jobj, conn_jcb->kunp_fid, NULL);
+ (*jenv)->SetObjectField(jenv, jcb->jobj, conn_jcb->vunp_fid, NULL);
+
+ /* Call the registered callback. */
+ ret = (*jenv)->CallIntMethod(jenv, jcb->jcallback, conn_jcb->notify_mid,
+ jcb->jobj, opret, flags);
+
+ if ((*jenv)->ExceptionOccurred(jenv)) {
+ (*jenv)->ExceptionDescribe(jenv);
+ (*jenv)->ExceptionClear(jenv);
+ }
+ if (0) {
+err: __wt_err(session, ret, "Java async callback error");
+ }
+
+ /* Invalidate the AsyncOp, further use throws NullPointerException. */
+ ret = javaClose(jenv, jcb, &conn_jcb->asynccptr_fid);
+
+ (*jenv)->DeleteGlobalRef(jenv, jcb->jcallback);
+
+ __wt_free(session, jcb);
+
+ if (ret == 0 && (opret == 0 || opret == WT_NOTFOUND))
+ return (0);
+ else
+ return (1);
+}
+
+WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler};
+%}
+
+%extend __wt_async_op {
+
+ %javamethodmodifiers get_key_wrap "protected";
+ WT_ITEM get_key_wrap(JNIEnv *jenv) {
+ WT_ITEM k;
+ int ret;
+ k.data = NULL;
+ if ((ret = $self->get_key($self, &k)) != 0)
+ throwWiredTigerException(jenv, ret);
+ return k;
+ }
+
+ %javamethodmodifiers get_value_wrap "protected";
+ WT_ITEM get_value_wrap(JNIEnv *jenv) {
+ WT_ITEM v;
+ int ret;
+ v.data = NULL;
+ if ((ret = $self->get_value($self, &v)) != 0)
+ throwWiredTigerException(jenv, ret);
+ return v;
+ }
+
+ %javamethodmodifiers insert_wrap "protected";
+ int insert_wrap(WT_ITEM *k, WT_ITEM *v) {
+ $self->set_key($self, k);
+ $self->set_value($self, v);
+ return $self->insert($self);
+ }
+
+ %javamethodmodifiers remove_wrap "protected";
+ int remove_wrap(WT_ITEM *k) {
+ $self->set_key($self, k);
+ return $self->remove($self);
+ }
+
+ %javamethodmodifiers search_wrap "protected";
+ int search_wrap(WT_ITEM *k) {
+ $self->set_key($self, k);
+ return $self->search($self);
+ }
+
+ %javamethodmodifiers update_wrap "protected";
+ int update_wrap(WT_ITEM *k, WT_ITEM *v) {
+ $self->set_key($self, k);
+ $self->set_value($self, v);
+ return $self->update($self);
+ }
+
+ %javamethodmodifiers java_init "protected";
+ int java_init(jobject jasyncop) {
+ JAVA_CALLBACK *jcb =
+ (JAVA_CALLBACK *)$self->c.lang_private;
+ jcb->jobj = JCALL1(NewGlobalRef, jcb->jnienv, jasyncop);
+ JCALL1(DeleteLocalRef, jcb->jnienv, jasyncop);
+ return (0);
+ }
+}
+
+/* Cache key/value formats in Async_op */
+%typemap(javabody) struct __wt_async_op %{
+ private long swigCPtr;
+ protected boolean swigCMemOwn;
+ protected String keyFormat;
+ protected String valueFormat;
+ protected PackOutputStream keyPacker;
+ protected PackOutputStream valuePacker;
+ protected PackInputStream keyUnpacker;
+ protected PackInputStream valueUnpacker;
+
+ protected $javaclassname(long cPtr, boolean cMemoryOwn) {
+ swigCMemOwn = cMemoryOwn;
+ swigCPtr = cPtr;
+ keyFormat = getKey_format();
+ valueFormat = getValue_format();
+ keyPacker = new PackOutputStream(keyFormat);
+ valuePacker = new PackOutputStream(valueFormat);
+ wiredtigerJNI.AsyncOp_java_init(swigCPtr, this, this);
+ }
+
+ protected static long getCPtr($javaclassname obj) {
+ return (obj == null) ? 0 : obj.swigCPtr;
+ }
+%}
+
+%typemap(javacode) struct __wt_async_op %{
+
+ /**
+ * Retrieve the format string for this async_op's key.
+ */
+ public String getKeyFormat() {
+ return keyFormat;
+ }
+
+ /**
+ * Retrieve the format string for this async_op's value.
+ */
+ public String getValueFormat() {
+ return valueFormat;
+ }
+
+ /**
+ * Append a byte to the async_op's key.
+ *
+ * \param value The value to append.
+ * \return This async_op object, so put calls can be chained.
+ */
+ public AsyncOp putKeyByte(byte value)
+ throws WiredTigerPackingException {
+ keyUnpacker = null;
+ keyPacker.addByte(value);
+ return this;
+ }
+
+ /**
+ * Append a byte array to the async_op's key.
+ *
+ * \param value The value to append.
+ * \return This async_op object, so put calls can be chained.
+ */
+ public AsyncOp putKeyByteArray(byte[] value)
+ throws WiredTigerPackingException {
+ this.putKeyByteArray(value, 0, value.length);
+ return this;
+ }
+
+ /**
+ * Append a byte array to the async_op's key.
+ *
+ * \param value The value to append.
+ * \param off The offset into value at which to start.
+ * \param len The length of the byte array.
+ * \return This async_op object, so put calls can be chained.
+ */
+ public AsyncOp putKeyByteArray(byte[] value, int off, int len)
+ throws WiredTigerPackingException {
+ keyUnpacker = null;
+ keyPacker.addByteArray(value, off, len);
+ return this;
+ }
+
+ /**
+ * Append an integer to the async_op's key.
+ *
+ * \param value The value to append
+ * \return This async_op object, so put calls can be chained.
+ */
+ public AsyncOp putKeyInt(int value)
+ throws WiredTigerPackingException {
+ keyUnpacker = null;
+ keyPacker.addInt(value);
+ return this;
+ }
+
+ /**
+ * Append a long to the async_op's key.
+ *
+ * \param value The value to append
+ * \return This async_op object, so put calls can be chained.
+ */
+ public AsyncOp putKeyLong(long value)
+ throws WiredTigerPackingException {
+ keyUnpacker = null;
+ keyPacker.addLong(value);
+ return this;
+ }
+
+ /**
+ * Append a short integer to the async_op's key.
+ *
+ * \param value The value to append
+ * \return This async_op object, so put calls can be chained.
+ */
+ public AsyncOp putKeyShort(short value)
+ throws WiredTigerPackingException {
+ keyUnpacker = null;
+ keyPacker.addShort(value);
+ return this;
+ }
+
+ /**
+ * Append a string to the async_op's key.
+ *
+ * \param value The value to append
+ * \return This async_op object, so put calls can be chained.
+ */
+ public AsyncOp putKeyString(String value)
+ throws WiredTigerPackingException {
+ keyUnpacker = null;
+ keyPacker.addString(value);
+ return this;
+ }
+
+ /**
+ * Append a byte to the async_op's value.
+ *
+ * \param value The value to append
+ * \return This async_op object, so put calls can be chained.
+ */
+ public AsyncOp putValueByte(byte value)
+ throws WiredTigerPackingException {
+ valueUnpacker = null;
+ valuePacker.addByte(value);
+ return this;
+ }
+
+ /**
+ * Append a byte array to the async_op's value.
+ *
+ * \param value The value to append
+ * \return This async_op object, so put calls can be chained.
+ */
+ public AsyncOp putValueByteArray(byte[] value)
+ throws WiredTigerPackingException {
+ this.putValueByteArray(value, 0, value.length);
+ return this;
+ }
+
+ /**
+ * Append a byte array to the async_op's value.
+ *
+ * \param value The value to append
+ * \param off The offset into value at which to start.
+ * \param len The length of the byte array.
+ * \return This async_op object, so put calls can be chained.
+ */
+ public AsyncOp putValueByteArray(byte[] value, int off, int len)
+ throws WiredTigerPackingException {
+ valueUnpacker = null;
+ valuePacker.addByteArray(value, off, len);
+ return this;
+ }
+
+ /**
+ * Append an integer to the async_op's value.
+ *
+ * \param value The value to append
+ * \return This async_op object, so put calls can be chained.
+ */
+ public AsyncOp putValueInt(int value)
+ throws WiredTigerPackingException {
+ valueUnpacker = null;
+ valuePacker.addInt(value);
+ return this;
+ }
+
+ /**
+ * Append a long to the async_op's value.
+ *
+ * \param value The value to append
+ * \return This async_op object, so put calls can be chained.
+ */
+ public AsyncOp putValueLong(long value)
+ throws WiredTigerPackingException {
+ valueUnpacker = null;
+ valuePacker.addLong(value);
+ return this;
+ }
+
+ /**
+ * Append a short integer to the async_op's value.
+ *
+ * \param value The value to append
+ * \return This async_op object, so put calls can be chained.
+ */
+ public AsyncOp putValueShort(short value)
+ throws WiredTigerPackingException {
+ valueUnpacker = null;
+ valuePacker.addShort(value);
+ return this;
+ }
+
+ /**
+ * Append a string to the async_op's value.
+ *
+ * \param value The value to append
+ * \return This async_op object, so put calls can be chained.
+ */
+ public AsyncOp putValueString(String value)
+ throws WiredTigerPackingException {
+ valueUnpacker = null;
+ valuePacker.addString(value);
+ return this;
+ }
+
+ /**
+ * Retrieve a byte from the async_op's key.
+ *
+ * \return The requested value.
+ */
+ public byte getKeyByte()
+ throws WiredTigerPackingException {
+ return getKeyUnpacker().getByte();
+ }
+
+ /**
+ * Retrieve a byte array from the async_op's key.
+ *
+ * \param output The byte array where the returned value will be stored.
+ * The array should be large enough to store the entire
+ * data item, if not a truncated value will be returned.
+ */
+ public void getKeyByteArray(byte[] output)
+ throws WiredTigerPackingException {
+ this.getKeyByteArray(output, 0, output.length);
+ }
+
+ /**
+ * Retrieve a byte array from the async_op's key.
+ *
+ * \param output The byte array where the returned value will be stored.
+ * \param off Offset into the destination buffer to start copying into.
+ * \param len The length should be large enough to store the entire
+ * data item, if not a truncated value will be returned.
+ */
+ public void getKeyByteArray(byte[] output, int off, int len)
+ throws WiredTigerPackingException {
+ getKeyUnpacker().getByteArray(output, off, len);
+ }
+
+ /**
+ * Retrieve a byte array from the async_op's key.
+ *
+ * \return The requested value.
+ */
+ public byte[] getKeyByteArray()
+ throws WiredTigerPackingException {
+ return getKeyUnpacker().getByteArray();
+ }
+
+ /**
+ * Retrieve an integer from the async_op's key.
+ *
+ * \return The requested value.
+ */
+ public int getKeyInt()
+ throws WiredTigerPackingException {
+ return getKeyUnpacker().getInt();
+ }
+
+ /**
+ * Retrieve a long from the async_op's key.
+ *
+ * \return The requested value.
+ */
+ public long getKeyLong()
+ throws WiredTigerPackingException {
+ return getKeyUnpacker().getLong();
+ }
+
+ /**
+ * Retrieve a short integer from the async_op's key.
+ *
+ * \return The requested value.
+ */
+ public short getKeyShort()
+ throws WiredTigerPackingException {
+ return getKeyUnpacker().getShort();
+ }
+
+ /**
+ * Retrieve a string from the async_op's key.
+ *
+ * \return The requested value.
+ */
+ public String getKeyString()
+ throws WiredTigerPackingException {
+ return getKeyUnpacker().getString();
+ }
+
+ /**
+ * Retrieve a byte from the async_op's value.
+ *
+ * \return The requested value.
+ */
+ public byte getValueByte()
+ throws WiredTigerPackingException {
+ return getValueUnpacker().getByte();
+ }
+
+ /**
+ * Retrieve a byte array from the async_op's value.
+ *
+ * \param output The byte array where the returned value will be stored.
+ * The array should be large enough to store the entire
+ * data item, if not a truncated value will be returned.
+ */
+ public void getValueByteArray(byte[] output)
+ throws WiredTigerPackingException {
+ this.getValueByteArray(output, 0, output.length);
+ }
+
+ /**
+ * Retrieve a byte array from the async_op's value.
+ *
+ * \param output The byte array where the returned value will be stored.
+ * \param off Offset into the destination buffer to start copying into.
+ * \param len The length should be large enough to store the entire
+ * data item, if not a truncated value will be returned.
+ */
+ public void getValueByteArray(byte[] output, int off, int len)
+ throws WiredTigerPackingException {
+ getValueUnpacker().getByteArray(output, off, len);
+ }
+
+ /**
+ * Retrieve a byte array from the async_op's value.
+ *
+ * \return The requested value.
+ */
+ public byte[] getValueByteArray()
+ throws WiredTigerPackingException {
+ return getValueUnpacker().getByteArray();
+ }
+
+ /**
+ * Retrieve an integer from the async_op's value.
+ *
+ * \return The requested value.
+ */
+ public int getValueInt()
+ throws WiredTigerPackingException {
+ return getValueUnpacker().getInt();
+ }
+
+ /**
+ * Retrieve a long from the async_op's value.
+ *
+ * \return The requested value.
+ */
+ public long getValueLong()
+ throws WiredTigerPackingException {
+ return getValueUnpacker().getLong();
+ }
+
+ /**
+ * Retrieve a short integer from the async_op's value.
+ *
+ * \return The requested value.
+ */
+ public short getValueShort()
+ throws WiredTigerPackingException {
+ return getValueUnpacker().getShort();
+ }
+
+ /**
+ * Retrieve a string from the async_op's value.
+ *
+ * \return The requested value.
+ */
+ public String getValueString()
+ throws WiredTigerPackingException {
+ return getValueUnpacker().getString();
+ }
+
+ /**
+ * Insert the async_op's current key/value into the table.
+ *
+ * \return The status of the operation.
+ */
+ public int insert()
+ throws WiredTigerException {
+ byte[] key = keyPacker.getValue();
+ byte[] value = valuePacker.getValue();
+ keyPacker.reset();
+ valuePacker.reset();
+ return insert_wrap(key, value);
+ }
+
+ /**
+ * Update the async_op's current key/value into the table.
+ *
+ * \return The status of the operation.
+ */
+ public int update()
+ throws WiredTigerException {
+ byte[] key = keyPacker.getValue();
+ byte[] value = valuePacker.getValue();
+ keyPacker.reset();
+ valuePacker.reset();
+ return update_wrap(key, value);
+ }
+
+ /**
+ * Remove the async_op's current key/value into the table.
+ *
+ * \return The status of the operation.
+ */
+ public int remove()
+ throws WiredTigerException {
+ byte[] key = keyPacker.getValue();
+ keyPacker.reset();
+ return remove_wrap(key);
+ }
+
+ /**
+ * Search for an item in the table.
+ *
+ * \return The result of the comparison.
+ */
+ public int search()
+ throws WiredTigerException {
+ int ret = search_wrap(keyPacker.getValue());
+ keyPacker.reset();
+ valuePacker.reset();
+ return ret;
+ }
+
+ /**
+ * Set up the key unpacker or return previously cached value.
+ *
+ * \return The key unpacker.
+ */
+ private PackInputStream getKeyUnpacker()
+ throws WiredTigerPackingException {
+ if (keyUnpacker == null)
+ keyUnpacker =
+ new PackInputStream(keyFormat, get_key_wrap());
+ return keyUnpacker;
+ }
+
+ /**
+ * Set up the value unpacker or return previously cached value.
+ *
+ * \return The value unpacker.
+ */
+ private PackInputStream getValueUnpacker()
+ throws WiredTigerPackingException {
+ if (valueUnpacker == null)
+ valueUnpacker =
+ new PackInputStream(valueFormat, get_value_wrap());
+ return valueUnpacker;
+ }
+
+%}
+
+%extend __wt_cursor {
+
+ %javamethodmodifiers get_key_wrap "protected";
+ WT_ITEM get_key_wrap(JNIEnv *jenv) {
+ WT_ITEM k;
+ int ret;
+ k.data = NULL;
+ if ((ret = $self->get_key($self, &k)) != 0)
+ throwWiredTigerException(jenv, ret);
+ return k;
+ }
+
+ %javamethodmodifiers get_value_wrap "protected";
+ WT_ITEM get_value_wrap(JNIEnv *jenv) {
+ WT_ITEM v;
+ int ret;
+ v.data = NULL;
+ if ((ret = $self->get_value($self, &v)) != 0)
+ throwWiredTigerException(jenv, ret);
+ return v;
+ }
+
+ %javamethodmodifiers insert_wrap "protected";
+ int insert_wrap(WT_ITEM *k, WT_ITEM *v) {
+ $self->set_key($self, k);
+ $self->set_value($self, v);
+ return $self->insert($self);
+ }
+
+ %javamethodmodifiers remove_wrap "protected";
+ int remove_wrap(WT_ITEM *k) {
+ $self->set_key($self, k);
+ return $self->remove($self);
+ }
+
+ %javamethodmodifiers search_wrap "protected";
+ int search_wrap(WT_ITEM *k) {
+ $self->set_key($self, k);
+ return $self->search($self);
+ }
+
+ %javamethodmodifiers search_near_wrap "protected";
+ enum SearchStatus search_near_wrap(JNIEnv *jenv, WT_ITEM *k) {
+ int cmp, ret;
+
+ $self->set_key($self, k);
+ ret = $self->search_near(self, &cmp);
+ if (ret != 0 && ret != WT_NOTFOUND)
+ throwWiredTigerException(jenv, ret);
+ if (ret == 0)
+ return (cmp == 0 ? FOUND : cmp < 0 ? SMALLER : LARGER);
+ return (NOTFOUND);
+ }
+
+ %javamethodmodifiers update_wrap "protected";
+ int update_wrap(WT_ITEM *k, WT_ITEM *v) {
+ $self->set_key($self, k);
+ $self->set_value($self, v);
+ return $self->update($self);
+ }
+
+ int compare_wrap(JNIEnv *jenv, WT_CURSOR *other) {
+ int cmp, ret = $self->compare($self, other, &cmp);
+ if (ret != 0)
+ throwWiredTigerException(jenv, ret);
+ return cmp;
+ }
+
+ %javamethodmodifiers java_init "protected";
+ int java_init(jobject jcursor) {
+ JAVA_CALLBACK *jcb = (JAVA_CALLBACK *)$self->lang_private;
+ jcb->jobj = JCALL1(NewGlobalRef, jcb->jnienv, jcursor);
+ JCALL1(DeleteLocalRef, jcb->jnienv, jcursor);
+ return (0);
+ }
+}
+
+/* Cache key/value formats in Cursor */
+%typemap(javabody) struct __wt_cursor %{
+ private long swigCPtr;
+ protected boolean swigCMemOwn;
+ protected String keyFormat;
+ protected String valueFormat;
+ protected PackOutputStream keyPacker;
+ protected PackOutputStream valuePacker;
+ protected PackInputStream keyUnpacker;
+ protected PackInputStream valueUnpacker;
+
+ protected $javaclassname(long cPtr, boolean cMemoryOwn) {
+ swigCMemOwn = cMemoryOwn;
+ swigCPtr = cPtr;
+ keyFormat = getKey_format();
+ valueFormat = getValue_format();
+ keyPacker = new PackOutputStream(keyFormat);
+ valuePacker = new PackOutputStream(valueFormat);
+ wiredtigerJNI.Cursor_java_init(swigCPtr, this, this);
+ }
+
+ protected static long getCPtr($javaclassname obj) {
+ return (obj == null) ? 0 : obj.swigCPtr;
+ }
+%}
+
+%typemap(javacode) struct __wt_cursor %{
+
+ /**
+ * Retrieve the format string for this cursor's key.
+ */
+ public String getKeyFormat() {
+ return keyFormat;
+ }
+
+ /**
+ * Retrieve the format string for this cursor's value.
+ */
+ public String getValueFormat() {
+ return valueFormat;
+ }
+
+ /**
+ * Append a byte to the cursor's key.
+ *
+ * \param value The value to append.
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putKeyByte(byte value)
+ throws WiredTigerPackingException {
+ keyPacker.addByte(value);
+ return this;
+ }
+
+ /**
+ * Append a byte array to the cursor's key.
+ *
+ * \param value The value to append.
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putKeyByteArray(byte[] value)
+ throws WiredTigerPackingException {
+ this.putKeyByteArray(value, 0, value.length);
+ return this;
+ }
+
+ /**
+ * Append a byte array to the cursor's key.
+ *
+ * \param value The value to append.
+ * \param off The offset into value at which to start.
+ * \param len The length of the byte array.
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putKeyByteArray(byte[] value, int off, int len)
+ throws WiredTigerPackingException {
+ keyPacker.addByteArray(value, off, len);
+ return this;
+ }
+
+ /**
+ * Append an integer to the cursor's key.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putKeyInt(int value)
+ throws WiredTigerPackingException {
+ keyPacker.addInt(value);
+ return this;
+ }
+
+ /**
+ * Append a long to the cursor's key.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putKeyLong(long value)
+ throws WiredTigerPackingException {
+ keyPacker.addLong(value);
+ return this;
+ }
+
+ /**
+ * Append a short integer to the cursor's key.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putKeyShort(short value)
+ throws WiredTigerPackingException {
+ keyPacker.addShort(value);
+ return this;
+ }
+
+ /**
+ * Append a string to the cursor's key.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putKeyString(String value)
+ throws WiredTigerPackingException {
+ keyPacker.addString(value);
+ return this;
+ }
+
+ /**
+ * Append a byte to the cursor's value.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putValueByte(byte value)
+ throws WiredTigerPackingException {
+ valuePacker.addByte(value);
+ return this;
+ }
+
+ /**
+ * Append a byte array to the cursor's value.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putValueByteArray(byte[] value)
+ throws WiredTigerPackingException {
+ this.putValueByteArray(value, 0, value.length);
+ return this;
+ }
+
+ /**
+ * Append a byte array to the cursor's value.
+ *
+ * \param value The value to append
+ * \param off The offset into value at which to start.
+ * \param len The length of the byte array.
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putValueByteArray(byte[] value, int off, int len)
+ throws WiredTigerPackingException {
+ valuePacker.addByteArray(value, off, len);
+ return this;
+ }
+
+ /**
+ * Append an integer to the cursor's value.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putValueInt(int value)
+ throws WiredTigerPackingException {
+ valuePacker.addInt(value);
+ return this;
+ }
+
+ /**
+ * Append a long to the cursor's value.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putValueLong(long value)
+ throws WiredTigerPackingException {
+ valuePacker.addLong(value);
+ return this;
+ }
+
+ /**
+ * Append a short integer to the cursor's value.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putValueShort(short value)
+ throws WiredTigerPackingException {
+ valuePacker.addShort(value);
+ return this;
+ }
+
+ /**
+ * Append a string to the cursor's value.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putValueString(String value)
+ throws WiredTigerPackingException {
+ valuePacker.addString(value);
+ return this;
+ }
+
+ /**
+ * Retrieve a byte from the cursor's key.
+ *
+ * \return The requested value.
+ */
+ public byte getKeyByte()
+ throws WiredTigerPackingException {
+ return keyUnpacker.getByte();
+ }
+
+ /**
+ * Retrieve a byte array from the cursor's key.
+ *
+ * \param output The byte array where the returned value will be stored.
+ * The array should be large enough to store the entire
+ * data item, if not a truncated value will be returned.
+ */
+ public void getKeyByteArray(byte[] output)
+ throws WiredTigerPackingException {
+ this.getKeyByteArray(output, 0, output.length);
+ }
+
+ /**
+ * Retrieve a byte array from the cursor's key.
+ *
+ * \param output The byte array where the returned value will be stored.
+ * \param off Offset into the destination buffer to start copying into.
+ * \param len The length should be large enough to store the entire
+ * data item, if not a truncated value will be returned.
+ */
+ public void getKeyByteArray(byte[] output, int off, int len)
+ throws WiredTigerPackingException {
+ keyUnpacker.getByteArray(output, off, len);
+ }
+
+ /**
+ * Retrieve a byte array from the cursor's key.
+ *
+ * \return The requested value.
+ */
+ public byte[] getKeyByteArray()
+ throws WiredTigerPackingException {
+ return keyUnpacker.getByteArray();
+ }
+
+ /**
+ * Retrieve an integer from the cursor's key.
+ *
+ * \return The requested value.
+ */
+ public int getKeyInt()
+ throws WiredTigerPackingException {
+ return keyUnpacker.getInt();
+ }
+
+ /**
+ * Retrieve a long from the cursor's key.
+ *
+ * \return The requested value.
+ */
+ public long getKeyLong()
+ throws WiredTigerPackingException {
+ return keyUnpacker.getLong();
+ }
+
+ /**
+ * Retrieve a short integer from the cursor's key.
+ *
+ * \return The requested value.
+ */
+ public short getKeyShort()
+ throws WiredTigerPackingException {
+ return keyUnpacker.getShort();
+ }
+
+ /**
+ * Retrieve a string from the cursor's key.
+ *
+ * \return The requested value.
+ */
+ public String getKeyString()
+ throws WiredTigerPackingException {
+ return keyUnpacker.getString();
+ }
+
+ /**
+ * Retrieve a byte from the cursor's value.
+ *
+ * \return The requested value.
+ */
+ public byte getValueByte()
+ throws WiredTigerPackingException {
+ return valueUnpacker.getByte();
+ }
+
+ /**
+ * Retrieve a byte array from the cursor's value.
+ *
+ * \param output The byte array where the returned value will be stored.
+ * The array should be large enough to store the entire
+ * data item, if not a truncated value will be returned.
+ */
+ public void getValueByteArray(byte[] output)
+ throws WiredTigerPackingException {
+ this.getValueByteArray(output, 0, output.length);
+ }
+
+ /**
+ * Retrieve a byte array from the cursor's value.
+ *
+ * \param output The byte array where the returned value will be stored.
+ * \param off Offset into the destination buffer to start copying into.
+ * \param len The length should be large enough to store the entire
+ * data item, if not a truncated value will be returned.
+ */
+ public void getValueByteArray(byte[] output, int off, int len)
+ throws WiredTigerPackingException {
+ valueUnpacker.getByteArray(output, off, len);
+ }
+
+ /**
+ * Retrieve a byte array from the cursor's value.
+ *
+ * \return The requested value.
+ */
+ public byte[] getValueByteArray()
+ throws WiredTigerPackingException {
+ return valueUnpacker.getByteArray();
+ }
+
+ /**
+ * Retrieve an integer from the cursor's value.
+ *
+ * \return The requested value.
+ */
+ public int getValueInt()
+ throws WiredTigerPackingException {
+ return valueUnpacker.getInt();
+ }
+
+ /**
+ * Retrieve a long from the cursor's value.
+ *
+ * \return The requested value.
+ */
+ public long getValueLong()
+ throws WiredTigerPackingException {
+ return valueUnpacker.getLong();
+ }
+
+ /**
+ * Retrieve a short integer from the cursor's value.
+ *
+ * \return The requested value.
+ */
+ public short getValueShort()
+ throws WiredTigerPackingException {
+ return valueUnpacker.getShort();
+ }
+
+ /**
+ * Retrieve a string from the cursor's value.
+ *
+ * \return The requested value.
+ */
+ public String getValueString()
+ throws WiredTigerPackingException {
+ return valueUnpacker.getString();
+ }
+
+ /**
+ * Insert the cursor's current key/value into the table.
+ *
+ * \return The status of the operation.
+ */
+ public int insert()
+ throws WiredTigerException {
+ byte[] key = keyPacker.getValue();
+ byte[] value = valuePacker.getValue();
+ keyPacker.reset();
+ valuePacker.reset();
+ return insert_wrap(key, value);
+ }
+
+ /**
+ * Update the cursor's current key/value into the table.
+ *
+ * \return The status of the operation.
+ */
+ public int update()
+ throws WiredTigerException {
+ byte[] key = keyPacker.getValue();
+ byte[] value = valuePacker.getValue();
+ keyPacker.reset();
+ valuePacker.reset();
+ return update_wrap(key, value);
+ }
+
+ /**
+ * Remove the cursor's current key/value into the table.
+ *
+ * \return The status of the operation.
+ */
+ public int remove()
+ throws WiredTigerException {
+ byte[] key = keyPacker.getValue();
+ keyPacker.reset();
+ return remove_wrap(key);
+ }
+
+ /**
+ * Compare this cursor's position to another Cursor.
+ *
+ * \return The result of the comparison.
+ */
+ public int compare(Cursor other)
+ throws WiredTigerException {
+ return compare_wrap(other);
+ }
+
+ /**
+ * Retrieve the next item in the table.
+ *
+ * \return The result of the comparison.
+ */
+ public int next()
+ throws WiredTigerException {
+ int ret = next_wrap();
+ keyPacker.reset();
+ valuePacker.reset();
+ keyUnpacker = (ret == 0) ?
+ new PackInputStream(keyFormat, get_key_wrap()) : null;
+ valueUnpacker = (ret == 0) ?
+ new PackInputStream(valueFormat, get_value_wrap()) : null;
+ return ret;
+ }
+
+ /**
+ * Retrieve the previous item in the table.
+ *
+ * \return The result of the comparison.
+ */
+ public int prev()
+ throws WiredTigerException {
+ int ret = prev_wrap();
+ keyPacker.reset();
+ valuePacker.reset();
+ keyUnpacker = (ret == 0) ?
+ new PackInputStream(keyFormat, get_key_wrap()) : null;
+ valueUnpacker = (ret == 0) ?
+ new PackInputStream(valueFormat, get_value_wrap()) : null;
+ return ret;
+ }
+
+ /**
+ * Search for an item in the table.
+ *
+ * \return The result of the comparison.
+ */
+ public int search()
+ throws WiredTigerException {
+ int ret = search_wrap(keyPacker.getValue());
+ keyPacker.reset();
+ valuePacker.reset();
+ keyUnpacker = (ret == 0) ?
+ new PackInputStream(keyFormat, get_key_wrap()) : null;
+ valueUnpacker = (ret == 0) ?
+ new PackInputStream(valueFormat, get_value_wrap()) : null;
+ return ret;
+ }
+
+ /**
+ * Search for an item in the table.
+ *
+ * \return The result of the comparison.
+ */
+ public SearchStatus search_near()
+ throws WiredTigerException {
+ SearchStatus ret = search_near_wrap(keyPacker.getValue());
+ keyPacker.reset();
+ valuePacker.reset();
+ keyUnpacker = (ret != SearchStatus.NOTFOUND) ?
+ new PackInputStream(keyFormat, get_key_wrap()) : null;
+ valueUnpacker = (ret != SearchStatus.NOTFOUND) ?
+ new PackInputStream(valueFormat, get_value_wrap()) : null;
+ return ret;
+ }
+%}
+
+/* Put a WiredTigerException on all wrapped methods. We'd like this
+ * to only apply to methods returning int. SWIG doesn't have a way
+ * to do this, so we remove the exception for simple getters and such.
+ */
+%javaexception("com.wiredtiger.db.WiredTigerException") { $action; }
+%javaexception("") wiredtiger_strerror { $action; }
+%javaexception("") __wt_async_op::connection { $action; }
+%javaexception("") __wt_async_op::get_type { $action; }
+%javaexception("") __wt_async_op::get_id { $action; }
+%javaexception("") __wt_async_op::key_format { $action; }
+%javaexception("") __wt_async_op::value_format { $action; }
+%javaexception("") __wt_connection::get_home { $action; }
+%javaexception("") __wt_connection::is_new { $action; }
+%javaexception("") __wt_connection::java_init { $action; }
+%javaexception("") __wt_cursor::key_format { $action; }
+%javaexception("") __wt_cursor::session { $action; }
+%javaexception("") __wt_cursor::uri { $action; }
+%javaexception("") __wt_cursor::value_format { $action; }
+%javaexception("") __wt_session::connection { $action; }
+%javaexception("") __wt_session::java_init { $action; }
+
+/* Remove / rename parts of the C API that we don't want in Java. */
+%immutable __wt_cursor::session;
+%immutable __wt_cursor::uri;
+%immutable __wt_cursor::key_format;
+%immutable __wt_cursor::value_format;
+%immutable __wt_session::connection;
+
+%ignore __wt_collator;
+%ignore __wt_connection::add_collator;
+%ignore __wt_compressor;
+%ignore __wt_connection::add_compressor;
+%ignore __wt_data_source;
+%ignore __wt_connection::add_data_source;
+%ignore __wt_event_handler;
+%ignore __wt_extractor;
+%ignore __wt_connection::add_extractor;
+%ignore __wt_item;
+%ignore __wt_lsn;
+%ignore __wt_session::msg_printf;
+
+%ignore wiredtiger_struct_pack;
+%ignore wiredtiger_struct_size;
+%ignore wiredtiger_struct_unpack;
+
+%ignore wiredtiger_version;
+
+%ignore __wt_connection::get_extension_api;
+%ignore wiredtiger_extension_init;
+%ignore wiredtiger_extension_terminate;
+
+%define REQUIRE_WRAP(typedefname, name, javaname)
+%ignore name;
+%javamethodmodifiers name##_wrap "
+ /**
+ * @copydoc typedefname
+ */
+ public ";
+%rename(javaname) name##_wrap;
+%enddef
+
+REQUIRE_WRAP(::wiredtiger_open, wiredtiger_open, open)
+REQUIRE_WRAP(WT_CONNECTION::async_new_op,
+ __wt_connection::async_new_op, async_new_op)
+REQUIRE_WRAP(WT_CONNECTION::open_session,
+ __wt_connection::open_session, open_session)
+REQUIRE_WRAP(WT_SESSION::transaction_pinned_range,
+ __wt_session::transaction_pinned_range, transaction_pinned_range)
+REQUIRE_WRAP(WT_SESSION::open_cursor, __wt_session::open_cursor, open_cursor)
+REQUIRE_WRAP(WT_ASYNC_OP::get_id, __wt_async_op::get_id,getId)
+
+%rename(AsyncOp) __wt_async_op;
+%rename(Cursor) __wt_cursor;
+%rename(Session) __wt_session;
+%rename(Connection) __wt_connection;
+
+%define TRACKED_CLASS(jclassname, ctypename, java_init_fcn, implclass)
+%ignore jclassname::jclassname();
+
+%typemap(javabody) struct ctypename %{
+ private long swigCPtr;
+ protected boolean swigCMemOwn;
+
+ protected $javaclassname(long cPtr, boolean cMemoryOwn) {
+ swigCMemOwn = cMemoryOwn;
+ swigCPtr = cPtr;
+ java_init_fcn(swigCPtr, this, this);
+ }
+
+ protected static long getCPtr($javaclassname obj) {
+ return (obj == null) ? 0 : obj.swigCPtr;
+ }
+%}
+
+%extend ctypename {
+ %javamethodmodifiers java_init "protected";
+ int java_init(jobject jsess) {
+ implclass *session = (implclass *)$self;
+ JAVA_CALLBACK *jcb = (JAVA_CALLBACK *)session->lang_private;
+ jcb->jobj = JCALL1(NewGlobalRef, jcb->jnienv, jsess);
+ JCALL1(DeleteLocalRef, jcb->jnienv, jsess);
+ return (0);
+ }
+}
+%enddef
+
+TRACKED_CLASS(Session, __wt_session, wiredtigerJNI.Session_java_init, WT_SESSION_IMPL)
+TRACKED_CLASS(Connection, __wt_connection, wiredtigerJNI.Connection_java_init, WT_CONNECTION_IMPL)
+/* Note: Cursor incorporates the elements of TRACKED_CLASS into its
+ * custom constructor and %extend clause.
+ */
+
+%include "wiredtiger.h"
+
+/* Return new connections, sessions and cursors. */
+%inline {
+WT_CONNECTION *wiredtiger_open_wrap(JNIEnv *jenv, const char *home, const char *config) {
+ extern WT_EVENT_HANDLER javaApiEventHandler;
+ WT_CONNECTION *conn = NULL;
+ WT_CONNECTION_IMPL *connimpl;
+ JAVA_CALLBACK *jcb;
+ int ret;
+ if ((ret = wiredtiger_open(home, &javaApiEventHandler, config, &conn)) != 0)
+ goto err;
+
+ connimpl = (WT_CONNECTION_IMPL *)conn;
+ if ((ret = __wt_calloc_def(connimpl->default_session, 1, &jcb)) != 0)
+ goto err;
+
+ jcb->jnienv = jenv;
+ connimpl->lang_private = jcb;
+
+err: if (ret != 0)
+ throwWiredTigerException(jenv, ret);
+ return conn;
+}
+}
+
+%extend __wt_connection {
+ WT_ASYNC_OP *async_new_op_wrap(JNIEnv *jenv, const char *uri,
+ const char *config, jobject callbackObject) {
+ extern WT_ASYNC_CALLBACK javaApiAsyncHandler;
+ WT_ASYNC_OP *asyncop = NULL;
+ WT_CONNECTION_IMPL *connimpl;
+ JAVA_CALLBACK *jcb;
+ int ret;
+
+ if ((ret = $self->async_new_op($self, uri, config, &javaApiAsyncHandler, &asyncop)) != 0)
+ goto err;
+
+ connimpl = (WT_CONNECTION_IMPL *)$self;
+ if ((ret = __wt_calloc_def(connimpl->default_session, 1, &jcb)) != 0)
+ goto err;
+
+ jcb->jnienv = jenv;
+ (*jenv)->GetJavaVM(jenv, &jcb->javavm);
+ jcb->jcallback = JCALL1(NewGlobalRef, jcb->jnienv, callbackObject);
+ JCALL1(DeleteLocalRef, jcb->jnienv, callbackObject);
+ asyncop->c.lang_private = jcb;
+ asyncop->c.flags |= WT_CURSTD_RAW;
+
+err: if (ret != 0)
+ throwWiredTigerException(jenv, ret);
+ return asyncop;
+ }
+}
+
+%extend __wt_connection {
+ WT_SESSION *open_session_wrap(JNIEnv *jenv, const char *config) {
+ extern WT_EVENT_HANDLER javaApiEventHandler;
+ WT_SESSION *session = NULL;
+ WT_SESSION_IMPL *sessionimpl;
+ JAVA_CALLBACK *jcb;
+ int ret;
+
+ if ((ret = $self->open_session($self, &javaApiEventHandler, config, &session)) != 0)
+ goto err;
+
+ sessionimpl = (WT_SESSION_IMPL *)session;
+ if ((ret = __wt_calloc_def(sessionimpl, 1, &jcb)) != 0)
+ goto err;
+
+ jcb->jnienv = jenv;
+ sessionimpl->lang_private = jcb;
+
+err: if (ret != 0)
+ throwWiredTigerException(jenv, ret);
+ return session;
+ }
+}
+
+%extend __wt_session {
+ WT_CURSOR *open_cursor_wrap(JNIEnv *jenv, const char *uri, WT_CURSOR_NULLABLE *to_dup, const char *config) {
+ WT_CURSOR *cursor = NULL;
+ JAVA_CALLBACK *jcb;
+ int ret;
+
+ if ((ret = $self->open_cursor($self, uri, to_dup, config, &cursor)) != 0)
+ goto err;
+
+ cursor->flags |= WT_CURSTD_RAW;
+
+ if ((ret = __wt_calloc_def((WT_SESSION_IMPL *)cursor->session,
+ 1, &jcb)) != 0)
+ goto err;
+
+ jcb->jnienv = jenv;
+ cursor->lang_private = jcb;
+
+err: if (ret != 0)
+ throwWiredTigerException(jenv, ret);
+ return cursor;
+ }
+}
+
+%extend __wt_async_op {
+ long get_id_wrap(JNIEnv *jenv) {
+ WT_UNUSED(jenv);
+ return (self->get_id(self));
+ }
+}
+
+%extend __wt_session {
+ long transaction_pinned_range_wrap(JNIEnv *jenv) {
+ int ret;
+ uint64_t range = 0;
+ ret = self->transaction_pinned_range(self, &range);
+ if (ret != 0)
+ throwWiredTigerException(jenv, ret);
+ return range;
+ }
+}
diff --git a/src/third_party/wiredtiger/lang/python/Makefile.am b/src/third_party/wiredtiger/lang/python/Makefile.am
new file mode 100644
index 00000000000..03c65a57028
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/python/Makefile.am
@@ -0,0 +1,28 @@
+PYSRC = $(top_srcdir)/lang/python
+PYDIRS = -t $(abs_builddir) -I $(abs_top_srcdir):$(abs_top_builddir) -L $(abs_top_builddir)/.libs
+all-local: _wiredtiger.so
+
+# We keep generated Python sources under lang/python: that's where they live
+# in release packages.
+$(PYSRC)/wiredtiger_wrap.c: $(top_srcdir)/src/include/wiredtiger.in $(PYSRC)/wiredtiger.i
+ (cd $(PYSRC) && \
+ $(SWIG) -python -threads -O -Wall -nodefaultctor -nodefaultdtor -I$(abs_top_builddir) wiredtiger.i && \
+ mv wiredtiger.py wiredtiger/__init__.py)
+
+_wiredtiger.so: $(top_builddir)/libwiredtiger.la $(PYSRC)/wiredtiger_wrap.c
+ (cd $(PYSRC) && \
+ $(PYTHON) setup.py build_ext -f -b $(abs_builddir) $(PYDIRS))
+
+install-exec-local:
+ (cd $(PYSRC) && \
+ $(PYTHON) setup.py build_py -d $(abs_builddir)/build && \
+ $(PYTHON) setup.py build_ext -f -b $(abs_builddir)/build $(PYDIRS) && \
+ $(PYTHON) setup.py install_lib -b $(abs_builddir)/build --skip-build $(PYTHON_INSTALL_ARG))
+
+# We build in different places for an install vs running from the tree:
+# clean up both. Don't rely on "setup.py clean" -- everything that should
+# be removed is created under the build directory.
+clean-local:
+ rm -rf build _wiredtiger.so wiredtiger_wrap.o WT_TEST
+
+TESTS = run-ex_access
diff --git a/src/third_party/wiredtiger/lang/python/run-ex_access b/src/third_party/wiredtiger/lang/python/run-ex_access
new file mode 100755
index 00000000000..a6f8348e9fd
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/python/run-ex_access
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+rm -rf WT_TEST ; mkdir WT_TEST
+
+exec env LD_LIBRARY_PATH=../../.libs DYLD_LIBRARY_PATH=../../.libs PYTHONPATH=.:${srcdir} python ${srcdir}/../../examples/python/ex_access.py
diff --git a/src/third_party/wiredtiger/lang/python/setup.py b/src/third_party/wiredtiger/lang/python/setup.py
new file mode 100644
index 00000000000..7d99c872bc3
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/python/setup.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+import re, os, sys
+from distutils.core import setup, Extension
+
+# OS X hack: turn off the Universal binary support that is built into the
+# Python build machinery, just build for the default CPU architecture.
+if not 'ARCHFLAGS' in os.environ:
+ os.environ['ARCHFLAGS'] = ''
+
+# Suppress warnings building SWIG generated code
+extra_cflags = [ '-w' ]
+
+dir = os.path.dirname(__file__)
+
+# Read the version information from the RELEASE_INFO file
+for l in open(os.path.join(dir, '..', '..', 'RELEASE_INFO')):
+ if re.match(r'WIREDTIGER_VERSION_(?:MAJOR|MINOR|PATCH)=', l):
+ exec(l)
+
+wt_ver = '%d.%d' % (WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR)
+
+setup(name='wiredtiger', version=wt_ver,
+ ext_modules=[Extension('_wiredtiger',
+ [os.path.join(dir, 'wiredtiger_wrap.c')],
+ libraries=['wiredtiger'],
+ extra_compile_args=extra_cflags,
+ )],
+ package_dir={'' : dir},
+ packages=['wiredtiger'],
+)
diff --git a/src/third_party/wiredtiger/lang/python/wiredtiger.i b/src/third_party/wiredtiger/lang/python/wiredtiger.i
new file mode 100644
index 00000000000..5e88855276a
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/python/wiredtiger.i
@@ -0,0 +1,1155 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * wiredtiger.i
+ * The SWIG interface file defining the wiredtiger python API.
+ */
+%define DOCSTRING
+"@defgroup wt_python WiredTiger Python API
+Python wrappers aroung the WiredTiger C API.
+@{
+@cond IGNORE"
+%enddef
+
+%module(docstring=DOCSTRING) wiredtiger
+
+%feature("autodoc", "0");
+
+%pythoncode %{
+from packing import pack, unpack
+## @endcond
+%}
+
+/* Set the input argument to point to a temporary variable */
+%typemap(in, numinputs=0) WT_CONNECTION ** (WT_CONNECTION *temp = NULL) {
+ $1 = &temp;
+}
+%typemap(in, numinputs=0) WT_SESSION ** (WT_SESSION *temp = NULL) {
+ $1 = &temp;
+}
+%typemap(in, numinputs=0) WT_ASYNC_OP ** (WT_ASYNC_OP *temp = NULL) {
+ $1 = &temp;
+}
+%typemap(in, numinputs=0) WT_CURSOR ** (WT_CURSOR *temp = NULL) {
+ $1 = &temp;
+}
+
+%typemap(in) WT_ASYNC_CALLBACK * (PyObject *callback_obj = NULL) %{
+ callback_obj = $input;
+ $1 = &pyApiAsyncCallback;
+%}
+
+%typemap(in, numinputs=0) WT_EVENT_HANDLER * %{
+ $1 = &pyApiEventHandler;
+%}
+
+/* Set the return value to the returned connection, session, or cursor */
+%typemap(argout) WT_CONNECTION ** {
+ $result = SWIG_NewPointerObj(SWIG_as_voidptr(*$1),
+ SWIGTYPE_p___wt_connection, 0);
+}
+%typemap(argout) WT_SESSION ** {
+ $result = SWIG_NewPointerObj(SWIG_as_voidptr(*$1),
+ SWIGTYPE_p___wt_session, 0);
+ if (*$1 != NULL) {
+ PY_CALLBACK *pcb;
+
+ if (__wt_calloc_def((WT_SESSION_IMPL *)(*$1), 1, &pcb) != 0)
+ SWIG_exception_fail(SWIG_MemoryError, "WT calloc failed");
+ else {
+ Py_XINCREF($result);
+ pcb->pyobj = $result;
+ ((WT_SESSION_IMPL *)(*$1))->lang_private = pcb;
+ }
+ }
+}
+%typemap(argout) WT_ASYNC_OP ** {
+ $result = SWIG_NewPointerObj(SWIG_as_voidptr(*$1),
+ SWIGTYPE_p___wt_async_op, 0);
+ if (*$1 != NULL) {
+ PY_CALLBACK *pcb;
+
+ (*$1)->c.flags |= WT_CURSTD_RAW;
+ PyObject_SetAttrString($result, "is_column",
+ PyBool_FromLong(strcmp((*$1)->key_format, "r") == 0));
+ PyObject_SetAttrString($result, "key_format",
+ PyString_InternFromString((*$1)->key_format));
+ PyObject_SetAttrString($result, "value_format",
+ PyString_InternFromString((*$1)->value_format));
+
+ if (__wt_calloc_def((WT_ASYNC_OP_IMPL *)(*$1), 1, &pcb) != 0)
+ SWIG_exception_fail(SWIG_MemoryError, "WT calloc failed");
+ else {
+ pcb->pyobj = $result;
+ Py_XINCREF(pcb->pyobj);
+ /* XXX Is there a way to avoid SWIG's numbering? */
+ pcb->pyasynccb = callback_obj5;
+ Py_XINCREF(pcb->pyasynccb);
+ (*$1)->c.lang_private = pcb;
+ }
+ }
+}
+
+%typemap(argout) WT_CURSOR ** {
+ $result = SWIG_NewPointerObj(SWIG_as_voidptr(*$1),
+ SWIGTYPE_p___wt_cursor, 0);
+ if (*$1 != NULL) {
+ PY_CALLBACK *pcb;
+ uint32_t json;
+
+ json = (*$1)->flags & WT_CURSTD_DUMP_JSON;
+ if (!json)
+ (*$1)->flags |= WT_CURSTD_RAW;
+ PyObject_SetAttrString($result, "is_json",
+ PyBool_FromLong(json != 0));
+ PyObject_SetAttrString($result, "is_column",
+ PyBool_FromLong(strcmp((*$1)->key_format, "r") == 0));
+ PyObject_SetAttrString($result, "key_format",
+ PyString_InternFromString((*$1)->key_format));
+ PyObject_SetAttrString($result, "value_format",
+ PyString_InternFromString((*$1)->value_format));
+
+ if (__wt_calloc_def((WT_SESSION_IMPL *)(*$1)->session, 1, &pcb) != 0)
+ SWIG_exception_fail(SWIG_MemoryError, "WT calloc failed");
+ else {
+ Py_XINCREF($result);
+ pcb->pyobj = $result;
+ (*$1)->lang_private = pcb;
+ }
+ }
+}
+
+/* 64 bit typemaps. */
+%typemap(in) uint64_t {
+ $1 = PyLong_AsUnsignedLongLong($input);
+}
+%typemap(out) uint64_t {
+ $result = PyLong_FromUnsignedLongLong($1);
+}
+
+/* Throw away references after close. */
+%define DESTRUCTOR(class, method)
+%feature("shadow") class::method %{
+ def method(self, *args):
+ '''close(self, config) -> int
+
+ @copydoc class::method'''
+ try:
+ self._freecb()
+ return $action(self, *args)
+ finally:
+ self.this = None
+%}
+%enddef
+DESTRUCTOR(__wt_connection, close)
+DESTRUCTOR(__wt_cursor, close)
+DESTRUCTOR(__wt_session, close)
+
+/* Don't require empty config strings. */
+%typemap(default) const char *config { $1 = NULL; }
+%typemap(default) WT_CURSOR *to_dup { $1 = NULL; }
+
+/*
+ * Error returns other than WT_NOTFOUND generate an exception.
+ * Use our own exception type, in future tailored to the kind
+ * of error.
+ */
+%header %{
+
+#include "src/include/wt_internal.h"
+
+/*
+ * Closed handle checking:
+ *
+ * The typedef WT_CURSOR_NULLABLE used in wiredtiger.h is only made
+ * visible to the SWIG parser and is used to identify arguments of
+ * Cursor type that are permitted to be null. Likewise, typedefs
+ * WT_{CURSOR,SESSION,CONNECTION}_CLOSED identify 'close' calls that
+ * need explicit nulling of the swigCPtr. We do not match the *_CLOSED
+ * typedefs in Python SWIG, as we already have special cased 'close' methods.
+ *
+ * We want SWIG to see these 'fake' typenames, but not the compiler.
+ */
+#define WT_CURSOR_NULLABLE WT_CURSOR
+#define WT_CURSOR_CLOSED WT_CURSOR
+#define WT_SESSION_CLOSED WT_SESSION
+#define WT_CONNECTION_CLOSED WT_CONNECTION
+
+/*
+ * For Connections, Sessions and Cursors created in Python, each of
+ * WT_CONNECTION_IMPL, WT_SESSION_IMPL and WT_CURSOR have a
+ * lang_private field that store a pointer to a PY_CALLBACK, alloced
+ * during the various open calls. {conn,session,cursor}CloseHandler()
+ * functions reach into the associated Python object, set the 'this'
+ * asttribute to None, and free the PY_CALLBACK.
+ */
+typedef struct {
+ PyObject *pyobj; /* the python Session/Cursor/AsyncOp object */
+ PyObject *pyasynccb; /* the callback to use for AsyncOp */
+} PY_CALLBACK;
+
+static PyObject *wtError;
+
+static int sessionFreeHandler(WT_SESSION *session_arg);
+static int cursorFreeHandler(WT_CURSOR *cursor_arg);
+%}
+
+%init %{
+ /*
+ * Create an exception type and put it into the _wiredtiger module.
+ * First increment the reference count because PyModule_AddObject
+ * decrements it. Then note that "m" is the local variable for the
+ * module in the SWIG generated code. If there is a SWIG variable for
+ * this, I haven't found it.
+ */
+ wtError = PyErr_NewException("_wiredtiger.WiredTigerError", NULL, NULL);
+ Py_INCREF(wtError);
+ PyModule_AddObject(m, "WiredTigerError", wtError);
+%}
+
+%pythoncode %{
+WiredTigerError = _wiredtiger.WiredTigerError
+
+## @cond DISABLE
+# Implements the iterable contract
+class IterableCursor:
+ def __init__(self, cursor):
+ self.cursor = cursor
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ if self.cursor.next() == WT_NOTFOUND:
+ raise StopIteration
+ return self.cursor.get_keys() + self.cursor.get_values()
+## @endcond
+
+# An abstract class, which must be subclassed with notify() overridden.
+class AsyncCallback:
+ def __init__(self):
+ raise NotImplementedError
+
+ def notify(self, op, op_ret, flags):
+ raise NotImplementedError
+
+%}
+
+/* Bail out if arg or arg.this is None, else set res to the C pointer. */
+%define CONVERT_WITH_NULLCHECK(argp, res)
+ if ($input == Py_None) {
+ SWIG_exception_fail(SWIG_NullReferenceError,
+ "in method '$symname', "
+ "argument $argnum of type '$type' is None");
+ } else {
+ res = SWIG_ConvertPtr($input, &argp, $descriptor, $disown | 0);
+ if (!SWIG_IsOK(res)) {
+ if (SWIG_Python_GetSwigThis($input) == 0) {
+ SWIG_exception_fail(SWIG_NullReferenceError,
+ "in method '$symname', "
+ "argument $argnum of type '$type' is None");
+ } else {
+ SWIG_exception_fail(SWIG_ArgError(res),
+ "in method '$symname', "
+ "argument $argnum of type '$type'");
+ }
+ }
+ }
+%enddef
+
+/*
+ * Extra 'self' elimination.
+ * The methods we're wrapping look like this:
+ * struct __wt_xxx {
+ * int method(WT_XXX *, ...otherargs...);
+ * };
+ * To SWIG, that is equivalent to:
+ * int method(struct __wt_xxx *self, WT_XXX *, ...otherargs...);
+ * and we use consecutive argument matching of typemaps to convert two args to
+ * one.
+ */
+%define SELFHELPER(type, name)
+%typemap(in) (type *self, type *name) (void *argp = 0, int res = 0) %{
+ CONVERT_WITH_NULLCHECK(argp, res)
+ $2 = $1 = ($ltype)(argp);
+%}
+%typemap(in) type ## _NULLABLE * {
+ $1 = *(type **)&$input;
+}
+
+%enddef
+
+SELFHELPER(struct __wt_connection, connection)
+SELFHELPER(struct __wt_async_op, op)
+SELFHELPER(struct __wt_session, session)
+SELFHELPER(struct __wt_cursor, cursor)
+
+ /*
+ * Create an error exception if it has not already
+ * been done.
+ */
+%define SWIG_ERROR_IF_NOT_SET(result)
+do {
+ if (PyErr_Occurred() == NULL) {
+ /* We could use PyErr_SetObject for more complex reporting. */
+ SWIG_SetErrorMsg(wtError, wiredtiger_strerror(result));
+ }
+ SWIG_fail;
+} while(0)
+%enddef
+
+/* Error handling. Default case: a non-zero return is an error. */
+%exception {
+ $action
+ if (result != 0)
+ SWIG_ERROR_IF_NOT_SET(result);
+}
+
+/* Async operations can return EBUSY when no ops are available. */
+%define EBUSY_OK(m)
+%exception m {
+retry:
+ $action
+ if (result != 0 && result != EBUSY)
+ SWIG_ERROR_IF_NOT_SET(result);
+ else if (result == EBUSY) {
+ __wt_sleep(0, 10000);
+ goto retry;
+ }
+}
+%enddef
+
+/* Any API that returns an enum type uses this. */
+%define ENUM_OK(m)
+%exception m {
+ $action
+}
+%enddef
+
+/* Cursor positioning methods can also return WT_NOTFOUND. */
+%define NOTFOUND_OK(m)
+%exception m {
+ $action
+ if (result != 0 && result != WT_NOTFOUND)
+ SWIG_ERROR_IF_NOT_SET(result);
+}
+%enddef
+
+/* Cursor compare can return any of -1, 0, 1 or WT_NOTFOUND. */
+%define COMPARE_OK(m)
+%exception m {
+ $action
+ if ((result < -1 || result > 1) && result != WT_NOTFOUND)
+ SWIG_ERROR_IF_NOT_SET(result);
+}
+%enddef
+
+EBUSY_OK(__wt_connection::async_new_op)
+ENUM_OK(__wt_async_op::get_type)
+NOTFOUND_OK(__wt_cursor::next)
+NOTFOUND_OK(__wt_cursor::prev)
+NOTFOUND_OK(__wt_cursor::remove)
+NOTFOUND_OK(__wt_cursor::search)
+NOTFOUND_OK(__wt_cursor::update)
+
+COMPARE_OK(__wt_cursor::compare)
+COMPARE_OK(__wt_cursor::search_near)
+
+/* Lastly, some methods need no (additional) error checking. */
+%exception __wt_connection::get_home;
+%exception __wt_connection::is_new;
+%exception __wt_connection::search_near;
+%exception __wt_async_op::_set_key;
+%exception __wt_async_op::_set_value;
+%exception __wt_cursor::_set_key;
+%exception __wt_cursor::_set_key_str;
+%exception __wt_cursor::_set_value;
+%exception __wt_cursor::_set_value_str;
+%exception wiredtiger_strerror;
+%exception wiredtiger_version;
+%exception diagnostic_build;
+%exception verbose_build;
+
+/* WT_ASYNC_OP customization. */
+/* First, replace the varargs get / set methods with Python equivalents. */
+%ignore __wt_async_op::get_key;
+%ignore __wt_async_op::get_value;
+%ignore __wt_async_op::set_key;
+%ignore __wt_async_op::set_value;
+%immutable __wt_async_op::connection;
+
+/* WT_CURSOR customization. */
+/* First, replace the varargs get / set methods with Python equivalents. */
+%ignore __wt_cursor::get_key;
+%ignore __wt_cursor::get_value;
+%ignore __wt_cursor::set_key;
+%ignore __wt_cursor::set_value;
+
+/* Next, override methods that return integers via arguments. */
+%ignore __wt_cursor::compare(WT_CURSOR *, WT_CURSOR *, int *);
+%ignore __wt_cursor::search_near(WT_CURSOR *, int *);
+
+/* SWIG magic to turn Python byte strings into data / size. */
+%apply (char *STRING, int LENGTH) { (char *data, int size) };
+
+/* Handle binary data returns from get_key/value -- avoid cstring.i: it creates a list of returns. */
+%typemap(in,numinputs=0) (char **datap, int *sizep) (char *data, int size) { $1 = &data; $2 = &size; }
+%typemap(frearg) (char **datap, int *sizep) "";
+%typemap(argout) (char **datap, int *sizep) {
+ if (*$1)
+ $result = SWIG_FromCharPtrAndSize(*$1, *$2);
+}
+
+/* Handle record number returns from get_recno */
+%typemap(in,numinputs=0) (uint64_t *recnop) (uint64_t recno) { $1 = &recno; }
+%typemap(frearg) (uint64_t *recnop) "";
+%typemap(argout) (uint64_t *recnop) { $result = PyLong_FromUnsignedLongLong(*$1); }
+
+%{
+typedef int int_void;
+%}
+typedef int int_void;
+%typemap(out) int_void { $result = VOID_Object; }
+
+%extend __wt_async_op {
+ /* Get / set keys and values */
+ void _set_key(char *data, int size) {
+ WT_ITEM k;
+ k.data = data;
+ k.size = (uint32_t)size;
+ $self->set_key($self, &k);
+ }
+
+ int_void _set_recno(uint64_t recno) {
+ WT_ITEM k;
+ uint8_t recno_buf[20];
+ size_t size;
+ int ret;
+ if ((ret = wiredtiger_struct_size(NULL,
+ &size, "r", recno)) != 0 ||
+ (ret = wiredtiger_struct_pack(NULL,
+ recno_buf, sizeof (recno_buf), "r", recno)) != 0)
+ return (ret);
+
+ k.data = recno_buf;
+ k.size = (uint32_t)size;
+ $self->set_key($self, &k);
+ return (ret);
+ }
+
+ void _set_value(char *data, int size) {
+ WT_ITEM v;
+ v.data = data;
+ v.size = (uint32_t)size;
+ $self->set_value($self, &v);
+ }
+
+ /* Don't return values, just throw exceptions on failure. */
+ int_void _get_key(char **datap, int *sizep) {
+ WT_ITEM k;
+ int ret = $self->get_key($self, &k);
+ if (ret == 0) {
+ *datap = (char *)k.data;
+ *sizep = (int)k.size;
+ }
+ return (ret);
+ }
+
+ int_void _get_recno(uint64_t *recnop) {
+ WT_ITEM k;
+ int ret = $self->get_key($self, &k);
+ if (ret == 0)
+ ret = wiredtiger_struct_unpack(NULL,
+ k.data, k.size, "q", recnop);
+ return (ret);
+ }
+
+ int_void _get_value(char **datap, int *sizep) {
+ WT_ITEM v;
+ int ret = $self->get_value($self, &v);
+ if (ret == 0) {
+ *datap = (char *)v.data;
+ *sizep = (int)v.size;
+ }
+ return (ret);
+ }
+
+ int _freecb() {
+ return (cursorFreeHandler($self));
+ }
+
+%pythoncode %{
+ def get_key(self):
+ '''get_key(self) -> object
+
+ @copydoc WT_ASYNC_OP::get_key
+ Returns only the first column.'''
+ k = self.get_keys()
+ if len(k) == 1:
+ return k[0]
+ return k
+
+ def get_keys(self):
+ '''get_keys(self) -> (object, ...)
+
+ @copydoc WT_ASYNC_OP::get_key'''
+ if self.is_column:
+ return [self._get_recno(),]
+ else:
+ return unpack(self.key_format, self._get_key())
+
+ def get_value(self):
+ '''get_value(self) -> object
+
+ @copydoc WT_ASYNC_OP::get_value
+ Returns only the first column.'''
+ v = self.get_values()
+ if len(v) == 1:
+ return v[0]
+ return v
+
+ def get_values(self):
+ '''get_values(self) -> (object, ...)
+
+ @copydoc WT_ASYNC_OP::get_value'''
+ return unpack(self.value_format, self._get_value())
+
+ def set_key(self, *args):
+ '''set_key(self) -> None
+
+ @copydoc WT_ASYNC_OP::set_key'''
+ if len(args) == 1 and type(args[0]) == tuple:
+ args = args[0]
+ if self.is_column:
+ self._set_recno(long(args[0]))
+ else:
+ # Keep the Python string pinned
+ self._key = pack(self.key_format, *args)
+ self._set_key(self._key)
+
+ def set_value(self, *args):
+ '''set_value(self) -> None
+
+ @copydoc WT_ASYNC_OP::set_value'''
+ if len(args) == 1 and type(args[0]) == tuple:
+ args = args[0]
+ # Keep the Python string pinned
+ self._value = pack(self.value_format, *args)
+ self._set_value(self._value)
+
+ def __getitem__(self, key):
+ '''Python convenience for searching'''
+ self.set_key(key)
+ if self.search() != 0:
+ raise KeyError
+ return self.get_value()
+
+ def __setitem__(self, key, value):
+ '''Python convenience for inserting'''
+ self.set_key(key)
+ self.set_key(value)
+ self.insert()
+%}
+};
+
+%extend __wt_cursor {
+ /* Get / set keys and values */
+ void _set_key(char *data, int size) {
+ WT_ITEM k;
+ k.data = data;
+ k.size = (uint32_t)size;
+ $self->set_key($self, &k);
+ }
+
+ /* Get / set keys and values */
+ void _set_key_str(char *str) {
+ $self->set_key($self, str);
+ }
+
+ int_void _set_recno(uint64_t recno) {
+ WT_ITEM k;
+ uint8_t recno_buf[20];
+ size_t size;
+ int ret;
+ if ((ret = wiredtiger_struct_size($self->session,
+ &size, "r", recno)) != 0 ||
+ (ret = wiredtiger_struct_pack($self->session,
+ recno_buf, sizeof (recno_buf), "r", recno)) != 0)
+ return (ret);
+
+ k.data = recno_buf;
+ k.size = (uint32_t)size;
+ $self->set_key($self, &k);
+ return (ret);
+ }
+
+ void _set_value(char *data, int size) {
+ WT_ITEM v;
+ v.data = data;
+ v.size = (uint32_t)size;
+ $self->set_value($self, &v);
+ }
+
+ /* Get / set keys and values */
+ void _set_value_str(char *str) {
+ $self->set_value($self, str);
+ }
+
+ /* Don't return values, just throw exceptions on failure. */
+ int_void _get_key(char **datap, int *sizep) {
+ WT_ITEM k;
+ int ret = $self->get_key($self, &k);
+ if (ret == 0) {
+ *datap = (char *)k.data;
+ *sizep = (int)k.size;
+ }
+ return (ret);
+ }
+
+ int_void _get_json_key(char **datap, int *sizep) {
+ const char *k;
+ int ret = $self->get_key($self, &k);
+ if (ret == 0) {
+ *datap = (char *)k;
+ *sizep = strlen(k);
+ }
+ return (ret);
+ }
+
+ int_void _get_recno(uint64_t *recnop) {
+ WT_ITEM k;
+ int ret = $self->get_key($self, &k);
+ if (ret == 0)
+ ret = wiredtiger_struct_unpack($self->session,
+ k.data, k.size, "q", recnop);
+ return (ret);
+ }
+
+ int_void _get_value(char **datap, int *sizep) {
+ WT_ITEM v;
+ int ret = $self->get_value($self, &v);
+ if (ret == 0) {
+ *datap = (char *)v.data;
+ *sizep = (int)v.size;
+ }
+ return (ret);
+ }
+
+ int_void _get_json_value(char **datap, int *sizep) {
+ const char *k;
+ int ret = $self->get_value($self, &k);
+ if (ret == 0) {
+ *datap = (char *)k;
+ *sizep = strlen(k);
+ }
+ return (ret);
+ }
+
+ /* compare and search_near need special handling. */
+ int compare(WT_CURSOR *other) {
+ int cmp = 0;
+ int ret = 0;
+ if (other == NULL) {
+ SWIG_Error(SWIG_NullReferenceError,
+ "in method 'Cursor_compare', "
+ "argument 1 of type 'struct __wt_cursor *' "
+ "is None");
+ ret = EINVAL; /* any non-zero value will do. */
+ }
+ else {
+ ret = $self->compare($self, other, &cmp);
+
+ /*
+ * Map less-than-zero to -1 and greater-than-zero to 1
+ * to avoid colliding with other errors.
+ */
+ ret = ((ret != 0) ? ret :
+ (cmp < 0) ? -1 : (cmp == 0) ? 0 : 1);
+ }
+ return (ret);
+ }
+
+ int search_near() {
+ int cmp = 0;
+ int ret = $self->search_near($self, &cmp);
+ /*
+ * Map less-than-zero to -1 and greater-than-zero to 1 to avoid
+ * colliding with WT_NOTFOUND.
+ */
+ return ((ret != 0) ? ret :
+ (cmp < 0) ? -1 : (cmp == 0) ? 0 : 1);
+ }
+
+ int _freecb() {
+ return (cursorFreeHandler($self));
+ }
+
+%pythoncode %{
+ def get_key(self):
+ '''get_key(self) -> object
+
+ @copydoc WT_CURSOR::get_key
+ Returns only the first column.'''
+ k = self.get_keys()
+ if len(k) == 1:
+ return k[0]
+ return k
+
+ def get_keys(self):
+ '''get_keys(self) -> (object, ...)
+
+ @copydoc WT_CURSOR::get_key'''
+ if self.is_json:
+ return [self._get_json_key()]
+ elif self.is_column:
+ return [self._get_recno(),]
+ else:
+ return unpack(self.key_format, self._get_key())
+
+ def get_value(self):
+ '''get_value(self) -> object
+
+ @copydoc WT_CURSOR::get_value
+ Returns only the first column.'''
+ v = self.get_values()
+ if len(v) == 1:
+ return v[0]
+ return v
+
+ def get_values(self):
+ '''get_values(self) -> (object, ...)
+
+ @copydoc WT_CURSOR::get_value'''
+ if self.is_json:
+ return [self._get_json_value()]
+ else:
+ return unpack(self.value_format, self._get_value())
+
+ def set_key(self, *args):
+ '''set_key(self) -> None
+
+ @copydoc WT_CURSOR::set_key'''
+ if len(args) == 1 and type(args[0]) == tuple:
+ args = args[0]
+ if self.is_column:
+ self._set_recno(long(args[0]))
+ elif self.is_json:
+ self._set_key_str(args[0])
+ else:
+ # Keep the Python string pinned
+ self._key = pack(self.key_format, *args)
+ self._set_key(self._key)
+
+ def set_value(self, *args):
+ '''set_value(self) -> None
+
+ @copydoc WT_CURSOR::set_value'''
+ if self.is_json:
+ self._set_value_str(args[0])
+ else:
+ if len(args) == 1 and type(args[0]) == tuple:
+ args = args[0]
+ # Keep the Python string pinned
+ self._value = pack(self.value_format, *args)
+ self._set_value(self._value)
+
+ def __iter__(self):
+ '''Cursor objects support iteration, equivalent to calling
+ WT_CURSOR::next until it returns ::WT_NOTFOUND.'''
+ if not hasattr(self, '_iterable'):
+ self._iterable = IterableCursor(self)
+ return self._iterable
+
+ def __getitem__(self, key):
+ '''Python convenience for searching'''
+ self.set_key(key)
+ if self.search() != 0:
+ raise KeyError
+ return self.get_value()
+%}
+};
+
+%extend __wt_session {
+ int log_printf(const char *msg) {
+ return self->log_printf(self, "%s", msg);
+ }
+
+ int _freecb() {
+ return (sessionFreeHandler(self));
+ }
+};
+
+%extend __wt_connection {
+ int _freecb() {
+ return (0);
+ }
+};
+
+%{
+int diagnostic_build() {
+#ifdef HAVE_DIAGNOSTIC
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+int verbose_build() {
+#ifdef HAVE_VERBOSE
+ return 1;
+#else
+ return 0;
+#endif
+}
+%}
+int diagnostic_build();
+int verbose_build();
+
+/* Remove / rename parts of the C API that we don't want in Python. */
+%immutable __wt_cursor::session;
+%immutable __wt_cursor::uri;
+%ignore __wt_cursor::key_format;
+%ignore __wt_cursor::value_format;
+%immutable __wt_session::connection;
+%immutable __wt_async_op::connection;
+%immutable __wt_async_op::uri;
+%immutable __wt_async_op::config;
+%ignore __wt_async_op::key_format;
+%ignore __wt_async_op::value_format;
+
+%ignore __wt_async_callback;
+%ignore __wt_collator;
+%ignore __wt_compressor;
+%ignore __wt_config_item;
+%ignore __wt_data_source;
+%ignore __wt_event_handler;
+%ignore __wt_extractor;
+%ignore __wt_item;
+%ignore __wt_lsn;
+
+%ignore __wt_connection::add_collator;
+%ignore __wt_connection::add_compressor;
+%ignore __wt_connection::add_data_source;
+%ignore __wt_connection::add_extractor;
+%ignore __wt_connection::get_extension_api;
+%ignore __wt_session::log_printf;
+
+%ignore wiredtiger_struct_pack;
+%ignore wiredtiger_struct_size;
+%ignore wiredtiger_struct_unpack;
+
+%ignore wiredtiger_extension_init;
+%ignore wiredtiger_extension_terminate;
+
+/* Convert 'int *' to output args for wiredtiger_version */
+%apply int *OUTPUT { int * };
+
+%rename(AsyncOp) __wt_async_op;
+%rename(Cursor) __wt_cursor;
+%rename(Session) __wt_session;
+%rename(Connection) __wt_connection;
+
+%include "wiredtiger.h"
+
+/* Add event handler support. */
+%{
+/* Write to and flush the stream. */
+static int
+writeToPythonStream(const char *streamname, const char *message)
+{
+ PyObject *sys, *se, *write_method, *flush_method, *written,
+ *arglist, *arglist2;
+ char *msg;
+ int ret;
+ size_t msglen;
+
+ sys = NULL;
+ se = NULL;
+ write_method = flush_method = NULL;
+ written = NULL;
+ arglist = arglist2 = NULL;
+ msglen = strlen(message);
+ msg = malloc(msglen + 2);
+ strcpy(msg, message);
+ strcpy(&msg[msglen], "\n");
+
+ /* Acquire python Global Interpreter Lock. Otherwise can segfault. */
+ SWIG_PYTHON_THREAD_BEGIN_BLOCK;
+
+ ret = 1;
+ if ((sys = PyImport_ImportModule("sys")) == NULL)
+ goto err;
+ if ((se = PyObject_GetAttrString(sys, streamname)) == NULL)
+ goto err;
+ if ((write_method = PyObject_GetAttrString(se, "write")) == NULL)
+ goto err;
+ if ((flush_method = PyObject_GetAttrString(se, "flush")) == NULL)
+ goto err;
+ if ((arglist = Py_BuildValue("(s)", msg)) == NULL)
+ goto err;
+ if ((arglist2 = Py_BuildValue("()")) == NULL)
+ goto err;
+
+ written = PyObject_CallObject(write_method, arglist);
+ (void)PyObject_CallObject(flush_method, arglist2);
+ ret = 0;
+
+err: Py_XDECREF(arglist2);
+ Py_XDECREF(arglist);
+ Py_XDECREF(flush_method);
+ Py_XDECREF(write_method);
+ Py_XDECREF(se);
+ Py_XDECREF(sys);
+ Py_XDECREF(written);
+
+ /* Release python Global Interpreter Lock */
+ SWIG_PYTHON_THREAD_END_BLOCK;
+
+ if (msg)
+ free(msg);
+ return (ret);
+}
+
+static int
+pythonErrorCallback(WT_EVENT_HANDLER *handler, WT_SESSION *session, int err,
+ const char *message)
+{
+ return writeToPythonStream("stderr", message);
+}
+
+static int
+pythonMessageCallback(WT_EVENT_HANDLER *handler, WT_SESSION *session,
+ const char *message)
+{
+ return writeToPythonStream("stdout", message);
+}
+
+/* Zero out SWIG's pointer to the C object,
+ * equivalent to 'pyobj.this = None' in Python.
+ */
+static int
+pythonClose(PY_CALLBACK *pcb)
+{
+ int ret;
+
+ /*
+ * Ensure the global interpreter lock is held - so that Python
+ * doesn't shut down threads while we use them.
+ */
+ SWIG_PYTHON_THREAD_BEGIN_BLOCK;
+
+ ret = 0;
+ if (PyObject_SetAttrString(pcb->pyobj, "this", Py_None) == -1) {
+ SWIG_Error(SWIG_RuntimeError, "WT SetAttr failed");
+ ret = EINVAL; /* any non-zero value will do. */
+ }
+ Py_XDECREF(pcb->pyobj);
+ Py_XDECREF(pcb->pyasynccb);
+
+ SWIG_PYTHON_THREAD_END_BLOCK;
+
+ return (ret);
+}
+
+/* Session specific close handler. */
+static int
+sessionCloseHandler(WT_SESSION *session_arg)
+{
+ int ret;
+ PY_CALLBACK *pcb;
+ WT_SESSION_IMPL *session;
+
+ ret = 0;
+ session = (WT_SESSION_IMPL *)session_arg;
+ pcb = (PY_CALLBACK *)session->lang_private;
+ session->lang_private = NULL;
+ if (pcb != NULL)
+ ret = pythonClose(pcb);
+ __wt_free(session, pcb);
+
+ return (ret);
+}
+
+/* Cursor specific close handler. */
+static int
+cursorCloseHandler(WT_CURSOR *cursor)
+{
+ int ret;
+ PY_CALLBACK *pcb;
+
+ ret = 0;
+ pcb = (PY_CALLBACK *)cursor->lang_private;
+ cursor->lang_private = NULL;
+ if (pcb != NULL)
+ ret = pythonClose(pcb);
+ __wt_free((WT_SESSION_IMPL *)cursor->session, pcb);
+
+ return (ret);
+}
+
+/* Session specific close handler. */
+static int
+sessionFreeHandler(WT_SESSION *session_arg)
+{
+ PY_CALLBACK *pcb;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)session_arg;
+ pcb = (PY_CALLBACK *)session->lang_private;
+ session->lang_private = NULL;
+ __wt_free(session, pcb);
+ return (0);
+}
+
+/* Cursor specific close handler. */
+static int
+cursorFreeHandler(WT_CURSOR *cursor)
+{
+ PY_CALLBACK *pcb;
+
+ pcb = (PY_CALLBACK *)cursor->lang_private;
+ cursor->lang_private = NULL;
+ __wt_free((WT_SESSION_IMPL *)cursor->session, pcb);
+ return (0);
+}
+
+static int
+pythonCloseCallback(WT_EVENT_HANDLER *handler, WT_SESSION *session,
+ WT_CURSOR *cursor)
+{
+ int ret;
+
+ WT_UNUSED(handler);
+
+ if (cursor != NULL)
+ ret = cursorCloseHandler(cursor);
+ else
+ ret = sessionCloseHandler(session);
+ return (ret);
+}
+
+static WT_EVENT_HANDLER pyApiEventHandler = {
+ pythonErrorCallback, pythonMessageCallback, NULL, pythonCloseCallback
+};
+%}
+
+/* Add async callback support. */
+%{
+
+static int
+pythonAsyncCallback(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *asyncop, int opret,
+ uint32_t flags)
+{
+ int ret, t_ret;
+ PY_CALLBACK *pcb;
+ PyObject *arglist, *notify_method, *pyresult;
+ WT_ASYNC_OP_IMPL *op;
+ WT_SESSION_IMPL *session;
+
+ /*
+ * Ensure the global interpreter lock is held since we'll be
+ * making Python calls now.
+ */
+ SWIG_PYTHON_THREAD_BEGIN_BLOCK;
+
+ op = (WT_ASYNC_OP_IMPL *)asyncop;
+ session = O2S(op);
+ pcb = (PY_CALLBACK *)asyncop->c.lang_private;
+ asyncop->c.lang_private = NULL;
+ ret = 0;
+
+ if (pcb->pyasynccb == NULL)
+ goto err;
+ if ((arglist = Py_BuildValue("(Oii)", pcb->pyobj,
+ opret, flags)) == NULL)
+ goto err;
+ if ((notify_method = PyObject_GetAttrString(pcb->pyasynccb,
+ "notify")) == NULL)
+ goto err;
+
+ pyresult = PyEval_CallObject(notify_method, arglist);
+ if (pyresult == NULL || !PyArg_Parse(pyresult, "i", &ret))
+ goto err;
+
+ if (0) {
+ if (ret == 0)
+ ret = EINVAL;
+err: __wt_err(session, ret, "python async callback error");
+ }
+ Py_XDECREF(pyresult);
+ Py_XDECREF(notify_method);
+ Py_XDECREF(arglist);
+
+ SWIG_PYTHON_THREAD_END_BLOCK;
+
+ if (pcb != NULL) {
+ if ((t_ret = pythonClose(pcb) != 0) && ret == 0)
+ ret = t_ret;
+ }
+ __wt_free(session, pcb);
+
+ if (ret == 0 && (opret == 0 || opret == WT_NOTFOUND))
+ return (0);
+ else
+ return (1);
+}
+
+static WT_ASYNC_CALLBACK pyApiAsyncCallback = { pythonAsyncCallback };
+%}
+
+%pythoncode %{
+class stat:
+ '''keys for statistics cursors'''
+
+ class conn:
+ '''keys for cursors on connection statistics'''
+ pass
+
+ class dsrc:
+ '''keys for cursors on data source statistics'''
+ pass
+
+## @}
+
+import sys
+# All names starting with 'WT_STAT_DSRC_' are renamed to
+# the wiredtiger.stat.dsrc class, those starting with 'WT_STAT_CONN' are
+# renamed to wiredtiger.stat.conn class.
+def _rename_with_prefix(prefix, toclass):
+ curmodule = sys.modules[__name__]
+ for name in dir(curmodule):
+ if name.startswith(prefix):
+ shortname = name[len(prefix):].lower()
+ setattr(toclass, shortname, getattr(curmodule, name))
+ delattr(curmodule, name)
+
+_rename_with_prefix('WT_STAT_CONN_', stat.conn)
+_rename_with_prefix('WT_STAT_DSRC_', stat.dsrc)
+del _rename_with_prefix
+%}
+
diff --git a/src/third_party/wiredtiger/lang/python/wiredtiger/fpacking.py b/src/third_party/wiredtiger/lang/python/wiredtiger/fpacking.py
new file mode 100644
index 00000000000..632c5c5a1c5
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/python/wiredtiger/fpacking.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# WiredTiger fixed-size packing and unpacking functions, using the Python
+# struct library.
+
+import struct
+
+def __wt2struct(fmt):
+ if not fmt:
+ return None, fmt
+ # Big endian with no alignment is the default
+ if fmt[0] in '@=<>!':
+ tfmt = fmt[0]
+ fmt = fmt[1:]
+ else:
+ tfmt = '>'
+ return tfmt, fmt.replace('r', 'Q')
+
+def unpack(fmt, s):
+ tfmt, fmt = __wt2struct(fmt)
+ if not fmt:
+ return ()
+ result = ()
+ pfmt = tfmt
+ sizebytes = 0
+ for offset, f in enumerate(fmt):
+ if f.isdigit():
+ sizebytes += 1
+ # With a fixed size, everything is encoded as a string
+ if f in 'Su' and sizebytes > 0:
+ f = 's'
+ if f not in 'Su':
+ pfmt += f
+ sizebytes = 0
+ continue
+
+ # We've hit something that needs special handling, split any fixed-size
+ # values we've already passed
+ if len(pfmt) > 1:
+ size = struct.calcsize(pfmt)
+ result += struct.unpack_from(pfmt, s)
+ s = s[size:]
+ if f == 'S':
+ l = s.find('\0')
+ result += (s[:l],)
+ s = s[l+1:]
+ if f == 'u':
+ if offset == len(fmt) - 1:
+ result += (s,)
+ else:
+ l = struct.unpack_from(tfmt + 'l', s)[0]
+ s = s[struct.calcsize(tfmt + 'l'):]
+ result += (s[:l],)
+ s = s[l:]
+ pfmt = tfmt
+ sizebytes = 0
+
+ if len(pfmt) > 1:
+ result += struct.unpack(pfmt, s)
+ return result
+
+def pack(fmt, *values):
+ pfmt, fmt = __wt2struct(fmt)
+ if not fmt:
+ return ''
+ i = sizebytes = 0
+ for offset, f in enumerate(fmt):
+ if f == 'S':
+ # Note: this code is being careful about embedded NUL characters
+ if sizebytes == 0:
+ l = values[i].find('\0') + 1
+ if not l:
+ l = len(values[i]) + 1
+ pfmt += str(l)
+ sizebytes = len(str(l))
+ f = 's'
+ elif f == 'u':
+ if sizebytes == 0 and offset != len(fmt) - 1:
+ l = len(values[i])
+ pfmt += 'l' + str(l)
+ values = values[:i] + (l,) + values[i:]
+ sizebytes = len(str(l))
+ f = 's'
+ pfmt += f
+ if f.isdigit():
+ sizebytes += 1
+ continue
+ if f != 's' and sizebytes > 0:
+ i += int(pfmt[-sizebytes:])
+ else:
+ i += 1
+ sizebytes = 0
+ return struct.pack(pfmt, *values)
diff --git a/src/third_party/wiredtiger/lang/python/wiredtiger/intpack-test.py b/src/third_party/wiredtiger/lang/python/wiredtiger/intpack-test.py
new file mode 100644
index 00000000000..b731a5e5adc
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/python/wiredtiger/intpack-test.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+from intpacking import compress_int
+
+i = 1
+while i < 1 << 60:
+ print -i, ''.join('%02x' % ord(c) for c in compress_int(-i))
+ print i, ''.join('%02x' % ord(c) for c in compress_int(i))
+ i <<= 1
diff --git a/src/third_party/wiredtiger/lang/python/wiredtiger/intpacking.py b/src/third_party/wiredtiger/lang/python/wiredtiger/intpacking.py
new file mode 100644
index 00000000000..d9e48bb91f6
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/python/wiredtiger/intpacking.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+import math, struct
+
+# Variable-length integer packing
+# need: up to 64 bits, both signed and unsigned
+#
+# Try hard for small values (up to ~2 bytes), after that, just encode the
+# length in the first byte.
+#
+# First byte | Next | |
+# byte | bytes| Min Value | Max Value
+# ------------+------+------------------------+--------------------------------
+# [00 00xxxx] | free | N/A | N/A
+# [00 01llll] | 8-l | -2^64 | -2^13 - 2^6
+# [00 1xxxxx] | 1 | -2^13 - 2^6 | -2^6 - 1
+# [01 xxxxxx] | 0 | -2^6 | -1
+# [10 xxxxxx] | 0 | 0 | 2^6 - 1
+# [11 0xxxxx] | 1 | 2^6 | 2^13 + 2^6 - 1
+# [11 10llll] | l | 2^14 + 2^7 | 2^64 - 1
+# [11 11xxxx] | free | N/A | N/A
+
+NEG_MULTI_MARKER = 0x10
+NEG_2BYTE_MARKER = 0x20
+NEG_1BYTE_MARKER = 0x40
+POS_1BYTE_MARKER = 0x80
+POS_2BYTE_MARKER = 0xc0
+POS_MULTI_MARKER = 0xe0
+
+NEG_1BYTE_MIN = -2**6
+NEG_2BYTE_MIN = -2**13 + NEG_1BYTE_MIN
+POS_1BYTE_MAX = 2**6 - 1
+POS_2BYTE_MAX = 2**13 + POS_1BYTE_MAX
+
+MINUS_BIT = -1 << 64
+UINT64_MASK = 0xffffffffffffffff
+
+def getbits(x, start, end=0):
+ '''return the least significant bits of x, from start to end'''
+ return (x & ((1 << start) - 1)) >> (end)
+
+def get_int(b, size):
+ r = 0;
+ for i in xrange(size):
+ r = (r << 8) | ord(b[i])
+ return r
+
+def pack_int(x):
+ if x < NEG_2BYTE_MIN:
+ packed = struct.pack('>Q', x & UINT64_MASK)
+ while packed and packed[0] == '\xff':
+ packed = packed[1:]
+ return chr(NEG_MULTI_MARKER | getbits(8 - len(packed), 4)) + packed
+ elif x < NEG_1BYTE_MIN:
+ x -= NEG_2BYTE_MIN
+ return chr(NEG_2BYTE_MARKER | getbits(x, 13, 8)) + chr(getbits(x, 8))
+ elif x < 0:
+ x -= NEG_1BYTE_MIN
+ return chr(NEG_1BYTE_MARKER | getbits(x, 6))
+ elif x <= POS_1BYTE_MAX:
+ return chr(POS_1BYTE_MARKER | getbits(x, 6))
+ elif x <= POS_2BYTE_MAX:
+ x -= (POS_1BYTE_MAX + 1)
+ return chr(POS_2BYTE_MARKER | getbits(x, 13, 8)) + chr(getbits(x, 8))
+ else:
+ packed = struct.pack('>Q', x - (POS_2BYTE_MAX + 1))
+ while packed and packed[0] == '\x00':
+ packed = packed[1:]
+ return chr(POS_MULTI_MARKER | getbits(len(packed), 4)) + packed
+
+def unpack_int(b):
+ marker = ord(b[0])
+ if marker < NEG_2BYTE_MARKER:
+ sz = 8 - getbits(marker, 4)
+ return ((-1 << (sz << 3)) | get_int(b[1:], sz), b[sz+1:])
+ elif marker < NEG_1BYTE_MARKER:
+ return (NEG_2BYTE_MIN + ((getbits(marker, 5) << 8) | ord(b[1])), b[2:])
+ elif marker < POS_1BYTE_MARKER:
+ return (NEG_1BYTE_MIN + getbits(marker, 6), b[1:])
+ elif marker < POS_2BYTE_MARKER:
+ return (getbits(marker, 6), b[1:])
+ elif marker < POS_MULTI_MARKER:
+ return (POS_1BYTE_MAX + 1 + ((getbits(marker, 5) << 8) | ord(b[1])), b[2:])
+ else:
+ sz = getbits(marker, 4)
+ return (POS_2BYTE_MAX + 1 + get_int(b[1:], sz), b[sz+1:])
+
+# Sanity testing
+if __name__ == '__main__':
+ import random
+
+ for big in (100, 10000, 1 << 40, 1 << 64):
+ for i in xrange(1000):
+ r = random.randint(-big, big)
+ print "\rChecking %d" % r,
+ if unpack_int(pack_int(r))[0] != r:
+ print "\nFound a problem with %d" % r
+ break
+
+ print
+
+ for i in xrange(1000):
+ r1 = random.randint(-big, big)
+ r2 = random.randint(-big, big)
+ print "\rChecking %d, %d" % (r1, r2),
+ if cmp(r1, r2) != cmp(pack_int(r1), pack_int(r2)):
+ print "\nFound a problem with %d, %d" % (r1, r2)
+ break
+
+ print
diff --git a/src/third_party/wiredtiger/lang/python/wiredtiger/packing-test.py b/src/third_party/wiredtiger/lang/python/wiredtiger/packing-test.py
new file mode 100644
index 00000000000..3a4e34f3fc1
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/python/wiredtiger/packing-test.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+from packing import pack, unpack
+
+def check(fmt, *v):
+ print fmt, repr(v), ''.join('%02x' % ord(c) for c in pack(fmt, *v))
+
+check('iii', 0, 101, -99)
+check('3i', 0, 101, -99)
+check('iS', 42, "forty two")
+check('u', r"\x42" * 20)
+check('uu', r"\x42" * 10, r"\x42" * 10)
diff --git a/src/third_party/wiredtiger/lang/python/wiredtiger/packing.py b/src/third_party/wiredtiger/lang/python/wiredtiger/packing.py
new file mode 100644
index 00000000000..a79bf6bffbd
--- /dev/null
+++ b/src/third_party/wiredtiger/lang/python/wiredtiger/packing.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# WiredTiger variable-length packing and unpacking functions
+
+from intpacking import pack_int, unpack_int
+
+def __get_type(fmt):
+ if not fmt:
+ return None, fmt
+ # Variable-sized encoding is the default (and only supported format in v1)
+ if fmt[0] in '.@<>':
+ tfmt = fmt[0]
+ fmt = fmt[1:]
+ else:
+ tfmt = '.'
+ return tfmt, fmt
+
+def unpack(fmt, s):
+ tfmt, fmt = __get_type(fmt)
+ if not fmt:
+ return ()
+ if tfmt != '.':
+ raise ValueError('Only variable-length encoding is currently supported')
+ result = []
+ havesize = size = 0
+ for offset, f in enumerate(fmt):
+ if f.isdigit():
+ size = (size * 10) + int(f)
+ havesize = 1
+ continue
+ elif f == 'x':
+ if not havesize:
+ size = 1
+ s = s[size:]
+ # Note: no value, don't increment i
+ elif f in 'Ssu':
+ if not havesize:
+ if f == 's':
+ size = 1
+ elif f == 'S':
+ size = s.find('\0')
+ elif f == 'u':
+ if offset == len(fmt) - 1:
+ size = len(s)
+ else:
+ size, s = unpack_int(s)
+ result.append(s[:size])
+ if f == 'S' and not havesize:
+ size += 1
+ s = s[size:]
+ elif f in 't':
+ # bit type, size is number of bits
+ if not havesize:
+ size = 1
+ result.append(ord(s[0:1]))
+ s = s[1:]
+ else:
+ # integral type
+ if not havesize:
+ size = 1
+ for j in xrange(size):
+ v, s = unpack_int(s)
+ result.append(v)
+ havesize = size = 0
+ return result
+
+def pack(fmt, *values):
+ tfmt, fmt = __get_type(fmt)
+ if not fmt:
+ return ()
+ if tfmt != '.':
+ raise ValueError('Only variable-length encoding is currently supported')
+ result = ''
+ havesize = i = size = 0
+ for offset, f in enumerate(fmt):
+ if f.isdigit():
+ size = (size * 10) + int(f)
+ havesize = 1
+ continue
+ elif f == 'x':
+ if not havesize:
+ result += '\0'
+ else:
+ result += '\0' * size
+ # Note: no value, don't increment i
+ elif f in 'Ssu':
+ if f == 'S' and '\0' in values[i]:
+ l = values[i].find('\0')
+ else:
+ l = len(values[i])
+ if havesize:
+ if l > size:
+ l = size
+ elif f == 's':
+ havesize = size = 1
+ elif f == 'u' and offset != len(fmt) - 1:
+ result += pack_int(l)
+ result += values[i][:l]
+ if f == 'S' and not havesize:
+ result += '\0'
+ elif size > l:
+ result += '\0' * (size - l)
+ i += 1
+ elif f in 't':
+ # bit type, size is number of bits
+ if not havesize:
+ size = 1
+ if size > 8:
+ raise ValueError("bit count cannot be greater than 8 for 't' encoding")
+ mask = (1 << size) - 1
+ val = values[i]
+ if (mask & val) != val:
+ raise ValueError("value out of range for 't' encoding")
+ result += chr(val)
+ i += 1
+ else:
+ # integral type
+ if not havesize:
+ size = 1
+ for j in xrange(size):
+ result += pack_int(values[i])
+ i += 1
+ havesize = size = 0
+ return result
diff --git a/src/third_party/wiredtiger/src/async/async_api.c b/src/third_party/wiredtiger/src/async/async_api.c
new file mode 100644
index 00000000000..3cb78e80b09
--- /dev/null
+++ b/src/third_party/wiredtiger/src/async/async_api.c
@@ -0,0 +1,604 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __async_get_format --
+ * Find or allocate the uri/config/format structure.
+ */
+static int
+__async_get_format(WT_CONNECTION_IMPL *conn, const char *uri,
+ const char *config, WT_ASYNC_OP_IMPL *op)
+{
+ WT_ASYNC *async;
+ WT_ASYNC_FORMAT *af;
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ WT_SESSION_IMPL *session;
+ uint64_t cfg_hash, uri_hash;
+
+ async = conn->async;
+ c = NULL;
+ op->format = NULL;
+
+ if (uri != NULL)
+ uri_hash = __wt_hash_city64(uri, strlen(uri));
+ else
+ uri_hash = 0;
+ if (config != NULL)
+ cfg_hash = __wt_hash_city64(config, strlen(config));
+ else
+ cfg_hash = 0;
+
+ /*
+ * We don't need to hold a lock around this walk. The list is
+ * permanent and always valid. We might race an insert and there
+ * is a possibility a duplicate entry might be inserted, but
+ * that is not harmful.
+ */
+ STAILQ_FOREACH(af, &async->formatqh, q) {
+ if (af->uri_hash == uri_hash && af->cfg_hash == cfg_hash)
+ goto setup;
+ }
+ /*
+ * We didn't find one in the cache. Allocate and initialize one.
+ * Insert it at the head expecting LRU usage. We need a real session
+ * for the cursor.
+ */
+ WT_RET(
+ __wt_open_internal_session(conn, "async-cursor", 1, 1, &session));
+ __wt_spin_lock(session, &async->ops_lock);
+ WT_ERR(__wt_calloc_def(session, 1, &af));
+ WT_ERR(__wt_strdup(session, uri, &af->uri));
+ WT_ERR(__wt_strdup(session, config, &af->config));
+ af->uri_hash = uri_hash;
+ af->cfg_hash = cfg_hash;
+ /*
+ * Get the key_format and value_format for this URI and store
+ * it in the structure so that async->set_key/value work.
+ */
+ wt_session = &session->iface;
+ WT_ERR(wt_session->open_cursor(wt_session, uri, NULL, NULL, &c));
+ WT_ERR(__wt_strdup(session, c->key_format, &af->key_format));
+ WT_ERR(__wt_strdup(session, c->value_format, &af->value_format));
+ WT_ERR(c->close(c));
+ c = NULL;
+
+ STAILQ_INSERT_HEAD(&async->formatqh, af, q);
+ __wt_spin_unlock(session, &async->ops_lock);
+ WT_ERR(wt_session->close(wt_session, NULL));
+
+setup: op->format = af;
+ /*
+ * Copy the pointers for the formats. Items in the async format
+ * queue remain there until the connection is closed. We must
+ * initialize the format fields in the async_op, which are publicly
+ * visible, and its internal cursor used by internal key/value
+ * functions.
+ */
+ op->iface.c.key_format = op->iface.key_format = af->key_format;
+ op->iface.c.value_format = op->iface.value_format = af->value_format;
+ return (0);
+
+err:
+ if (c != NULL)
+ (void)c->close(c);
+ __wt_free(session, af->uri);
+ __wt_free(session, af->config);
+ __wt_free(session, af->key_format);
+ __wt_free(session, af->value_format);
+ __wt_free(session, af);
+ return (ret);
+}
+
+/*
+ * __async_new_op_alloc --
+ * Find and allocate the next available async op handle.
+ */
+static int
+__async_new_op_alloc(WT_SESSION_IMPL *session, const char *uri,
+ const char *config, WT_ASYNC_OP_IMPL **opp)
+{
+ WT_ASYNC *async;
+ WT_ASYNC_OP_IMPL *op;
+ WT_CONNECTION_IMPL *conn;
+ uint32_t i, save_i, view;
+
+ conn = S2C(session);
+ async = conn->async;
+ WT_STAT_FAST_CONN_INCR(session, async_op_alloc);
+ *opp = NULL;
+
+retry:
+ op = NULL;
+ WT_ORDERED_READ(save_i, async->ops_index);
+ /*
+ * Look after the last one allocated for a free one. We'd expect
+ * ops to be freed mostly FIFO so we should quickly find one.
+ */
+ for (view = 1, i = save_i; i < conn->async_size; i++, view++) {
+ op = &async->async_ops[i];
+ if (op->state == WT_ASYNCOP_FREE)
+ break;
+ }
+
+ /*
+ * Loop around back to the beginning if we need to.
+ */
+ if (op == NULL || op->state != WT_ASYNCOP_FREE)
+ for (i = 0; i < save_i; i++, view++) {
+ op = &async->async_ops[i];
+ if (op->state == WT_ASYNCOP_FREE)
+ break;
+ }
+
+ /*
+ * We still haven't found one. Return an error.
+ */
+ if (op == NULL || op->state != WT_ASYNCOP_FREE) {
+ WT_STAT_FAST_CONN_INCR(session, async_full);
+ WT_RET(EBUSY);
+ }
+ /*
+ * Set the state of this op handle as READY for the user to use.
+ * If we can set the state then the op entry is ours.
+ * Start the next search at the next entry after this one.
+ */
+ if (!WT_ATOMIC_CAS4(op->state, WT_ASYNCOP_FREE, WT_ASYNCOP_READY)) {
+ WT_STAT_FAST_CONN_INCR(session, async_alloc_race);
+ goto retry;
+ }
+ WT_STAT_FAST_CONN_INCRV(session, async_alloc_view, view);
+ WT_RET(__async_get_format(conn, uri, config, op));
+ op->unique_id = WT_ATOMIC_ADD8(async->op_id, 1);
+ op->optype = WT_AOP_NONE;
+ (void)WT_ATOMIC_STORE4(async->ops_index, (i + 1) % conn->async_size);
+ *opp = op;
+ return (0);
+}
+
+/*
+ * __async_config --
+ * Parse and setup the async API options.
+ */
+static int
+__async_config(WT_SESSION_IMPL *session,
+ WT_CONNECTION_IMPL *conn, const char **cfg, int *runp)
+{
+ WT_CONFIG_ITEM cval;
+
+ /*
+ * The async configuration is off by default.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "async.enabled", &cval));
+ *runp = cval.val != 0;
+
+ /*
+ * Even if async is turned off, we want to parse and store the
+ * default values so that reconfigure can just enable them.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "async.ops_max", &cval));
+ conn->async_size = (uint32_t)cval.val;
+
+ WT_RET(__wt_config_gets(session, cfg, "async.threads", &cval));
+ conn->async_workers = (uint32_t)cval.val;
+ /* Sanity check that api_data.py is in sync with async.h */
+ WT_ASSERT(session, conn->async_workers <= WT_ASYNC_MAX_WORKERS);
+
+ return (0);
+}
+
+/*
+ * __wt_async_stats_update --
+ * Update the async stats for return to the application.
+ */
+void
+__wt_async_stats_update(WT_SESSION_IMPL *session)
+{
+ WT_ASYNC *async;
+ WT_CONNECTION_IMPL *conn;
+ WT_CONNECTION_STATS *stats;
+
+ conn = S2C(session);
+ async = conn->async;
+ if (async == NULL)
+ return;
+ stats = &conn->stats;
+ WT_STAT_SET(stats, async_cur_queue, async->cur_queue);
+ WT_STAT_SET(stats, async_max_queue, async->max_queue);
+ F_SET(conn, WT_CONN_SERVER_ASYNC);
+}
+
+/*
+ * __async_start --
+ * Start the async subsystem. All configuration processing has
+ * already been done by the caller.
+ */
+static int
+__async_start(WT_SESSION_IMPL *session)
+{
+ WT_ASYNC *async;
+ WT_CONNECTION_IMPL *conn;
+ uint32_t i;
+
+ conn = S2C(session);
+ conn->async_cfg = 1;
+ /*
+ * Async is on, allocate the WT_ASYNC structure and initialize the ops.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_ASYNC), &conn->async));
+ async = conn->async;
+ STAILQ_INIT(&async->formatqh);
+ WT_RET(__wt_spin_init(session, &async->ops_lock, "ops"));
+ WT_RET(__wt_cond_alloc(session, "async flush", 0, &async->flush_cond));
+ WT_RET(__wt_async_op_init(session));
+
+ /*
+ * Start up the worker threads.
+ */
+ F_SET(conn, WT_CONN_SERVER_ASYNC);
+ for (i = 0; i < conn->async_workers; i++) {
+ /*
+ * Each worker has its own session. We set both a general
+ * server flag in the connection and an individual flag
+ * in the session. The user may reconfigure the number of
+ * workers and we may want to selectively stop some workers
+ * while leaving the rest running.
+ */
+ WT_RET(__wt_open_internal_session(
+ conn, "async-worker", 1, 1, &async->worker_sessions[i]));
+ F_SET(async->worker_sessions[i], WT_SESSION_SERVER_ASYNC);
+ }
+ for (i = 0; i < conn->async_workers; i++) {
+ /*
+ * Start the threads.
+ */
+ WT_RET(__wt_thread_create(session, &async->worker_tids[i],
+ __wt_async_worker, async->worker_sessions[i]));
+ }
+ __wt_async_stats_update(session);
+ return (0);
+}
+
+/*
+ * __wt_async_create --
+ * Start the async subsystem and worker threads.
+ */
+int
+__wt_async_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ int run;
+
+ conn = S2C(session);
+
+ /* Handle configuration. */
+ run = 0;
+ WT_RET(__async_config(session, conn, cfg, &run));
+
+ /* If async is not configured, we're done. */
+ if (!run)
+ return (0);
+ return (__async_start(session));
+}
+
+/*
+ * __wt_async_reconfig --
+ * Start the async subsystem and worker threads.
+ */
+int
+__wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_ASYNC *async;
+ WT_CONNECTION_IMPL *conn, tmp_conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ int run;
+ uint32_t i;
+
+ conn = S2C(session);
+ async = conn->async;
+ memset(&tmp_conn, 0, sizeof(tmp_conn));
+ tmp_conn.async_cfg = conn->async_cfg;
+ tmp_conn.async_workers = conn->async_workers;
+ tmp_conn.async_size = conn->async_size;
+
+ /* Handle configuration. */
+ run = conn->async_cfg;
+ WT_RET(__async_config(session, &tmp_conn, cfg, &run));
+
+ /*
+ * There are some restrictions on the live reconfiguration of async.
+ * Unlike other subsystems where we simply destroy anything existing
+ * and restart with the new configuration, async is not so easy.
+ * If the user is just changing the number of workers, we want to
+ * allow the existing op handles and other information to remain in
+ * existence. So we must handle various combinations of changes
+ * individually.
+ *
+ * One restriction is that if async is currently on, the user cannot
+ * change the number of async op handles available. The user can try
+ * but we do nothing with it. However we must allow the ops_max config
+ * string so that a user can completely start async via reconfigure.
+ */
+
+ /*
+ * Easy cases:
+ * 1. If async is on and the user wants it off, shut it down.
+ * 2. If async is off, and the user wants it on, start it.
+ * 3. If not a toggle and async is off, we're done.
+ */
+ if (conn->async_cfg > 0 && !run) {
+ /* Case 1 */
+ WT_TRET(__wt_async_flush(session));
+ ret = __wt_async_destroy(session);
+ conn->async_cfg = 0;
+ return (ret);
+ } else if (conn->async_cfg == 0 && run)
+ /* Case 2 */
+ return (__async_start(session));
+ else if (conn->async_cfg == 0)
+ /* Case 3 */
+ return (0);
+
+ /*
+ * Running async worker modification cases:
+ * 4. If number of workers didn't change, we're done.
+ * 5. If more workers, start new ones.
+ * 6. If fewer workers, kill some.
+ */
+ if (conn->async_workers == tmp_conn.async_workers)
+ /* No change in the number of workers. */
+ return (0);
+ if (conn->async_workers < tmp_conn.async_workers) {
+ /* Case 5 */
+ /*
+ * The worker_sessions array is allocated for the maximum
+ * allowed number of workers, so starting more is easy.
+ */
+ for (i = conn->async_workers; i < tmp_conn.async_workers; i++) {
+ /*
+ * Each worker has its own session.
+ */
+ WT_RET(__wt_open_internal_session(conn,
+ "async-worker", 1, 1, &async->worker_sessions[i]));
+ F_SET(async->worker_sessions[i],
+ WT_SESSION_SERVER_ASYNC);
+ }
+ for (i = conn->async_workers; i < tmp_conn.async_workers; i++) {
+ /*
+ * Start the threads.
+ */
+ WT_RET(__wt_thread_create(session,
+ &async->worker_tids[i], __wt_async_worker,
+ async->worker_sessions[i]));
+ }
+ conn->async_workers = tmp_conn.async_workers;
+ }
+ if (conn->async_workers > tmp_conn.async_workers) {
+ /* Case 6 */
+ /*
+ * Stopping an individual async worker is the most complex case.
+ * We clear the session async flag on the targeted worker thread
+ * so that only that thread stops, and the others keep running.
+ */
+ for (i = conn->async_workers - 1;
+ i >= tmp_conn.async_workers; i--) {
+ /*
+ * Join any worker we're stopping.
+ * After the thread is stopped, close its session.
+ */
+ WT_ASSERT(session, async->worker_tids[i] != 0);
+ WT_ASSERT(session, async->worker_sessions[i] != NULL);
+ F_CLR(async->worker_sessions[i],
+ WT_SESSION_SERVER_ASYNC);
+ WT_TRET(__wt_thread_join(
+ session, async->worker_tids[i]));
+ async->worker_tids[i] = 0;
+ wt_session = &async->worker_sessions[i]->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ async->worker_sessions[i] = NULL;
+ }
+ conn->async_workers = tmp_conn.async_workers;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_async_destroy --
+ * Destroy the async worker threads and async subsystem.
+ */
+int
+__wt_async_destroy(WT_SESSION_IMPL *session)
+{
+ WT_ASYNC *async;
+ WT_ASYNC_FORMAT *af, *afnext;
+ WT_ASYNC_OP *op;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ uint32_t i;
+
+ conn = S2C(session);
+ async = conn->async;
+
+ if (!conn->async_cfg)
+ return (0);
+
+ F_CLR(conn, WT_CONN_SERVER_ASYNC);
+ for (i = 0; i < conn->async_workers; i++)
+ if (async->worker_tids[i] != 0) {
+ WT_TRET(__wt_thread_join(
+ session, async->worker_tids[i]));
+ async->worker_tids[i] = 0;
+ }
+ WT_TRET(__wt_cond_destroy(session, &async->flush_cond));
+
+ /* Close the server threads' sessions. */
+ for (i = 0; i < conn->async_workers; i++)
+ if (async->worker_sessions[i] != NULL) {
+ wt_session = &async->worker_sessions[i]->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ async->worker_sessions[i] = NULL;
+ }
+ /* Free any op key/value buffers. */
+ for (i = 0; i < conn->async_size; i++) {
+ op = (WT_ASYNC_OP *)&async->async_ops[i];
+ if (op->c.key.data != NULL)
+ __wt_buf_free(session, &op->c.key);
+ if (op->c.value.data != NULL)
+ __wt_buf_free(session, &op->c.value);
+ }
+
+ /* Free format resources */
+ af = STAILQ_FIRST(&async->formatqh);
+ while (af != NULL) {
+ afnext = STAILQ_NEXT(af, q);
+ __wt_free(session, af->uri);
+ __wt_free(session, af->config);
+ __wt_free(session, af->key_format);
+ __wt_free(session, af->value_format);
+ __wt_free(session, af);
+ af = afnext;
+ }
+ __wt_free(session, async->async_queue);
+ __wt_free(session, async->async_ops);
+ __wt_spin_destroy(session, &async->ops_lock);
+ __wt_free(session, conn->async);
+
+ return (ret);
+}
+
+/*
+ * __wt_async_flush --
+ * Implementation of the WT_CONN->async_flush method.
+ */
+int
+__wt_async_flush(WT_SESSION_IMPL *session)
+{
+ WT_ASYNC *async;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+ if (!conn->async_cfg)
+ return (0);
+
+ async = conn->async;
+ WT_STAT_FAST_CONN_INCR(session, async_flush);
+ /*
+ * We have to do several things. First we have to prevent
+ * other callers from racing with us so that only one
+ * flush is happening at a time. Next we have to wait for
+ * the worker threads to notice the flush and indicate
+ * that the flush is complete on their side. Then we
+ * clear the flush flags and return.
+ */
+retry:
+ while (async->flush_state != WT_ASYNC_FLUSH_NONE)
+ /*
+ * We're racing an in-progress flush. We need to wait
+ * our turn to start our own. We need to convoy the
+ * racing calls because a later call may be waiting for
+ * specific enqueued ops to be complete before this returns.
+ */
+ __wt_sleep(0, 100000);
+
+ if (!WT_ATOMIC_CAS4(async->flush_state, WT_ASYNC_FLUSH_NONE,
+ WT_ASYNC_FLUSH_IN_PROGRESS))
+ goto retry;
+ /*
+ * We're the owner of this flush operation. Set the
+ * WT_ASYNC_FLUSH_IN_PROGRESS to block other callers.
+ * We're also preventing all worker threads from taking
+ * things off the work queue with the lock.
+ */
+ async->flush_count = 0;
+ (void)WT_ATOMIC_ADD8(async->flush_gen, 1);
+ WT_ASSERT(session, async->flush_op.state == WT_ASYNCOP_FREE);
+ async->flush_op.state = WT_ASYNCOP_READY;
+ WT_ERR(__wt_async_op_enqueue(session, &async->flush_op));
+ while (async->flush_state != WT_ASYNC_FLUSH_COMPLETE)
+ WT_ERR(__wt_cond_wait(NULL, async->flush_cond, 100000));
+ /*
+ * Flush is done. Clear the flags.
+ */
+ async->flush_op.state = WT_ASYNCOP_FREE;
+ WT_PUBLISH(async->flush_state, WT_ASYNC_FLUSH_NONE);
+err:
+ return (ret);
+}
+
+/*
+ * __async_runtime_config --
+ * Configure runtime fields at allocation.
+ */
+static int
+__async_runtime_config(WT_ASYNC_OP_IMPL *op, const char *cfg[])
+{
+ WT_ASYNC_OP *asyncop;
+ WT_CONFIG_ITEM cval;
+ WT_SESSION_IMPL *session;
+
+ session = O2S(op);
+ asyncop = (WT_ASYNC_OP *)op;
+ WT_RET(__wt_config_gets_def(session, cfg, "append", 0, &cval));
+ if (cval.val)
+ F_SET(&asyncop->c, WT_CURSTD_APPEND);
+ else
+ F_CLR(&asyncop->c, WT_CURSTD_APPEND);
+ WT_RET(__wt_config_gets_def(session, cfg, "overwrite", 1, &cval));
+ if (cval.val)
+ F_SET(&asyncop->c, WT_CURSTD_OVERWRITE);
+ else
+ F_CLR(&asyncop->c, WT_CURSTD_OVERWRITE);
+ WT_RET(__wt_config_gets_def(session, cfg, "raw", 0, &cval));
+ if (cval.val)
+ F_SET(&asyncop->c, WT_CURSTD_RAW);
+ else
+ F_CLR(&asyncop->c, WT_CURSTD_RAW);
+ return (0);
+
+}
+
+/*
+ * __wt_async_new_op --
+ * Implementation of the WT_CONN->async_new_op method.
+ */
+int
+__wt_async_new_op(WT_SESSION_IMPL *session, const char *uri,
+ const char *config, const char *cfg[], WT_ASYNC_CALLBACK *cb,
+ WT_ASYNC_OP_IMPL **opp)
+{
+ WT_ASYNC_OP_IMPL *op;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ *opp = NULL;
+
+ conn = S2C(session);
+ if (!conn->async_cfg)
+ return (ENOTSUP);
+
+ op = NULL;
+ WT_ERR(__async_new_op_alloc(session, uri, config, &op));
+ WT_ERR(__async_runtime_config(op, cfg));
+ op->cb = cb;
+ *opp = op;
+ return (0);
+
+err:
+ /*
+ * If we get an error after allocating op, set its state to free.
+ */
+ if (op != NULL)
+ op->state = WT_ASYNCOP_FREE;
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/async/async_op.c b/src/third_party/wiredtiger/src/async/async_op.c
new file mode 100644
index 00000000000..9dba2b2b5f3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/async/async_op.c
@@ -0,0 +1,359 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+/*
+ * __async_get_key --
+ * WT_ASYNC_OP->get_key implementation for op handles.
+ */
+static int
+__async_get_key(WT_ASYNC_OP *asyncop, ...)
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, asyncop);
+ ret = __wt_cursor_get_keyv(&asyncop->c, asyncop->c.flags, ap);
+ va_end(ap);
+ return (ret);
+}
+
+/*
+ * __async_set_key --
+ * WT_ASYNC_OP->set_key implementation for op handles.
+ */
+static void
+__async_set_key(WT_ASYNC_OP *asyncop, ...)
+{
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ va_list ap;
+
+ c = &asyncop->c;
+ va_start(ap, asyncop);
+ __wt_cursor_set_keyv(c, c->flags, ap);
+ if (!WT_DATA_IN_ITEM(&c->key) && !WT_CURSOR_RECNO(c))
+ WT_ERR(__wt_buf_set(O2S((WT_ASYNC_OP_IMPL *)asyncop), &c->key,
+ c->key.data, c->key.size));
+ va_end(ap);
+ if (0)
+err: c->saved_err = ret;
+}
+
+/*
+ * __async_get_value --
+ * WT_ASYNC_OP->get_value implementation for op handles.
+ */
+static int
+__async_get_value(WT_ASYNC_OP *asyncop, ...)
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, asyncop);
+ ret = __wt_cursor_get_valuev(&asyncop->c, ap);
+ va_end(ap);
+ return (ret);
+}
+
+/*
+ * __async_set_value --
+ * WT_ASYNC_OP->set_value implementation for op handles.
+ */
+static void
+__async_set_value(WT_ASYNC_OP *asyncop, ...)
+{
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ va_list ap;
+
+ c = &asyncop->c;
+ va_start(ap, asyncop);
+ __wt_cursor_set_valuev(c, ap);
+ /* Copy the data, if it is pointing at data elsewhere. */
+ if (!WT_DATA_IN_ITEM(&c->value))
+ WT_ERR(__wt_buf_set(O2S((WT_ASYNC_OP_IMPL *)asyncop),
+ &c->value, c->value.data, c->value.size));
+ va_end(ap);
+ if (0)
+err: c->saved_err = ret;
+}
+
+/*
+ * __async_op_wrap --
+ * Common wrapper for all async operations.
+ */
+static int
+__async_op_wrap(WT_ASYNC_OP_IMPL *op, WT_ASYNC_OPTYPE type)
+{
+ op->optype = type;
+ return (__wt_async_op_enqueue(O2S(op), op));
+}
+
+/*
+ * __async_search --
+ * WT_ASYNC_OP->search implementation for op handles.
+ */
+static int
+__async_search(WT_ASYNC_OP *asyncop)
+{
+ WT_ASYNC_OP_IMPL *op;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ op = (WT_ASYNC_OP_IMPL *)asyncop;
+ ASYNCOP_API_CALL(O2C(op), session, search);
+ WT_STAT_FAST_CONN_INCR(O2S(op), async_op_search);
+ WT_ERR(__async_op_wrap(op, WT_AOP_SEARCH));
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __async_insert --
+ * WT_ASYNC_OP->insert implementation for op handles.
+ */
+static int
+__async_insert(WT_ASYNC_OP *asyncop)
+{
+ WT_ASYNC_OP_IMPL *op;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ op = (WT_ASYNC_OP_IMPL *)asyncop;
+ ASYNCOP_API_CALL(O2C(op), session, insert);
+ WT_STAT_FAST_CONN_INCR(O2S(op), async_op_insert);
+ WT_ERR(__async_op_wrap(op, WT_AOP_INSERT));
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __async_update --
+ * WT_ASYNC_OP->update implementation for op handles.
+ */
+static int
+__async_update(WT_ASYNC_OP *asyncop)
+{
+ WT_ASYNC_OP_IMPL *op;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ op = (WT_ASYNC_OP_IMPL *)asyncop;
+ ASYNCOP_API_CALL(O2C(op), session, update);
+ WT_STAT_FAST_CONN_INCR(O2S(op), async_op_update);
+ WT_ERR(__async_op_wrap(op, WT_AOP_UPDATE));
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __async_remove --
+ * WT_ASYNC_OP->remove implementation for op handles.
+ */
+static int
+__async_remove(WT_ASYNC_OP *asyncop)
+{
+ WT_ASYNC_OP_IMPL *op;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ op = (WT_ASYNC_OP_IMPL *)asyncop;
+ ASYNCOP_API_CALL(O2C(op), session, remove);
+ WT_STAT_FAST_CONN_INCR(O2S(op), async_op_remove);
+ WT_ERR(__async_op_wrap(op, WT_AOP_REMOVE));
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __async_compact --
+ * WT_ASYNC_OP->compact implementation for op handles.
+ */
+static int
+__async_compact(WT_ASYNC_OP *asyncop)
+{
+ WT_ASYNC_OP_IMPL *op;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ op = (WT_ASYNC_OP_IMPL *)asyncop;
+ ASYNCOP_API_CALL(O2C(op), session, compact);
+ WT_STAT_FAST_CONN_INCR(O2S(op), async_op_compact);
+ WT_ERR(__async_op_wrap(op, WT_AOP_COMPACT));
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __async_get_id --
+ * WT_ASYNC_OP->get_id implementation for op handles.
+ */
+static uint64_t
+__async_get_id(WT_ASYNC_OP *asyncop)
+{
+ return (((WT_ASYNC_OP_IMPL *)asyncop)->unique_id);
+}
+
+/*
+ * __async_get_type --
+ * WT_ASYNC_OP->get_type implementation for op handles.
+ */
+static WT_ASYNC_OPTYPE
+__async_get_type(WT_ASYNC_OP *asyncop)
+{
+ return (((WT_ASYNC_OP_IMPL *)asyncop)->optype);
+}
+
+/*
+ * __async_op_init --
+ * Initialize all the op handle fields.
+ */
+static int
+__async_op_init(WT_CONNECTION_IMPL *conn, WT_ASYNC_OP_IMPL *op, uint32_t id)
+{
+ WT_ASYNC_OP *asyncop;
+
+ asyncop = (WT_ASYNC_OP *)op;
+ asyncop->connection = (WT_CONNECTION *)conn;
+ asyncop->key_format = asyncop->value_format = NULL;
+ asyncop->c.key_format = asyncop->c.value_format = NULL;
+ asyncop->get_key = __async_get_key;
+ asyncop->get_value = __async_get_value;
+ asyncop->set_key = __async_set_key;
+ asyncop->set_value = __async_set_value;
+ asyncop->search = __async_search;
+ asyncop->insert = __async_insert;
+ asyncop->update = __async_update;
+ asyncop->remove = __async_remove;
+ asyncop->compact = __async_compact;
+ asyncop->get_id = __async_get_id;
+ asyncop->get_type = __async_get_type;
+ /*
+ * The cursor needs to have the get/set key/value functions initialized.
+ * It also needs the key/value related fields set up.
+ */
+ asyncop->c.get_key = __wt_cursor_get_key;
+ asyncop->c.set_key = __wt_cursor_set_key;
+ asyncop->c.get_value = __wt_cursor_get_value;
+ asyncop->c.set_value = __wt_cursor_set_value;
+ asyncop->c.recno = 0;
+ memset(asyncop->c.raw_recno_buf, 0, sizeof(asyncop->c.raw_recno_buf));
+ memset(&asyncop->c.key, 0, sizeof(asyncop->c.key));
+ memset(&asyncop->c.value, 0, sizeof(asyncop->c.value));
+ asyncop->c.session = (WT_SESSION *)conn->default_session;
+ asyncop->c.saved_err = 0;
+ asyncop->c.flags = 0;
+
+ op->internal_id = id;
+ op->state = WT_ASYNCOP_FREE;
+ return (0);
+}
+
+/*
+ * __wt_async_op_enqueue --
+ * Enqueue an operation onto the work queue.
+ */
+int
+__wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op)
+{
+ WT_ASYNC *async;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ uint64_t cur_head, cur_tail, my_alloc, my_slot;
+#ifdef HAVE_DIAGNOSTIC
+ WT_ASYNC_OP_IMPL *my_op;
+#endif
+
+ conn = S2C(session);
+ async = conn->async;
+ /*
+ * Enqueue op at the tail of the work queue.
+ */
+ WT_ASSERT(session, op->state == WT_ASYNCOP_READY);
+ /*
+ * We get our slot in the ring buffer to use.
+ */
+ my_alloc = WT_ATOMIC_ADD8(async->alloc_head, 1);
+ my_slot = my_alloc % async->async_qsize;
+
+ /*
+ * Make sure we haven't wrapped around the queue.
+ * If so, wait for the tail to advance off this slot.
+ */
+ WT_ORDERED_READ(cur_tail, async->tail_slot);
+ while (cur_tail == my_slot) {
+ __wt_yield();
+ WT_ORDERED_READ(cur_tail, async->tail_slot);
+ }
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_ORDERED_READ(my_op, async->async_queue[my_slot]);
+ if (my_op != NULL)
+ return (__wt_panic(session));
+#endif
+ WT_PUBLISH(async->async_queue[my_slot], op);
+ op->state = WT_ASYNCOP_ENQUEUED;
+ if (WT_ATOMIC_ADD4(async->cur_queue, 1) > async->max_queue)
+ WT_PUBLISH(async->max_queue, async->cur_queue);
+ /*
+ * Multiple threads may be adding ops to the queue. We need to wait
+ * our turn to make our slot visible to workers.
+ */
+ WT_ORDERED_READ(cur_head, async->head);
+ while (cur_head != (my_alloc - 1)) {
+ __wt_yield();
+ WT_ORDERED_READ(cur_head, async->head);
+ }
+ WT_PUBLISH(async->head, my_alloc);
+ return (ret);
+}
+
+/*
+ * __wt_async_op_init --
+ * Initialize all the op handles.
+ */
+int
+__wt_async_op_init(WT_SESSION_IMPL *session)
+{
+ WT_ASYNC *async;
+ WT_ASYNC_OP_IMPL *op;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ uint32_t i;
+
+ conn = S2C(session);
+ async = conn->async;
+
+ /*
+ * Initialize the flush op structure.
+ */
+ WT_RET(__async_op_init(conn, &async->flush_op, OPS_INVALID_INDEX));
+
+ /*
+ * Allocate and initialize the work queue. This is sized so that
+ * the ring buffer is known to be big enough such that the head
+ * can never overlap the tail. Include extra for the flush op.
+ */
+ async->async_qsize = conn->async_size + 2;
+ WT_RET(__wt_calloc_def(
+ session, async->async_qsize, &async->async_queue));
+ /*
+ * Allocate and initialize all the user ops.
+ */
+ WT_ERR(__wt_calloc_def(session, conn->async_size, &async->async_ops));
+ for (i = 0; i < conn->async_size; i++) {
+ op = &async->async_ops[i];
+ WT_ERR(__async_op_init(conn, op, i));
+ }
+ return (0);
+err:
+ if (async->async_ops != NULL) {
+ __wt_free(session, async->async_ops);
+ async->async_ops = NULL;
+ }
+ if (async->async_queue != NULL) {
+ __wt_free(session, async->async_queue);
+ async->async_queue = NULL;
+ }
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/async/async_worker.c b/src/third_party/wiredtiger/src/async/async_worker.c
new file mode 100644
index 00000000000..74ee2dd2f86
--- /dev/null
+++ b/src/third_party/wiredtiger/src/async/async_worker.c
@@ -0,0 +1,359 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __async_op_dequeue --
+ * Wait for work to be available. Then atomically take it off
+ * the work queue.
+ */
+static int
+__async_op_dequeue(WT_CONNECTION_IMPL *conn, WT_SESSION_IMPL *session,
+ WT_ASYNC_OP_IMPL **op)
+{
+ WT_ASYNC *async;
+ long sleep_usec;
+ uint64_t cur_tail, last_consume, my_consume, my_slot, prev_slot;
+ uint32_t tries;
+
+ async = conn->async;
+ *op = NULL;
+ /*
+ * Wait for work to do. Work is available when async->head moves.
+ * Then grab the slot containing the work. If we lose, try again.
+ */
+retry:
+ tries = 0;
+ sleep_usec = 100;
+ WT_ORDERED_READ(last_consume, async->alloc_tail);
+ /*
+ * We stay in this loop until there is work to do.
+ */
+ while (last_consume == async->head &&
+ async->flush_state != WT_ASYNC_FLUSHING) {
+ WT_STAT_FAST_CONN_INCR(session, async_nowork);
+ if (++tries < MAX_ASYNC_YIELD)
+ /*
+ * Initially when we find no work, allow other
+ * threads to run.
+ */
+ __wt_yield();
+ else {
+ /*
+ * If we haven't found work in a while, start sleeping
+ * to wait for work to arrive instead of spinning.
+ */
+ __wt_sleep(0, sleep_usec);
+ sleep_usec = WT_MIN(sleep_usec * 2,
+ MAX_ASYNC_SLEEP_USECS);
+ }
+ if (!F_ISSET(session, WT_SESSION_SERVER_ASYNC))
+ return (0);
+ if (!F_ISSET(conn, WT_CONN_SERVER_ASYNC))
+ return (0);
+ if (F_ISSET(conn, WT_CONN_PANIC))
+ return (__wt_panic(session));
+ WT_ORDERED_READ(last_consume, async->alloc_tail);
+ }
+ if (async->flush_state == WT_ASYNC_FLUSHING)
+ return (0);
+ /*
+ * Try to increment the tail to claim this slot. If we lose
+ * a race, try again.
+ */
+ my_consume = last_consume + 1;
+ if (!WT_ATOMIC_CAS8(async->alloc_tail, last_consume, my_consume))
+ goto retry;
+ /*
+ * This item of work is ours to process. Clear it out of the
+ * queue and return.
+ */
+ my_slot = my_consume % async->async_qsize;
+ prev_slot = last_consume % async->async_qsize;
+ *op = WT_ATOMIC_STORE8(async->async_queue[my_slot], NULL);
+
+ WT_ASSERT(session, async->cur_queue > 0);
+ WT_ASSERT(session, *op != NULL);
+ WT_ASSERT(session, (*op)->state == WT_ASYNCOP_ENQUEUED);
+ (void)WT_ATOMIC_SUB4(async->cur_queue, 1);
+ (*op)->state = WT_ASYNCOP_WORKING;
+
+ if (*op == &async->flush_op)
+ /*
+ * We're the worker to take the flush op off the queue.
+ */
+ WT_PUBLISH(async->flush_state, WT_ASYNC_FLUSHING);
+ WT_ORDERED_READ(cur_tail, async->tail_slot);
+ while (cur_tail != prev_slot) {
+ __wt_yield();
+ WT_ORDERED_READ(cur_tail, async->tail_slot);
+ }
+ WT_PUBLISH(async->tail_slot, my_slot);
+ return (0);
+}
+
+/*
+ * __async_flush_wait --
+ * Wait for the final worker to finish flushing.
+ */
+static int
+__async_flush_wait(WT_SESSION_IMPL *session, WT_ASYNC *async, uint64_t my_gen)
+{
+ WT_DECL_RET;
+
+ while (async->flush_state == WT_ASYNC_FLUSHING &&
+ async->flush_gen == my_gen)
+ WT_ERR(__wt_cond_wait(session, async->flush_cond, 10000));
+err: return (ret);
+}
+
+/*
+ * __async_worker_cursor --
+ * Return a cursor for the worker thread to use for its op.
+ * The worker thread caches cursors. So first search for one
+ * with the same config/uri signature. Otherwise open a new
+ * cursor and cache it.
+ */
+static int
+__async_worker_cursor(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
+ WT_ASYNC_WORKER_STATE *worker, WT_CURSOR **cursorp)
+{
+ WT_ASYNC_CURSOR *ac;
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ wt_session = (WT_SESSION *)session;
+ *cursorp = NULL;
+ /*
+ * Compact doesn't need a cursor.
+ */
+ if (op->optype == WT_AOP_COMPACT)
+ return (0);
+ WT_ASSERT(session, op->format != NULL);
+ STAILQ_FOREACH(ac, &worker->cursorqh, q) {
+ if (op->format->cfg_hash == ac->cfg_hash &&
+ op->format->uri_hash == ac->uri_hash) {
+ /*
+ * If one of our cached cursors has a matching
+ * signature, use it and we're done.
+ */
+ *cursorp = ac->c;
+ return (0);
+ }
+ }
+ /*
+ * We didn't find one in our cache. Open one and cache it.
+ * Insert it at the head expecting LRU usage.
+ */
+ WT_RET(__wt_calloc_def(session, 1, &ac));
+ WT_ERR(wt_session->open_cursor(
+ wt_session, op->format->uri, NULL, op->format->config, &c));
+ ac->cfg_hash = op->format->cfg_hash;
+ ac->uri_hash = op->format->uri_hash;
+ ac->c = c;
+ STAILQ_INSERT_HEAD(&worker->cursorqh, ac, q);
+ worker->num_cursors++;
+ *cursorp = c;
+ return (0);
+
+err: __wt_free(session, ac);
+ return (ret);
+}
+
+/*
+ * __async_worker_execop --
+ * A worker thread executes an individual op with a cursor.
+ */
+static int
+__async_worker_execop(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
+ WT_CURSOR *cursor)
+{
+ WT_ASYNC_OP *asyncop;
+ WT_ITEM val;
+ WT_SESSION *wt_session;
+
+ asyncop = (WT_ASYNC_OP *)op;
+ /*
+ * Set the key of our local cursor from the async op handle.
+ * If needed, also set the value.
+ */
+ if (op->optype != WT_AOP_COMPACT) {
+ WT_RET(__wt_cursor_get_raw_key(&asyncop->c, &val));
+ __wt_cursor_set_raw_key(cursor, &val);
+ if (op->optype == WT_AOP_INSERT ||
+ op->optype == WT_AOP_UPDATE) {
+ WT_RET(__wt_cursor_get_raw_value(&asyncop->c, &val));
+ __wt_cursor_set_raw_value(cursor, &val);
+ }
+ }
+ switch (op->optype) {
+ case WT_AOP_COMPACT:
+ wt_session = &session->iface;
+ WT_RET(wt_session->compact(wt_session,
+ op->format->uri, op->format->config));
+ break;
+ case WT_AOP_INSERT:
+ WT_RET(cursor->insert(cursor));
+ break;
+ case WT_AOP_UPDATE:
+ WT_RET(cursor->update(cursor));
+ break;
+ case WT_AOP_REMOVE:
+ WT_RET(cursor->remove(cursor));
+ break;
+ case WT_AOP_SEARCH:
+ WT_RET(cursor->search(cursor));
+ /*
+ * Get the value from the cursor and put it into
+ * the op for op->get_value.
+ */
+ WT_RET(__wt_cursor_get_raw_value(cursor, &val));
+ __wt_cursor_set_raw_value(&asyncop->c, &val);
+ break;
+ case WT_AOP_NONE:
+ default:
+ WT_RET_MSG(session, EINVAL, "Unknown async optype %d\n",
+ op->optype);
+ }
+ return (0);
+}
+
+/*
+ * __async_worker_op --
+ * A worker thread handles an individual op.
+ */
+static int
+__async_worker_op(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
+ WT_ASYNC_WORKER_STATE *worker)
+{
+ WT_ASYNC_OP *asyncop;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ int cb_ret;
+
+ asyncop = (WT_ASYNC_OP *)op;
+
+ cb_ret = 0;
+
+ wt_session = &session->iface;
+ if (op->optype != WT_AOP_COMPACT)
+ WT_RET(wt_session->begin_transaction(wt_session, NULL));
+ WT_ASSERT(session, op->state == WT_ASYNCOP_WORKING);
+ WT_RET(__async_worker_cursor(session, op, worker, &cursor));
+ /*
+ * Perform op and invoke the callback.
+ */
+ ret = __async_worker_execop(session, op, cursor);
+ if (op->cb != NULL && op->cb->notify != NULL)
+ cb_ret = op->cb->notify(op->cb, asyncop, ret, 0);
+
+ /*
+ * If the operation succeeded and the user callback returned
+ * zero then commit. Otherwise rollback.
+ */
+ if (op->optype != WT_AOP_COMPACT) {
+ if ((ret == 0 || ret == WT_NOTFOUND) && cb_ret == 0)
+ WT_TRET(wt_session->commit_transaction(
+ wt_session, NULL));
+ else
+ WT_TRET(wt_session->rollback_transaction(
+ wt_session, NULL));
+ F_CLR(&asyncop->c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ WT_TRET(cursor->reset(cursor));
+ }
+ /*
+ * After the callback returns, and the transaction resolved release
+ * the op back to the free pool. We do this regardless of
+ * success or failure.
+ */
+ WT_PUBLISH(op->state, WT_ASYNCOP_FREE);
+ return (ret);
+}
+
+/*
+ * __async_worker --
+ * The async worker threads.
+ */
+void *
+__wt_async_worker(void *arg)
+{
+ WT_ASYNC *async;
+ WT_ASYNC_CURSOR *ac, *acnext;
+ WT_ASYNC_OP_IMPL *op;
+ WT_ASYNC_WORKER_STATE worker;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ uint64_t flush_gen;
+
+ session = arg;
+ conn = S2C(session);
+ async = conn->async;
+
+ worker.num_cursors = 0;
+ STAILQ_INIT(&worker.cursorqh);
+ while (F_ISSET(conn, WT_CONN_SERVER_ASYNC) &&
+ F_ISSET(session, WT_SESSION_SERVER_ASYNC)) {
+ WT_ERR(__async_op_dequeue(conn, session, &op));
+ if (op != NULL && op != &async->flush_op) {
+ /*
+ * If an operation fails, we want the worker thread to
+ * keep running, unless there is a panic.
+ */
+ (void)__async_worker_op(session, op, &worker);
+ if (F_ISSET(conn, WT_CONN_PANIC))
+ WT_ERR(__wt_panic(session));
+ } else if (async->flush_state == WT_ASYNC_FLUSHING) {
+ /*
+ * Worker flushing going on. Last worker to the party
+ * needs to clear the FLUSHING flag and signal the cond.
+ * If FLUSHING is going on, we do not take anything off
+ * the queue.
+ */
+ WT_ORDERED_READ(flush_gen, async->flush_gen);
+ if (WT_ATOMIC_ADD4(async->flush_count, 1) ==
+ conn->async_workers) {
+ /*
+ * We're last. All workers accounted for so
+ * signal the condition and clear the FLUSHING
+ * flag to release the other worker threads.
+ * Set the FLUSH_COMPLETE flag so that the
+ * caller can return to the application.
+ */
+ WT_PUBLISH(async->flush_state,
+ WT_ASYNC_FLUSH_COMPLETE);
+ WT_ERR(__wt_cond_signal(session,
+ async->flush_cond));
+ } else
+ /*
+ * We need to wait for the last worker to
+ * signal the condition.
+ */
+ WT_ERR(__async_flush_wait(
+ session, async, flush_gen));
+ }
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "async worker error");
+ }
+ /*
+ * Worker thread cleanup, close our cached cursors and
+ * free all the WT_ASYNC_CURSOR structures.
+ */
+ ac = STAILQ_FIRST(&worker.cursorqh);
+ while (ac != NULL) {
+ acnext = STAILQ_NEXT(ac, q);
+ WT_TRET(ac->c->close(ac->c));
+ __wt_free(session, ac);
+ ac = acnext;
+ }
+ return (NULL);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_addr.c b/src/third_party/wiredtiger/src/block/block_addr.c
new file mode 100644
index 00000000000..bbd52359157
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_addr.c
@@ -0,0 +1,202 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __block_buffer_to_addr --
+ * Convert a filesystem address cookie into its components, UPDATING the
+ * caller's buffer reference so it can be called repeatedly to load a buffer.
+ */
+static int
+__block_buffer_to_addr(WT_BLOCK *block,
+ const uint8_t **pp, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
+{
+ uint64_t o, s, c;
+
+ WT_RET(__wt_vunpack_uint(pp, 0, &o));
+ WT_RET(__wt_vunpack_uint(pp, 0, &s));
+ WT_RET(__wt_vunpack_uint(pp, 0, &c));
+
+ /*
+ * To avoid storing large offsets, we minimize the value by subtracting
+ * a block for description information, then storing a count of block
+ * allocation units. That implies there is no such thing as an
+ * "invalid" offset though, they could all be valid (other than very
+ * large numbers), which is what we didn't want to store in the first
+ * place. Use the size: writing a block of size 0 makes no sense, so
+ * that's the out-of-band value. Once we're out of this function and
+ * are working with a real file offset, size and checksum triplet, there
+ * can be invalid offsets, that's simpler than testing sizes of 0 all
+ * over the place.
+ */
+ if (s == 0) {
+ *offsetp = 0;
+ *sizep = *cksump = 0;
+ } else {
+ *offsetp = (wt_off_t)(o + 1) * block->allocsize;
+ *sizep = (uint32_t)s * block->allocsize;
+ *cksump = (uint32_t)c;
+ }
+ return (0);
+}
+
+/*
+ * __wt_block_addr_to_buffer --
+ * Convert the filesystem components into its address cookie.
+ */
+int
+__wt_block_addr_to_buffer(WT_BLOCK *block,
+ uint8_t **pp, wt_off_t offset, uint32_t size, uint32_t cksum)
+{
+ uint64_t o, s, c;
+
+ /* See the comment above: this is the reverse operation. */
+ if (size == 0) {
+ o = WT_BLOCK_INVALID_OFFSET;
+ s = c = 0;
+ } else {
+ o = (uint64_t)offset / block->allocsize - 1;
+ s = size / block->allocsize;
+ c = cksum;
+ }
+ WT_RET(__wt_vpack_uint(pp, 0, o));
+ WT_RET(__wt_vpack_uint(pp, 0, s));
+ WT_RET(__wt_vpack_uint(pp, 0, c));
+ return (0);
+}
+
+/*
+ * __wt_block_buffer_to_addr --
+ * Convert a filesystem address cookie into its components NOT UPDATING
+ * the caller's buffer reference.
+ */
+int
+__wt_block_buffer_to_addr(WT_BLOCK *block,
+ const uint8_t *p, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
+{
+ return (__block_buffer_to_addr(block, &p, offsetp, sizep, cksump));
+}
+
+/*
+ * __wt_block_addr_valid --
+ * Return if an address cookie is valid.
+ */
+int
+__wt_block_addr_valid(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int live)
+{
+ wt_off_t offset;
+ uint32_t cksum, size;
+
+ WT_UNUSED(session);
+ WT_UNUSED(addr_size);
+ WT_UNUSED(live);
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * In diagnostic mode, verify the address isn't on the available list,
+ * or for live systems, the discard list.
+ */
+ WT_RET(__wt_block_misplaced(
+ session, block, "addr-valid", offset, size, live));
+#endif
+
+ /* Check if it's past the end of the file. */
+ return (offset + size > block->fh->size ? 0 : 1);
+}
+
+/*
+ * __wt_block_addr_string --
+ * Return a printable string representation of an address cookie.
+ */
+int
+__wt_block_addr_string(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
+{
+ wt_off_t offset;
+ uint32_t cksum, size;
+
+ WT_UNUSED(addr_size);
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+ /* Printable representation. */
+ WT_RET(__wt_buf_fmt(session, buf,
+ "[%" PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ (uintmax_t)offset, (uintmax_t)offset + size, size, cksum));
+
+ return (0);
+}
+
+/*
+ * __wt_block_buffer_to_ckpt --
+ * Convert a checkpoint cookie into its components.
+ */
+int
+__wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci)
+{
+ uint64_t a;
+ const uint8_t **pp;
+
+ ci->version = *p++;
+ if (ci->version != WT_BM_CHECKPOINT_VERSION)
+ WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version");
+
+ pp = &p;
+ WT_RET(__block_buffer_to_addr(block, pp,
+ &ci->root_offset, &ci->root_size, &ci->root_cksum));
+ WT_RET(__block_buffer_to_addr(block, pp,
+ &ci->alloc.offset, &ci->alloc.size, &ci->alloc.cksum));
+ WT_RET(__block_buffer_to_addr(block, pp,
+ &ci->avail.offset, &ci->avail.size, &ci->avail.cksum));
+ WT_RET(__block_buffer_to_addr(block, pp,
+ &ci->discard.offset, &ci->discard.size, &ci->discard.cksum));
+ WT_RET(__wt_vunpack_uint(pp, 0, &a));
+ ci->file_size = (wt_off_t)a;
+ WT_RET(__wt_vunpack_uint(pp, 0, &a));
+ ci->ckpt_size = a;
+
+ return (0);
+}
+
+/*
+ * __wt_block_ckpt_to_buffer --
+ * Convert the components into its checkpoint cookie.
+ */
+int
+__wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci)
+{
+ uint64_t a;
+
+ if (ci->version != WT_BM_CHECKPOINT_VERSION)
+ WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version");
+
+ (*pp)[0] = ci->version;
+ (*pp)++;
+
+ WT_RET(__wt_block_addr_to_buffer(block, pp,
+ ci->root_offset, ci->root_size, ci->root_cksum));
+ WT_RET(__wt_block_addr_to_buffer(block, pp,
+ ci->alloc.offset, ci->alloc.size, ci->alloc.cksum));
+ WT_RET(__wt_block_addr_to_buffer(block, pp,
+ ci->avail.offset, ci->avail.size, ci->avail.cksum));
+ WT_RET(__wt_block_addr_to_buffer(block, pp,
+ ci->discard.offset, ci->discard.size, ci->discard.cksum));
+ a = (uint64_t)ci->file_size;
+ WT_RET(__wt_vpack_uint(pp, 0, a));
+ a = (uint64_t)ci->ckpt_size;
+ WT_RET(__wt_vpack_uint(pp, 0, a));
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c
new file mode 100644
index 00000000000..83c3a40e8e1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_ckpt.c
@@ -0,0 +1,842 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
+static int __ckpt_string(
+ WT_SESSION_IMPL *, WT_BLOCK *, const uint8_t *, WT_ITEM *);
+static int __ckpt_update(
+ WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, WT_BLOCK_CKPT *, int);
+
+/*
+ * __wt_block_ckpt_init --
+ * Initialize a checkpoint structure.
+ */
+int
+__wt_block_ckpt_init(
+ WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
+{
+ WT_CLEAR(*ci);
+
+ ci->version = WT_BM_CHECKPOINT_VERSION;
+ ci->root_offset = WT_BLOCK_INVALID_OFFSET;
+
+ WT_RET(__wt_block_extlist_init(session, &ci->alloc, name, "alloc", 0));
+ WT_RET(__wt_block_extlist_init(session, &ci->avail, name, "avail", 1));
+ WT_RET(__wt_block_extlist_init(
+ session, &ci->discard, name, "discard", 0));
+ WT_RET(__wt_block_extlist_init(
+ session, &ci->ckpt_avail, name, "ckpt_avail", 1));
+
+ return (0);
+}
+
+/*
+ * __wt_block_checkpoint_load --
+ * Load a checkpoint.
+ */
+int
+__wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
+ const uint8_t *addr, size_t addr_size,
+ uint8_t *root_addr, size_t *root_addr_sizep, int checkpoint)
+{
+ WT_BLOCK_CKPT *ci, _ci;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ uint8_t *endp;
+
+ WT_UNUSED(addr_size);
+ ci = NULL;
+
+ /*
+ * Sometimes we don't find a root page (we weren't given a checkpoint,
+ * or the checkpoint was empty). In that case we return an empty root
+ * address, set that up now.
+ */
+ *root_addr_sizep = 0;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
+ if (addr != NULL) {
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__ckpt_string(session, block, addr, tmp));
+ }
+ WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+ "%s: load-checkpoint: %s", block->name,
+ addr == NULL ? "[Empty]" : (const char *)tmp->data));
+ }
+
+ /*
+ * There's a single checkpoint in the file that can be written, all of
+ * the others are read-only. We use the same initialization calls for
+ * readonly checkpoints, but the information doesn't persist.
+ */
+ if (checkpoint) {
+ ci = &_ci;
+ WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
+ } else {
+ /*
+ * We depend on the btree level for locking: things will go
+ * bad fast should we open the live system in two handles, or
+ * if we create, salvage, truncate or verify the live/running
+ * file, for that matter.
+ */
+ ci = &block->live;
+ WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
+ }
+
+ /*
+ * If the checkpoint has an on-disk root page, load it. Otherwise, size
+ * the file past the description information.
+ */
+ if (addr == NULL || addr_size == 0)
+ ci->file_size = block->allocsize;
+ else {
+ /* Crack the checkpoint cookie. */
+ WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci));
+
+ /* Verify sets up next. */
+ if (block->verify)
+ WT_ERR(__wt_verify_ckpt_load(session, block, ci));
+
+ /* Read any root page. */
+ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
+ endp = root_addr;
+ WT_ERR(__wt_block_addr_to_buffer(block, &endp,
+ ci->root_offset, ci->root_size, ci->root_cksum));
+ *root_addr_sizep = WT_PTRDIFF(endp, root_addr);
+ }
+
+ /*
+ * Rolling a checkpoint forward requires the avail list, the
+ * blocks from which we can allocate.
+ */
+ if (!checkpoint)
+ WT_ERR(__wt_block_extlist_read_avail(
+ session, block, &ci->avail, ci->file_size));
+ }
+
+ /*
+ * If the checkpoint can be written, that means anything written after
+ * the checkpoint is no longer interesting, truncate the file. Don't
+ * bother checking the avail list for a block at the end of the file,
+ * that was done when the checkpoint was first written (re-writing the
+ * checkpoint might possibly make it relevant here, but it's unlikely
+ * enough I don't bother).
+ */
+ if (!checkpoint) {
+ /*
+ * The truncate might fail if there's a file mapping (if there's
+ * an open checkpoint on the file), that's OK.
+ */
+ WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+ "truncate file to %" PRIuMAX, (uintmax_t)ci->file_size));
+ WT_ERR_BUSY_OK(
+ __wt_ftruncate(session, block->fh, ci->file_size));
+ }
+
+ if (0) {
+err: /*
+ * Don't call checkpoint-unload: unload does real work including
+ * file truncation. If we fail early enough that the checkpoint
+ * information isn't correct, bad things would happen. The only
+ * allocated memory was in the service of verify, clean that up.
+ */
+ if (block->verify)
+ WT_TRET(__wt_verify_ckpt_unload(session, block));
+ }
+
+ /* Checkpoints don't need the original information, discard it. */
+ if (checkpoint && ci != NULL)
+ __wt_block_ckpt_destroy(session, ci);
+
+ __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_block_checkpoint_unload --
+ * Unload a checkpoint.
+ */
+int
+__wt_block_checkpoint_unload(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, int checkpoint)
+{
+ WT_DECL_RET;
+
+ /* Verify cleanup. */
+ if (block->verify)
+ WT_TRET(__wt_verify_ckpt_unload(session, block));
+
+ /*
+ * If it's the live system, truncate to discard any extended blocks and
+ * discard the active extent lists. Hold the lock even though we're
+ * unloading the live checkpoint, there could be readers active in
+ * other checkpoints.
+ */
+ if (!checkpoint) {
+ /*
+ * The truncate might fail if there's a file mapping (if there's
+ * an open checkpoint on the file), that's OK.
+ */
+ WT_TRET_BUSY_OK(
+ __wt_ftruncate(session, block->fh, block->fh->size));
+
+ __wt_spin_lock(session, &block->live_lock);
+ __wt_block_ckpt_destroy(session, &block->live);
+ __wt_spin_unlock(session, &block->live_lock);
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_block_ckpt_destroy --
+ * Clear a checkpoint structure.
+ */
+void
+__wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci)
+{
+ /* Discard the extent lists. */
+ __wt_block_extlist_free(session, &ci->alloc);
+ __wt_block_extlist_free(session, &ci->avail);
+ __wt_block_extlist_free(session, &ci->discard);
+ __wt_block_extlist_free(session, &ci->ckpt_alloc);
+ __wt_block_extlist_free(session, &ci->ckpt_avail);
+ __wt_block_extlist_free(session, &ci->ckpt_discard);
+}
+
+/*
+ * __wt_block_checkpoint --
+ * Create a new checkpoint.
+ */
+int
+__wt_block_checkpoint(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, int data_cksum)
+{
+ WT_BLOCK_CKPT *ci;
+ WT_DECL_RET;
+
+ ci = &block->live;
+
+ /*
+ * Write the root page: it's possible for there to be a checkpoint of
+ * an empty tree, in which case, we store an illegal root offset.
+ *
+ * !!!
+ * We happen to know that checkpoints are single-threaded above us in
+ * the btree engine. That's probably something we want to guarantee
+ * for any WiredTiger block manager.
+ */
+ if (buf == NULL) {
+ ci->root_offset = WT_BLOCK_INVALID_OFFSET;
+ ci->root_size = ci->root_cksum = 0;
+ } else
+ WT_RET(__wt_block_write_off(session, block, buf,
+ &ci->root_offset, &ci->root_size, &ci->root_cksum,
+ data_cksum, 0));
+
+ /*
+ * Checkpoints are potentially reading/writing/merging lots of blocks,
+ * pre-allocate structures for this thread's use.
+ */
+ WT_RET(__wt_block_ext_prealloc(session, 250));
+
+ /* Process the checkpoint list, deleting and updating as required. */
+ ret = __ckpt_process(session, block, ckptbase);
+
+ /* Discard any excessive memory we've allocated. */
+ WT_TRET(__wt_block_ext_discard(session, 250));
+
+ return (ret);
+}
+
+/*
+ * __ckpt_extlist_read --
+ * Read a checkpoints extent lists and copy
+ */
+static int
+__ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
+{
+ WT_BLOCK_CKPT *ci;
+
+ /*
+ * Allocate a checkpoint structure, crack the cookie and read the
+ * checkpoint's extent lists.
+ *
+ * Ignore the avail list: checkpoint avail lists are only useful if we
+ * are rolling forward from the particular checkpoint and they represent
+ * our best understanding of what blocks can be allocated. If we are
+ * not operating on the live checkpoint, subsequent checkpoints might
+ * have allocated those blocks, and the avail list is useless. We don't
+ * discard it, because it is useful as part of verification, but we
+ * don't re-write it either.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));
+
+ ci = ckpt->bpriv;
+ WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
+ WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
+ WT_RET(__wt_block_extlist_read(
+ session, block, &ci->alloc, ci->file_size));
+ WT_RET(__wt_block_extlist_read(
+ session, block, &ci->discard, ci->file_size));
+
+ return (0);
+}
+
+/*
+ * __ckpt_extlist_fblocks --
+ * If a checkpoint's extent list is going away, free its blocks.
+ */
+static int
+__ckpt_extlist_fblocks(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
+{
+ if (el->offset == WT_BLOCK_INVALID_OFFSET)
+ return (0);
+
+ /*
+ * Free blocks used to write checkpoint extents into the live system's
+ * checkpoint avail list (they were never on any alloc list). Do not
+ * use the live system's avail list because that list is used to decide
+ * if the file can be truncated, and we can't truncate any part of the
+ * file that contains a previous checkpoint's extents.
+ */
+ return (__wt_block_insert_ext(
+ session, &block->live.ckpt_avail, el->offset, el->size));
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __ckpt_verify --
+ * Diagnostic code, confirm we get what we expect in the checkpoint array.
+ */
+static int
+__ckpt_verify(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
+{
+ WT_CKPT *ckpt;
+
+ /*
+ * Fast check that we're seeing what we expect to see: some number of
+ * checkpoints to add, delete or ignore, terminated by a new checkpoint.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ switch (ckpt->flags) {
+ case 0:
+ case WT_CKPT_DELETE:
+ case WT_CKPT_DELETE | WT_CKPT_FAKE:
+ case WT_CKPT_FAKE:
+ break;
+ case WT_CKPT_ADD:
+ if (ckpt[1].name == NULL)
+ break;
+ /* FALLTHROUGH */
+ default:
+ return (
+ __wt_illegal_value(session, "checkpoint array"));
+ }
+ return (0);
+}
+#endif
+
+/*
+ * __ckpt_process --
+ * Process the list of checkpoints.
+ */
+static int
+__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
+{
+ WT_BLOCK_CKPT *a, *b, *ci;
+ WT_CKPT *ckpt, *next_ckpt;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ uint64_t ckpt_size;
+ int deleting, locked;
+
+ ci = &block->live;
+ locked = 0;
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_RET(__ckpt_verify(session, ckptbase));
+#endif
+
+ /*
+ * Checkpoints are a two-step process: first, write a new checkpoint to
+ * disk (including all the new extent lists for modified checkpoints
+ * and the live system). As part of this, create a list of file blocks
+ * newly available for reallocation, based on checkpoints being deleted.
+ * We then return the locations of the new checkpoint information to our
+ * caller. Our caller has to write that information into some kind of
+ * stable storage, and once that's done, we can actually allocate from
+ * that list of newly available file blocks. (We can't allocate from
+ * that list immediately because the allocation might happen before our
+ * caller saves the new checkpoint information, and if we crashed before
+ * the new checkpoint location was saved, we'd have overwritten blocks
+ * still referenced by checkpoints in the system.) In summary, there is
+ * a second step: after our caller saves the checkpoint information, we
+ * are called to add the newly available blocks into the live system's
+ * available list.
+ *
+ * This function is the first step, the second step is in the resolve
+ * function.
+ *
+ * If we're called to checkpoint the same file twice, without the second
+ * resolution step, it's an error at an upper level and our choices are
+ * all bad: either leak blocks or risk crashing with our caller not
+ * having saved the checkpoint information to stable storage. Leaked
+ * blocks are a safer choice, but that means file verify will fail for
+ * the rest of "forever", and the chance of us allocating a block and
+ * then crashing such that it matters is reasonably low: don't leak the
+ * blocks.
+ */
+ if (block->ckpt_inprogress) {
+ __wt_errx(session,
+ "%s: checkpointed without the checkpoint being resolved",
+ block->name);
+
+ WT_RET(__wt_block_checkpoint_resolve(session, block));
+ }
+
+ /*
+ * Extents newly available as a result of deleting previous checkpoints
+ * are added to a list of extents. The list should be empty, but as
+ * described above, there is no "free the checkpoint information" call
+ * into the block manager; if there was an error in an upper level that
+ * resulted in some previous checkpoint never being resolved, the list
+ * may not be empty. We should have caught that with the "checkpoint
+ * in progress" test, but it doesn't cost us anything to be cautious.
+ *
+ * We free the checkpoint's allocation and discard extent lists as part
+ * of the resolution step, not because they're needed at that time, but
+ * because it's potentially a lot of work, and waiting allows the btree
+ * layer to continue eviction sooner. As for the checkpoint-available
+ * list, make sure they get cleaned out.
+ */
+ __wt_block_extlist_free(session, &ci->ckpt_avail);
+ WT_RET(__wt_block_extlist_init(
+ session, &ci->ckpt_avail, "live", "ckpt_avail", 1));
+ __wt_block_extlist_free(session, &ci->ckpt_alloc);
+ __wt_block_extlist_free(session, &ci->ckpt_discard);
+
+ /*
+ * To delete a checkpoint, we'll need checkpoint information for it and
+ * the subsequent checkpoint into which it gets rolled; read them from
+ * disk before we lock things down.
+ */
+ deleting = 0;
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
+ !F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+ deleting = 1;
+
+ /*
+ * Read the checkpoint and next checkpoint extent lists if we
+ * haven't already read them (we may have already read these
+ * extent blocks if there is more than one deleted checkpoint).
+ */
+ if (ckpt->bpriv == NULL)
+ WT_ERR(__ckpt_extlist_read(session, block, ckpt));
+
+ for (next_ckpt = ckpt + 1;; ++next_ckpt)
+ if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
+ break;
+
+ /*
+ * The "next" checkpoint may be the live tree which has no
+ * extent blocks to read.
+ */
+ if (next_ckpt->bpriv == NULL &&
+ !F_ISSET(next_ckpt, WT_CKPT_ADD))
+ WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
+ }
+
+ /*
+ * Hold a lock so the live extent lists and the file size can't change
+ * underneath us. I suspect we'll tighten this if checkpoints take too
+ * much time away from real work: we read the historic checkpoint
+ * information without a lock, but we could also merge and re-write the
+ * deleted and merged checkpoint information without a lock, except for
+ * the final merge of ranges into the live tree.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ locked = 1;
+
+ /*
+ * We've allocated our last page, update the checkpoint size. We need
+ * to calculate the live system's checkpoint size before merging
+ * checkpoint allocation and discard information from the checkpoints
+ * we're deleting, those operations change the underlying byte counts.
+ */
+ ckpt_size = ci->ckpt_size;
+ ckpt_size += ci->alloc.bytes;
+ ckpt_size -= ci->discard.bytes;
+
+ /* Skip the additional processing if we aren't deleting checkpoints. */
+ if (!deleting)
+ goto live_update;
+
+ /*
+ * Delete any no-longer-needed checkpoints: we do this first as it frees
+ * blocks to the live lists, and the freed blocks will then be included
+ * when writing the live extent lists.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
+ !F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
+ if (tmp == NULL)
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__ckpt_string(
+ session, block, ckpt->raw.data, tmp));
+ WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+ "%s: delete-checkpoint: %s: %s",
+ block->name, ckpt->name, (const char *)tmp->data));
+ }
+
+ /*
+ * Find the checkpoint into which we'll roll this checkpoint's
+ * blocks: it's the next real checkpoint in the list, and it
+ * better have been read in (if it's not the add slot).
+ */
+ for (next_ckpt = ckpt + 1;; ++next_ckpt)
+ if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
+ break;
+
+ /*
+ * Set the from/to checkpoint structures, where the "to" value
+ * may be the live tree.
+ */
+ a = ckpt->bpriv;
+ if (F_ISSET(next_ckpt, WT_CKPT_ADD))
+ b = &block->live;
+ else
+ b = next_ckpt->bpriv;
+
+ /*
+ * Free the root page: there's nothing special about this free,
+ * the root page is allocated using normal rules, that is, it
+ * may have been taken from the avail list, and was entered on
+ * the live system's alloc list at that time. We free it into
+ * the checkpoint's discard list, however, not the live system's
+ * list because it appears on the checkpoint's alloc list and so
+ * must be paired in the checkpoint.
+ */
+ if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
+ WT_ERR(__wt_block_insert_ext(session,
+ &a->discard, a->root_offset, a->root_size));
+
+ /*
+ * Free the blocks used to hold the "from" checkpoint's extent
+ * lists, including the avail list.
+ */
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));
+
+ /*
+ * Roll the "from" alloc and discard extent lists into the "to"
+ * checkpoint's lists.
+ */
+ if (a->alloc.entries != 0)
+ WT_ERR(__wt_block_extlist_merge(
+ session, &a->alloc, &b->alloc));
+ if (a->discard.entries != 0)
+ WT_ERR(__wt_block_extlist_merge(
+ session, &a->discard, &b->discard));
+
+ /*
+ * If the "to" checkpoint is also being deleted, we're done with
+ * it, it's merged into some other checkpoint in the next loop.
+ * This means the extent lists may aggregate over a number of
+ * checkpoints, but that's OK, they're disjoint sets of ranges.
+ */
+ if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
+ continue;
+
+ /*
+ * Find blocks for re-use: wherever the "to" checkpoint's
+ * allocate and discard lists overlap, move the range to
+ * the live system's checkpoint available list.
+ */
+ WT_ERR(__wt_block_extlist_overlap(session, block, b));
+
+ /*
+ * If we're updating the live system's information, we're done.
+ */
+ if (F_ISSET(next_ckpt, WT_CKPT_ADD))
+ continue;
+
+ /*
+ * We have to write the "to" checkpoint's extent lists out in
+ * new blocks, and update its cookie.
+ *
+ * Free the blocks used to hold the "to" checkpoint's extent
+ * lists; don't include the avail list, it's not changing.
+ */
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));
+
+ F_SET(next_ckpt, WT_CKPT_UPDATE);
+ }
+
+ /* Update checkpoints marked for update. */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_UPDATE))
+ WT_ERR(__ckpt_update(
+ session, block, ckpt, ckpt->bpriv, 0));
+
+live_update:
+ /* Truncate the file if that's possible. */
+ WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));
+
+ /* Update the final, added checkpoint based on the live system. */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_ADD)) {
+ /*
+ * Set the checkpoint size for the live system.
+ *
+ * !!!
+ * Our caller wants the final checkpoint size. Setting
+ * the size here violates layering, but the alternative
+ * is a call for the btree layer to crack the checkpoint
+ * cookie into its components, and that's a fair amount
+ * of work.
+ */
+ ckpt->ckpt_size = ci->ckpt_size = ckpt_size;
+
+ WT_ERR(__ckpt_update(session, block, ckpt, ci, 1));
+ }
+
+ /*
+ * Reset the live system's alloc and discard extent lists, leave the
+ * avail list alone. This includes freeing a lot of extents, so do it
+ * outside of the system's lock by copying and resetting the original,
+ * then doing the work later.
+ */
+ ci->ckpt_alloc = ci->alloc;
+ WT_ERR(__wt_block_extlist_init(
+ session, &ci->alloc, "live", "alloc", 0));
+ ci->ckpt_discard = ci->discard;
+ WT_ERR(__wt_block_extlist_init(
+ session, &ci->discard, "live", "discard", 0));
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * The first checkpoint in the system should always have an empty
+ * discard list. If we've read that checkpoint and/or created it,
+ * check.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+ break;
+ if ((a = ckpt->bpriv) == NULL)
+ a = &block->live;
+ if (a->discard.entries != 0) {
+ __wt_errx(session,
+ "first checkpoint incorrectly has blocks on the discard "
+ "list");
+ WT_ERR(WT_ERROR);
+ }
+#endif
+
+ block->ckpt_inprogress = 1;
+
+err: if (locked)
+ __wt_spin_unlock(session, &block->live_lock);
+
+ /* Discard any checkpoint information we loaded. */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if ((ci = ckpt->bpriv) != NULL)
+ __wt_block_ckpt_destroy(session, ci);
+
+ __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __ckpt_update --
+ * Update a checkpoint.
+ */
+static int
+__ckpt_update(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, int is_live)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ uint8_t *endp;
+
+#ifdef HAVE_DIAGNOSTIC
+ /* Check the extent list combinations for overlaps. */
+ WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail));
+ WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail));
+ WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard));
+#endif
+ /*
+ * Write the checkpoint's alloc and discard extent lists. After each
+ * write, remove any allocated blocks from the system's allocation
+ * list, checkpoint extent blocks don't appear on any extent lists.
+ */
+ WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
+ WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));
+
+ /*
+ * We only write an avail list for the live system, other checkpoint's
+ * avail lists are static and never change.
+ *
+ * Write the avail list last so it reflects changes due to allocating
+ * blocks for the alloc and discard lists. Second, when we write the
+ * live system's avail list, it's two lists: the current avail list
+ * plus the list of blocks to be made available when the new checkpoint
+ * completes. We can't merge that second list into the real list yet,
+ * it's not truly available until the new checkpoint locations have been
+ * saved to the metadata.
+ */
+ if (is_live)
+ WT_RET(__wt_block_extlist_write(
+ session, block, &ci->avail, &ci->ckpt_avail));
+
+ /*
+ * Set the file size for the live system.
+ *
+ * !!!
+ * We do NOT set the file size when re-writing checkpoints because we
+ * want to test the checkpoint's blocks against a reasonable maximum
+ * file size during verification. This is bad: imagine a checkpoint
+ * appearing early in the file, re-written, and then the checkpoint
+ * requires blocks at the end of the file, blocks after the listed file
+ * size. If the application opens that checkpoint for writing
+ * (discarding subsequent checkpoints), we would truncate the file to
+ * the early chunk, discarding the re-written checkpoint information.
+ * The alternative, updating the file size has its own problems, in
+ * that case we'd work correctly, but we'd lose all of the blocks
+ * between the original checkpoint and the re-written checkpoint.
+ * Currently, there's no API to roll-forward intermediate checkpoints,
+ * if there ever is, this will need to be fixed.
+ */
+ if (is_live)
+ ci->file_size = block->fh->size;
+
+ /*
+ * Copy the checkpoint information into the checkpoint array's address
+ * cookie.
+ */
+ WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE));
+ endp = ckpt->raw.mem;
+ WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci));
+ ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp));
+ WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+ "%s: create-checkpoint: %s: %s",
+ block->name, ckpt->name, (const char *)tmp->data));
+ }
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_block_checkpoint_resolve --
+ * Resolve a checkpoint.
+ */
+int
+__wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_BLOCK_CKPT *ci;
+ WT_DECL_RET;
+
+ ci = &block->live;
+
+ /*
+ * Resolve the checkpoint after our caller has written the checkpoint
+ * information to stable storage.
+ */
+ if (!block->ckpt_inprogress)
+ WT_RET_MSG(session, WT_ERROR,
+ "%s: checkpoint resolved, but no checkpoint in progress",
+ block->name);
+ block->ckpt_inprogress = 0;
+
+ __wt_spin_lock(session, &block->live_lock);
+ ret = __wt_block_extlist_merge(session, &ci->ckpt_avail, &ci->avail);
+ __wt_spin_unlock(session, &block->live_lock);
+
+ /* Discard the lists remaining after the checkpoint call. */
+ __wt_block_extlist_free(session, &ci->ckpt_avail);
+ __wt_block_extlist_free(session, &ci->ckpt_alloc);
+ __wt_block_extlist_free(session, &ci->ckpt_discard);
+
+ return (ret);
+}
+
+/*
+ * __ckpt_string --
+ * Return a printable string representation of a checkpoint address cookie.
+ */
+static int
+__ckpt_string(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *addr, WT_ITEM *buf)
+{
+ WT_BLOCK_CKPT *ci, _ci;
+
+ /* Initialize the checkpoint, crack the cookie. */
+ ci = &_ci;
+ WT_RET(__wt_block_ckpt_init(session, ci, "string"));
+ WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci));
+
+ WT_RET(__wt_buf_fmt(session, buf,
+ "version=%d",
+ ci->version));
+ if (ci->root_offset == WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]"));
+ else
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", root=[%"
+ PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ (uintmax_t)ci->root_offset,
+ (uintmax_t)(ci->root_offset + ci->root_size),
+ ci->root_size, ci->root_cksum));
+ if (ci->alloc.offset == WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__wt_buf_catfmt(session, buf, ", alloc=[Empty]"));
+ else
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", alloc=[%"
+ PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ (uintmax_t)ci->alloc.offset,
+ (uintmax_t)(ci->alloc.offset + ci->alloc.size),
+ ci->alloc.size, ci->alloc.cksum));
+ if (ci->avail.offset == WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__wt_buf_catfmt(session, buf, ", avail=[Empty]"));
+ else
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", avail=[%"
+ PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ (uintmax_t)ci->avail.offset,
+ (uintmax_t)(ci->avail.offset + ci->avail.size),
+ ci->avail.size, ci->avail.cksum));
+ if (ci->discard.offset == WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__wt_buf_catfmt(session, buf, ", discard=[Empty]"));
+ else
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", discard=[%"
+ PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ (uintmax_t)ci->discard.offset,
+ (uintmax_t)(ci->discard.offset + ci->discard.size),
+ ci->discard.size, ci->discard.cksum));
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", file size=%" PRIuMAX, (uintmax_t)ci->file_size));
+
+ __wt_block_ckpt_destroy(session, ci);
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_compact.c b/src/third_party/wiredtiger/src/block/block_compact.c
new file mode 100644
index 00000000000..007c77f3291
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_compact.c
@@ -0,0 +1,221 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *);
+
+/*
+ * __wt_block_compact_start --
+ * Start compaction of a file.
+ */
+int
+__wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_UNUSED(session);
+
+ /*
+ * Save the current allocation plan, switch to first-fit allocation.
+ * We don't need the lock, but it's not a performance question and
+ * might avoid bugs in the future.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ block->allocfirst_save = block->allocfirst;
+ block->allocfirst = 1;
+ __wt_spin_unlock(session, &block->live_lock);
+
+ return (0);
+}
+
+/*
+ * __wt_block_compact_end --
+ * End compaction of a file.
+ */
+int
+__wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_UNUSED(session);
+
+ /*
+ * Restore the previous allocation plan.
+ * We don't need the lock, but it's not a performance question and
+ * might avoid bugs in the future.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ block->allocfirst = block->allocfirst_save;
+ __wt_spin_unlock(session, &block->live_lock);
+
+ return (0);
+}
+
+/*
+ * __wt_block_compact_skip --
+ * Return if compaction will shrink the file.
+ */
+int
+__wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp)
+{
+ WT_DECL_RET;
+ WT_EXT *ext;
+ WT_EXTLIST *el;
+ WT_FH *fh;
+ wt_off_t avail, ninety;
+
+ *skipp = 1; /* Return a default skip. */
+
+ fh = block->fh;
+
+ /*
+ * We do compaction by copying blocks from the end of the file to the
+ * beginning of the file, and we need some metrics to decide if it's
+ * worth doing. Ignore small files, and files where we are unlikely
+ * to recover 10% of the file.
+ */
+ if (fh->size <= 10 * 1024)
+ return (0);
+
+ __wt_spin_lock(session, &block->live_lock);
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT))
+ WT_ERR(__block_dump_avail(session, block));
+
+ /* Sum the number of available bytes in the first 90% of the file. */
+ avail = 0;
+ ninety = fh->size - fh->size / 10;
+
+ el = &block->live.avail;
+ WT_EXT_FOREACH(ext, el->off)
+ if (ext->off < ninety)
+ avail += ext->size;
+
+ /*
+ * If at least 10% of the total file is available and in the first 90%
+ * of the file, we'll try compaction.
+ */
+ if (avail >= fh->size / 10)
+ *skipp = 0;
+
+ WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
+ "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
+ "90%% of the file, require 10%% or %" PRIuMAX "MB (%" PRIuMAX
+ ") to perform compaction, compaction %s",
+ block->name,
+ (uintmax_t)avail / WT_MEGABYTE, (uintmax_t)avail,
+ (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10,
+ *skipp ? "skipped" : "proceeding"));
+
+err: __wt_spin_unlock(session, &block->live_lock);
+
+ return (ret);
+}
+
+/*
+ * __wt_block_compact_page_skip --
+ * Return if writing a particular page will shrink the file.
+ */
+int
+__wt_block_compact_page_skip(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int *skipp)
+{
+ WT_DECL_RET;
+ WT_EXT *ext;
+ WT_EXTLIST *el;
+ WT_FH *fh;
+ wt_off_t ninety, offset;
+ uint32_t size, cksum;
+
+ WT_UNUSED(addr_size);
+ *skipp = 1; /* Return a default skip. */
+
+ fh = block->fh;
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+ __wt_spin_lock(session, &block->live_lock);
+
+ /*
+ * If this block is in the last 10% of the file and there's a block on
+ * the available list that's in the first 90% of the file, rewrite the
+ * block. Checking the available list is necessary (otherwise writing
+ * the block would extend the file), but there's an obvious race if the
+ * file is sufficiently busy.
+ */
+ ninety = fh->size - fh->size / 10;
+ if (offset > ninety) {
+ el = &block->live.avail;
+ WT_EXT_FOREACH(ext, el->off)
+ if (ext->off < ninety && ext->size >= size) {
+ *skipp = 0;
+ break;
+ }
+ }
+
+ __wt_spin_unlock(session, &block->live_lock);
+
+ return (ret);
+}
+
+/*
+ * __block_dump_avail --
+ * Dump out the avail list so we can see what compaction will look like.
+ */
+static int
+__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_EXTLIST *el;
+ WT_EXT *ext;
+ wt_off_t decile[10], percentile[100], size, v;
+ u_int i;
+
+ el = &block->live.avail;
+ size = block->fh->size;
+
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "file size %" PRIuMAX "MB (%" PRIuMAX ") with %" PRIuMAX
+ "%% space available %" PRIuMAX "MB (%" PRIuMAX ")",
+ (uintmax_t)size / WT_MEGABYTE, (uintmax_t)size,
+ ((uintmax_t)el->bytes * 100) / (uintmax_t)size,
+ (uintmax_t)el->bytes / WT_MEGABYTE, (uintmax_t)el->bytes));
+
+ if (el->entries == 0)
+ return (0);
+
+ /*
+ * Bucket the available memory into file deciles/percentiles. Large
+ * pieces of memory will cross over multiple buckets, assign to the
+ * decile/percentile in 512B chunks.
+ */
+ memset(decile, 0, sizeof(decile));
+ memset(percentile, 0, sizeof(percentile));
+ WT_EXT_FOREACH(ext, el->off)
+ for (i = 0; i < ext->size / 512; ++i) {
+ ++decile[((ext->off + i * 512) * 10) / size];
+ ++percentile[((ext->off + i * 512) * 100) / size];
+ }
+
+#ifdef __VERBOSE_OUTPUT_PERCENTILE
+ for (i = 0; i < WT_ELEMENTS(percentile); ++i) {
+ v = percentile[i] * 512;
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "%2u%%: %12" PRIuMAX "MB, (%" PRIuMAX "B, %"
+ PRIuMAX "%%)",
+ i, (uintmax_t)v / WT_MEGABYTE, (uintmax_t)v,
+ (uintmax_t)((v * 100) / (wt_off_t)el->bytes)));
+ }
+#endif
+ for (i = 0; i < WT_ELEMENTS(decile); ++i) {
+ v = decile[i] * 512;
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "%2u%%: %12" PRIuMAX "MB, (%" PRIuMAX "B, %"
+ PRIuMAX "%%)",
+ i * 10, (uintmax_t)v / WT_MEGABYTE, (uintmax_t)v,
+ (uintmax_t)((v * 100) / (wt_off_t)el->bytes)));
+ }
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c
new file mode 100644
index 00000000000..d500f93817a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_ext.c
@@ -0,0 +1,1437 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __block_append(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t);
+static int __block_ext_overlap(WT_SESSION_IMPL *,
+ WT_BLOCK *, WT_EXTLIST *, WT_EXT **, WT_EXTLIST *, WT_EXT **);
+static int __block_extlist_dump(
+ WT_SESSION_IMPL *, const char *, WT_EXTLIST *, int);
+static int __block_merge(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t);
+
+/*
+ * __block_off_srch_last --
+ * Return the last element in the list, along with a stack for appending.
+ */
+static inline WT_EXT *
+__block_off_srch_last(WT_EXT **head, WT_EXT ***stack)
+{
+ WT_EXT **extp, *last;
+ int i;
+
+ last = NULL; /* The list may be empty */
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;)
+ if (*extp != NULL) {
+ last = *extp;
+ extp = &(*extp)->next[i];
+ } else
+ stack[i--] = extp--;
+ return (last);
+}
+
+/*
+ * __block_off_srch --
+ * Search a by-offset skiplist (either the primary by-offset list, or the
+ * by-offset list referenced by a size entry), for the specified offset.
+ */
+static inline void
+__block_off_srch(WT_EXT **head, wt_off_t off, WT_EXT ***stack, int skip_off)
+{
+ WT_EXT **extp;
+ int i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ *
+ * Return a stack for an exact match or the next-largest item.
+ *
+ * The WT_EXT structure contains two skiplists, the primary one and the
+ * per-size bucket one: if the skip_off flag is set, offset the skiplist
+ * array by the depth specified in this particular structure.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;)
+ if (*extp != NULL && (*extp)->off < off)
+ extp =
+ &(*extp)->next[i + (skip_off ? (*extp)->depth : 0)];
+ else
+ stack[i--] = extp--;
+}
+
+/*
+ * __block_first_srch --
+ * Search the skiplist for the first available slot.
+ */
+static inline int
+__block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack)
+{
+ WT_EXT *ext;
+
+ /*
+ * Linear walk of the available chunks in offset order; take the first
+ * one that's large enough.
+ */
+ WT_EXT_FOREACH(ext, head)
+ if (ext->size >= size)
+ break;
+ if (ext == NULL)
+ return (0);
+
+ /* Build a stack for the offset we want. */
+ __block_off_srch(head, ext->off, stack, 0);
+ return (1);
+}
+
+/*
+ * __block_size_srch --
+ * Search the by-size skiplist for the specified size.
+ */
+static inline void
+__block_size_srch(WT_SIZE **head, wt_off_t size, WT_SIZE ***stack)
+{
+ WT_SIZE **szp;
+ int i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ *
+ * Return a stack for an exact match or the next-largest item.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, szp = &head[i]; i >= 0;)
+ if (*szp != NULL && (*szp)->size < size)
+ szp = &(*szp)->next[i];
+ else
+ stack[i--] = szp--;
+}
+
+/*
+ * __block_off_srch_pair --
+ * Search a by-offset skiplist for before/after records of the specified
+ * offset.
+ */
+static inline void
+__block_off_srch_pair(
+ WT_EXTLIST *el, wt_off_t off, WT_EXT **beforep, WT_EXT **afterp)
+{
+ WT_EXT **head, **extp;
+ int i;
+
+ *beforep = *afterp = NULL;
+
+ head = el->off;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;) {
+ if (*extp == NULL) {
+ --i;
+ --extp;
+ continue;
+ }
+
+ if ((*extp)->off < off) { /* Keep going at this level */
+ *beforep = *extp;
+ extp = &(*extp)->next[i];
+ } else { /* Drop down a level */
+ *afterp = *extp;
+ --i;
+ --extp;
+ }
+ }
+}
+
+/*
+ * __block_ext_insert --
+ * Insert an extent into an extent list.
+ */
+static int
+__block_ext_insert(WT_SESSION_IMPL *session, WT_EXTLIST *el, WT_EXT *ext)
+{
+ WT_EXT **astack[WT_SKIP_MAXDEPTH];
+ WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
+ u_int i;
+
+ /*
+ * If we are inserting a new size onto the size skiplist, we'll need a
+ * new WT_SIZE structure for that skiplist.
+ */
+ if (el->track_size) {
+ __block_size_srch(el->sz, ext->size, sstack);
+ szp = *sstack[0];
+ if (szp == NULL || szp->size != ext->size) {
+ WT_RET(__wt_block_size_alloc(session, &szp));
+ szp->size = ext->size;
+ szp->depth = ext->depth;
+ for (i = 0; i < ext->depth; ++i) {
+ szp->next[i] = *sstack[i];
+ *sstack[i] = szp;
+ }
+ }
+
+ /*
+ * Insert the new WT_EXT structure into the size element's
+ * offset skiplist.
+ */
+ __block_off_srch(szp->off, ext->off, astack, 1);
+ for (i = 0; i < ext->depth; ++i) {
+ ext->next[i + ext->depth] = *astack[i];
+ *astack[i] = ext;
+ }
+ }
+#ifdef HAVE_DIAGNOSTIC
+ if (!el->track_size)
+ for (i = 0; i < ext->depth; ++i)
+ ext->next[i + ext->depth] = NULL;
+#endif
+
+ /* Insert the new WT_EXT structure into the offset skiplist. */
+ __block_off_srch(el->off, ext->off, astack, 0);
+ for (i = 0; i < ext->depth; ++i) {
+ ext->next[i] = *astack[i];
+ *astack[i] = ext;
+ }
+
+ ++el->entries;
+ el->bytes += (uint64_t)ext->size;
+
+ /* Update the cached end-of-list. */
+ if (ext->next[0] == NULL)
+ el->last = ext;
+
+ return (0);
+}
+
+/*
+ * __block_off_insert --
+ * Insert a file range into an extent list.
+ */
+static int
+__block_off_insert(
+ WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+ WT_EXT *ext;
+
+ WT_RET(__wt_block_ext_alloc(session, &ext));
+ ext->off = off;
+ ext->size = size;
+
+ return (__block_ext_insert(session, el, ext));
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __block_off_match --
+ * Return if any part of a specified range appears on a specified extent
+ * list.
+ */
+static int
+__block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+ WT_EXT *before, *after;
+
+ /* Search for before and after entries for the offset. */
+ __block_off_srch_pair(el, off, &before, &after);
+
+ /* If "before" or "after" overlaps, we have a winner. */
+ if (before != NULL && before->off + before->size > off)
+ return (1);
+ if (after != NULL && off + size > after->off)
+ return (1);
+ return (0);
+}
+
+/*
+ * __wt_block_misplaced --
+ * Complain if a block appears on the available or discard lists.
+ */
+int
+__wt_block_misplaced(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const char *tag, wt_off_t offset, uint32_t size, int live)
+{
+ const char *name;
+
+ name = NULL;
+
+ /*
+ * Don't check during the salvage read phase, we might be reading an
+ * already freed overflow page.
+ */
+ if (F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
+ return (0);
+
+ /*
+ * Verify a block the btree engine thinks it "owns" doesn't appear on
+ * the available or discard lists (it might reasonably be on the alloc
+ * list, if it was allocated since the last checkpoint). The engine
+ * "owns" a block if it's trying to read or free the block, and those
+ * functions make this check.
+ *
+ * Any block being read or freed should not be "available".
+ *
+ * Any block being read or freed in the live system should not be on the
+ * discard list. (A checkpoint handle might be reading a block which is
+ * on the live system's discard list; any attempt to free a block from a
+ * checkpoint handle has already failed.)
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ if (__block_off_match(&block->live.avail, offset, size))
+ name = "available";
+ else if (live && __block_off_match(&block->live.discard, offset, size))
+ name = "discard";
+ __wt_spin_unlock(session, &block->live_lock);
+ if (name != NULL) {
+ __wt_errx(session,
+ "%s failed: %" PRIuMAX "/%" PRIu32 " is on the %s list",
+ tag, (uintmax_t)offset, size, name);
+ return (__wt_panic(session));
+ }
+ return (0);
+}
+#endif
+
+/*
+ * __block_off_remove --
+ * Remove a record from an extent list.
+ */
+static int
+__block_off_remove(
+ WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, WT_EXT **extp)
+{
+ WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
+ WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
+ u_int i;
+
+ /* Find and remove the record from the by-offset skiplist. */
+ __block_off_srch(el->off, off, astack, 0);
+ ext = *astack[0];
+ if (ext == NULL || ext->off != off)
+ goto corrupt;
+ for (i = 0; i < ext->depth; ++i)
+ *astack[i] = ext->next[i];
+
+ /*
+ * Find and remove the record from the size's offset skiplist; if that
+ * empties the by-size skiplist entry, remove it as well.
+ */
+ if (el->track_size) {
+ __block_size_srch(el->sz, ext->size, sstack);
+ szp = *sstack[0];
+ if (szp == NULL || szp->size != ext->size)
+ return (EINVAL);
+ __block_off_srch(szp->off, off, astack, 1);
+ ext = *astack[0];
+ if (ext == NULL || ext->off != off)
+ goto corrupt;
+ for (i = 0; i < ext->depth; ++i)
+ *astack[i] = ext->next[i + ext->depth];
+ if (szp->off[0] == NULL) {
+ for (i = 0; i < szp->depth; ++i)
+ *sstack[i] = szp->next[i];
+ __wt_block_size_free(session, szp);
+ }
+ }
+#ifdef HAVE_DIAGNOSTIC
+ if (!el->track_size) {
+ int not_null;
+ for (i = 0, not_null = 0; i < ext->depth; ++i)
+ if (ext->next[i + ext->depth] != NULL)
+ not_null = 1;
+ WT_ASSERT(session, not_null == 0);
+ }
+#endif
+
+ --el->entries;
+ el->bytes -= (uint64_t)ext->size;
+
+ /* Return the record if our caller wants it, otherwise free it. */
+ if (extp == NULL)
+ __wt_block_ext_free(session, ext);
+ else
+ *extp = ext;
+
+ /* Update the cached end-of-list. */
+ if (el->last == ext)
+ el->last = NULL;
+
+ return (0);
+
+corrupt:
+ WT_PANIC_RET(session, EINVAL,
+ "attempt to remove non-existent offset from an extent list");
+}
+
+/*
+ * __wt_block_off_remove_overlap --
+ * Remove a range from an extent list, where the range may be part of a
+ * overlapping entry.
+ */
+int
+__wt_block_off_remove_overlap(
+ WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+ WT_EXT *before, *after, *ext;
+ wt_off_t a_off, a_size, b_off, b_size;
+
+ WT_ASSERT(session, off != WT_BLOCK_INVALID_OFFSET);
+
+ /* Search for before and after entries for the offset. */
+ __block_off_srch_pair(el, off, &before, &after);
+
+ /* If "before" or "after" overlaps, retrieve the overlapping entry. */
+ if (before != NULL && before->off + before->size > off) {
+ WT_RET(__block_off_remove(session, el, before->off, &ext));
+
+ /* Calculate overlapping extents. */
+ a_off = ext->off;
+ a_size = off - ext->off;
+ b_off = off + size;
+ b_size = ext->size - (a_size + size);
+ } else if (after != NULL && off + size > after->off) {
+ WT_RET(__block_off_remove(session, el, after->off, &ext));
+
+ /*
+ * Calculate overlapping extents. There's no initial overlap
+ * since the after extent presumably cannot begin before "off".
+ */
+ a_off = WT_BLOCK_INVALID_OFFSET;
+ a_size = 0;
+ b_off = off + size;
+ b_size = ext->size - (b_off - ext->off);
+ } else
+ return (WT_NOTFOUND);
+
+ /*
+ * If there are overlaps, insert the item; re-use the extent structure
+ * and save the allocation (we know there's no need to merge).
+ */
+ if (a_size != 0) {
+ ext->off = a_off;
+ ext->size = a_size;
+ WT_RET(__block_ext_insert(session, el, ext));
+ ext = NULL;
+ }
+ if (b_size != 0) {
+ if (ext == NULL)
+ WT_RET(__block_off_insert(session, el, b_off, b_size));
+ else {
+ ext->off = b_off;
+ ext->size = b_size;
+ WT_RET(__block_ext_insert(session, el, ext));
+ ext = NULL;
+ }
+ }
+ if (ext != NULL)
+ __wt_block_ext_free(session, ext);
+ return (0);
+}
+
+/*
+ * __block_extend --
+ * Extend the file to allocate space.
+ */
+static inline int
+__block_extend(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size)
+{
+ WT_FH *fh;
+
+ fh = block->fh;
+
+ /*
+ * Callers of this function are expected to have already acquired any
+ * locks required to extend the file.
+ *
+ * We should never be allocating from an empty file.
+ */
+ if (fh->size < block->allocsize)
+ WT_RET_MSG(session, EINVAL,
+ "file has no description information");
+
+ /*
+ * Make sure we don't allocate past the maximum file size. There's no
+ * easy way to know the maximum wt_off_t on a system, limit growth to
+ * 8B bits (we currently check an wt_off_t is 8B in verify_build.h). I
+ * don't think we're likely to see anything bigger for awhile.
+ */
+ if (fh->size > (wt_off_t)INT64_MAX - size)
+ WT_RET_MSG(session, WT_ERROR,
+ "block allocation failed, file cannot grow further");
+
+ *offp = fh->size;
+ fh->size += size;
+
+ WT_STAT_FAST_DATA_INCR(session, block_extension);
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "file extend %" PRIdMAX "B @ %" PRIdMAX,
+ (intmax_t)size, (intmax_t)*offp));
+
+ return (0);
+}
+
+/*
+ * __wt_block_alloc --
+ * Alloc a chunk of space from the underlying file.
+ */
+int
+__wt_block_alloc(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size)
+{
+ WT_EXT *ext, **estack[WT_SKIP_MAXDEPTH];
+ WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
+
+ /* Assert we're maintaining the by-size skiplist. */
+ WT_ASSERT(session, block->live.avail.track_size != 0);
+
+ WT_STAT_FAST_DATA_INCR(session, block_alloc);
+ if (size % block->allocsize != 0)
+ WT_RET_MSG(session, EINVAL,
+ "cannot allocate a block size %" PRIdMAX " that is not "
+ "a multiple of the allocation size %" PRIu32,
+ (intmax_t)size, block->allocsize);
+
+ /*
+ * Allocation is either first-fit (lowest offset), or best-fit (best
+ * size). If it's first-fit, walk the offset list linearly until we
+ * find an entry that will work.
+ *
+ * If it's best-fit by size, search the by-size skiplist for the size
+ * and take the first entry on the by-size offset list. This means we
+ * prefer best-fit over lower offset, but within a size we'll prefer an
+ * offset appearing earlier in the file.
+ *
+ * If we don't have anything big enough, extend the file.
+ */
+ if (block->live.avail.bytes < (uint64_t)size)
+ goto append;
+ if (block->allocfirst) {
+ if (!__block_first_srch(block->live.avail.off, size, estack))
+ goto append;
+ ext = *estack[0];
+ } else {
+ __block_size_srch(block->live.avail.sz, size, sstack);
+ if ((szp = *sstack[0]) == NULL) {
+append: WT_RET(__block_extend(session, block, offp, size));
+ WT_RET(__block_append(session,
+ &block->live.alloc, *offp, (wt_off_t)size));
+ return (0);
+ }
+
+ /* Take the first record. */
+ ext = szp->off[0];
+ }
+
+ /* Remove the record, and set the returned offset. */
+ WT_RET(__block_off_remove(session, &block->live.avail, ext->off, &ext));
+ *offp = ext->off;
+
+ /* If doing a partial allocation, adjust the record and put it back. */
+ if (ext->size > size) {
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "allocate %" PRIdMAX " from range %" PRIdMAX "-%"
+ PRIdMAX ", range shrinks to %" PRIdMAX "-%" PRIdMAX,
+ (intmax_t)size,
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
+ (intmax_t)(ext->off + size),
+ (intmax_t)(ext->off + size + ext->size - size)));
+
+ ext->off += size;
+ ext->size -= size;
+ WT_RET(__block_ext_insert(session, &block->live.avail, ext));
+ } else {
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "allocate range %" PRIdMAX "-%" PRIdMAX,
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size)));
+
+ __wt_block_ext_free(session, ext);
+ }
+
+ /* Add the newly allocated extent to the list of allocations. */
+ WT_RET(__block_merge(
+ session, &block->live.alloc, *offp, (wt_off_t)size));
+ return (0);
+}
+
+/*
+ * __wt_block_free --
+ * Free a cookie-referenced chunk of space to the underlying file.
+ */
+int
+__wt_block_free(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *addr, size_t addr_size)
+{
+ WT_DECL_RET;
+ wt_off_t offset;
+ uint32_t cksum, size;
+
+ WT_UNUSED(addr_size);
+ WT_STAT_FAST_DATA_INCR(session, block_free);
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "free %" PRIdMAX "/%" PRIdMAX, (intmax_t)offset, (intmax_t)size));
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_RET(__wt_block_misplaced(session, block, "free", offset, size, 1));
+#endif
+ WT_RET(__wt_block_ext_prealloc(session, 5));
+ __wt_spin_lock(session, &block->live_lock);
+ ret = __wt_block_off_free(session, block, offset, (wt_off_t)size);
+ __wt_spin_unlock(session, &block->live_lock);
+
+ return (ret);
+}
+
+/*
+ * __wt_block_off_free --
+ * Free a file range to the underlying file.
+ */
+int
+__wt_block_off_free(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size)
+{
+ WT_DECL_RET;
+
+ /*
+ * Callers of this function are expected to have already acquired any
+ * locks required to manipulate the extent lists.
+ *
+ * We can reuse this extent immediately if it was allocated during this
+ * checkpoint, merge it into the avail list (which slows file growth in
+ * workloads including repeated overflow record modification). If this
+ * extent is referenced in a previous checkpoint, merge into the discard
+ * list.
+ */
+ if ((ret = __wt_block_off_remove_overlap(
+ session, &block->live.alloc, offset, size)) == 0)
+ ret = __block_merge(
+ session, &block->live.avail, offset, (wt_off_t)size);
+ else if (ret == WT_NOTFOUND)
+ ret = __block_merge(
+ session, &block->live.discard, offset, (wt_off_t)size);
+ return (ret);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_block_extlist_check --
+ * Return if the extent lists overlap.
+ */
+int
+__wt_block_extlist_check(
+ WT_SESSION_IMPL *session, WT_EXTLIST *al, WT_EXTLIST *bl)
+{
+ WT_EXT *a, *b;
+
+ a = al->off[0];
+ b = bl->off[0];
+
+ /* Walk the lists in parallel, looking for overlaps. */
+ while (a != NULL && b != NULL) {
+ /*
+ * If there's no overlap, move the lower-offset entry to the
+ * next entry in its list.
+ */
+ if (a->off + a->size <= b->off) {
+ a = a->next[0];
+ continue;
+ }
+ if (b->off + b->size <= a->off) {
+ b = b->next[0];
+ continue;
+ }
+ WT_PANIC_RET(session, EINVAL,
+ "checkpoint merge check: %s list overlaps the %s list",
+ al->name, bl->name);
+ }
+ return (0);
+}
+#endif
+
+/*
+ * __wt_block_extlist_overlap --
+ * Review a checkpoint's alloc/discard extent lists, move overlaps into the
+ * live system's checkpoint-avail list.
+ */
+int
+__wt_block_extlist_overlap(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci)
+{
+ WT_EXT *alloc, *discard;
+
+ alloc = ci->alloc.off[0];
+ discard = ci->discard.off[0];
+
+ /* Walk the lists in parallel, looking for overlaps. */
+ while (alloc != NULL && discard != NULL) {
+ /*
+ * If there's no overlap, move the lower-offset entry to the
+ * next entry in its list.
+ */
+ if (alloc->off + alloc->size <= discard->off) {
+ alloc = alloc->next[0];
+ continue;
+ }
+ if (discard->off + discard->size <= alloc->off) {
+ discard = discard->next[0];
+ continue;
+ }
+
+ /* Reconcile the overlap. */
+ WT_RET(__block_ext_overlap(session, block,
+ &ci->alloc, &alloc, &ci->discard, &discard));
+ }
+ return (0);
+}
+
+/*
+ * __block_ext_overlap --
+ * Reconcile two overlapping ranges.
+ */
+static int
+__block_ext_overlap(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *ael, WT_EXT **ap, WT_EXTLIST *bel, WT_EXT **bp)
+{
+ WT_EXT *a, *b, **ext;
+ WT_EXTLIST *avail, *el;
+ wt_off_t off, size;
+
+ avail = &block->live.ckpt_avail;
+
+ /*
+ * The ranges overlap, choose the range we're going to take from each.
+ *
+ * We can think of the overlap possibilities as 11 different cases:
+ *
+ * AAAAAAAAAAAAAAAAAA
+ * #1 BBBBBBBBBBBBBBBBBB ranges are the same
+ * #2 BBBBBBBBBBBBB overlaps the beginning
+ * #3 BBBBBBBBBBBBBBBB overlaps the end
+ * #4 BBBBB B is a prefix of A
+ * #5 BBBBBB B is middle of A
+ * #6 BBBBBBBBBB B is a suffix of A
+ *
+ * and:
+ *
+ * BBBBBBBBBBBBBBBBBB
+ * #7 AAAAAAAAAAAAA same as #3
+ * #8 AAAAAAAAAAAAAAAA same as #2
+ * #9 AAAAA A is a prefix of B
+ * #10 AAAAAA A is middle of B
+ * #11 AAAAAAAAAA A is a suffix of B
+ *
+ *
+ * By swapping the arguments so "A" is always the lower range, we can
+ * eliminate cases #2, #8, #10 and #11, and only handle 7 cases:
+ *
+ * AAAAAAAAAAAAAAAAAA
+ * #1 BBBBBBBBBBBBBBBBBB ranges are the same
+ * #3 BBBBBBBBBBBBBBBB overlaps the end
+ * #4 BBBBB B is a prefix of A
+ * #5 BBBBBB B is middle of A
+ * #6 BBBBBBBBBB B is a suffix of A
+ *
+ * and:
+ *
+ * BBBBBBBBBBBBBBBBBB
+ * #7 AAAAAAAAAAAAA same as #3
+ * #9 AAAAA A is a prefix of B
+ */
+ a = *ap;
+ b = *bp;
+ if (a->off > b->off) { /* Swap */
+ b = *ap;
+ a = *bp;
+ ext = ap; ap = bp; bp = ext;
+ el = ael; ael = bel; bel = el;
+ }
+
+ if (a->off == b->off) { /* Case #1, #4, #9 */
+ if (a->size == b->size) { /* Case #1 */
+ /*
+ * Move caller's A and B to the next element
+ * Add that A and B range to the avail list
+ * Delete A and B
+ */
+ *ap = (*ap)->next[0];
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, avail, b->off, b->size));
+ WT_RET(__block_off_remove(session, ael, a->off, NULL));
+ WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ }
+ else if (a->size > b->size) { /* Case #4 */
+ /*
+ * Remove A from its list
+ * Increment/Decrement A's offset/size by the size of B
+ * Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, ael, a->off, &a));
+ a->off += b->size;
+ a->size -= b->size;
+ WT_RET(__block_ext_insert(session, ael, a));
+
+ /*
+ * Move caller's B to the next element
+ * Add B's range to the avail list
+ * Delete B
+ */
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, avail, b->off, b->size));
+ WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ } else { /* Case #9 */
+ /*
+ * Remove B from its list
+ * Increment/Decrement B's offset/size by the size of A
+ * Insert B on its list
+ */
+ WT_RET(__block_off_remove(session, bel, b->off, &b));
+ b->off += a->size;
+ b->size -= a->size;
+ WT_RET(__block_ext_insert(session, bel, b));
+
+ /*
+ * Move caller's A to the next element
+ * Add A's range to the avail list
+ * Delete A
+ */
+ *ap = (*ap)->next[0];
+ WT_RET(__block_merge(session, avail, a->off, a->size));
+ WT_RET(__block_off_remove(session, ael, a->off, NULL));
+ } /* Case #6 */
+ } else if (a->off + a->size == b->off + b->size) {
+ /*
+ * Remove A from its list
+ * Decrement A's size by the size of B
+ * Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, ael, a->off, &a));
+ a->size -= b->size;
+ WT_RET(__block_ext_insert(session, ael, a));
+
+ /*
+ * Move caller's B to the next element
+ * Add B's range to the avail list
+ * Delete B
+ */
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, avail, b->off, b->size));
+ WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ } else if /* Case #3, #7 */
+ (a->off + a->size < b->off + b->size) {
+ /*
+ * Add overlap to the avail list
+ */
+ off = b->off;
+ size = (a->off + a->size) - b->off;
+ WT_RET(__block_merge(session, avail, off, size));
+
+ /*
+ * Remove A from its list
+ * Decrement A's size by the overlap
+ * Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, ael, a->off, &a));
+ a->size -= size;
+ WT_RET(__block_ext_insert(session, ael, a));
+
+ /*
+ * Remove B from its list
+ * Increment/Decrement B's offset/size by the overlap
+ * Insert B on its list
+ */
+ WT_RET(__block_off_remove(session, bel, b->off, &b));
+ b->off += size;
+ b->size -= size;
+ WT_RET(__block_ext_insert(session, bel, b));
+ } else { /* Case #5 */
+ /* Calculate the offset/size of the trailing part of A. */
+ off = b->off + b->size;
+ size = (a->off + a->size) - off;
+
+ /*
+ * Remove A from its list
+ * Decrement A's size by trailing part of A plus B's size
+ * Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, ael, a->off, &a));
+ a->size = b->off - a->off;
+ WT_RET(__block_ext_insert(session, ael, a));
+
+ /* Add trailing part of A to A's list as a new element. */
+ WT_RET(__block_merge(session, ael, off, size));
+
+ /*
+ * Move caller's B to the next element
+ * Add B's range to the avail list
+ * Delete B
+ */
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, avail, b->off, b->size));
+ WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_block_extlist_merge --
+ * Merge one extent list into another.
+ */
+int
+__wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b)
+{
+ WT_EXT *ext;
+ WT_EXTLIST tmp;
+ u_int i;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_BLOCK, "merging %s into %s", a->name, b->name));
+
+ /*
+ * Sometimes the list we are merging is much bigger than the other: if
+ * so, swap the lists around to reduce the amount of work we need to do
+ * during the merge. The size lists have to match as well, so this is
+ * only possible if both lists are tracking sizes, or neither are.
+ */
+ if (a->track_size == b->track_size && a->entries > b->entries) {
+ tmp = *a;
+ a->bytes = b->bytes;
+ b->bytes = tmp.bytes;
+ a->entries = b->entries;
+ b->entries = tmp.entries;
+ for (i = 0; i < WT_SKIP_MAXDEPTH; i++) {
+ a->off[i] = b->off[i];
+ b->off[i] = tmp.off[i];
+ a->sz[i] = b->sz[i];
+ b->sz[i] = tmp.sz[i];
+ }
+ }
+
+ WT_EXT_FOREACH(ext, a->off)
+ WT_RET(__block_merge(session, b, ext->off, ext->size));
+
+ return (0);
+}
+
+/*
+ * __block_append --
+ * Append a new entry to the allocation list.
+ */
+static int
+__block_append(
+ WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+ WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
+ u_int i;
+
+ WT_ASSERT(session, el->track_size == 0);
+
+ /*
+ * Identical to __block_merge, when we know the file is being extended,
+ * that is, the information is either going to be used to extend the
+ * last object on the list, or become a new object ending the list.
+ *
+ * The terminating element of the list is cached, check it; otherwise,
+ * get a stack for the last object in the skiplist, check for a simple
+ * extension, and otherwise append a new structure.
+ */
+ if ((ext = el->last) != NULL && ext->off + ext->size == off)
+ ext->size += size;
+ else {
+ ext = __block_off_srch_last(el->off, astack);
+ if (ext != NULL && ext->off + ext->size == off)
+ ext->size += size;
+ else {
+ WT_RET(__wt_block_ext_alloc(session, &ext));
+ ext->off = off;
+ ext->size = size;
+
+ for (i = 0; i < ext->depth; ++i)
+ *astack[i] = ext;
+ ++el->entries;
+ }
+
+ /* Update the cached end-of-list */
+ el->last = ext;
+ }
+ el->bytes += (uint64_t)size;
+
+ return (0);
+}
+
+/*
+ * __wt_block_insert_ext --
+ * Insert an extent into an extent list, merging if possible.
+ */
+int
+__wt_block_insert_ext(
+ WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+ /*
+ * There are currently two copies of this function (this code is a one-
+ * liner that calls the internal version of the function, which means
+ * the compiler should compress out the function call). It's that way
+ * because the interface is still fluid, I'm not convinced there won't
+ * be a need for a functional split between the internal and external
+ * versions in the future.
+ *
+ * Callers of this function are expected to have already acquired any
+ * locks required to manipulate the extent list.
+ */
+ return (__block_merge(session, el, off, size));
+}
+
+/*
+ * __block_merge --
+ * Insert an extent into an extent list, merging if possible (internal
+ * version).
+ */
+static int
+__block_merge(
+ WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+ WT_EXT *ext, *after, *before;
+
+ /*
+ * Retrieve the records preceding/following the offset. If the records
+ * are contiguous with the free'd offset, combine records.
+ */
+ __block_off_srch_pair(el, off, &before, &after);
+ if (before != NULL) {
+ if (before->off + before->size > off)
+ WT_PANIC_RET(session, EINVAL,
+ "%s: existing range %" PRIdMAX "-%" PRIdMAX
+ " overlaps with merge range %" PRIdMAX "-%" PRIdMAX,
+ el->name,
+ (intmax_t)before->off,
+ (intmax_t)(before->off + before->size),
+ (intmax_t)off, (intmax_t)(off + size));
+ if (before->off + before->size != off)
+ before = NULL;
+ }
+ if (after != NULL) {
+ if (off + size > after->off)
+ WT_PANIC_RET(session, EINVAL,
+ "%s: merge range %" PRIdMAX "-%" PRIdMAX
+ " overlaps with existing range %" PRIdMAX
+ "-%" PRIdMAX,
+ el->name,
+ (intmax_t)off, (intmax_t)(off + size),
+ (intmax_t)after->off,
+ (intmax_t)(after->off + after->size));
+ if (off + size != after->off)
+ after = NULL;
+ }
+ if (before == NULL && after == NULL) {
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "%s: insert range %" PRIdMAX "-%" PRIdMAX,
+ el->name, (intmax_t)off, (intmax_t)(off + size)));
+
+ return (__block_off_insert(session, el, off, size));
+ }
+
+ /*
+ * If the "before" offset range abuts, we'll use it as our new record;
+ * if the "after" offset range also abuts, include its size and remove
+ * it from the system. Else, only the "after" offset range abuts, use
+ * the "after" offset range as our new record. In either case, remove
+ * the record we're going to use, adjust it and re-insert it.
+ */
+ if (before == NULL) {
+ WT_RET(__block_off_remove(session, el, after->off, &ext));
+
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
+ PRIdMAX "-%" PRIdMAX,
+ el->name,
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
+ (intmax_t)off, (intmax_t)(off + ext->size + size)));
+
+ ext->off = off;
+ ext->size += size;
+ } else {
+ if (after != NULL) {
+ size += after->size;
+ WT_RET(
+ __block_off_remove(session, el, after->off, NULL));
+ }
+ WT_RET(__block_off_remove(session, el, before->off, &ext));
+
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
+ PRIdMAX "-%" PRIdMAX,
+ el->name,
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
+ (intmax_t)ext->off,
+ (intmax_t)(ext->off + ext->size + size)));
+
+ ext->size += size;
+ }
+ return (__block_ext_insert(session, el, ext));
+}
+
+/*
+ * __wt_block_extlist_read_avail --
+ * Read an avail extent list, includes minor special handling.
+ */
+int
+__wt_block_extlist_read_avail(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size)
+{
+ WT_DECL_RET;
+
+ /* If there isn't a list, we're done. */
+ if (el->offset == WT_BLOCK_INVALID_OFFSET)
+ return (0);
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * In diagnostic mode, reads are checked against the available and
+ * discard lists (a block being read should never appear on either).
+ * Checkpoint threads may be running in the file, don't race with
+ * them.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+#endif
+
+ WT_ERR(__wt_block_extlist_read(session, block, el, ckpt_size));
+
+ /*
+ * Extent blocks are allocated from the available list: if reading the
+ * avail list, the extent blocks might be included, remove them.
+ */
+ WT_ERR_NOTFOUND_OK(
+ __wt_block_off_remove_overlap(session, el, el->offset, el->size));
+
+err:
+#ifdef HAVE_DIAGNOSTIC
+ __wt_spin_unlock(session, &block->live_lock);
+#endif
+
+ return (ret);
+}
+
+/*
+ * __wt_block_extlist_read --
+ * Read an extent list.
+ */
+int
+__wt_block_extlist_read(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ wt_off_t off, size;
+ int (*func)(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t);
+ const uint8_t *p;
+
+ /* If there isn't a list, we're done. */
+ if (el->offset == WT_BLOCK_INVALID_OFFSET)
+ return (0);
+
+ WT_RET(__wt_scr_alloc(session, el->size, &tmp));
+ WT_ERR(__wt_block_read_off(
+ session, block, tmp, el->offset, el->size, el->cksum));
+
+#define WT_EXTLIST_READ(p, v) do { \
+ uint64_t _v; \
+ WT_ERR(__wt_vunpack_uint(&(p), 0, &_v)); \
+ (v) = (wt_off_t)_v; \
+} while (0)
+
+ p = WT_BLOCK_HEADER_BYTE(tmp->mem);
+ WT_EXTLIST_READ(p, off);
+ WT_EXTLIST_READ(p, size);
+ if (off != WT_BLOCK_EXTLIST_MAGIC || size != 0)
+ goto corrupted;
+
+ /*
+ * If we're not creating both offset and size skiplists, use the simpler
+ * append API, otherwise do a full merge. There are two reasons for the
+ * test: first, checkpoint "available" lists are NOT sorted (checkpoints
+ * write two separate lists, both of which are sorted but they're not
+ * merged). Second, the "available" list is sorted by size as well as
+ * by offset, and the fast-path append code doesn't support that, it's
+ * limited to offset. The test of "track size" is short-hand for "are
+ * we reading the "available" list.
+ */
+ func = el->track_size == 0 ? __block_append : __block_merge;
+ for (;;) {
+ WT_EXTLIST_READ(p, off);
+ WT_EXTLIST_READ(p, size);
+ if (off == WT_BLOCK_INVALID_OFFSET)
+ break;
+
+ /*
+ * We check the offset/size pairs represent valid file ranges,
+ * then insert them into the list. We don't necessarily have
+ * to check for offsets past the end of the checkpoint, but it's
+ * a cheap test to do here and we'd have to do the check as part
+ * of file verification, regardless.
+ */
+ if (off < block->allocsize ||
+ off % block->allocsize != 0 ||
+ size % block->allocsize != 0 ||
+ off + size > ckpt_size)
+corrupted: WT_PANIC_RET(session, WT_ERROR,
+ "file contains a corrupted %s extent list, range %"
+ PRIdMAX "-%" PRIdMAX " past end-of-file",
+ el->name,
+ (intmax_t)off, (intmax_t)(off + size));
+
+ WT_ERR(func(session, el, off, size));
+ }
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_BLOCK))
+ WT_ERR(__block_extlist_dump(session, "read extlist", el, 0));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_block_extlist_write --
+ * Write an extent list at the tail of the file.
+ */
+int
+__wt_block_extlist_write(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *el, WT_EXTLIST *additional)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_EXT *ext;
+ WT_PAGE_HEADER *dsk;
+ size_t size;
+ uint32_t entries;
+ uint8_t *p;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_BLOCK))
+ WT_RET(__block_extlist_dump(session, "write extlist", el, 0));
+
+ /*
+ * Figure out how many entries we're writing -- if there aren't any
+ * entries, we're done.
+ */
+ entries = el->entries + (additional == NULL ? 0 : additional->entries);
+ if (entries == 0) {
+ el->offset = WT_BLOCK_INVALID_OFFSET;
+ el->cksum = el->size = 0;
+ return (0);
+ }
+
+ /*
+ * Get a scratch buffer, clear the page's header and data, initialize
+ * the header.
+ *
+ * Allocate memory for the extent list entries plus two additional
+ * entries: the initial WT_BLOCK_EXTLIST_MAGIC/0 pair and the list-
+ * terminating WT_BLOCK_INVALID_OFFSET/0 pair.
+ */
+ size = (entries + 2) * 2 * WT_INTPACK64_MAXSIZE;
+ WT_RET(__wt_block_write_size(session, block, &size));
+ WT_RET(__wt_scr_alloc(session, size, &tmp));
+ dsk = tmp->mem;
+ memset(dsk, 0, WT_BLOCK_HEADER_BYTE_SIZE);
+ dsk->type = WT_PAGE_BLOCK_MANAGER;
+
+#define WT_EXTLIST_WRITE(p, v) \
+ WT_ERR(__wt_vpack_uint(&(p), 0, (uint64_t)(v)))
+
+ /* Fill the page's data. */
+ p = WT_BLOCK_HEADER_BYTE(dsk);
+ WT_EXTLIST_WRITE(p, WT_BLOCK_EXTLIST_MAGIC); /* Initial value */
+ WT_EXTLIST_WRITE(p, 0);
+ WT_EXT_FOREACH(ext, el->off) { /* Free ranges */
+ WT_EXTLIST_WRITE(p, ext->off);
+ WT_EXTLIST_WRITE(p, ext->size);
+ }
+ if (additional != NULL)
+ WT_EXT_FOREACH(ext, additional->off) { /* Free ranges */
+ WT_EXTLIST_WRITE(p, ext->off);
+ WT_EXTLIST_WRITE(p, ext->size);
+ }
+ WT_EXTLIST_WRITE(p, WT_BLOCK_INVALID_OFFSET); /* Ending value */
+ WT_EXTLIST_WRITE(p, 0);
+
+ dsk->u.datalen = WT_PTRDIFF32(p, WT_BLOCK_HEADER_BYTE(dsk));
+ tmp->size = dsk->mem_size = WT_PTRDIFF32(p, dsk);
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * The extent list is written as a valid btree page because the salvage
+ * functionality might move into the btree layer some day, besides, we
+ * don't need another format and this way the page format can be easily
+ * verified.
+ */
+ WT_ERR(__wt_verify_dsk(session, "[extent list check]", tmp));
+#endif
+
+ /* Write the extent list to disk. */
+ WT_ERR(__wt_block_write_off(
+ session, block, tmp, &el->offset, &el->size, &el->cksum, 1, 1));
+
+ /*
+ * Remove the allocated blocks from the system's allocation list, extent
+ * blocks never appear on any allocation list.
+ */
+ WT_TRET(__wt_block_off_remove_overlap(
+ session, &block->live.alloc, el->offset, el->size));
+
+ WT_ERR(__wt_verbose(session, WT_VERB_BLOCK,
+ "%s written %" PRIdMAX "/%" PRIu32,
+ el->name, (intmax_t)el->offset, el->size));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_block_extlist_truncate --
+ * Truncate the file based on the last available extent in the list.
+ */
+int
+__wt_block_extlist_truncate(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
+{
+ WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
+ WT_FH *fh;
+ wt_off_t orig, size;
+
+ fh = block->fh;
+
+ /*
+ * Check if the last available extent is at the end of the file, and if
+ * so, truncate the file and discard the extent.
+ */
+ if ((ext = __block_off_srch_last(el->off, astack)) == NULL)
+ return (0);
+ WT_ASSERT(session, ext->off + ext->size <= fh->size);
+ if (ext->off + ext->size < fh->size)
+ return (0);
+
+ /*
+ * Remove the extent list entry. (Save the value, we need it to reset
+ * the cached file size, and that can't happen until after the extent
+ * list removal succeeds.)
+ */
+ orig = fh->size;
+ size = ext->off;
+ WT_RET(__block_off_remove(session, el, size, NULL));
+ fh->size = size;
+
+ /*
+ * Truncate the file. The truncate might fail if there's a file mapping
+ * (if there's an open checkpoint on the file), that's OK, we'll ignore
+ * those blocks.
+ */
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "truncate file from %" PRIdMAX " to %" PRIdMAX,
+ (intmax_t)orig, (intmax_t)size));
+ WT_RET_BUSY_OK(__wt_ftruncate(session, block->fh, size));
+
+ return (0);
+}
+
+/*
+ * __wt_block_extlist_init --
+ * Initialize an extent list.
+ */
+int
+__wt_block_extlist_init(WT_SESSION_IMPL *session,
+ WT_EXTLIST *el, const char *name, const char *extname, int track_size)
+{
+ size_t size;
+
+ WT_CLEAR(*el);
+
+ size = (name == NULL ? 0 : strlen(name)) +
+ strlen(".") + (extname == NULL ? 0 : strlen(extname) + 1);
+ WT_RET(__wt_calloc_def(session, size, &el->name));
+ (void)snprintf(el->name, size, "%s.%s",
+ name == NULL ? "" : name, extname == NULL ? "" : extname);
+
+ el->offset = WT_BLOCK_INVALID_OFFSET;
+ el->track_size = track_size;
+ return (0);
+}
+
+/*
+ * __wt_block_extlist_free --
+ * Discard an extent list.
+ */
+void
+__wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el)
+{
+ WT_EXT *ext, *next;
+ WT_SIZE *szp, *nszp;
+
+ __wt_free(session, el->name);
+
+ for (ext = el->off[0]; ext != NULL; ext = next) {
+ next = ext->next[0];
+ __wt_free(session, ext);
+ }
+ for (szp = el->sz[0]; szp != NULL; szp = nszp) {
+ nszp = szp->next[0];
+ __wt_free(session, szp);
+ }
+
+ /* Extent lists are re-used, clear them. */
+ WT_CLEAR(*el);
+}
+
+/*
+ * __block_extlist_dump --
+ * Dump an extent list as verbose messages.
+ */
+static int
+__block_extlist_dump(
+ WT_SESSION_IMPL *session, const char *tag, WT_EXTLIST *el, int show_size)
+{
+ WT_EXT *ext;
+ WT_SIZE *szp;
+
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "%s: %s: %" PRIu64 " bytes, by offset:%s",
+ tag, el->name, el->bytes, el->entries == 0 ? " [Empty]" : ""));
+ if (el->entries == 0)
+ return (0);
+
+ WT_EXT_FOREACH(ext, el->off)
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "\t{%" PRIuMAX "/%" PRIuMAX "}",
+ (uintmax_t)ext->off, (uintmax_t)ext->size));
+
+ if (!show_size)
+ return (0);
+
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "%s: %s: by size:%s",
+ tag, el->name, el->entries == 0 ? " [Empty]" : ""));
+ if (el->entries == 0)
+ return (0);
+
+ WT_EXT_FOREACH(szp, el->sz) {
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "\t{%" PRIuMAX "}", (uintmax_t)szp->size));
+ WT_EXT_FOREACH_OFF(ext, szp->off)
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "\t\t{%" PRIuMAX "/%" PRIuMAX "}",
+ (uintmax_t)ext->off, (uintmax_t)ext->size));
+ }
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_map.c b/src/third_party/wiredtiger/src/block/block_map.c
new file mode 100644
index 00000000000..68fb75179d9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_map.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_block_map --
+ * Map a segment of the file in, if possible.
+ */
+int
+__wt_block_map(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp,
+ void **mappingcookie)
+{
+ *(void **)mapp = NULL;
+ *maplenp = 0;
+
+ /*
+ * Turn off mapping when verifying the file, because we can't perform
+ * checksum validation of mapped segments, and verify has to checksum
+ * pages.
+ */
+ if (block->verify)
+ return (0);
+
+ /*
+ * Turn off mapping when direct I/O is configured for the file, the
+ * Linux open(2) documentation says applications should avoid mixing
+ * mmap(2) of files with direct I/O to the same files.
+ */
+ if (block->fh->direct_io)
+ return (0);
+
+ /*
+ * Turn off mapping if the application configured a cache size maximum,
+ * we can't control how much of the cache size we use in that case.
+ */
+ if (block->os_cache_max != 0)
+ return (0);
+
+ /*
+ * Map the file into memory.
+ * Ignore errors, we'll read the file through the cache if map fails.
+ */
+ (void)__wt_mmap(session, block->fh, mapp, maplenp, mappingcookie);
+
+ return (0);
+}
+
+/*
+ * __wt_block_unmap --
+ * Unmap any mapped-in segment of the file.
+ */
+int
+__wt_block_unmap(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen,
+ void **mappingcookie)
+{
+ /* Unmap the file from memory. */
+ return (__wt_munmap(session, block->fh, map, maplen, mappingcookie));
+}
diff --git a/src/third_party/wiredtiger/src/block/block_mgr.c b/src/third_party/wiredtiger/src/block/block_mgr.c
new file mode 100644
index 00000000000..4f7f2898de5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_mgr.c
@@ -0,0 +1,433 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static void __bm_method_set(WT_BM *, int);
+
+/*
+ * __bm_readonly --
+ * General-purpose "writes not supported on this handle" function.
+ */
+static int
+__bm_readonly(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ WT_RET_MSG(session, ENOTSUP,
+ "%s: write operation on read-only checkpoint handle",
+ bm->block->name);
+}
+
+/*
+ * __bm_addr_string --
+ * Return a printable string representation of an address cookie.
+ */
+static int
+__bm_addr_string(WT_BM *bm, WT_SESSION_IMPL *session,
+ WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
+{
+ return (
+ __wt_block_addr_string(session, bm->block, buf, addr, addr_size));
+}
+
+/*
+ * __bm_addr_valid --
+ * Return if an address cookie is valid.
+ */
+static int
+__bm_addr_valid(WT_BM *bm,
+ WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+ return (__wt_block_addr_valid(
+ session, bm->block, addr, addr_size, bm->is_live));
+}
+
+/*
+ * __bm_block_header --
+ * Return the size of the block header.
+ */
+static u_int
+__bm_block_header(WT_BM *bm)
+{
+ return (__wt_block_header(bm->block));
+}
+
+/*
+ * __bm_checkpoint --
+ * Write a buffer into a block, creating a checkpoint.
+ */
+static int
+__bm_checkpoint(WT_BM *bm,
+ WT_SESSION_IMPL *session, WT_ITEM *buf, WT_CKPT *ckptbase, int data_cksum)
+{
+ return (__wt_block_checkpoint(
+ session, bm->block, buf, ckptbase, data_cksum));
+}
+
+/*
+ * __bm_sync --
+ * Flush a file to disk.
+ */
+static int
+__bm_sync(WT_BM *bm, WT_SESSION_IMPL *session, int async)
+{
+ return (async ?
+ __wt_fsync_async(session, bm->block->fh) :
+ __wt_fsync(session, bm->block->fh));
+}
+
+/*
+ * __bm_checkpoint_load --
+ * Load a checkpoint.
+ */
+static int
+__bm_checkpoint_load(WT_BM *bm, WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_size,
+ uint8_t *root_addr, size_t *root_addr_sizep, int checkpoint)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /* If not opening a checkpoint, we're opening the live system. */
+ bm->is_live = !checkpoint;
+ WT_RET(__wt_block_checkpoint_load(session, bm->block,
+ addr, addr_size, root_addr, root_addr_sizep, checkpoint));
+
+ if (checkpoint) {
+ /*
+ * Read-only objects are optionally mapped into memory instead
+ * of being read into cache buffers.
+ */
+ if (conn->mmap)
+ WT_RET(__wt_block_map(session, bm->block,
+ &bm->map, &bm->maplen, &bm->mappingcookie));
+
+ /*
+ * If this handle is for a checkpoint, that is, read-only, there
+ * isn't a lot you can do with it. Although the btree layer
+ * prevents attempts to write a checkpoint reference, paranoia
+ * is healthy.
+ */
+ __bm_method_set(bm, 1);
+ }
+
+ return (0);
+}
+
+/*
+ * __bm_checkpoint_resolve --
+ * Resolve the checkpoint.
+ */
+static int
+__bm_checkpoint_resolve(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ return (__wt_block_checkpoint_resolve(session, bm->block));
+}
+
+/*
+ * __bm_checkpoint_unload --
+ * Unload a checkpoint point.
+ */
+static int
+__bm_checkpoint_unload(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+
+ /* Unmap any mapped segment. */
+ if (bm->map != NULL)
+ WT_TRET(__wt_block_unmap(session,
+ bm->block, bm->map, bm->maplen, &bm->mappingcookie));
+
+ /* Unload the checkpoint. */
+ WT_TRET(__wt_block_checkpoint_unload(session, bm->block, !bm->is_live));
+
+ return (ret);
+}
+
+/*
+ * __bm_close --
+ * Close a file.
+ */
+static int
+__bm_close(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+
+ if (bm == NULL) /* Safety check */
+ return (0);
+
+ ret = __wt_block_close(session, bm->block);
+
+ __wt_overwrite_and_free(session, bm);
+ return (ret);
+}
+
+/*
+ * __bm_compact_start --
+ * Start a block manager compaction.
+ */
+static int
+__bm_compact_start(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ return (__wt_block_compact_start(session, bm->block));
+}
+
+/*
+ * __bm_compact_page_skip --
+ * Return if a page is useful for compaction.
+ */
+static int
+__bm_compact_page_skip(WT_BM *bm, WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_size, int *skipp)
+{
+ return (__wt_block_compact_page_skip(
+ session, bm->block, addr, addr_size, skipp));
+}
+
+/*
+ * __bm_compact_skip --
+ * Return if a file can be compacted.
+ */
+static int
+__bm_compact_skip(WT_BM *bm, WT_SESSION_IMPL *session, int *skipp)
+{
+ return (__wt_block_compact_skip(session, bm->block, skipp));
+}
+
+/*
+ * __bm_compact_end --
+ * End a block manager compaction.
+ */
+static int
+__bm_compact_end(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ return (__wt_block_compact_end(session, bm->block));
+}
+
+/*
+ * __bm_free --
+ * Free a block of space to the underlying file.
+ */
+static int
+__bm_free(WT_BM *bm,
+ WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+ return (__wt_block_free(session, bm->block, addr, addr_size));
+}
+
+/*
+ * __bm_stat --
+ * Block-manager statistics.
+ */
+static int
+__bm_stat(WT_BM *bm, WT_SESSION_IMPL *session, WT_DSRC_STATS *stats)
+{
+ __wt_block_stat(session, bm->block, stats);
+ return (0);
+}
+
+/*
+ * __bm_write --
+ * Write a buffer into a block, returning the block's address cookie.
+ */
+static int
+__bm_write(WT_BM *bm, WT_SESSION_IMPL *session,
+ WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int data_cksum)
+{
+ return (__wt_block_write(
+ session, bm->block, buf, addr, addr_sizep, data_cksum));
+}
+
+/*
+ * __bm_write_size --
+ * Return the buffer size required to write a block.
+ */
+static int
+__bm_write_size(WT_BM *bm, WT_SESSION_IMPL *session, size_t *sizep)
+{
+ return (__wt_block_write_size(session, bm->block, sizep));
+}
+
+/*
+ * __bm_salvage_start --
+ * Start a block manager salvage.
+ */
+static int
+__bm_salvage_start(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ return (__wt_block_salvage_start(session, bm->block));
+}
+
+/*
+ * __bm_salvage_valid --
+ * Inform salvage a block is valid.
+ */
+static int
+__bm_salvage_valid(WT_BM *bm,
+ WT_SESSION_IMPL *session, uint8_t *addr, size_t addr_size, int valid)
+{
+ return (__wt_block_salvage_valid(
+ session, bm->block, addr, addr_size, valid));
+}
+
+/*
+ * __bm_salvage_next --
+ * Return the next block from the file.
+ */
+static int
+__bm_salvage_next(WT_BM *bm,
+ WT_SESSION_IMPL *session, uint8_t *addr, size_t *addr_sizep, int *eofp)
+{
+ return (__wt_block_salvage_next(
+ session, bm->block, addr, addr_sizep, eofp));
+}
+
+/*
+ * __bm_salvage_end --
+ * End a block manager salvage.
+ */
+static int
+__bm_salvage_end(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ return (__wt_block_salvage_end(session, bm->block));
+}
+
+/*
+ * __bm_verify_start --
+ * Start a block manager verify.
+ */
+static int
+__bm_verify_start(WT_BM *bm, WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
+{
+ return (__wt_block_verify_start(session, bm->block, ckptbase));
+}
+
+/*
+ * __bm_verify_addr --
+ * Verify an address.
+ */
+static int
+__bm_verify_addr(WT_BM *bm,
+ WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+ return (__wt_block_verify_addr(session, bm->block, addr, addr_size));
+}
+
+/*
+ * __bm_verify_end --
+ * End a block manager verify.
+ */
+static int
+__bm_verify_end(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ return (__wt_block_verify_end(session, bm->block));
+}
+
+/*
+ * __bm_method_set --
+ * Set up the legal methods.
+ */
+static void
+__bm_method_set(WT_BM *bm, int readonly)
+{
+ if (readonly) {
+ bm->addr_string = __bm_addr_string;
+ bm->addr_valid = __bm_addr_valid;
+ bm->block_header = __bm_block_header;
+ bm->checkpoint = (int (*)(WT_BM *,
+ WT_SESSION_IMPL *, WT_ITEM *, WT_CKPT *, int))__bm_readonly;
+ bm->checkpoint_load = __bm_checkpoint_load;
+ bm->checkpoint_resolve =
+ (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+ bm->checkpoint_unload = __bm_checkpoint_unload;
+ bm->close = __bm_close;
+ bm->compact_end =
+ (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+ bm->compact_page_skip = (int (*)(WT_BM *, WT_SESSION_IMPL *,
+ const uint8_t *, size_t, int *))__bm_readonly;
+ bm->compact_skip = (int (*)
+ (WT_BM *, WT_SESSION_IMPL *, int *))__bm_readonly;
+ bm->compact_start =
+ (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+ bm->free = (int (*)(WT_BM *,
+ WT_SESSION_IMPL *, const uint8_t *, size_t))__bm_readonly;
+ bm->preload = __wt_bm_preload;
+ bm->read = __wt_bm_read;
+ bm->salvage_end = (int (*)
+ (WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+ bm->salvage_next = (int (*)(WT_BM *, WT_SESSION_IMPL *,
+ uint8_t *, size_t *, int *))__bm_readonly;
+ bm->salvage_start = (int (*)
+ (WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+ bm->salvage_valid = (int (*)(WT_BM *,
+ WT_SESSION_IMPL *, uint8_t *, size_t, int))__bm_readonly;
+ bm->stat = __bm_stat;
+ bm->sync =
+ (int (*)(WT_BM *, WT_SESSION_IMPL *, int))__bm_readonly;
+ bm->verify_addr = __bm_verify_addr;
+ bm->verify_end = __bm_verify_end;
+ bm->verify_start = __bm_verify_start;
+ bm->write = (int (*)(WT_BM *, WT_SESSION_IMPL *,
+ WT_ITEM *, uint8_t *, size_t *, int))__bm_readonly;
+ bm->write_size = (int (*)
+ (WT_BM *, WT_SESSION_IMPL *, size_t *))__bm_readonly;
+ } else {
+ bm->addr_string = __bm_addr_string;
+ bm->addr_valid = __bm_addr_valid;
+ bm->block_header = __bm_block_header;
+ bm->checkpoint = __bm_checkpoint;
+ bm->checkpoint_load = __bm_checkpoint_load;
+ bm->checkpoint_resolve = __bm_checkpoint_resolve;
+ bm->checkpoint_unload = __bm_checkpoint_unload;
+ bm->close = __bm_close;
+ bm->compact_end = __bm_compact_end;
+ bm->compact_page_skip = __bm_compact_page_skip;
+ bm->compact_skip = __bm_compact_skip;
+ bm->compact_start = __bm_compact_start;
+ bm->free = __bm_free;
+ bm->preload = __wt_bm_preload;
+ bm->read = __wt_bm_read;
+ bm->salvage_end = __bm_salvage_end;
+ bm->salvage_next = __bm_salvage_next;
+ bm->salvage_start = __bm_salvage_start;
+ bm->salvage_valid = __bm_salvage_valid;
+ bm->stat = __bm_stat;
+ bm->sync = __bm_sync;
+ bm->verify_addr = __bm_verify_addr;
+ bm->verify_end = __bm_verify_end;
+ bm->verify_start = __bm_verify_start;
+ bm->write = __bm_write;
+ bm->write_size = __bm_write_size;
+ }
+}
+
+/*
+ * __wt_block_manager_open --
+ * Open a file.
+ */
+int
+__wt_block_manager_open(WT_SESSION_IMPL *session,
+ const char *filename, const char *cfg[],
+ int forced_salvage, int readonly, uint32_t allocsize, WT_BM **bmp)
+{
+ WT_BM *bm;
+ WT_DECL_RET;
+
+ *bmp = NULL;
+
+ WT_RET(__wt_calloc_def(session, 1, &bm));
+ __bm_method_set(bm, 0);
+
+ WT_ERR(__wt_block_open(session, filename, cfg,
+ forced_salvage, readonly, allocsize, &bm->block));
+
+ *bmp = bm;
+ return (0);
+
+err: WT_TRET(bm->close(bm, session));
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c
new file mode 100644
index 00000000000..2fbaa0fe331
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_open.c
@@ -0,0 +1,330 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __desc_read(WT_SESSION_IMPL *, WT_BLOCK *);
+
+/*
+ * __wt_block_manager_truncate --
+ * Truncate a file.
+ */
+int
+__wt_block_manager_truncate(
+ WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize)
+{
+ WT_DECL_RET;
+ WT_FH *fh;
+
+ /* Open the underlying file handle. */
+ WT_RET(__wt_open(session, filename, 0, 0, WT_FILE_TYPE_DATA, &fh));
+
+ /* Truncate the file. */
+ WT_ERR(__wt_ftruncate(session, fh, (wt_off_t)0));
+
+ /* Write out the file's meta-data. */
+ ret = __wt_desc_init(session, fh, allocsize);
+
+ /* Close the file handle. */
+err: WT_TRET(__wt_close(session, fh));
+
+ return (ret);
+}
+
+/*
+ * __wt_block_manager_create --
+ * Create a file.
+ */
+int
+__wt_block_manager_create(
+ WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize)
+{
+ WT_DECL_RET;
+ WT_FH *fh;
+
+ /* Create the underlying file and open a handle. */
+ WT_RET(__wt_open(session, filename, 1, 1, WT_FILE_TYPE_DATA, &fh));
+
+ /* Write out the file's meta-data. */
+ ret = __wt_desc_init(session, fh, allocsize);
+
+ /* Close the file handle. */
+ WT_TRET(__wt_close(session, fh));
+
+ /* Undo any create on error. */
+ if (ret != 0)
+ WT_TRET(__wt_remove(session, filename));
+
+ return (ret);
+}
+
+/*
+ * __block_destroy --
+ * Destroy a block handle.
+ */
+static int
+__block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+ TAILQ_REMOVE(&conn->blockqh, block, q);
+
+ if (block->name != NULL)
+ __wt_free(session, block->name);
+
+ if (block->fh != NULL)
+ WT_TRET(__wt_close(session, block->fh));
+
+ __wt_spin_destroy(session, &block->live_lock);
+
+ __wt_overwrite_and_free(session, block);
+
+ return (ret);
+}
+
+/*
+ * __wt_block_open --
+ * Open a block handle.
+ */
+int
+__wt_block_open(WT_SESSION_IMPL *session,
+ const char *filename, const char *cfg[],
+ int forced_salvage, int readonly, uint32_t allocsize, WT_BLOCK **blockp)
+{
+ WT_BLOCK *block;
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ WT_TRET(__wt_verbose(session, WT_VERB_BLOCK, "open: %s", filename));
+
+ conn = S2C(session);
+ *blockp = NULL;
+
+ __wt_spin_lock(session, &conn->block_lock);
+ TAILQ_FOREACH(block, &conn->blockqh, q)
+ if (strcmp(filename, block->name) == 0) {
+ ++block->ref;
+ *blockp = block;
+ __wt_spin_unlock(session, &conn->block_lock);
+ return (0);
+ }
+
+ /* Basic structure allocation, initialization. */
+ WT_ERR(__wt_calloc_def(session, 1, &block));
+ block->ref = 1;
+ TAILQ_INSERT_HEAD(&conn->blockqh, block, q);
+
+ WT_ERR(__wt_strdup(session, filename, &block->name));
+ block->allocsize = allocsize;
+
+ WT_ERR(__wt_config_gets(session, cfg, "block_allocation", &cval));
+ block->allocfirst =
+ WT_STRING_MATCH("first", cval.str, cval.len) ? 1 : 0;
+
+ /* Configuration: optional OS buffer cache maximum size. */
+ WT_ERR(__wt_config_gets(session, cfg, "os_cache_max", &cval));
+ block->os_cache_max = (size_t)cval.val;
+#ifdef HAVE_POSIX_FADVISE
+ if (conn->direct_io && block->os_cache_max)
+ WT_ERR_MSG(session, EINVAL,
+ "os_cache_max not supported in combination with direct_io");
+#else
+ if (block->os_cache_max)
+ WT_ERR_MSG(session, EINVAL,
+ "os_cache_max not supported if posix_fadvise not "
+ "available");
+#endif
+
+ /* Configuration: optional immediate write scheduling flag. */
+ WT_ERR(__wt_config_gets(session, cfg, "os_cache_dirty_max", &cval));
+ block->os_cache_dirty_max = (size_t)cval.val;
+#ifdef HAVE_SYNC_FILE_RANGE
+ if (conn->direct_io && block->os_cache_dirty_max)
+ WT_ERR_MSG(session, EINVAL,
+ "os_cache_dirty_max not supported in combination with "
+ "direct_io");
+#else
+ if (block->os_cache_dirty_max) {
+ /*
+ * Ignore any setting if it is not supported.
+ */
+ block->os_cache_dirty_max = 0;
+ WT_ERR(__wt_verbose(session, WT_VERB_BLOCK,
+ "os_cache_dirty_max ignored when sync_file_range not "
+ "available"));
+ }
+#endif
+
+ /* Open the underlying file handle. */
+ WT_ERR(__wt_open(session, filename, 0, 0,
+ readonly ? WT_FILE_TYPE_CHECKPOINT : WT_FILE_TYPE_DATA,
+ &block->fh));
+
+ /* Initialize the live checkpoint's lock. */
+ WT_ERR(__wt_spin_init(session, &block->live_lock, "block manager"));
+
+ /*
+ * Read the description information from the first block.
+ *
+ * Salvage is a special case: if we're forcing the salvage, we don't
+ * look at anything, including the description information.
+ */
+ if (!forced_salvage)
+ WT_ERR(__desc_read(session, block));
+
+ *blockp = block;
+ __wt_spin_unlock(session, &conn->block_lock);
+ return (0);
+
+err: WT_TRET(__block_destroy(session, block));
+ __wt_spin_unlock(session, &conn->block_lock);
+ return (ret);
+}
+
+/*
+ * __wt_block_close --
+ * Close a block handle.
+ */
+int
+__wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ if (block == NULL) /* Safety check */
+ return (0);
+
+ conn = S2C(session);
+
+ WT_TRET(__wt_verbose(session, WT_VERB_BLOCK,
+ "close: %s", block->name == NULL ? "" : block->name ));
+
+ __wt_spin_lock(session, &conn->block_lock);
+
+ /* Reference count is initialized to 1. */
+ if (block->ref == 0 || --block->ref == 0)
+ WT_TRET(__block_destroy(session, block));
+
+ __wt_spin_unlock(session, &conn->block_lock);
+
+ return (ret);
+}
+
+/*
+ * __wt_desc_init --
+ * Write a file's initial descriptor structure.
+ */
+int
+__wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize)
+{
+ WT_BLOCK_DESC *desc;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+
+ /* Use a scratch buffer to get correct alignment for direct I/O. */
+ WT_RET(__wt_scr_alloc(session, allocsize, &buf));
+ memset(buf->mem, 0, allocsize);
+
+ desc = buf->mem;
+ desc->magic = WT_BLOCK_MAGIC;
+ desc->majorv = WT_BLOCK_MAJOR_VERSION;
+ desc->minorv = WT_BLOCK_MINOR_VERSION;
+
+ /* Update the checksum. */
+ desc->cksum = 0;
+ desc->cksum = __wt_cksum(desc, allocsize);
+
+ ret = __wt_write(session, fh, (wt_off_t)0, (size_t)allocsize, desc);
+
+ __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __desc_read --
+ * Read and verify the file's metadata.
+ */
+static int
+__desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_BLOCK_DESC *desc;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ uint32_t cksum;
+
+ /* Use a scratch buffer to get correct alignment for direct I/O. */
+ WT_RET(__wt_scr_alloc(session, block->allocsize, &buf));
+
+ /* Read the first allocation-sized block and verify the file format. */
+ WT_ERR(__wt_read(session,
+ block->fh, (wt_off_t)0, (size_t)block->allocsize, buf->mem));
+
+ desc = buf->mem;
+ WT_ERR(__wt_verbose(session, WT_VERB_BLOCK,
+ "%s: magic %" PRIu32
+ ", major/minor: %" PRIu32 "/%" PRIu32
+ ", checksum %#" PRIx32,
+ block->name, desc->magic,
+ desc->majorv, desc->minorv,
+ desc->cksum));
+
+ /*
+ * We fail the open if the checksum fails, or the magic number is wrong
+ * or the major/minor numbers are unsupported for this version. This
+ * test is done even if the caller is verifying or salvaging the file:
+ * it makes sense for verify, and for salvage we don't overwrite files
+ * without some reason to believe they are WiredTiger files. The user
+ * may have entered the wrong file name, and is now frantically pounding
+ * their interrupt key.
+ */
+ cksum = desc->cksum;
+ desc->cksum = 0;
+ if (desc->magic != WT_BLOCK_MAGIC ||
+ cksum != __wt_cksum(desc, block->allocsize))
+ WT_ERR_MSG(session, WT_ERROR,
+ "%s does not appear to be a WiredTiger file", block->name);
+
+ if (desc->majorv > WT_BLOCK_MAJOR_VERSION ||
+ (desc->majorv == WT_BLOCK_MAJOR_VERSION &&
+ desc->minorv > WT_BLOCK_MINOR_VERSION))
+ WT_ERR_MSG(session, WT_ERROR,
+ "unsupported WiredTiger file version: this build only "
+ "supports major/minor versions up to %d/%d, and the file "
+ "is version %d/%d",
+ WT_BLOCK_MAJOR_VERSION, WT_BLOCK_MINOR_VERSION,
+ desc->majorv, desc->minorv);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_block_stat --
+ * Block statistics
+ */
+void
+__wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats)
+{
+ /*
+ * We're looking inside the live system's structure, which normally
+ * requires locking: the chances of a corrupted read are probably
+ * non-existent, and it's statistics information regardless, but it
+ * isn't like this is a common function for an application to call.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ WT_STAT_SET(stats, allocation_size, block->allocsize);
+ WT_STAT_SET(stats, block_checkpoint_size, block->live.ckpt_size);
+ WT_STAT_SET(stats, block_magic, WT_BLOCK_MAGIC);
+ WT_STAT_SET(stats, block_major, WT_BLOCK_MAJOR_VERSION);
+ WT_STAT_SET(stats, block_minor, WT_BLOCK_MINOR_VERSION);
+ WT_STAT_SET(stats, block_reuse_bytes, block->live.avail.bytes);
+ WT_STAT_SET(stats, block_size, block->fh->size);
+ __wt_spin_unlock(session, &block->live_lock);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c
new file mode 100644
index 00000000000..c528ee4a6aa
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_read.c
@@ -0,0 +1,212 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bm_preload --
+ * Pre-load a page.
+ */
+int
+__wt_bm_preload(WT_BM *bm,
+ WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+ WT_BLOCK *block;
+ WT_DECL_RET;
+ wt_off_t offset;
+ uint32_t cksum, size;
+ int mapped;
+
+ WT_UNUSED(addr_size);
+ block = bm->block;
+ ret = EINVAL; /* Play games due to conditional compilation */
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+ /* Check for a mapped block. */
+ mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen;
+ if (mapped)
+ WT_RET(__wt_mmap_preload(
+ session, (uint8_t *)bm->map + offset, size));
+ else {
+#ifdef HAVE_POSIX_FADVISE
+ ret = posix_fadvise(block->fh->fd,
+ (wt_off_t)offset, (wt_off_t)size, POSIX_FADV_WILLNEED);
+#endif
+ if (ret != 0) {
+ WT_DECL_ITEM(tmp);
+ WT_RET(__wt_scr_alloc(session, size, &tmp));
+ ret = __wt_block_read_off(
+ session, block, tmp, offset, size, cksum);
+ __wt_scr_free(&tmp);
+ WT_RET(ret);
+ }
+ }
+
+ WT_STAT_FAST_CONN_INCR(session, block_preload);
+
+ return (0);
+}
+
+/*
+ * __wt_bm_read --
+ * Map or read address cookie referenced block into a buffer.
+ */
+int
+__wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session,
+ WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
+{
+ WT_BLOCK *block;
+ int mapped;
+ wt_off_t offset;
+ uint32_t cksum, size;
+
+ WT_UNUSED(addr_size);
+ block = bm->block;
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+ /*
+ * Map the block if it's possible.
+ */
+ mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen;
+ if (mapped) {
+ buf->data = (uint8_t *)bm->map + offset;
+ buf->size = size;
+ WT_RET(__wt_mmap_preload(session, buf->data, buf->size));
+
+ WT_STAT_FAST_CONN_INCR(session, block_map_read);
+ WT_STAT_FAST_CONN_INCRV(session, block_byte_map_read, size);
+ return (0);
+ }
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * In diagnostic mode, verify the block we're about to read isn't on
+ * the available list, or for live systems, the discard list.
+ */
+ WT_RET(__wt_block_misplaced(
+ session, block, "read", offset, size, bm->is_live));
+#endif
+ /* Read the block. */
+ WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum));
+
+#ifdef HAVE_POSIX_FADVISE
+ /* Optionally discard blocks from the system's buffer cache. */
+ if (block->os_cache_max != 0 &&
+ (block->os_cache += size) > block->os_cache_max) {
+ WT_DECL_RET;
+
+ block->os_cache = 0;
+ /* Ignore EINVAL - some file systems don't support the flag. */
+ if ((ret = posix_fadvise(block->fh->fd,
+ (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0 &&
+ ret != EINVAL)
+ WT_RET_MSG(
+ session, ret, "%s: posix_fadvise", block->name);
+ }
+#endif
+ return (0);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_block_read_off_blind --
+ * Read the block at an offset, try to figure out what it looks like,
+ * debugging only.
+ */
+int
+__wt_block_read_off_blind(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset)
+{
+ WT_BLOCK_HEADER *blk;
+ uint32_t cksum, size;
+
+ /*
+ * Make sure the buffer is large enough for the header and read the
+ * the first allocation-size block.
+ */
+ WT_RET(__wt_buf_init(session, buf, block->allocsize));
+ WT_RET(__wt_read(
+ session, block->fh, offset, (size_t)block->allocsize, buf->mem));
+ blk = WT_BLOCK_HEADER_REF(buf->mem);
+
+ /*
+ * Copy out the size and checksum (we're about to re-use the buffer),
+ * and if the size isn't insane, read the rest of the block.
+ */
+ size = blk->disk_size;
+ cksum = blk->cksum;
+ if (__wt_block_offset_invalid(block, offset, size))
+ WT_RET_MSG(session, EINVAL,
+ "block at offset %" PRIuMAX " cannot be a valid block, no "
+ "read attempted",
+ (uintmax_t)offset);
+ return (__wt_block_read_off(session, block, buf, offset, size, cksum));
+}
+#endif
+
+/*
+ * __wt_block_read_off --
+ * Read an addr/size pair referenced block into a buffer.
+ */
+int
+__wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
+ WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum)
+{
+ WT_BLOCK_HEADER *blk;
+ size_t bufsize;
+ uint32_t page_cksum;
+
+ WT_RET(__wt_verbose(session, WT_VERB_READ,
+ "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32,
+ (uintmax_t)offset, size, cksum));
+
+ /*
+ * Grow the buffer as necessary and read the block. Buffers should be
+ * aligned for reading, but there are lots of buffers (for example, file
+ * cursors have two buffers each, key and value), and it's difficult to
+ * be sure we've found all of them. If the buffer isn't aligned, it's
+ * an easy fix: set the flag and guarantee we reallocate it. (Most of
+ * the time on reads, the buffer memory has not yet been allocated, so
+ * we're not adding any additional processing time.)
+ */
+ if (F_ISSET(buf, WT_ITEM_ALIGNED))
+ bufsize = size;
+ else {
+ F_SET(buf, WT_ITEM_ALIGNED);
+ bufsize = WT_MAX(size, buf->memsize + 10);
+ }
+ WT_RET(__wt_buf_init(session, buf, bufsize));
+ WT_RET(__wt_read(session, block->fh, offset, size, buf->mem));
+ buf->size = size;
+
+ blk = WT_BLOCK_HEADER_REF(buf->mem);
+ blk->cksum = 0;
+ page_cksum = __wt_cksum(buf->mem,
+ F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP);
+ if (cksum != page_cksum) {
+ if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
+ __wt_errx(session,
+ "read checksum error [%"
+ PRIu32 "B @ %" PRIuMAX ", %"
+ PRIu32 " != %" PRIu32 "]",
+ size, (uintmax_t)offset, cksum, page_cksum);
+
+ /* Panic if a checksum fails during an ordinary read. */
+ return (block->verify ||
+ F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
+ WT_ERROR :
+ __wt_illegal_value(session, block->name));
+ }
+
+ WT_STAT_FAST_CONN_INCR(session, block_read);
+ WT_STAT_FAST_CONN_INCRV(session, block_byte_read, size);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_session.c b/src/third_party/wiredtiger/src/block/block_session.c
new file mode 100644
index 00000000000..fa56b72f49b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_session.c
@@ -0,0 +1,305 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Per session handle cached block manager information.
+ */
+typedef struct {
+ WT_EXT *ext_cache; /* List of WT_EXT handles */
+ u_int ext_cache_cnt; /* Count */
+
+ WT_SIZE *sz_cache; /* List of WT_SIZE handles */
+ u_int sz_cache_cnt; /* Count */
+} WT_BLOCK_MGR_SESSION;
+
+/*
+ * __block_ext_alloc --
+ * Allocate a new WT_EXT structure.
+ */
+static int
+__block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp)
+{
+ WT_EXT *ext;
+
+ u_int skipdepth;
+
+ skipdepth = __wt_skip_choose_depth(session);
+ WT_RET(__wt_calloc(session, 1,
+ sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), &ext));
+ ext->depth = (uint8_t)skipdepth;
+ (*extp) = ext;
+
+ return (0);
+}
+
+/*
+ * __wt_block_ext_alloc --
+ * Return a WT_EXT structure for use.
+ */
+int
+__wt_block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp)
+{
+ WT_EXT *ext;
+ WT_BLOCK_MGR_SESSION *bms;
+ u_int i;
+
+ bms = session->block_manager;
+
+ /* Return a WT_EXT structure for use from a cached list. */
+ if (bms != NULL && bms->ext_cache != NULL) {
+ ext = bms->ext_cache;
+ bms->ext_cache = ext->next[0];
+
+ /* Clear any left-over references. */
+ for (i = 0; i < ext->depth; ++i)
+ ext->next[i] = ext->next[i + ext->depth] = NULL;
+
+ /*
+ * The count is advisory to minimize our exposure to bugs, but
+ * don't let it go negative.
+ */
+ if (bms->ext_cache_cnt > 0)
+ --bms->ext_cache_cnt;
+
+ *extp = ext;
+ return (0);
+ }
+
+ return (__block_ext_alloc(session, extp));
+}
+
+/*
+ * __block_ext_prealloc --
+ * Pre-allocate WT_EXT structures.
+ */
+static int
+__block_ext_prealloc(WT_SESSION_IMPL *session, u_int max)
+{
+ WT_BLOCK_MGR_SESSION *bms;
+ WT_EXT *ext;
+
+ bms = session->block_manager;
+
+ for (; bms->ext_cache_cnt < max; ++bms->ext_cache_cnt) {
+ WT_RET(__block_ext_alloc(session, &ext));
+
+ ext->next[0] = bms->ext_cache;
+ bms->ext_cache = ext;
+ }
+ return (0);
+}
+
+/*
+ * __wt_block_ext_free --
+ * Add a WT_EXT structure to the cached list.
+ */
+void
+__wt_block_ext_free(WT_SESSION_IMPL *session, WT_EXT *ext)
+{
+ WT_BLOCK_MGR_SESSION *bms;
+
+ if ((bms = session->block_manager) == NULL)
+ __wt_free(session, ext);
+ else {
+ ext->next[0] = bms->ext_cache;
+ bms->ext_cache = ext;
+
+ ++bms->ext_cache_cnt;
+ }
+}
+
+/*
+ * __block_ext_discard --
+ * Discard some or all of the WT_EXT structure cache.
+ */
+static int
+__block_ext_discard(WT_SESSION_IMPL *session, u_int max)
+{
+ WT_BLOCK_MGR_SESSION *bms;
+ WT_EXT *ext, *next;
+
+ bms = session->block_manager;
+ if (max != 0 && bms->ext_cache_cnt <= max)
+ return (0);
+
+ for (ext = bms->ext_cache; ext != NULL;) {
+ next = ext->next[0];
+ __wt_free(session, ext);
+ ext = next;
+
+ --bms->ext_cache_cnt;
+ if (max != 0 && bms->ext_cache_cnt <= max)
+ break;
+ }
+ bms->ext_cache = ext;
+
+ if (max == 0 && bms->ext_cache_cnt != 0)
+ WT_RET_MSG(session, WT_ERROR,
+ "incorrect count in session handle's block manager cache");
+ return (0);
+}
+
+/*
+ * __block_size_alloc --
+ * Allocate a new WT_SIZE structure.
+ */
+static int
+__block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp)
+{
+ return (__wt_calloc(session, 1, sizeof(WT_SIZE), szp));
+}
+
+/*
+ * __wt_block_size_alloc --
+ * Return a WT_SIZE structure for use.
+ */
+int
+__wt_block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp)
+{
+ WT_BLOCK_MGR_SESSION *bms;
+
+ bms = session->block_manager;
+
+ /* Return a WT_SIZE structure for use from a cached list. */
+ if (bms != NULL && bms->sz_cache != NULL) {
+ (*szp) = bms->sz_cache;
+ bms->sz_cache = bms->sz_cache->next[0];
+
+ /*
+ * The count is advisory to minimize our exposure to bugs, but
+ * don't let it go negative.
+ */
+ if (bms->sz_cache_cnt > 0)
+ --bms->sz_cache_cnt;
+ return (0);
+ }
+
+ return (__block_size_alloc(session, szp));
+}
+
+/*
+ * __block_size_prealloc --
+ * Pre-allocate WT_SIZE structures.
+ */
+static int
+__block_size_prealloc(WT_SESSION_IMPL *session, u_int max)
+{
+ WT_BLOCK_MGR_SESSION *bms;
+ WT_SIZE *sz;
+
+ bms = session->block_manager;
+
+ for (; bms->sz_cache_cnt < max; ++bms->sz_cache_cnt) {
+ WT_RET(__block_size_alloc(session, &sz));
+
+ sz->next[0] = bms->sz_cache;
+ bms->sz_cache = sz;
+ }
+ return (0);
+}
+
+/*
+ * __wt_block_size_free --
+ * Add a WT_SIZE structure to the cached list.
+ */
+void
+__wt_block_size_free(WT_SESSION_IMPL *session, WT_SIZE *sz)
+{
+ WT_BLOCK_MGR_SESSION *bms;
+
+ if ((bms = session->block_manager) == NULL)
+ __wt_free(session, sz);
+ else {
+ sz->next[0] = bms->sz_cache;
+ bms->sz_cache = sz;
+
+ ++bms->sz_cache_cnt;
+ }
+}
+
+/*
+ * __block_size_discard --
+ * Discard some or all of the WT_SIZE structure cache.
+ */
+static int
+__block_size_discard(WT_SESSION_IMPL *session, u_int max)
+{
+ WT_BLOCK_MGR_SESSION *bms;
+ WT_SIZE *sz, *nsz;
+
+ bms = session->block_manager;
+ if (max != 0 && bms->sz_cache_cnt <= max)
+ return (0);
+
+ for (sz = bms->sz_cache; sz != NULL;) {
+ nsz = sz->next[0];
+ __wt_free(session, sz);
+ sz = nsz;
+
+ --bms->sz_cache_cnt;
+ if (max != 0 && bms->sz_cache_cnt <= max)
+ break;
+ }
+ bms->sz_cache = sz;
+
+ if (max == 0 && bms->sz_cache_cnt != 0)
+ WT_RET_MSG(session, WT_ERROR,
+ "incorrect count in session handle's block manager cache");
+ return (0);
+}
+
+/*
+ * __block_manager_session_cleanup --
+ * Clean up the session handle's block manager information.
+ */
+static int
+__block_manager_session_cleanup(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+
+ if (session->block_manager == NULL)
+ return (0);
+
+ WT_TRET(__block_ext_discard(session, 0));
+ WT_TRET(__block_size_discard(session, 0));
+
+ __wt_free(session, session->block_manager);
+
+ return (ret);
+}
+
+/*
+ * __wt_block_ext_prealloc --
+ * Pre-allocate WT_EXT and WT_SIZE structures.
+ */
+int
+__wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max)
+{
+ if (session->block_manager == NULL) {
+ WT_RET(__wt_calloc(session, 1,
+ sizeof(WT_BLOCK_MGR_SESSION), &session->block_manager));
+ session->block_manager_cleanup =
+ __block_manager_session_cleanup;
+ }
+ WT_RET(__block_ext_prealloc(session, max));
+ WT_RET(__block_size_prealloc(session, max));
+ return (0);
+}
+
+/*
+ * __wt_block_ext_discard --
+ * Discard WT_EXT and WT_SIZE structures after checkpoint runs.
+ */
+int
+__wt_block_ext_discard(WT_SESSION_IMPL *session, u_int max)
+{
+ WT_RET(__block_ext_discard(session, max));
+ WT_RET(__block_size_discard(session, max));
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_slvg.c b/src/third_party/wiredtiger/src/block/block_slvg.c
new file mode 100644
index 00000000000..349daa620f5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_slvg.c
@@ -0,0 +1,190 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_block_salvage_start --
+ * Start a file salvage.
+ */
+int
+__wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ wt_off_t len;
+ uint32_t allocsize;
+
+ allocsize = block->allocsize;
+
+ /* Reset the description information in the first block. */
+ WT_RET(__wt_desc_init(session, block->fh, allocsize));
+
+ /*
+ * Salvage creates a new checkpoint when it's finished, set up for
+ * rolling an empty file forward.
+ */
+ WT_RET(__wt_block_ckpt_init(session, &block->live, "live"));
+
+ /*
+ * Truncate the file to an allocation-size multiple of blocks (bytes
+ * trailing the last block must be garbage, by definition).
+ */
+ if (block->fh->size > allocsize) {
+ len = (block->fh->size / allocsize) * allocsize;
+ if (len != block->fh->size)
+ WT_RET(__wt_ftruncate(session, block->fh, len));
+ } else
+ len = allocsize;
+ block->live.file_size = len;
+
+ /*
+ * The file's first allocation-sized block is description information,
+ * skip it when reading through the file.
+ */
+ block->slvg_off = allocsize;
+
+ /*
+ * The only checkpoint extent we care about is the allocation list.
+ * Start with the entire file on the allocation list, we'll "free"
+ * any blocks we don't want as we process the file.
+ */
+ WT_RET(__wt_block_insert_ext(
+ session, &block->live.alloc, allocsize, len - allocsize));
+
+ return (0);
+}
+
+/*
+ * __wt_block_salvage_end --
+ * End a file salvage.
+ */
+int
+__wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ /* Discard the checkpoint. */
+ return (__wt_block_checkpoint_unload(session, block, 0));
+}
+
+/*
+ * __wt_block_offset_invalid --
+ * Return if the block offset is insane.
+ */
+int
+__wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size)
+{
+ if (size == 0) /* < minimum page size */
+ return (1);
+ if (size % block->allocsize != 0) /* not allocation-size units */
+ return (1);
+ if (size > WT_BTREE_PAGE_SIZE_MAX) /* > maximum page size */
+ return (1);
+ /* past end-of-file */
+ if (offset + (wt_off_t)size > block->fh->size)
+ return (1);
+ return (0);
+}
+
+/*
+ * __wt_block_salvage_next --
+ * Return the address for the next potential block from the file.
+ */
+int
+__wt_block_salvage_next(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp)
+{
+ WT_BLOCK_HEADER *blk;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_FH *fh;
+ wt_off_t max, offset;
+ uint32_t allocsize, cksum, size;
+ uint8_t *endp;
+
+ *eofp = 0;
+
+ fh = block->fh;
+ allocsize = block->allocsize;
+ WT_ERR(__wt_scr_alloc(session, allocsize, &tmp));
+
+ /* Read through the file, looking for pages. */
+ for (max = fh->size;;) {
+ offset = block->slvg_off;
+ if (offset >= max) { /* Check eof. */
+ *eofp = 1;
+ goto done;
+ }
+
+ /*
+ * Read the start of a possible page (an allocation-size block),
+ * and get a page length from it. Move to the next allocation
+ * sized boundary, we'll never consider this one again.
+ */
+ WT_ERR(__wt_read(
+ session, fh, offset, (size_t)allocsize, tmp->mem));
+ blk = WT_BLOCK_HEADER_REF(tmp->mem);
+ size = blk->disk_size;
+ cksum = blk->cksum;
+
+ /*
+ * Check the block size: if it's not insane, read the block.
+ * Reading the block validates any checksum; if reading the
+ * block succeeds, return its address as a possible page,
+ * otherwise, move past it.
+ */
+ if (!__wt_block_offset_invalid(block, offset, size) &&
+ __wt_block_read_off(
+ session, block, tmp, offset, size, cksum) == 0)
+ break;
+
+ /* Free the allocation-size block. */
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "skipping %" PRIu32 "B at file offset %" PRIuMAX,
+ allocsize, (uintmax_t)offset));
+ WT_ERR(__wt_block_off_free(
+ session, block, offset, (wt_off_t)allocsize));
+ block->slvg_off += allocsize;
+ }
+
+ /* Re-create the address cookie that should reference this block. */
+ endp = addr;
+ WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum));
+ *addr_sizep = WT_PTRDIFF(endp, addr);
+
+done:
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_block_salvage_valid --
+ * Let salvage know if a block is valid.
+ */
+int
+__wt_block_salvage_valid(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid)
+{
+ wt_off_t offset;
+ uint32_t size, cksum;
+
+ WT_UNUSED(session);
+ WT_UNUSED(addr_size);
+
+ /*
+ * Crack the cookie.
+ * If the upper layer took the block, move past it; if the upper layer
+ * rejected the block, move past an allocation size chunk and free it.
+ */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+ if (valid)
+ block->slvg_off = offset + size;
+ else {
+ WT_RET(__wt_block_off_free(
+ session, block, offset, (wt_off_t)block->allocsize));
+ block->slvg_off = offset + block->allocsize;
+ }
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_vrfy.c b/src/third_party/wiredtiger/src/block/block_vrfy.c
new file mode 100644
index 00000000000..148b4fa9743
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_vrfy.c
@@ -0,0 +1,514 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __verify_ckptfrag_add(
+ WT_SESSION_IMPL *, WT_BLOCK *, wt_off_t, wt_off_t);
+static int __verify_ckptfrag_chk(WT_SESSION_IMPL *, WT_BLOCK *);
+static int __verify_filefrag_add(
+ WT_SESSION_IMPL *, WT_BLOCK *, const char *, wt_off_t, wt_off_t, int);
+static int __verify_filefrag_chk(WT_SESSION_IMPL *, WT_BLOCK *);
+static int __verify_last_avail(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
+static int __verify_last_truncate(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
+
+/* The bit list ignores the first block: convert to/from a frag/offset. */
+#define WT_wt_off_tO_FRAG(block, off) \
+ ((off) / (block)->allocsize - 1)
+#define WT_FRAG_TO_OFF(block, frag) \
+ (((wt_off_t)(frag + 1)) * (block)->allocsize)
+
+/*
+ * __wt_block_verify_start --
+ * Start file verification.
+ */
+int
+__wt_block_verify_start(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
+{
+ WT_CKPT *ckpt;
+ wt_off_t size;
+
+ /*
+ * Find the last checkpoint in the list: if there are none, or the only
+ * checkpoint we have is fake, there's no work to do. Don't complain,
+ * that's not our problem to solve.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ ;
+ for (;; --ckpt) {
+ if (ckpt->name != NULL && !F_ISSET(ckpt, WT_CKPT_FAKE))
+ break;
+ if (ckpt == ckptbase)
+ return (0);
+ }
+
+ /* Truncate the file to the size of the last checkpoint. */
+ WT_RET(__verify_last_truncate(session, block, ckpt));
+
+ /*
+ * We're done if the file has no data pages (this happens if we verify
+ * a file immediately after creation or the checkpoint doesn't reflect
+ * any of the data pages).
+ */
+ size = block->fh->size;
+ if (size <= block->allocsize)
+ return (0);
+
+ /* The file size should be a multiple of the allocation size. */
+ if (size % block->allocsize != 0)
+ WT_RET_MSG(session, WT_ERROR,
+ "the file size is not a multiple of the allocation size");
+
+ /*
+ * Allocate a bit array, where each bit represents a single allocation
+ * size piece of the file (this is how we track the parts of the file
+ * we've verified, and check for multiply referenced or unreferenced
+ * blocks). Storing this on the heap seems reasonable, verifying a 1TB
+ * file with an 512B allocation size would require a 256MB bit array:
+ *
+ * (((1 * 2^40) / 512) / 8) = 256 * 2^20
+ *
+ * To verify larger files than we can handle in this way, we'd have to
+ * write parts of the bit array into a disk file.
+ *
+ * Alternatively, we could switch to maintaining ranges of the file as
+ * we do with the extents, but that has its own failure mode, where we
+ * verify many non-contiguous blocks creating too many entries on the
+ * list to fit into memory.
+ */
+ block->frags = (uint64_t)WT_wt_off_tO_FRAG(block, size);
+ WT_RET(__bit_alloc(session, block->frags, &block->fragfile));
+
+ /*
+ * We maintain an allocation list that is rolled forward through the
+ * set of checkpoints.
+ */
+ WT_RET(__wt_block_extlist_init(
+ session, &block->verify_alloc, "verify", "alloc", 0));
+
+ /*
+ * The only checkpoint avail list we care about is the last one written;
+ * get it now and initialize the list of file fragments.
+ */
+ WT_RET(__verify_last_avail(session, block, ckpt));
+
+ block->verify = 1;
+ return (0);
+}
+
+/*
+ * __verify_last_avail --
+ * Get the last checkpoint's avail list and load it into the list of file
+ * fragments.
+ */
+static int
+__verify_last_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
+{
+ WT_BLOCK_CKPT *ci, _ci;
+ WT_DECL_RET;
+ WT_EXT *ext;
+ WT_EXTLIST *el;
+
+ ci = &_ci;
+ WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
+ WT_ERR(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
+
+ el = &ci->avail;
+ if (el->offset != WT_BLOCK_INVALID_OFFSET) {
+ WT_ERR(__wt_block_extlist_read_avail(
+ session, block, el, ci->file_size));
+ WT_EXT_FOREACH(ext, el->off)
+ if ((ret = __verify_filefrag_add(session, block,
+ "avail-list chunk", ext->off, ext->size, 1)) != 0)
+ break;
+ }
+
+err: __wt_block_ckpt_destroy(session, ci);
+ return (ret);
+}
+
+/*
+ * __verify_last_truncate --
+ * Truncate the file to the last checkpoint's size.
+ */
+static int
+__verify_last_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
+{
+ WT_BLOCK_CKPT *ci, _ci;
+ WT_DECL_RET;
+
+ ci = &_ci;
+ WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
+ WT_ERR(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
+ WT_ERR(__wt_ftruncate(session, block->fh, ci->file_size));
+
+err: __wt_block_ckpt_destroy(session, ci);
+ return (ret);
+}
+
+/*
+ * __wt_block_verify_end --
+ * End file verification.
+ */
+int
+__wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_DECL_RET;
+
+ /* Confirm we verified every file block. */
+ ret = __verify_filefrag_chk(session, block);
+
+ /* Discard the accumulated allocation list. */
+ __wt_block_extlist_free(session, &block->verify_alloc);
+
+ /* Discard the fragment tracking lists. */
+ __wt_free(session, block->fragfile);
+ __wt_free(session, block->fragckpt);
+
+ block->verify = 0;
+ return (ret);
+}
+
+/*
+ * __wt_verify_ckpt_load --
+ * Verify work done when a checkpoint is loaded.
+ */
+int
+__wt_verify_ckpt_load(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci)
+{
+ WT_EXTLIST *el;
+ WT_EXT *ext;
+ uint64_t frag, frags;
+
+ /* Set the maximum file size for this checkpoint. */
+ block->verify_size = ci->file_size;
+
+ /*
+ * Add the root page and disk blocks used to store the extent lists to
+ * the list of blocks we've "seen" from the file.
+ */
+ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__verify_filefrag_add(session, block, "checkpoint",
+ ci->root_offset, (wt_off_t)ci->root_size, 1));
+ if (ci->alloc.offset != WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__verify_filefrag_add(session, block, "alloc list",
+ ci->alloc.offset, (wt_off_t)ci->alloc.size, 1));
+ if (ci->avail.offset != WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__verify_filefrag_add(session, block, "avail list",
+ ci->avail.offset, (wt_off_t)ci->avail.size, 1));
+ if (ci->discard.offset != WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__verify_filefrag_add(session, block, "discard list",
+ ci->discard.offset, (wt_off_t)ci->discard.size, 1));
+
+ /*
+ * Checkpoint verification is similar to deleting checkpoints. As we
+ * read each new checkpoint, we merge the allocation lists (accumulating
+ * all allocated pages as we move through the system), and then remove
+ * any pages found in the discard list. The result should be a
+ * one-to-one mapping to the pages we find in this specific checkpoint.
+ */
+ el = &ci->alloc;
+ if (el->offset != WT_BLOCK_INVALID_OFFSET) {
+ WT_RET(__wt_block_extlist_read(
+ session, block, el, ci->file_size));
+ WT_RET(__wt_block_extlist_merge(
+ session, el, &block->verify_alloc));
+ __wt_block_extlist_free(session, el);
+ }
+ el = &ci->discard;
+ if (el->offset != WT_BLOCK_INVALID_OFFSET) {
+ WT_RET(__wt_block_extlist_read(
+ session, block, el, ci->file_size));
+ WT_EXT_FOREACH(ext, el->off)
+ WT_RET(__wt_block_off_remove_overlap(session,
+ &block->verify_alloc, ext->off, ext->size));
+ __wt_block_extlist_free(session, el);
+ }
+
+ /*
+ * The root page of the checkpoint appears on the alloc list, but not,
+ * at least until the checkpoint is deleted, on a discard list. To
+ * handle this case, remove the root page from the accumulated list of
+ * checkpoint pages, so it doesn't add a new requirement for subsequent
+ * checkpoints.
+ */
+ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__wt_block_off_remove_overlap(session,
+ &block->verify_alloc, ci->root_offset, ci->root_size));
+
+ /*
+ * Allocate the per-checkpoint bit map. The per-checkpoint bit map is
+ * the opposite of the per-file bit map, that is, we set all the bits
+ * that we expect to be set based on the checkpoint's allocation and
+ * discard lists, then clear bits as we verify blocks. When finished
+ * verifying the checkpoint, the bit list should be empty.
+ */
+ WT_RET(__bit_alloc(session, block->frags, &block->fragckpt));
+ el = &block->verify_alloc;
+ WT_EXT_FOREACH(ext, el->off) {
+ frag = (uint64_t)WT_wt_off_tO_FRAG(block, ext->off);
+ frags = (uint64_t)(ext->size / block->allocsize);
+ __bit_nset(block->fragckpt, frag, frag + (frags - 1));
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_verify_ckpt_unload --
+ * Verify work done when a checkpoint is unloaded.
+ */
+int
+__wt_verify_ckpt_unload(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_DECL_RET;
+
+ /* Confirm we verified every checkpoint block. */
+ ret = __verify_ckptfrag_chk(session, block);
+
+ /* Discard the per-checkpoint fragment list. */
+ __wt_free(session, block->fragckpt);
+
+ return (ret);
+}
+
+/*
+ * __wt_block_verify_addr --
+ * Update an address in a checkpoint as verified.
+ */
+int
+__wt_block_verify_addr(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *addr, size_t addr_size)
+{
+ wt_off_t offset;
+ uint32_t cksum, size;
+
+ WT_UNUSED(addr_size);
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+ /* Add to the per-file list. */
+ WT_RET(__verify_filefrag_add(session, block, NULL, offset, size, 0));
+
+ /*
+ * It's tempting to try and flag a page as "verified" when we read it.
+ * That doesn't work because we may visit a page multiple times when
+ * verifying a single checkpoint (for example, when verifying the
+ * physical image of a row-store leaf page with overflow keys, the
+ * overflow keys are read when checking for key sort issues, and read
+ * again when more general overflow item checking is done). This
+ * function is called by the btree verification code, once per logical
+ * visit in a checkpoint, so we can detect if a page is referenced
+ * multiple times within a single checkpoint. This doesn't apply to
+ * the per-file list, because it is expected for the same btree blocks
+ * to appear in multiple checkpoints.
+ *
+ * Add the block to the per-checkpoint list.
+ */
+ WT_RET(__verify_ckptfrag_add(session, block, offset, size));
+
+ return (0);
+}
+
+/*
+ * __verify_filefrag_add --
+ * Add the fragments to the per-file fragment list, optionally complain if
+ * we've already verified this chunk of the file.
+ */
+static int
+__verify_filefrag_add(WT_SESSION_IMPL *session, WT_BLOCK *block,
+ const char *type, wt_off_t offset, wt_off_t size, int nodup)
+{
+ uint64_t f, frag, frags, i;
+
+ WT_RET(__wt_verbose(session, WT_VERB_VERIFY,
+ "add file block%s%s%s at %" PRIuMAX "-%" PRIuMAX " (%" PRIuMAX ")",
+ type == NULL ? "" : " (",
+ type == NULL ? "" : type,
+ type == NULL ? "" : ")",
+ (uintmax_t)offset, (uintmax_t)(offset + size), (uintmax_t)size));
+
+ /* Check each chunk against the total file size. */
+ if (offset + size > block->fh->size)
+ WT_RET_MSG(session, WT_ERROR,
+ "fragment %" PRIuMAX "-%" PRIuMAX " references "
+ "non-existent file blocks",
+ (uintmax_t)offset, (uintmax_t)(offset + size));
+
+ frag = (uint64_t)WT_wt_off_tO_FRAG(block, offset);
+ frags = (uint64_t)(size / block->allocsize);
+
+ /* It may be illegal to reference a particular chunk more than once. */
+ if (nodup)
+ for (f = frag, i = 0; i < frags; ++f, ++i)
+ if (__bit_test(block->fragfile, f))
+ WT_RET_MSG(session, WT_ERROR,
+ "file fragment at %" PRIuMAX " referenced "
+ "multiple times",
+ (uintmax_t)offset);
+
+ /* Add fragments to the file's fragment list. */
+ __bit_nset(block->fragfile, frag, frag + (frags - 1));
+
+ return (0);
+}
+
+/*
+ * __verify_filefrag_chk --
+ * Verify we've checked all the fragments in the file.
+ */
+static int
+__verify_filefrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ uint64_t count, first, last;
+
+ /* If there's nothing to verify, it was a fast run. */
+ if (block->frags == 0)
+ return (0);
+
+ /*
+ * It's OK if we have not verified blocks at the end of the file: that
+ * happens if the file is truncated during a checkpoint or load or was
+ * extended after writing a checkpoint. We should never see unverified
+ * blocks anywhere else, though.
+ *
+ * I'm deliberately testing for a last fragment of 0, it makes no sense
+ * there would be no fragments verified, complain if the first fragment
+ * in the file wasn't verified.
+ */
+ for (last = block->frags - 1; last != 0; --last) {
+ if (__bit_test(block->fragfile, last))
+ break;
+ __bit_set(block->fragfile, last);
+ }
+
+ /*
+ * Check for any other file fragments we haven't verified -- every time
+ * we find a bit that's clear, complain. We re-start the search each
+ * time after setting the clear bit(s) we found: it's simpler and this
+ * isn't supposed to happen a lot.
+ */
+ for (count = 0;; ++count) {
+ if (__bit_ffc(block->fragfile, block->frags, &first) != 0)
+ break;
+ __bit_set(block->fragfile, first);
+ for (last = first + 1; last < block->frags; ++last) {
+ if (__bit_test(block->fragfile, last))
+ break;
+ __bit_set(block->fragfile, last);
+ }
+
+ if (!WT_VERBOSE_ISSET(session, WT_VERB_VERIFY))
+ continue;
+
+ __wt_errx(session,
+ "file range %" PRIuMAX "-%" PRIuMAX " never verified",
+ (uintmax_t)WT_FRAG_TO_OFF(block, first),
+ (uintmax_t)WT_FRAG_TO_OFF(block, last));
+ }
+ if (count == 0)
+ return (0);
+
+ __wt_errx(session, "file ranges never verified: %" PRIu64, count);
+ return (WT_ERROR);
+}
+
+/*
+ * __verify_ckptfrag_add --
+ * Clear the fragments in the per-checkpoint fragment list, and complain if
+ * we've already verified this chunk of the checkpoint.
+ */
+static int
+__verify_ckptfrag_add(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size)
+{
+ uint64_t f, frag, frags, i;
+
+ WT_RET(__wt_verbose(session, WT_VERB_VERIFY,
+ "add checkpoint block at %" PRIuMAX "-%" PRIuMAX " (%" PRIuMAX ")",
+ (uintmax_t)offset, (uintmax_t)(offset + size), (uintmax_t)size));
+
+ /*
+ * Check each chunk against the checkpoint's size, a checkpoint should
+ * never reference a block outside of the checkpoint's stored size.
+ */
+ if (offset + size > block->verify_size)
+ WT_RET_MSG(session, WT_ERROR,
+ "fragment %" PRIuMAX "-%" PRIuMAX " references "
+ "file blocks outside the checkpoint",
+ (uintmax_t)offset, (uintmax_t)(offset + size));
+
+ frag = (uint64_t)WT_wt_off_tO_FRAG(block, offset);
+ frags = (uint64_t)(size / block->allocsize);
+
+ /* It is illegal to reference a particular chunk more than once. */
+ for (f = frag, i = 0; i < frags; ++f, ++i)
+ if (!__bit_test(block->fragckpt, f))
+ WT_RET_MSG(session, WT_ERROR,
+ "fragment at %" PRIuMAX " referenced multiple "
+ "times in a single checkpoint or found in the "
+ "checkpoint but not listed in the checkpoint's "
+ "allocation list",
+ (uintmax_t)offset);
+
+ /* Remove fragments from the checkpoint's allocation list. */
+ __bit_nclr(block->fragckpt, frag, frag + (frags - 1));
+
+ return (0);
+}
+
+/*
+ * __verify_ckptfrag_chk --
+ * Verify we've checked all the fragments in the checkpoint.
+ */
+static int
+__verify_ckptfrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ uint64_t count, first, last;
+
+ /*
+ * The checkpoint fragment memory is only allocated as a checkpoint
+ * is successfully loaded; don't check if there's nothing there.
+ */
+ if (block->fragckpt == NULL)
+ return (0);
+
+ /*
+ * Check for checkpoint fragments we haven't verified -- every time we
+ * find a bit that's set, complain. We re-start the search each time
+ * after clearing the set bit(s) we found: it's simpler and this isn't
+ * supposed to happen a lot.
+ */
+ for (count = 0;; ++count) {
+ if (__bit_ffs(block->fragckpt, block->frags, &first) != 0)
+ break;
+ __bit_clear(block->fragckpt, first);
+ for (last = first + 1; last < block->frags; ++last) {
+ if (!__bit_test(block->fragckpt, last))
+ break;
+ __bit_clear(block->fragckpt, last);
+ }
+
+ if (!WT_VERBOSE_ISSET(session, WT_VERB_VERIFY))
+ continue;
+
+ __wt_errx(session,
+ "checkpoint range %" PRIuMAX "-%" PRIuMAX " never verified",
+ (uintmax_t)WT_FRAG_TO_OFF(block, first),
+ (uintmax_t)WT_FRAG_TO_OFF(block, last));
+ }
+
+ if (count == 0)
+ return (0);
+
+ __wt_errx(session,
+ "checkpoint ranges never verified: %" PRIu64, count);
+ return (WT_ERROR);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c
new file mode 100644
index 00000000000..0da6380e61f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_write.c
@@ -0,0 +1,269 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_block_header --
+ * Return the size of the block-specific header.
+ */
+u_int
+__wt_block_header(WT_BLOCK *block)
+{
+ WT_UNUSED(block);
+
+ return ((u_int)WT_BLOCK_HEADER_SIZE);
+}
+
+/*
+ * __wt_block_write_size --
+ * Return the buffer size required to write a block.
+ */
+int
+__wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep)
+{
+ WT_UNUSED(session);
+
+ /*
+ * We write the page size, in bytes, into the block's header as a 4B
+ * unsigned value, and it's possible for the engine to accept an item
+ * we can't write. For example, a huge key/value where the allocation
+ * size has been set to something large will overflow 4B when it tries
+ * to align the write. We could make this work (for example, writing
+ * the page size in units of allocation size or something else), but
+ * it's not worth the effort, writing 4GB objects into a btree makes
+ * no sense. Limit the writes to (4GB - 1KB), it gives us potential
+ * mode bits, and I'm not interested in debugging corner cases anyway.
+ */
+ *sizep = (size_t)
+ WT_ALIGN(*sizep + WT_BLOCK_HEADER_BYTE_SIZE, block->allocsize);
+ return (*sizep > UINT32_MAX - 1024 ? EINVAL : 0);
+}
+
+/*
+ * __wt_block_write --
+ * Write a buffer into a block, returning the block's address cookie.
+ */
+int
+__wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block,
+ WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int data_cksum)
+{
+ wt_off_t offset;
+ uint32_t size, cksum;
+ uint8_t *endp;
+
+ WT_RET(__wt_block_write_off(
+ session, block, buf, &offset, &size, &cksum, data_cksum, 0));
+
+ endp = addr;
+ WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum));
+ *addr_sizep = WT_PTRDIFF(endp, addr);
+
+ return (0);
+}
+
+/*
+ * __wt_block_write_off --
+ * Write a buffer into a block, returning the block's offset, size and
+ * checksum.
+ */
+int
+__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
+ WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump,
+ int data_cksum, int caller_locked)
+{
+ WT_BLOCK_HEADER *blk;
+ WT_DECL_RET;
+ WT_FH *fh;
+ size_t align_size;
+ wt_off_t offset;
+ int local_locked;
+
+ blk = WT_BLOCK_HEADER_REF(buf->mem);
+ fh = block->fh;
+ local_locked = 0;
+
+ /* Buffers should be aligned for writing. */
+ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
+ WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
+ WT_RET_MSG(session, EINVAL,
+ "direct I/O check: write buffer incorrectly allocated");
+ }
+
+ /*
+ * Align the size to an allocation unit.
+ *
+ * The buffer must be big enough for us to zero to the next allocsize
+ * boundary, this is one of the reasons the btree layer must find out
+ * from the block-manager layer the maximum size of the eventual write.
+ */
+ align_size = WT_ALIGN(buf->size, block->allocsize);
+ if (align_size > buf->memsize) {
+ WT_ASSERT(session, align_size <= buf->memsize);
+ WT_RET_MSG(session, EINVAL,
+ "buffer size check: write buffer incorrectly allocated");
+ }
+ if (align_size > UINT32_MAX) {
+ WT_ASSERT(session, align_size <= UINT32_MAX);
+ WT_RET_MSG(session, EINVAL,
+ "buffer size check: write buffer too large to write");
+ }
+
+ /* Zero out any unused bytes at the end of the buffer. */
+ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size);
+
+ /*
+ * Set the disk size so we don't have to incrementally read blocks
+ * during salvage.
+ */
+ blk->disk_size = WT_STORE_SIZE(align_size);
+
+ /*
+ * Update the block's checksum: if our caller specifies, checksum the
+ * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP
+ * bytes. The assumption is applications with good compression support
+ * turn off checksums and assume corrupted blocks won't decompress
+ * correctly. However, if compression failed to shrink the block, the
+ * block wasn't compressed, in which case our caller will tell us to
+ * checksum the data to detect corruption. If compression succeeded,
+ * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes
+ * because they're not compressed, both to give salvage a quick test
+ * of whether a block is useful and to give us a test so we don't lose
+ * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing.
+ */
+ blk->flags = 0;
+ if (data_cksum)
+ F_SET(blk, WT_BLOCK_DATA_CKSUM);
+ blk->cksum = 0;
+ blk->cksum = __wt_cksum(
+ buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP);
+
+ if (!caller_locked) {
+ WT_RET(__wt_block_ext_prealloc(session, 5));
+ __wt_spin_lock(session, &block->live_lock);
+ local_locked = 1;
+ }
+ ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size);
+
+ /*
+ * Extend the file in chunks. We want to limit the number of threads
+ * extending the file at the same time, so choose the one thread that's
+ * crossing the extended boundary. We don't extend newly created files,
+ * and it's theoretically possible we might wait so long our extension
+ * of the file is passed by another thread writing single blocks, that's
+ * why there's a check in case the extended file size becomes too small:
+ * if the file size catches up, every thread tries to extend it.
+ *
+ * File extension may require locking: some variants of the system call
+ * used to extend the file initialize the extended space. If a writing
+ * thread races with the extending thread, the extending thread might
+ * overwrite already written data, and that would be very, very bad.
+ *
+ * Some variants of the system call to extend the file fail at run-time
+ * based on the filesystem type, fall back to ftruncate in that case,
+ * and remember that ftruncate requires locking.
+ */
+ if (ret == 0 &&
+ fh->extend_len != 0 &&
+ (fh->extend_size <= fh->size ||
+ (offset + fh->extend_len <= fh->extend_size &&
+ offset +
+ fh->extend_len + (wt_off_t)align_size >= fh->extend_size))) {
+ fh->extend_size = offset + fh->extend_len * 2;
+ if (fh->fallocate_available) {
+ /*
+ * Release any locally acquired lock if it's not needed
+ * to extend the file, extending the file might require
+ * updating file metadata, which can be slow. (It may be
+ * a bad idea to configure for file extension on systems
+ * that require locking over the extend call.)
+ */
+ if (!fh->fallocate_requires_locking && local_locked) {
+ __wt_spin_unlock(session, &block->live_lock);
+ local_locked = 0;
+ }
+
+ /* Extend the file. */
+ if ((ret = __wt_fallocate(session,
+ fh, offset, fh->extend_len * 2)) == ENOTSUP) {
+ ret = 0;
+ goto extend_truncate;
+ }
+ } else {
+extend_truncate: /*
+ * We may have a caller lock or a locally acquired lock,
+ * but we need a lock to call ftruncate.
+ */
+ if (!caller_locked && local_locked == 0) {
+ __wt_spin_lock(session, &block->live_lock);
+ local_locked = 1;
+ }
+ /*
+ * The truncate might fail if there's a file mapping
+ * (if there's an open checkpoint on the file), that's
+ * OK.
+ */
+ if ((ret = __wt_ftruncate(
+ session, fh, offset + fh->extend_len * 2)) == EBUSY)
+ ret = 0;
+ }
+ }
+ /* Release any locally acquired lock. */
+ if (local_locked) {
+ __wt_spin_unlock(session, &block->live_lock);
+ local_locked = 0;
+ }
+ WT_RET(ret);
+
+ /* Write the block. */
+ if ((ret =
+ __wt_write(session, fh, offset, align_size, buf->mem)) != 0) {
+ if (!caller_locked)
+ __wt_spin_lock(session, &block->live_lock);
+ WT_TRET(__wt_block_off_free(
+ session, block, offset, (wt_off_t)align_size));
+ if (!caller_locked)
+ __wt_spin_unlock(session, &block->live_lock);
+ WT_RET(ret);
+ }
+
+#ifdef HAVE_SYNC_FILE_RANGE
+ /*
+ * Optionally schedule writes for dirty pages in the system buffer
+ * cache, but only if the current session can wait.
+ */
+ if (block->os_cache_dirty_max != 0 &&
+ (block->os_cache_dirty += align_size) > block->os_cache_dirty_max &&
+ __wt_session_can_wait(session)) {
+ block->os_cache_dirty = 0;
+ WT_RET(__wt_fsync_async(session, fh));
+ }
+#endif
+#ifdef HAVE_POSIX_FADVISE
+ /* Optionally discard blocks from the system buffer cache. */
+ if (block->os_cache_max != 0 &&
+ (block->os_cache += align_size) > block->os_cache_max) {
+ block->os_cache = 0;
+ if ((ret = posix_fadvise(fh->fd,
+ (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0)
+ WT_RET_MSG(
+ session, ret, "%s: posix_fadvise", block->name);
+ }
+#endif
+ WT_STAT_FAST_CONN_INCR(session, block_write);
+ WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size);
+
+ WT_RET(__wt_verbose(session, WT_VERB_WRITE,
+ "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32,
+ (uintmax_t)offset, (uintmax_t)align_size, blk->cksum));
+
+ *offsetp = offset;
+ *sizep = WT_STORE_SIZE(align_size);
+ *cksump = blk->cksum;
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/bloom/bloom.c b/src/third_party/wiredtiger/src/bloom/bloom.c
new file mode 100644
index 00000000000..8c8c8bc723e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/bloom/bloom.c
@@ -0,0 +1,351 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#define WT_BLOOM_TABLE_CONFIG "key_format=r,value_format=1t,exclusive=true"
+
+/*
+ * __bloom_init --
+ * Allocate a WT_BLOOM handle.
+ */
+static int
+__bloom_init(WT_SESSION_IMPL *session,
+ const char *uri, const char *config, WT_BLOOM **bloomp)
+{
+ WT_BLOOM *bloom;
+ WT_DECL_RET;
+ size_t len;
+
+ *bloomp = NULL;
+
+ WT_RET(__wt_calloc_def(session, 1, &bloom));
+
+ WT_ERR(__wt_strdup(session, uri, &bloom->uri));
+ len = strlen(WT_BLOOM_TABLE_CONFIG) + 2;
+ if (config != NULL)
+ len += strlen(config);
+ WT_ERR(__wt_calloc_def(session, len, &bloom->config));
+ /* Add the standard config at the end, so it overrides user settings. */
+ (void)snprintf(bloom->config, len,
+ "%s,%s", config == NULL ? "" : config, WT_BLOOM_TABLE_CONFIG);
+
+ bloom->session = session;
+
+ *bloomp = bloom;
+ return (0);
+
+err: __wt_free(session, bloom->uri);
+ __wt_free(session, bloom->config);
+ __wt_free(session, bloom->bitstring);
+ __wt_free(session, bloom);
+ return (ret);
+}
+
+/*
+ * __bloom_setup --
+ * Populate the bloom structure.
+ *
+ * Setup is passed in either the count of items expected (n), or the length of
+ * the bitstring (m). Depends on whether the function is called via create or
+ * open.
+ */
+static int
+__bloom_setup(
+ WT_BLOOM *bloom, uint64_t n, uint64_t m, uint32_t factor, uint32_t k)
+{
+ if (k < 2)
+ return (EINVAL);
+
+ bloom->k = k;
+ bloom->factor = factor;
+ if (n != 0) {
+ bloom->n = n;
+ bloom->m = bloom->n * bloom->factor;
+ } else {
+ bloom->m = m;
+ bloom->n = bloom->m / bloom->factor;
+ }
+ return (0);
+}
+
+/*
+ * __wt_bloom_create --
+ *
+ * Creates and configures a WT_BLOOM handle, allocates a bitstring in memory to
+ * use while populating the bloom filter.
+ *
+ * count - is the expected number of inserted items
+ * factor - is the number of bits to use per inserted item
+ * k - is the number of hash values to set or test per item
+ */
+int
+__wt_bloom_create(
+ WT_SESSION_IMPL *session, const char *uri, const char *config,
+ uint64_t count, uint32_t factor, uint32_t k, WT_BLOOM **bloomp)
+{
+ WT_BLOOM *bloom;
+ WT_DECL_RET;
+
+ WT_RET(__bloom_init(session, uri, config, &bloom));
+ WT_ERR(__bloom_setup(bloom, count, 0, factor, k));
+
+ WT_ERR(__bit_alloc(session, bloom->m, &bloom->bitstring));
+
+ *bloomp = bloom;
+ return (0);
+
+err: (void)__wt_bloom_close(bloom);
+ return (ret);
+}
+
+/*
+ * __bloom_open_cursor --
+ * Open a cursor to read from a Bloom filter.
+ */
+static int
+__bloom_open_cursor(WT_BLOOM *bloom, WT_CURSOR *owner)
+{
+ WT_CURSOR *c;
+ WT_SESSION_IMPL *session;
+ const char *cfg[3];
+
+ if ((c = bloom->c) != NULL)
+ return (0);
+
+ session = bloom->session;
+ cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+ cfg[1] = bloom->config;
+ cfg[2] = NULL;
+ c = NULL;
+ WT_RET(__wt_open_cursor(session, bloom->uri, owner, cfg, &c));
+
+ /* XXX Layering violation: bump the cache priority for Bloom filters. */
+ ((WT_CURSOR_BTREE *)c)->btree->evict_priority = WT_EVICT_INT_SKEW;
+
+ bloom->c = c;
+ return (0);
+}
+
+/*
+ * __wt_bloom_open --
+ * Open a Bloom filter object for use by a single session. The filter must
+ * have been created and finalized.
+ */
+int
+__wt_bloom_open(WT_SESSION_IMPL *session,
+ const char *uri, uint32_t factor, uint32_t k,
+ WT_CURSOR *owner, WT_BLOOM **bloomp)
+{
+ WT_BLOOM *bloom;
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ uint64_t size;
+
+ WT_RET(__bloom_init(session, uri, NULL, &bloom));
+ WT_ERR(__bloom_open_cursor(bloom, owner));
+ c = bloom->c;
+
+ /* Find the largest key, to get the size of the filter. */
+ WT_ERR(c->prev(c));
+ WT_ERR(c->get_key(c, &size));
+ WT_ERR(c->reset(c));
+
+ WT_ERR(__bloom_setup(bloom, 0, size, factor, k));
+
+ *bloomp = bloom;
+ return (0);
+
+err: (void)__wt_bloom_close(bloom);
+ return (ret);
+}
+
+/*
+ * __wt_bloom_insert --
+ * Adds the given key to the Bloom filter.
+ */
+int
+__wt_bloom_insert(WT_BLOOM *bloom, WT_ITEM *key)
+{
+ uint64_t h1, h2;
+ uint32_t i;
+
+ h1 = __wt_hash_fnv64(key->data, key->size);
+ h2 = __wt_hash_city64(key->data, key->size);
+ for (i = 0; i < bloom->k; i++, h1 += h2) {
+ __bit_set(bloom->bitstring, h1 % bloom->m);
+ }
+ return (0);
+}
+
+/*
+ * __wt_bloom_finalize --
+ * Writes the Bloom filter to stable storage. After calling finalize, only
+ * read operations can be performed on the bloom filter.
+ */
+int
+__wt_bloom_finalize(WT_BLOOM *bloom)
+{
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_ITEM values;
+ WT_SESSION *wt_session;
+ uint64_t i;
+
+ wt_session = (WT_SESSION *)bloom->session;
+ WT_CLEAR(values);
+
+ /*
+ * Create a bit table to store the bloom filter in.
+ * TODO: should this call __wt_schema_create directly?
+ */
+ WT_RET(wt_session->create(wt_session, bloom->uri, bloom->config));
+ WT_RET(wt_session->open_cursor(
+ wt_session, bloom->uri, NULL, "bulk=bitmap", &c));
+
+ /* Add the entries from the array into the table. */
+ for (i = 0; i < bloom->m; i += values.size) {
+ /* Adjust bits to bytes for string offset */
+ values.data = bloom->bitstring + (i >> 3);
+ /*
+ * Shave off some bytes for pure paranoia, in case WiredTiger
+ * reserves some special sizes. Choose a value so that if
+ * we do multiple inserts, it will be on an byte boundary.
+ */
+ values.size = (uint32_t)WT_MIN(bloom->m - i, UINT32_MAX - 127);
+ c->set_value(c, &values);
+ WT_ERR(c->insert(c));
+ }
+
+err: WT_TRET(c->close(c));
+ __wt_free(bloom->session, bloom->bitstring);
+ bloom->bitstring = NULL;
+
+ return (ret);
+}
+
+/*
+ * __wt_bloom_hash --
+ * Calculate the hash values for a given key.
+ */
+int
+__wt_bloom_hash(WT_BLOOM *bloom, WT_ITEM *key, WT_BLOOM_HASH *bhash)
+{
+ WT_UNUSED(bloom);
+
+ bhash->h1 = __wt_hash_fnv64(key->data, key->size);
+ bhash->h2 = __wt_hash_city64(key->data, key->size);
+
+ return (0);
+}
+
+/*
+ * __wt_bloom_hash_get --
+ * Tests whether the key (as given by its hash signature) is in the Bloom
+ * filter. Returns zero if found, WT_NOTFOUND if not.
+ */
+int
+__wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash)
+{
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ int result;
+ uint32_t i;
+ uint64_t h1, h2;
+ uint8_t bit;
+
+ /* Get operations are only supported by finalized bloom filters. */
+ WT_ASSERT(bloom->session, bloom->bitstring == NULL);
+
+ /* Create a cursor on the first time through. */
+ WT_ERR(__bloom_open_cursor(bloom, NULL));
+ c = bloom->c;
+
+ h1 = bhash->h1;
+ h2 = bhash->h2;
+
+ result = 0;
+ for (i = 0; i < bloom->k; i++, h1 += h2) {
+ /*
+ * Add 1 to the hash because WiredTiger tables are 1 based and
+ * the original bitstring array was 0 based.
+ */
+ c->set_key(c, (h1 % bloom->m) + 1);
+ WT_ERR(c->search(c));
+ WT_ERR(c->get_value(c, &bit));
+
+ if (bit == 0) {
+ result = WT_NOTFOUND;
+ break;
+ }
+ }
+ WT_ERR(c->reset(c));
+ return (result);
+
+err: /* Don't return WT_NOTFOUND from a failed search. */
+ if (ret == WT_NOTFOUND)
+ ret = WT_ERROR;
+ __wt_err(bloom->session, ret, "Failed lookup in bloom filter.");
+ return (ret);
+}
+
+/*
+ * __wt_bloom_get --
+ * Tests whether the given key is in the Bloom filter.
+ * Returns zero if found, WT_NOTFOUND if not.
+ */
+int
+__wt_bloom_get(WT_BLOOM *bloom, WT_ITEM *key)
+{
+ WT_BLOOM_HASH bhash;
+
+ WT_RET(__wt_bloom_hash(bloom, key, &bhash));
+ return (__wt_bloom_hash_get(bloom, &bhash));
+}
+
+/*
+ * __wt_bloom_close --
+ * Close the Bloom filter, release any resources.
+ */
+int
+__wt_bloom_close(WT_BLOOM *bloom)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = bloom->session;
+
+ if (bloom->c != NULL)
+ ret = bloom->c->close(bloom->c);
+ __wt_free(session, bloom->uri);
+ __wt_free(session, bloom->config);
+ __wt_free(session, bloom->bitstring);
+ __wt_free(session, bloom);
+
+ return (ret);
+}
+
+/*
+ * __wt_bloom_drop --
+ * Drop a Bloom filter, release any resources.
+ */
+int
+__wt_bloom_drop(WT_BLOOM *bloom, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ wt_session = (WT_SESSION *)bloom->session;
+ if (bloom->c != NULL) {
+ ret = bloom->c->close(bloom->c);
+ bloom->c = NULL;
+ }
+ WT_TRET(wt_session->drop(wt_session, bloom->uri, config));
+ WT_TRET(__wt_bloom_close(bloom));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c
new file mode 100644
index 00000000000..e81c951e9f6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_compact.c
@@ -0,0 +1,215 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __compact_rewrite --
+ * Return if a page needs to be re-written.
+ */
+static int
+__compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
+{
+ WT_BM *bm;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ size_t addr_size;
+ const uint8_t *addr;
+
+ *skipp = 1; /* Default skip. */
+
+ bm = S2BT(session)->bm;
+ page = ref->page;
+ mod = page->modify;
+
+ /*
+ * Ignore the root: it may not have a replacement address, and besides,
+ * if anything else gets written, so will it.
+ */
+ if (__wt_ref_is_root(ref))
+ return (0);
+
+ /* Ignore currently dirty pages, they will be written regardless. */
+ if (__wt_page_is_modified(page))
+ return (0);
+
+ /*
+ * If the page is clean, test the original addresses.
+ * If the page is a 1-to-1 replacement, test the replacement addresses.
+ * Ignore empty pages, they get merged into the parent.
+ */
+ if (mod == NULL || F_ISSET(mod, WT_PM_REC_MASK) == 0) {
+ WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ if (addr == NULL)
+ return (0);
+ WT_RET(
+ bm->compact_page_skip(bm, session, addr, addr_size, skipp));
+ } else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE) {
+ /*
+ * The page's modification information can change underfoot if
+ * the page is being reconciled, lock the page down.
+ */
+ WT_PAGE_LOCK(session, page);
+ ret = bm->compact_page_skip(bm, session,
+ mod->mod_replace.addr, mod->mod_replace.size, skipp);
+ WT_PAGE_UNLOCK(session, page);
+ WT_RET(ret);
+ }
+ return (0);
+}
+
+/*
+ * __wt_compact --
+ * Compact a file.
+ */
+int
+__wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_REF *ref;
+ int block_manager_begin, skip;
+
+ WT_UNUSED(cfg);
+
+ conn = S2C(session);
+ btree = S2BT(session);
+ bm = btree->bm;
+ ref = NULL;
+ block_manager_begin = 0;
+
+ WT_STAT_FAST_DATA_INCR(session, session_compact);
+
+ /*
+ * Check if compaction might be useful -- the API layer will quit trying
+ * to compact the data source if we make no progress, set a flag if the
+ * block layer thinks compaction is possible.
+ */
+ WT_RET(bm->compact_skip(bm, session, &skip));
+ if (skip)
+ return (0);
+
+ /*
+ * Reviewing in-memory pages requires looking at page reconciliation
+ * results, because we care about where the page is stored now, not
+ * where the page was stored when we first read it into the cache.
+ * We need to ensure we don't race with page reconciliation as it's
+ * writing the page modify information.
+ *
+ * There are three ways we call reconciliation: checkpoints, threads
+ * writing leaf pages (usually in preparation for a checkpoint), and
+ * eviction.
+ *
+ * We're holding the schema lock which serializes with checkpoints.
+ */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+ /*
+ * Get the tree handle's flush lock which blocks threads writing leaf
+ * pages.
+ */
+ __wt_spin_lock(session, &btree->flush_lock);
+
+ /*
+ * That leaves eviction, we don't want to block eviction. Set a flag
+ * so reconciliation knows compaction is running. If reconciliation
+ * sees the flag it locks the page it's writing, we acquire the same
+ * lock when reading the page's modify information, serializing access.
+ * The same page lock blocks work on the page, but compaction is an
+ * uncommon, heavy-weight operation. If it's ever a problem, there's
+ * no reason we couldn't use an entirely separate lock than the page
+ * lock.
+ *
+ * We also need to ensure we don't race with an on-going reconciliation.
+ * After we set the flag, wait for eviction of this file to drain, and
+ * then let eviction continue;
+ */
+ conn->compact_in_memory_pass = 1;
+ WT_ERR(__wt_evict_file_exclusive_on(session));
+ __wt_evict_file_exclusive_off(session);
+
+ /* Start compaction. */
+ WT_ERR(bm->compact_start(bm, session));
+ block_manager_begin = 1;
+
+ /* Walk the tree reviewing pages to see if they should be re-written. */
+ session->compaction = 1;
+ for (;;) {
+ /*
+ * Pages read for compaction aren't "useful"; don't update the
+ * read generation of pages already in memory, and if a page is
+ * read, set its generation to a low value so it is evicted
+ * quickly.
+ */
+ WT_ERR(__wt_tree_walk(session, &ref,
+ WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED));
+ if (ref == NULL)
+ break;
+
+ WT_ERR(__compact_rewrite(session, ref, &skip));
+ if (skip)
+ continue;
+
+ /* Rewrite the page: mark the page and tree dirty. */
+ WT_ERR(__wt_page_modify_init(session, ref->page));
+ __wt_page_modify_set(session, ref->page);
+
+ WT_STAT_FAST_DATA_INCR(session, btree_compact_rewrite);
+ }
+
+err: if (ref != NULL)
+ WT_TRET(__wt_page_release(session, ref, 0));
+
+ if (block_manager_begin)
+ WT_TRET(bm->compact_end(bm, session));
+
+ __wt_spin_unlock(session, &btree->flush_lock);
+
+ conn->compact_in_memory_pass = 0;
+ WT_FULL_BARRIER();
+
+ return (ret);
+}
+
+/*
+ * __wt_compact_page_skip --
+ * Return if compaction requires we read this page.
+ */
+int
+__wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
+{
+ WT_BM *bm;
+ size_t addr_size;
+ u_int type;
+ const uint8_t *addr;
+
+ *skipp = 0; /* Default to reading. */
+ type = 0; /* Keep compiler quiet. */
+
+ bm = S2BT(session)->bm;
+
+ /*
+ * We aren't holding a hazard pointer, so we can't look at the page
+ * itself, all we can look at is the WT_REF information. If there's no
+ * address, the page isn't on disk, but we have to read internal pages
+ * to walk the tree regardless; throw up our hands and read it.
+ */
+ WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, &type));
+ if (addr == NULL)
+ return (0);
+
+ /*
+ * Internal pages must be read to walk the tree; ask the block-manager
+ * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite
+ * won't help.
+ */
+ return (type == WT_CELL_ADDR_INT ? 0 :
+ bm->compact_page_skip(bm, session, addr, addr_size, skipp));
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
new file mode 100644
index 00000000000..0cc79776634
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -0,0 +1,468 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __cursor_fix_append_next --
+ * Return the next entry on the append list.
+ */
+static inline int
+__cursor_fix_append_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_ITEM *val;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ val = &cbt->iface.value;
+
+ if (newpage) {
+ if ((cbt->ins = WT_SKIP_FIRST(cbt->ins_head)) == NULL)
+ return (WT_NOTFOUND);
+ } else
+ if (cbt->recno >= WT_INSERT_RECNO(cbt->ins) &&
+ (cbt->ins = WT_SKIP_NEXT(cbt->ins)) == NULL)
+ return (WT_NOTFOUND);
+
+ /*
+ * This code looks different from the cursor-previous code. The append
+ * list appears on the last page of the tree, but it may be preceded by
+ * other rows, which means the cursor's recno will be set to a value and
+ * we simply want to increment it. If the cursor's recno is NOT set,
+ * we're starting our iteration in a tree that has only appended items.
+ * In that case, recno will be 0 and happily enough the increment will
+ * set it to 1, which is correct.
+ */
+ __cursor_set_recno(cbt, cbt->recno + 1);
+
+ /*
+ * Fixed-width column store appends are inherently non-transactional.
+ * Even a non-visible update by a concurrent or aborted transaction
+ * changes the effective end of the data. The effect is subtle because
+ * of the blurring between deleted and empty values, but ideally we
+ * would skip all uncommitted changes at the end of the data. This
+ * doesn't apply to variable-width column stores because the implicitly
+ * created records written by reconciliation are deleted and so can be
+ * never seen by a read.
+ *
+ * The problem is that we don't know at this point whether there may be
+ * multiple uncommitted changes at the end of the data, and it would be
+ * expensive to check every time we hit an aborted update. If an
+ * insert is aborted, we simply return zero (empty), regardless of
+ * whether we are at the end of the data.
+ */
+ if (cbt->recno < WT_INSERT_RECNO(cbt->ins) ||
+ (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
+ cbt->v = 0;
+ val->data = &cbt->v;
+ } else
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = 1;
+ return (0);
+}
+
+/*
+ * __cursor_fix_next --
+ * Move to the next, fixed-length column-store item.
+ */
+static inline int
+__cursor_fix_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_BTREE *btree;
+ WT_ITEM *val;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ btree = S2BT(session);
+ page = cbt->ref->page;
+ val = &cbt->iface.value;
+
+ /* Initialize for each new page. */
+ if (newpage) {
+ cbt->last_standard_recno = __col_fix_last_recno(page);
+ if (cbt->last_standard_recno == 0)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, page->pg_fix_recno);
+ goto new_page;
+ }
+
+ /* Move to the next entry and return the item. */
+ if (cbt->recno >= cbt->last_standard_recno)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, cbt->recno + 1);
+
+new_page:
+ /* Check any insert list for a matching record. */
+ cbt->ins_head = WT_COL_UPDATE_SINGLE(page);
+ cbt->ins = __col_insert_search(
+ cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno);
+ if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins))
+ cbt->ins = NULL;
+ upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
+ if (upd == NULL) {
+ cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt);
+ val->data = &cbt->v;
+ } else
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = 1;
+ return (0);
+}
+
+/*
+ * __cursor_var_append_next --
+ * Return the next variable-length entry on the append list.
+ */
+static inline int
+__cursor_var_append_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_ITEM *val;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ val = &cbt->iface.value;
+
+ if (newpage) {
+ cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
+ goto new_page;
+ }
+
+ for (;;) {
+ cbt->ins = WT_SKIP_NEXT(cbt->ins);
+new_page: if (cbt->ins == NULL)
+ return (WT_NOTFOUND);
+
+ __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
+ if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL ||
+ WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __cursor_var_next --
+ * Move to the next, variable-length column-store item.
+ */
+static inline int
+__cursor_var_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_CELL *cell;
+ WT_CELL_UNPACK unpack;
+ WT_COL *cip;
+ WT_ITEM *val;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ val = &cbt->iface.value;
+
+ /* Initialize for each new page. */
+ if (newpage) {
+ cbt->last_standard_recno = __col_var_last_recno(page);
+ if (cbt->last_standard_recno == 0)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, page->pg_var_recno);
+ goto new_page;
+ }
+
+ /* Move to the next entry and return the item. */
+ for (;;) {
+ if (cbt->recno >= cbt->last_standard_recno)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, cbt->recno + 1);
+
+new_page: /* Find the matching WT_COL slot. */
+ if ((cip = __col_var_search(page, cbt->recno)) == NULL)
+ return (WT_NOTFOUND);
+ cbt->slot = WT_COL_SLOT(page, cip);
+
+ /* Check any insert list for a matching record. */
+ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
+ cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
+ upd = cbt->ins == NULL ?
+ NULL : __wt_txn_read(session, cbt->ins->upd);
+ if (upd != NULL) {
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ return (0);
+ }
+
+ /*
+ * If we're at the same slot as the last reference and there's
+ * no matching insert list item, re-use the return information
+ * (so encoded items with large repeat counts aren't repeatedly
+ * decoded). Otherwise, unpack the cell and build the return
+ * information.
+ */
+ if (cbt->cip_saved != cip) {
+ if ((cell = WT_COL_PTR(page, cip)) == NULL)
+ continue;
+ __wt_cell_unpack(cell, &unpack);
+ if (unpack.type == WT_CELL_DEL)
+ continue;
+ WT_RET(__wt_page_cell_data_ref(
+ session, page, &unpack, &cbt->tmp));
+
+ cbt->cip_saved = cip;
+ }
+ val->data = cbt->tmp.data;
+ val->size = cbt->tmp.size;
+ return (0);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __cursor_row_next --
+ * Move to the next row-store item.
+ */
+static inline int
+__cursor_row_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_INSERT *ins;
+ WT_ITEM *key, *val;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ key = &cbt->iface.key;
+ val = &cbt->iface.value;
+
+ /*
+ * For row-store pages, we need a single item that tells us the part
+ * of the page we're walking (otherwise switching from next to prev
+ * and vice-versa is just too complicated), so we map the WT_ROW and
+ * WT_INSERT_HEAD insert array slots into a single name space: slot 1
+ * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
+ * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are
+ * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
+ *
+ * New page configuration.
+ */
+ if (newpage) {
+ cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+ cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
+ cbt->row_iteration_slot = 1;
+ goto new_insert;
+ }
+
+ /* Move to the next entry and return the item. */
+ for (;;) {
+ /*
+ * Continue traversing any insert list; maintain the insert list
+ * head reference and entry count in case we switch to a cursor
+ * previous movement.
+ */
+ if (cbt->ins != NULL)
+ cbt->ins = WT_SKIP_NEXT(cbt->ins);
+
+new_insert: if ((ins = cbt->ins) != NULL) {
+ if ((upd = __wt_txn_read(session, ins->upd)) == NULL ||
+ WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+ key->data = WT_INSERT_KEY(ins);
+ key->size = WT_INSERT_KEY_SIZE(ins);
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ return (0);
+ }
+
+ /* Check for the end of the page. */
+ if (cbt->row_iteration_slot >= page->pg_row_entries * 2 + 1)
+ return (WT_NOTFOUND);
+ ++cbt->row_iteration_slot;
+
+ /*
+ * Odd-numbered slots configure as WT_INSERT_HEAD entries,
+ * even-numbered slots configure as WT_ROW entries.
+ */
+ if (cbt->row_iteration_slot & 0x01) {
+ cbt->ins_head = WT_ROW_INSERT_SLOT(
+ page, cbt->row_iteration_slot / 2 - 1);
+ cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
+ goto new_insert;
+ }
+ cbt->ins_head = NULL;
+ cbt->ins = NULL;
+
+ cbt->slot = cbt->row_iteration_slot / 2 - 1;
+ rip = &page->pg_row_d[cbt->slot];
+ upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
+ if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+
+ return (__cursor_row_slot_return(cbt, rip, upd));
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_btcur_iterate_setup --
+ * Initialize a cursor for iteration, usually based on a search.
+ */
+void
+__wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next)
+{
+ WT_PAGE *page;
+
+ WT_UNUSED(next);
+
+ /*
+ * We don't currently have to do any setup when we switch between next
+ * and prev calls, but I'm sure we will someday -- I'm leaving support
+ * here for both flags for that reason.
+ */
+ F_SET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV);
+
+ /*
+ * If we don't have a search page, then we're done, we're starting at
+ * the beginning or end of the tree, not as a result of a search.
+ */
+ if (cbt->ref == NULL)
+ return;
+ page = cbt->ref->page;
+
+ if (page->type == WT_PAGE_ROW_LEAF) {
+ /*
+ * For row-store pages, we need a single item that tells us the
+ * part of the page we're walking (otherwise switching from next
+ * to prev and vice-versa is just too complicated), so we map
+ * the WT_ROW and WT_INSERT_HEAD insert array slots into a
+ * single name space: slot 1 is the "smallest key insert list",
+ * slot 2 is WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on.
+ * This means WT_INSERT lists are odd-numbered slots, and WT_ROW
+ * array slots are even-numbered slots.
+ */
+ cbt->row_iteration_slot = (cbt->slot + 1) * 2;
+ if (cbt->ins_head != NULL) {
+ if (cbt->ins_head == WT_ROW_INSERT_SMALLEST(page))
+ cbt->row_iteration_slot = 1;
+ else
+ cbt->row_iteration_slot += 1;
+ }
+ } else {
+ /*
+ * For column-store pages, calculate the largest record on the
+ * page.
+ */
+ cbt->last_standard_recno = page->type == WT_PAGE_COL_VAR ?
+ __col_var_last_recno(page) : __col_fix_last_recno(page);
+
+ /* If we're traversing the append list, set the reference. */
+ if (cbt->ins_head != NULL &&
+ cbt->ins_head == WT_COL_APPEND(page))
+ F_SET(cbt, WT_CBT_ITERATE_APPEND);
+ }
+}
+
+/*
+ * __wt_btcur_next --
+ * Move to the next record in the tree.
+ */
+int
+__wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating)
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ uint32_t flags;
+ int newpage;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_next);
+ WT_STAT_FAST_DATA_INCR(session, cursor_next);
+
+ flags = WT_READ_SKIP_INTL; /* Tree walk flags. */
+ if (truncating)
+ LF_SET(WT_READ_TRUNCATE);
+
+ WT_RET(__cursor_func_init(cbt, 0));
+
+ /*
+ * If we aren't already iterating in the right direction, there's
+ * some setup to do.
+ */
+ if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT))
+ __wt_btcur_iterate_setup(cbt, 1);
+
+ /*
+ * Walk any page we're holding until the underlying call returns not-
+ * found. Then, move to the next page, until we reach the end of the
+ * file.
+ */
+ page = cbt->ref == NULL ? NULL : cbt->ref->page;
+ for (newpage = 0;; newpage = 1) {
+ if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ ret = __cursor_fix_append_next(cbt, newpage);
+ break;
+ case WT_PAGE_COL_VAR:
+ ret = __cursor_var_append_next(cbt, newpage);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ if (ret == 0)
+ break;
+ F_CLR(cbt, WT_CBT_ITERATE_APPEND);
+ if (ret != WT_NOTFOUND)
+ break;
+ } else if (page != NULL) {
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ ret = __cursor_fix_next(cbt, newpage);
+ break;
+ case WT_PAGE_COL_VAR:
+ ret = __cursor_var_next(cbt, newpage);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ ret = __cursor_row_next(cbt, newpage);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ if (ret != WT_NOTFOUND)
+ break;
+
+ /*
+ * The last page in a column-store has appended entries.
+ * We handle it separately from the usual cursor code:
+ * it's only that one page and it's in a simple format.
+ */
+ if (page->type != WT_PAGE_ROW_LEAF &&
+ (cbt->ins_head = WT_COL_APPEND(page)) != NULL) {
+ F_SET(cbt, WT_CBT_ITERATE_APPEND);
+ continue;
+ }
+ }
+
+ WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
+ WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
+
+ page = cbt->ref->page;
+ WT_ASSERT(session, !WT_PAGE_IS_INTERNAL(page));
+ }
+
+err: if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
new file mode 100644
index 00000000000..8de784d1f1d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -0,0 +1,560 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Walking backwards through skip lists.
+ *
+ * The skip list stack is an array of pointers set up by a search. It points
+ * to the position a node should go in the skip list. In other words, the skip
+ * list search stack always points *after* the search item (that is, into the
+ * search item's next array).
+ *
+ * Helper macros to go from a stack pointer at level i, pointing into a next
+ * array, back to the insert node containing that next array.
+ */
+#undef PREV_ITEM
+#define PREV_ITEM(ins_head, insp, i) \
+ (((insp) == &(ins_head)->head[i] || (insp) == NULL) ? NULL : \
+ (WT_INSERT *)((char *)((insp) - (i)) - offsetof(WT_INSERT, next)))
+
+#undef PREV_INS
+#define PREV_INS(cbt, i) \
+ PREV_ITEM((cbt)->ins_head, (cbt)->ins_stack[(i)], (i))
+
+/*
+ * __cursor_skip_prev --
+ * Move back one position in a skip list stack (aka "finger").
+ */
+static inline int
+__cursor_skip_prev(WT_CURSOR_BTREE *cbt)
+{
+ WT_INSERT *current, *ins;
+ WT_ITEM key;
+ WT_SESSION_IMPL *session;
+ int i;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+restart:
+ /*
+ * If the search stack does not point at the current item, fill it in
+ * with a search.
+ */
+ while ((current = cbt->ins) != PREV_INS(cbt, 0)) {
+ if (cbt->btree->type == BTREE_ROW) {
+ key.data = WT_INSERT_KEY(current);
+ key.size = WT_INSERT_KEY_SIZE(current);
+ WT_RET(__wt_search_insert(session, cbt, &key));
+ } else
+ cbt->ins = __col_insert_search(cbt->ins_head,
+ cbt->ins_stack, cbt->next_stack,
+ WT_INSERT_RECNO(current));
+ }
+
+ /*
+ * Find the first node up the search stack that does not move.
+ *
+ * The depth of the current item must be at least this level, since we
+ * see it in that many levels of the stack.
+ *
+ * !!! Watch these loops carefully: they all rely on the value of i,
+ * and the exit conditions to end up with the right values are
+ * non-trivial.
+ */
+ ins = NULL; /* -Wconditional-uninitialized */
+ for (i = 0; i < WT_SKIP_MAXDEPTH - 1; i++)
+ if ((ins = PREV_INS(cbt, i + 1)) != current)
+ break;
+
+ /*
+ * Find a starting point for the new search. That is either at the
+ * non-moving node if we found a valid node, or the beginning of the
+ * next list down that is not the current node.
+ *
+ * Since it is the beginning of a list, and we know the current node is
+ * has a skip depth at least this high, any node we find must sort
+ * before the current node.
+ */
+ if (ins == NULL || ins == current)
+ for (; i >= 0; i--) {
+ cbt->ins_stack[i] = NULL;
+ cbt->next_stack[i] = NULL;
+ ins = cbt->ins_head->head[i];
+ if (ins != NULL && ins != current)
+ break;
+ }
+
+ /* Walk any remaining levels until just before the current node. */
+ while (i >= 0) {
+ /*
+ * If we get to the end of a list without finding the current
+ * item, we must have raced with an insert. Restart the search.
+ */
+ if (ins == NULL) {
+ cbt->ins_stack[0] = NULL;
+ cbt->next_stack[0] = NULL;
+ goto restart;
+ }
+ if (ins->next[i] != current) /* Stay at this level */
+ ins = ins->next[i];
+ else { /* Drop down a level */
+ cbt->ins_stack[i] = &ins->next[i];
+ cbt->next_stack[i] = ins->next[i];
+ --i;
+ }
+ }
+
+ /* If we found a previous node, the next one must be current. */
+ if (cbt->ins_stack[0] != NULL && *cbt->ins_stack[0] != current)
+ goto restart;
+
+ cbt->ins = PREV_INS(cbt, 0);
+ return (0);
+}
+
+/*
+ * __cursor_fix_append_prev --
+ * Return the previous fixed-length entry on the append list.
+ */
+static inline int
+__cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_ITEM *val;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ val = &cbt->iface.value;
+
+ if (newpage) {
+ if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL)
+ return (WT_NOTFOUND);
+ } else {
+ /*
+ * Handle the special case of leading implicit records, that is,
+ * there aren't any records in the tree not on the append list,
+ * and the first record on the append list isn't record 1.
+ *
+ * The "right" place to handle this is probably in our caller.
+ * The high-level cursor-previous routine would:
+ * -- call this routine to walk the append list
+ * -- call the routine to walk the standard page items
+ * -- call the tree walk routine looking for a previous page
+ * Each of them returns WT_NOTFOUND, at which point our caller
+ * checks the cursor record number, and if it's larger than 1,
+ * returns the implicit records. Instead, I'm trying to detect
+ * the case here, mostly because I don't want to put that code
+ * into our caller. Anyway, if this code breaks for any reason,
+ * that's the way I'd go.
+ *
+ * If we're not pointing to a WT_INSERT entry, or we can't find
+ * a WT_INSERT record that precedes our record name-space, check
+ * if there are any records on the page. If there aren't, then
+ * we're in the magic zone, keep going until we get to a record
+ * number of 1.
+ */
+ if (cbt->ins != NULL &&
+ cbt->recno <= WT_INSERT_RECNO(cbt->ins))
+ WT_RET(__cursor_skip_prev(cbt));
+ if (cbt->ins == NULL &&
+ (cbt->recno == 1 || __col_fix_last_recno(page) != 0))
+ return (WT_NOTFOUND);
+ }
+
+ /*
+ * This code looks different from the cursor-next code. The append
+ * list appears on the last page of the tree and contains the last
+ * records in the tree. If we're iterating through the tree, starting
+ * at the last record in the tree, by definition we're starting a new
+ * iteration and we set the record number to the last record found in
+ * the tree. Otherwise, decrement the record.
+ */
+ if (newpage)
+ __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
+ else
+ __cursor_set_recno(cbt, cbt->recno - 1);
+
+ /*
+ * Fixed-width column store appends are inherently non-transactional.
+ * Even a non-visible update by a concurrent or aborted transaction
+ * changes the effective end of the data. The effect is subtle because
+ * of the blurring between deleted and empty values, but ideally we
+ * would skip all uncommitted changes at the end of the data. This
+ * doesn't apply to variable-width column stores because the implicitly
+ * created records written by reconciliation are deleted and so can be
+ * never seen by a read.
+ */
+ if (cbt->ins == NULL ||
+ cbt->recno > WT_INSERT_RECNO(cbt->ins) ||
+ (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
+ cbt->v = 0;
+ val->data = &cbt->v;
+ } else
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = 1;
+ return (0);
+}
+
+/*
+ * __cursor_fix_prev --
+ * Move to the previous, fixed-length column-store item.
+ */
+static inline int
+__cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_BTREE *btree;
+ WT_ITEM *val;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ btree = S2BT(session);
+ val = &cbt->iface.value;
+
+ /* Initialize for each new page. */
+ if (newpage) {
+ cbt->last_standard_recno = __col_fix_last_recno(page);
+ if (cbt->last_standard_recno == 0)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, cbt->last_standard_recno);
+ goto new_page;
+ }
+
+ /* Move to the previous entry and return the item. */
+ if (cbt->recno == page->pg_fix_recno)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, cbt->recno - 1);
+
+new_page:
+ /* Check any insert list for a matching record. */
+ cbt->ins_head = WT_COL_UPDATE_SINGLE(page);
+ cbt->ins = __col_insert_search(
+ cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno);
+ if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins))
+ cbt->ins = NULL;
+ upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
+ if (upd == NULL) {
+ cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt);
+ val->data = &cbt->v;
+ } else
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = 1;
+ return (0);
+}
+
+/*
+ * __cursor_var_append_prev --
+ * Return the previous variable-length entry on the append list.
+ */
+static inline int
+__cursor_var_append_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_ITEM *val;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ val = &cbt->iface.value;
+
+ if (newpage) {
+ cbt->ins = WT_SKIP_LAST(cbt->ins_head);
+ goto new_page;
+ }
+
+ for (;;) {
+ WT_RET(__cursor_skip_prev(cbt));
+new_page: if (cbt->ins == NULL)
+ return (WT_NOTFOUND);
+
+ __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
+ if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL ||
+ WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __cursor_var_prev --
+ * Move to the previous, variable-length column-store item.
+ */
+static inline int
+__cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_CELL *cell;
+ WT_CELL_UNPACK unpack;
+ WT_COL *cip;
+ WT_ITEM *val;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ val = &cbt->iface.value;
+
+ /* Initialize for each new page. */
+ if (newpage) {
+ cbt->last_standard_recno = __col_var_last_recno(page);
+ if (cbt->last_standard_recno == 0)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, cbt->last_standard_recno);
+ goto new_page;
+ }
+
+ /* Move to the previous entry and return the item. */
+ for (;;) {
+ __cursor_set_recno(cbt, cbt->recno - 1);
+
+new_page: if (cbt->recno < page->pg_var_recno)
+ return (WT_NOTFOUND);
+
+ /* Find the matching WT_COL slot. */
+ if ((cip = __col_var_search(page, cbt->recno)) == NULL)
+ return (WT_NOTFOUND);
+ cbt->slot = WT_COL_SLOT(page, cip);
+
+ /* Check any insert list for a matching record. */
+ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
+ cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
+ upd = cbt->ins == NULL ?
+ NULL : __wt_txn_read(session, cbt->ins->upd);
+ if (upd != NULL) {
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ return (0);
+ }
+
+ /*
+ * If we're at the same slot as the last reference and there's
+ * no matching insert list item, re-use the return information
+ * (so encoded items with large repeat counts aren't repeatedly
+ * decoded). Otherwise, unpack the cell and build the return
+ * information.
+ */
+ if (cbt->cip_saved != cip) {
+ if ((cell = WT_COL_PTR(page, cip)) == NULL)
+ continue;
+ __wt_cell_unpack(cell, &unpack);
+ if (unpack.type == WT_CELL_DEL)
+ continue;
+ WT_RET(__wt_page_cell_data_ref(
+ session, page, &unpack, &cbt->tmp));
+
+ cbt->cip_saved = cip;
+ }
+ val->data = cbt->tmp.data;
+ val->size = cbt->tmp.size;
+ return (0);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __cursor_row_prev --
+ * Move to the previous row-store item.
+ */
+static inline int
+__cursor_row_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_INSERT *ins;
+ WT_ITEM *key, *val;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ key = &cbt->iface.key;
+ val = &cbt->iface.value;
+
+ /*
+ * For row-store pages, we need a single item that tells us the part
+ * of the page we're walking (otherwise switching from next to prev
+ * and vice-versa is just too complicated), so we map the WT_ROW and
+ * WT_INSERT_HEAD insert array slots into a single name space: slot 1
+ * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
+ * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are
+ * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
+ *
+ * New page configuration.
+ */
+ if (newpage) {
+ /*
+ * If we haven't instantiated keys on this page, do so, else it
+ * is a very, very slow traversal.
+ */
+ if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
+ WT_RET(__wt_row_leaf_keys(session, page));
+
+ if (page->pg_row_entries == 0)
+ cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+ else
+ cbt->ins_head =
+ WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
+ cbt->ins = WT_SKIP_LAST(cbt->ins_head);
+ cbt->row_iteration_slot = page->pg_row_entries * 2 + 1;
+ goto new_insert;
+ }
+
+ /* Move to the previous entry and return the item. */
+ for (;;) {
+ /*
+ * Continue traversing any insert list. Maintain the reference
+ * to the current insert element in case we switch to a cursor
+ * next movement.
+ */
+ if (cbt->ins != NULL)
+ WT_RET(__cursor_skip_prev(cbt));
+
+new_insert: if ((ins = cbt->ins) != NULL) {
+ if ((upd = __wt_txn_read(session, ins->upd)) == NULL ||
+ WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+ key->data = WT_INSERT_KEY(ins);
+ key->size = WT_INSERT_KEY_SIZE(ins);
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ return (0);
+ }
+
+ /* Check for the beginning of the page. */
+ if (cbt->row_iteration_slot == 1)
+ return (WT_NOTFOUND);
+ --cbt->row_iteration_slot;
+
+ /*
+ * Odd-numbered slots configure as WT_INSERT_HEAD entries,
+ * even-numbered slots configure as WT_ROW entries.
+ */
+ if (cbt->row_iteration_slot & 0x01) {
+ cbt->ins_head = cbt->row_iteration_slot == 1 ?
+ WT_ROW_INSERT_SMALLEST(page) :
+ WT_ROW_INSERT_SLOT(
+ page, cbt->row_iteration_slot / 2 - 1);
+ cbt->ins = WT_SKIP_LAST(cbt->ins_head);
+ goto new_insert;
+ }
+ cbt->ins_head = NULL;
+ cbt->ins = NULL;
+
+ cbt->slot = cbt->row_iteration_slot / 2 - 1;
+ rip = &page->pg_row_d[cbt->slot];
+ upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
+ if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+
+ return (__cursor_row_slot_return(cbt, rip, upd));
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_btcur_prev --
+ * Move to the previous record in the tree.
+ */
+int
+__wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating)
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ uint32_t flags;
+ int newpage;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_prev);
+ WT_STAT_FAST_DATA_INCR(session, cursor_prev);
+
+ flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */
+ if (truncating)
+ LF_SET(WT_READ_TRUNCATE);
+
+ WT_RET(__cursor_func_init(cbt, 0));
+
+ /*
+ * If we aren't already iterating in the right direction, there's
+ * some setup to do.
+ */
+ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
+ __wt_btcur_iterate_setup(cbt, 0);
+
+ /*
+ * Walk any page we're holding until the underlying call returns not-
+ * found. Then, move to the previous page, until we reach the start
+ * of the file.
+ */
+ page = cbt->ref == NULL ? NULL : cbt->ref->page;
+ for (newpage = 0;; newpage = 1) {
+ if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ ret = __cursor_fix_append_prev(cbt, newpage);
+ break;
+ case WT_PAGE_COL_VAR:
+ ret = __cursor_var_append_prev(cbt, newpage);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ if (ret == 0)
+ break;
+ F_CLR(cbt, WT_CBT_ITERATE_APPEND);
+ if (ret != WT_NOTFOUND)
+ break;
+ newpage = 1;
+ }
+ if (page != NULL) {
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ ret = __cursor_fix_prev(cbt, newpage);
+ break;
+ case WT_PAGE_COL_VAR:
+ ret = __cursor_var_prev(cbt, newpage);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ ret = __cursor_row_prev(cbt, newpage);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ if (ret != WT_NOTFOUND)
+ break;
+ }
+
+ WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
+ WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
+
+ page = cbt->ref->page;
+ WT_ASSERT(session, !WT_PAGE_IS_INTERNAL(page));
+
+ /*
+ * The last page in a column-store has appended entries.
+ * We handle it separately from the usual cursor code:
+ * it's only that one page and it's in a simple format.
+ */
+ if (page->type != WT_PAGE_ROW_LEAF &&
+ (cbt->ins_head = WT_COL_APPEND(page)) != NULL)
+ F_SET(cbt, WT_CBT_ITERATE_APPEND);
+ }
+
+err: if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
new file mode 100644
index 00000000000..5b2d9b055b5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -0,0 +1,1025 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __cursor_size_chk --
+ * Return if an inserted item is too large.
+ */
+static inline int
+__cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ size_t size;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ if (btree->type == BTREE_COL_FIX) {
+ /* Fixed-size column-stores take a single byte. */
+ if (kv->size != 1)
+ WT_RET_MSG(session, EINVAL,
+ "item size of %" WT_SIZET_FMT " does not match "
+ "fixed-length file requirement of 1 byte",
+ kv->size);
+ return (0);
+ }
+
+ /* Don't waste effort, 1GB is always cool. */
+ if (kv->size <= WT_GIGABYTE)
+ return (0);
+
+ /*
+ * There are two checks: what we are willing to store in the tree, and
+ * what the block manager can actually write.
+ */
+ if (kv->size > WT_BTREE_MAX_OBJECT_SIZE)
+ ret = EINVAL;
+ else {
+ size = kv->size;
+ ret = bm->write_size(bm, session, &size);
+ }
+ if (ret != 0)
+ WT_RET_MSG(session, ret,
+ "item size of %" WT_SIZET_FMT " exceeds the maximum "
+ "supported size",
+ kv->size);
+ return (0);
+}
+
+/*
+ * __cursor_fix_implicit --
+ * Return if search went past the end of the tree.
+ */
+static inline int
+__cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
+{
+ return (btree->type == BTREE_COL_FIX &&
+ !F_ISSET(cbt, WT_CBT_MAX_RECORD) ? 1 : 0);
+}
+
+/*
+ * __cursor_valid --
+ * Return if the cursor references an valid key/value pair.
+ */
+static inline int
+__cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_COL *cip;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ btree = cbt->btree;
+ page = cbt->ref->page;
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ if (updp != NULL)
+ *updp = NULL;
+
+ /*
+ * We may be pointing to an insert object, and we may have a page with
+ * existing entries. Insert objects always have associated update
+ * objects (the value). Any update object may be deleted, or invisible
+ * to us. In the case of an on-page entry, there is by definition a
+ * value that is visible to us, the original page cell.
+ *
+ * If we find a visible update structure, return our caller a reference
+ * to it because we don't want to repeatedly search for the update, it
+ * might suddenly become invisible (imagine a read-uncommitted session
+ * with another session's aborted insert), and we don't want to handle
+ * that potential error every time we look at the value.
+ *
+ * Unfortunately, the objects we might have and their relationships are
+ * different for the underlying page types.
+ *
+ * In the case of row-store, an insert object implies ignoring any page
+ * objects, no insert object can have the same key as an on-page object.
+ * For row-store:
+ * if there's an insert object:
+ * if there's a visible update:
+ * exact match
+ * else
+ * no exact match
+ * else
+ * use the on-page object (which may have an associated
+ * update object that may or may not be visible to us).
+ *
+ * Column-store is more complicated because an insert object can have
+ * the same key as an on-page object: updates to column-store rows
+ * are insert/object pairs, and an invisible update isn't the end as
+ * there may be an on-page object that is visible. This changes the
+ * logic to:
+ * if there's an insert object:
+ * if there's a visible update:
+ * exact match
+ * else if the on-page object's key matches the insert key
+ * use the on-page object
+ * else
+ * use the on-page object
+ *
+ * First, check for an insert object with a visible update (a visible
+ * update that's been deleted is not a valid key/value pair).
+ */
+ if (cbt->ins != NULL &&
+ (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ return (0);
+ if (updp != NULL)
+ *updp = upd;
+ return (1);
+ }
+
+ /*
+ * If we don't have an insert object, or in the case of column-store,
+ * there's an insert object but no update was visible to us and the key
+ * on the page is the same as the insert object's key, and the slot as
+ * set by the search function is valid, we can use the original page
+ * information.
+ */
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ /*
+ * If search returned an insert object, there may or may not be
+ * a matching on-page object, we have to check. Fixed-length
+ * column-store pages don't have slots, but map one-to-one to
+ * keys, check for retrieval past the end of the page.
+ */
+ if (cbt->recno >= page->pg_fix_recno + page->pg_fix_entries)
+ return (0);
+
+ /*
+ * Updates aren't stored on the page, an update would have
+ * appeared as an "insert" object; no further checks to do.
+ */
+ break;
+ case BTREE_COL_VAR:
+ /*
+ * If search returned an insert object, there may or may not be
+ * a matching on-page object, we have to check. Variable-length
+ * column-store pages don't map one-to-one to keys, but have
+ * "slots", check if search returned a valid slot.
+ */
+ if (cbt->slot >= page->pg_var_entries)
+ return (0);
+
+ /*
+ * Updates aren't stored on the page, an update would have
+ * appeared as an "insert" object; however, variable-length
+ * column store deletes are written into the backing store,
+ * check the cell for a record already deleted when read.
+ */
+ cip = &page->pg_var_d[cbt->slot];
+ if ((cell = WT_COL_PTR(page, cip)) == NULL ||
+ __wt_cell_type(cell) == WT_CELL_DEL)
+ return (0);
+ break;
+ case BTREE_ROW:
+ /*
+ * See above: for row-store, no insert object can have the same
+ * key as an on-page object, we're done.
+ */
+ if (cbt->ins != NULL)
+ return (0);
+
+ /*
+ * Check if searched returned a valid slot (the failure mode is
+ * an empty page, the search function doesn't check, and so the
+ * more exact test is "page->pg_row_entries == 0", but this test
+ * mirrors the column-store test).
+ */
+ if (cbt->slot >= page->pg_row_entries)
+ return (0);
+
+ /* Updates are stored on the page, check for a delete. */
+ if (page->pg_row_upd != NULL && (upd = __wt_txn_read(
+ session, page->pg_row_upd[cbt->slot])) != NULL) {
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ return (0);
+ if (updp != NULL)
+ *updp = upd;
+ }
+ break;
+ }
+ return (1);
+}
+
+/*
+ * __cursor_col_search --
+ * Column-store search from an application cursor.
+ */
+static inline int
+__cursor_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ WT_DECL_RET;
+
+ WT_WITH_PAGE_INDEX(session,
+ ret = __wt_col_search(session, cbt->iface.recno, NULL, cbt));
+ return (ret);
+}
+
+/*
+ * __cursor_row_search --
+ * Row-store search from an application cursor.
+ */
+static inline int
+__cursor_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int insert)
+{
+ WT_DECL_RET;
+
+ WT_WITH_PAGE_INDEX(session,
+ ret = __wt_row_search(session, &cbt->iface.key, NULL, cbt, insert));
+ return (ret);
+}
+
+/*
+ * __cursor_col_modify --
+ * Column-store delete, insert, and update from an application cursor.
+ */
+static inline int
+__cursor_col_modify(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
+{
+ return (__wt_col_modify(session,
+ cbt, cbt->iface.recno, &cbt->iface.value, NULL, is_remove));
+}
+
+/*
+ * __cursor_row_modify --
+ * Row-store insert, update and delete from an application cursor.
+ */
+static inline int
+__cursor_row_modify(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
+{
+ return (__wt_row_modify(session,
+ cbt, &cbt->iface.key, &cbt->iface.value, NULL, is_remove));
+}
+
+/*
+ * __wt_btcur_reset --
+ * Invalidate the cursor position.
+ */
+int
+__wt_btcur_reset(WT_CURSOR_BTREE *cbt)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_reset);
+ WT_STAT_FAST_DATA_INCR(session, cursor_reset);
+
+ return (__cursor_reset(cbt));
+}
+
+/*
+ * __wt_btcur_search --
+ * Search for a matching record in the tree.
+ */
+int
+__wt_btcur_search(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ btree = cbt->btree;
+ cursor = &cbt->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_search);
+ WT_STAT_FAST_DATA_INCR(session, cursor_search);
+
+ if (btree->type == BTREE_ROW)
+ WT_RET(__cursor_size_chk(session, &cursor->key));
+
+ WT_RET(__cursor_func_init(cbt, 1));
+
+ WT_ERR(btree->type == BTREE_ROW ?
+ __cursor_row_search(session, cbt, 0) :
+ __cursor_col_search(session, cbt));
+ if (cbt->compare == 0 && __cursor_valid(cbt, &upd))
+ ret = __wt_kv_return(session, cbt, upd);
+ else if (__cursor_fix_implicit(btree, cbt)) {
+ /*
+ * Creating a record past the end of the tree in a fixed-length
+ * column-store implicitly fills the gap with empty records.
+ */
+ cbt->recno = cursor->recno;
+ cbt->v = 0;
+ cursor->value.data = &cbt->v;
+ cursor->value.size = 1;
+ } else
+ ret = WT_NOTFOUND;
+
+err: if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
+
+/*
+ * __wt_btcur_search_near --
+ * Search for a record in the tree.
+ */
+int
+__wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+ int exact;
+
+ btree = cbt->btree;
+ cursor = &cbt->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+ exact = 0;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_search_near);
+ WT_STAT_FAST_DATA_INCR(session, cursor_search_near);
+
+ if (btree->type == BTREE_ROW)
+ WT_RET(__cursor_size_chk(session, &cursor->key));
+
+ WT_RET(__cursor_func_init(cbt, 1));
+
+ /*
+ * Set the "insert" flag for the btree row-store search; we may intend
+ * to position our cursor at the end of the tree, rather than match an
+ * existing record.
+ */
+ WT_ERR(btree->type == BTREE_ROW ?
+ __cursor_row_search(session, cbt, 1) :
+ __cursor_col_search(session, cbt));
+
+ /*
+ * If we find an valid key, return it.
+ *
+ * Else, creating a record past the end of the tree in a fixed-length
+ * column-store implicitly fills the gap with empty records. In this
+ * case, we instantiate the empty record, it's an exact match.
+ *
+ * Else, move to the next key in the tree (bias for prefix searches).
+ * Cursor next skips invalid rows, so we don't have to test for them
+ * again.
+ *
+ * Else, redo the search and move to the previous key in the tree.
+ * Cursor previous skips invalid rows, so we don't have to test for
+ * them again.
+ *
+ * If that fails, quit, there's no record to return.
+ */
+ if (__cursor_valid(cbt, &upd)) {
+ exact = cbt->compare;
+ ret = __wt_kv_return(session, cbt, upd);
+ } else if (__cursor_fix_implicit(btree, cbt)) {
+ cbt->recno = cursor->recno;
+ cbt->v = 0;
+ cursor->value.data = &cbt->v;
+ cursor->value.size = 1;
+ exact = 0;
+ } else if ((ret = __wt_btcur_next(cbt, 0)) != WT_NOTFOUND)
+ exact = 1;
+ else {
+ WT_ERR(btree->type == BTREE_ROW ?
+ __cursor_row_search(session, cbt, 1) :
+ __cursor_col_search(session, cbt));
+ if (__cursor_valid(cbt, &upd)) {
+ exact = cbt->compare;
+ ret = __wt_kv_return(session, cbt, upd);
+ } else if ((ret = __wt_btcur_prev(cbt, 0)) != WT_NOTFOUND)
+ exact = -1;
+ }
+
+err: if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND))
+ *exactp = exact;
+ return (ret);
+}
+
+/*
+ * __wt_btcur_insert --
+ * Insert a record into the tree.
+ */
+int
+__wt_btcur_insert(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ btree = cbt->btree;
+ cursor = &cbt->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_insert);
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert);
+ WT_STAT_FAST_DATA_INCRV(session,
+ cursor_insert_bytes, cursor->key.size + cursor->value.size);
+
+ if (btree->type == BTREE_ROW)
+ WT_RET(__cursor_size_chk(session, &cursor->key));
+ WT_RET(__cursor_size_chk(session, &cursor->value));
+
+ /*
+ * The tree is no longer empty: eviction should pay attention to it,
+ * and it's no longer possible to bulk-load into it.
+ */
+ if (btree->bulk_load_ok) {
+ btree->bulk_load_ok = 0;
+ __wt_btree_evictable(session, 1);
+ }
+
+retry: WT_RET(__cursor_func_init(cbt, 1));
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ /*
+ * If WT_CURSTD_APPEND is set, insert a new record (ignoring
+ * the application's record number). First we search for the
+ * maximum possible record number so the search ends on the
+ * last page. The real record number is assigned by the
+ * serialized append operation.
+ */
+ if (F_ISSET(cursor, WT_CURSTD_APPEND))
+ cbt->iface.recno = UINT64_MAX;
+
+ WT_ERR(__cursor_col_search(session, cbt));
+
+ if (F_ISSET(cursor, WT_CURSTD_APPEND))
+ cbt->iface.recno = 0;
+
+ /*
+ * If not overwriting, fail if the key exists. Creating a
+ * record past the end of the tree in a fixed-length
+ * column-store implicitly fills the gap with empty records.
+ * Fail in that case, the record exists.
+ */
+ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+ ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) ||
+ (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt))))
+ WT_ERR(WT_DUPLICATE_KEY);
+
+ WT_ERR(__cursor_col_modify(session, cbt, 0));
+ if (F_ISSET(cursor, WT_CURSTD_APPEND))
+ cbt->iface.recno = cbt->recno;
+ break;
+ case BTREE_ROW:
+ WT_ERR(__cursor_row_search(session, cbt, 1));
+ /*
+ * If not overwriting, fail if the key exists, else insert the
+ * key/value pair.
+ */
+ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+ cbt->compare == 0 && __cursor_valid(cbt, NULL))
+ WT_ERR(WT_DUPLICATE_KEY);
+
+ ret = __cursor_row_modify(session, cbt, 0);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+err: if (ret == WT_RESTART)
+ goto retry;
+ /* Insert doesn't maintain a position across calls, clear resources. */
+ if (ret == 0)
+ WT_TRET(__curfile_leave(cbt));
+ if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
+
+/*
+ * __wt_btcur_update_check --
+ * Check whether an update would conflict.
+ *
+ * This can be used to replace WT_CURSOR::insert or WT_CURSOR::update, so
+ * they only check for conflicts without updating the tree. It is used to
+ * maintain snapshot isolation for transactions that span multiple chunks
+ * in an LSM tree.
+ */
+int
+__wt_btcur_update_check(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cursor = &cbt->iface;
+ btree = cbt->btree;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+retry: WT_RET(__cursor_func_init(cbt, 1));
+
+ switch (btree->type) {
+ case BTREE_ROW:
+ WT_ERR(__cursor_row_search(session, cbt, 1));
+
+ /*
+ * We are only interested in checking for conflicts.
+ */
+ if (cbt->compare == 0 && cbt->ins != NULL)
+ ret = __wt_txn_update_check(session, cbt->ins->upd);
+ break;
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+err: if (ret == WT_RESTART)
+ goto retry;
+ WT_TRET(__curfile_leave(cbt));
+ if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
+
+/*
+ * __wt_btcur_remove --
+ * Remove a record from the tree.
+ */
+int
+__wt_btcur_remove(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ btree = cbt->btree;
+ cursor = &cbt->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_remove);
+ WT_STAT_FAST_DATA_INCR(session, cursor_remove);
+ WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size);
+
+ if (btree->type == BTREE_ROW)
+ WT_RET(__cursor_size_chk(session, &cursor->key));
+
+retry: WT_RET(__cursor_func_init(cbt, 1));
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ WT_ERR(__cursor_col_search(session, cbt));
+
+ /* Remove the record if it exists. */
+ if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) {
+ if (!__cursor_fix_implicit(btree, cbt))
+ WT_ERR(WT_NOTFOUND);
+ /*
+ * Creating a record past the end of the tree in a
+ * fixed-length column-store implicitly fills the
+ * gap with empty records. Return success in that
+ * case, the record was deleted successfully.
+ *
+ * Correct the btree cursor's location: the search
+ * will have pointed us at the previous/next item,
+ * and that's not correct.
+ */
+ cbt->recno = cursor->recno;
+ } else
+ ret = __cursor_col_modify(session, cbt, 1);
+ break;
+ case BTREE_ROW:
+ /* Remove the record if it exists. */
+ WT_ERR(__cursor_row_search(session, cbt, 0));
+ if (cbt->compare != 0 || !__cursor_valid(cbt, NULL))
+ WT_ERR(WT_NOTFOUND);
+
+ ret = __cursor_row_modify(session, cbt, 1);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+err: if (ret == WT_RESTART)
+ goto retry;
+ /*
+ * If the cursor is configured to overwrite and the record is not
+ * found, that is exactly what we want.
+ */
+ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND)
+ ret = 0;
+
+ if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+
+ return (ret);
+}
+
+/*
+ * __wt_btcur_update --
+ * Update a record in the tree.
+ */
+int
+__wt_btcur_update(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ btree = cbt->btree;
+ cursor = &cbt->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_update);
+ WT_STAT_FAST_DATA_INCR(session, cursor_update);
+ WT_STAT_FAST_DATA_INCRV(
+ session, cursor_update_bytes, cursor->value.size);
+
+ if (btree->type == BTREE_ROW)
+ WT_RET(__cursor_size_chk(session, &cursor->key));
+ WT_RET(__cursor_size_chk(session, &cursor->value));
+
+ /*
+ * The tree is no longer empty: eviction should pay attention to it,
+ * and it's no longer possible to bulk-load into it.
+ */
+ if (btree->bulk_load_ok) {
+ btree->bulk_load_ok = 0;
+ __wt_btree_evictable(session, 1);
+ }
+
+retry: WT_RET(__cursor_func_init(cbt, 1));
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ WT_ERR(__cursor_col_search(session, cbt));
+
+ /*
+ * If not overwriting, fail if the key doesn't exist. Update
+ * the record if it exists. Creating a record past the end of
+ * the tree in a fixed-length column-store implicitly fills the
+ * gap with empty records. Update the record in that case, the
+ * record exists.
+ */
+ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+ (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) &&
+ !__cursor_fix_implicit(btree, cbt))
+ WT_ERR(WT_NOTFOUND);
+ ret = __cursor_col_modify(session, cbt, 0);
+ break;
+ case BTREE_ROW:
+ WT_ERR(__cursor_row_search(session, cbt, 1));
+ /*
+ * If not overwriting, fail if the key does not exist.
+ */
+ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+ (cbt->compare != 0 || !__cursor_valid(cbt, NULL)))
+ WT_ERR(WT_NOTFOUND);
+ ret = __cursor_row_modify(session, cbt, 0);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+err: if (ret == WT_RESTART)
+ goto retry;
+
+ /*
+ * If successful, point the cursor at internal copies of the data. We
+ * could shuffle memory in the cursor so the key/value pair are in local
+ * buffer memory, but that's a data copy. We don't want to do another
+ * search (and we might get a different update structure if we race).
+ * To make this work, we add a field to the btree cursor to pass back a
+ * pointer to the modify function's allocated update structure.
+ */
+ if (ret == 0)
+ WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update));
+
+ if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
+
+/*
+ * __wt_btcur_next_random --
+ * Move to a random record in the tree.
+ */
+int
+__wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ btree = cbt->btree;
+
+ /*
+ * Only supports row-store: applications can trivially select a random
+ * value from a column-store, if there were any reason to do so.
+ */
+ if (btree->type != BTREE_ROW)
+ WT_RET(ENOTSUP);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_next);
+ WT_STAT_FAST_DATA_INCR(session, cursor_next);
+
+ WT_RET(__cursor_func_init(cbt, 1));
+
+ WT_ERR(__wt_row_random(session, cbt));
+ if (__cursor_valid(cbt, &upd))
+ WT_ERR(__wt_kv_return(session, cbt, upd));
+ else
+ WT_ERR(__wt_btcur_search_near(cbt, 0));
+
+err: if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
+
+/*
+ * __wt_btcur_compare --
+ * Return a comparison between two cursors.
+ */
+int
+__wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *a, *b;
+ WT_SESSION_IMPL *session;
+
+ a = (WT_CURSOR *)a_arg;
+ b = (WT_CURSOR *)b_arg;
+ btree = a_arg->btree;
+ session = (WT_SESSION_IMPL *)a->session;
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ /*
+ * Compare the interface's cursor record, not the underlying
+ * cursor reference: the interface's cursor reference is the
+ * one being returned to the application.
+ */
+ if (a->recno < b->recno)
+ *cmpp = -1;
+ else if (a->recno == b->recno)
+ *cmpp = 0;
+ else
+ *cmpp = 1;
+ break;
+ case BTREE_ROW:
+ WT_RET(__wt_compare(
+ session, btree->collator, &a->key, &b->key, cmpp));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
+
+/*
+ * __cursor_equals --
+ * Return if two cursors reference the same row.
+ */
+static int
+__cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b)
+{
+ switch (a->btree->type) {
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ /*
+ * Compare the interface's cursor record, not the underlying
+ * cursor reference: the interface's cursor reference is the
+ * one being returned to the application.
+ */
+ if (((WT_CURSOR *)a)->recno == ((WT_CURSOR *)b)->recno)
+ return (1);
+ break;
+ case BTREE_ROW:
+ if (a->ref != b->ref)
+ return (0);
+ if (a->ins != NULL || b->ins != NULL) {
+ if (a->ins == b->ins)
+ return (1);
+ break;
+ }
+ if (a->slot == b->slot)
+ return (1);
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __cursor_truncate --
+ * Discard a cursor range from row-store or variable-width column-store
+ * tree.
+ */
+static int
+__cursor_truncate(WT_SESSION_IMPL *session,
+ WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop,
+ int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, int))
+{
+ WT_DECL_RET;
+
+ /*
+ * First, call the standard cursor remove method to do a full search and
+ * re-position the cursor because we don't have a saved copy of the
+ * page's write generation information, which we need to remove records.
+ * Once that's done, we can delete records without a full search, unless
+ * we encounter a restart error because the page was modified by some
+ * other thread of control; in that case, repeat the full search to
+ * refresh the page's modification information.
+ *
+ * If this is a row-store, we delete leaf pages having no overflow items
+ * without reading them; for that to work, we have to ensure we read the
+ * page referenced by the ending cursor, since we may be deleting only a
+ * partial page at the end of the truncation. Our caller already fully
+ * instantiated the end cursor, so we know that page is pinned in memory
+ * and we can proceed without concern.
+ */
+ if (start == NULL) {
+ do {
+ WT_RET(__wt_btcur_remove(stop));
+ for (;;) {
+ if ((ret = __wt_btcur_prev(stop, 1)) != 0)
+ break;
+ stop->compare = 0; /* Exact match */
+ if ((ret = rmfunc(session, stop, 1)) != 0)
+ break;
+ }
+ } while (ret == WT_RESTART);
+ } else {
+ do {
+ WT_RET(__wt_btcur_remove(start));
+ for (;;) {
+ if (stop != NULL &&
+ __cursor_equals(start, stop))
+ break;
+ if ((ret = __wt_btcur_next(start, 1)) != 0)
+ break;
+ start->compare = 0; /* Exact match */
+ if ((ret = rmfunc(session, start, 1)) != 0)
+ break;
+ }
+ } while (ret == WT_RESTART);
+ }
+
+ WT_RET_NOTFOUND_OK(ret);
+ return (0);
+}
+
+/*
+ * __cursor_truncate_fix --
+ * Discard a cursor range from fixed-width column-store tree.
+ */
+static int
+__cursor_truncate_fix(WT_SESSION_IMPL *session,
+ WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop,
+ int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, int))
+{
+ WT_DECL_RET;
+ uint8_t *value;
+
+ /*
+ * Handle fixed-length column-store objects separately: for row-store
+ * and variable-length column-store objects we have "deleted" values
+ * and so returned objects actually exist: fixed-length column-store
+ * objects are filled-in if they don't exist, that is, if you create
+ * record 37, records 1-36 magically appear. Those records can't be
+ * deleted, which means we have to ignore already "deleted" records.
+ *
+ * First, call the standard cursor remove method to do a full search and
+ * re-position the cursor because we don't have a saved copy of the
+ * page's write generation information, which we need to remove records.
+ * Once that's done, we can delete records without a full search, unless
+ * we encounter a restart error because the page was modified by some
+ * other thread of control; in that case, repeat the full search to
+ * refresh the page's modification information.
+ */
+ if (start == NULL) {
+ do {
+ WT_RET(__wt_btcur_remove(stop));
+ for (;;) {
+ if ((ret = __wt_btcur_prev(stop, 1)) != 0)
+ break;
+ stop->compare = 0; /* Exact match */
+ value = (uint8_t *)stop->iface.value.data;
+ if (*value != 0 &&
+ (ret = rmfunc(session, stop, 1)) != 0)
+ break;
+ }
+ } while (ret == WT_RESTART);
+ } else {
+ do {
+ WT_RET(__wt_btcur_remove(start));
+ for (;;) {
+ if (stop != NULL &&
+ __cursor_equals(start, stop))
+ break;
+ if ((ret = __wt_btcur_next(start, 1)) != 0)
+ break;
+ start->compare = 0; /* Exact match */
+ value = (uint8_t *)start->iface.value.data;
+ if (*value != 0 &&
+ (ret = rmfunc(session, start, 1)) != 0)
+ break;
+ }
+ } while (ret == WT_RESTART);
+ }
+
+ WT_RET_NOTFOUND_OK(ret);
+ return (0);
+}
+
+/*
+ * __wt_btcur_range_truncate --
+ * Discard a cursor range from the tree.
+ */
+int
+__wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (start != NULL) ? start : stop;
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ btree = cbt->btree;
+
+ /*
+ * For recovery, we log the start and stop keys for a truncate
+ * operation, not the individual records removed. On the other hand,
+ * for rollback we need to keep track of all the in-memory operations.
+ *
+ * We deal with this here by logging the truncate range first, then (in
+ * the logging code) disabling writing of the in-memory remove records
+ * to disk.
+ */
+ if (S2C(session)->logging)
+ WT_RET(__wt_txn_truncate_log(session, start, stop));
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ WT_ERR(__cursor_truncate_fix(
+ session, start, stop, __cursor_col_modify));
+ break;
+ case BTREE_COL_VAR:
+ WT_ERR(__cursor_truncate(
+ session, start, stop, __cursor_col_modify));
+ break;
+ case BTREE_ROW:
+ /*
+ * The underlying cursor comparison routine requires cursors be
+ * fully instantiated when truncating row-store objects because
+ * it's comparing page and/or skiplist positions, not keys. (Key
+ * comparison would work, it's only that a key comparison would
+ * be relatively expensive. Column-store objects have record
+ * number keys, so the key comparison is cheap.) Cursors may
+ * have only had their keys set, so we must ensure the cursors
+ * are positioned in the tree.
+ */
+ if (start != NULL)
+ WT_ERR(__wt_btcur_search(start));
+ if (stop != NULL)
+ WT_ERR(__wt_btcur_search(stop));
+ WT_ERR(__cursor_truncate(
+ session, start, stop, __cursor_row_modify));
+ break;
+ }
+
+err: if (S2C(session)->logging)
+ WT_TRET(__wt_txn_truncate_end(session));
+ return (ret);
+}
+
+/*
+ * __wt_btcur_close --
+ * Close a btree cursor.
+ */
+int
+__wt_btcur_close(WT_CURSOR_BTREE *cbt)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ ret = __curfile_leave(cbt);
+ __wt_buf_free(session, &cbt->search_key);
+ __wt_buf_free(session, &cbt->tmp);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
new file mode 100644
index 00000000000..ebbb335d3a8
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -0,0 +1,1104 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * We pass around a session handle and output information, group it together.
+ */
+typedef struct {
+ WT_SESSION_IMPL *session; /* Enclosing session */
+
+ /*
+ * When using the standard event handlers, the debugging output has to
+ * do its own message handling because its output isn't line-oriented.
+ */
+ FILE *fp; /* Output file stream */
+ WT_ITEM *msg; /* Buffered message */
+
+ WT_ITEM *tmp; /* Temporary space */
+} WT_DBG;
+
+static const /* Output separator */
+ char * const sep = "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n";
+
+static int __debug_cell(WT_DBG *, const WT_PAGE_HEADER *, WT_CELL_UNPACK *);
+static int __debug_cell_data(
+ WT_DBG *, WT_PAGE *, int type, const char *, WT_CELL_UNPACK *);
+static void __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, int);
+static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *);
+static int __debug_dsk_cell(WT_DBG *, const WT_PAGE_HEADER *);
+static void __debug_dsk_col_fix(WT_DBG *, const WT_PAGE_HEADER *);
+static void __debug_item(WT_DBG *, const char *, const void *, size_t);
+static int __debug_page(WT_DBG *, WT_PAGE *, uint32_t);
+static void __debug_page_col_fix(WT_DBG *, WT_PAGE *);
+static int __debug_page_col_int(WT_DBG *, WT_PAGE *, uint32_t);
+static int __debug_page_col_var(WT_DBG *, WT_PAGE *);
+static int __debug_page_metadata(WT_DBG *, WT_PAGE *);
+static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t);
+static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *);
+static int __debug_ref(WT_DBG *, WT_REF *);
+static void __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *);
+static int __debug_tree(WT_SESSION_IMPL *, WT_PAGE *, const char *, uint32_t);
+static void __debug_update(WT_DBG *, WT_UPDATE *, int);
+static void __dmsg(WT_DBG *, const char *, ...)
+ WT_GCC_ATTRIBUTE((format (printf, 2, 3)));
+static void __dmsg_wrapup(WT_DBG *);
+
+/*
+ * __wt_debug_set_verbose --
+ * Set verbose flags from the debugger.
+ */
+int
+__wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v)
+{
+ const char *cfg[2] = { NULL, NULL };
+ char buf[256];
+
+ snprintf(buf, sizeof(buf), "verbose=[%s]", v);
+ cfg[0] = buf;
+ return (__wt_verbose_config(session, cfg));
+}
+
+/*
+ * __debug_hex_byte --
+ * Output a single byte in hex.
+ */
+static inline void
+__debug_hex_byte(WT_DBG *ds, uint8_t v)
+{
+ static const char hex[] = "0123456789abcdef";
+
+ __dmsg(ds, "#%c%c", hex[(v & 0xf0) >> 4], hex[v & 0x0f]);
+}
+
+/*
+ * __debug_config --
+ * Configure debugging output.
+ */
+static int
+__debug_config(WT_SESSION_IMPL *session, WT_DBG *ds, const char *ofile)
+{
+ memset(ds, 0, sizeof(WT_DBG));
+
+ ds->session = session;
+
+ WT_RET(__wt_scr_alloc(session, 512, &ds->tmp));
+
+ /*
+ * If we weren't given a file, we use the default event handler, and
+ * we'll have to buffer messages.
+ */
+ if (ofile == NULL)
+ return (__wt_scr_alloc(session, 512, &ds->msg));
+
+ /* If we're using a file, flush on each line. */
+ if ((ds->fp = fopen(ofile, "w")) == NULL)
+ WT_RET_MSG(session, __wt_errno(), "%s", ofile);
+
+ (void)setvbuf(ds->fp, NULL, _IOLBF, 0);
+ return (0);
+}
+
+/*
+ * __dmsg_wrapup --
+ * Flush any remaining output, release resources.
+ */
+static void
+__dmsg_wrapup(WT_DBG *ds)
+{
+ WT_SESSION_IMPL *session;
+ WT_ITEM *msg;
+
+ session = ds->session;
+ msg = ds->msg;
+
+ __wt_scr_free(&ds->tmp);
+
+ /*
+ * Discard the buffer -- it shouldn't have anything in it, but might
+ * as well be cautious.
+ */
+ if (msg != NULL) {
+ if (msg->size != 0)
+ (void)__wt_msg(session, "%s", (char *)msg->mem);
+ __wt_scr_free(&ds->msg);
+ }
+
+ /* Close any file we opened. */
+ if (ds->fp != NULL)
+ (void)fclose(ds->fp);
+}
+
+/*
+ * __dmsg --
+ * Debug message.
+ */
+static void
+__dmsg(WT_DBG *ds, const char *fmt, ...)
+{
+ va_list ap;
+ WT_ITEM *msg;
+ WT_SESSION_IMPL *session;
+ size_t len, space;
+ char *p;
+
+ session = ds->session;
+
+ /*
+ * Debug output chunks are not necessarily terminated with a newline
+ * character. It's easy if we're dumping to a stream, but if we're
+ * dumping to an event handler, which is line-oriented, we must buffer
+ * the output chunk, and pass it to the event handler once we see a
+ * terminating newline.
+ */
+ if (ds->fp == NULL) {
+ msg = ds->msg;
+ for (;;) {
+ p = (char *)msg->mem + msg->size;
+ space = msg->memsize - msg->size;
+ va_start(ap, fmt);
+ len = (size_t)vsnprintf(p, space, fmt, ap);
+ va_end(ap);
+
+ /* Check if there was enough space. */
+ if (len < space) {
+ msg->size += len;
+ break;
+ }
+
+ /*
+ * There's not much to do on error without checking for
+ * an error return on every single printf. Anyway, it's
+ * pretty unlikely and this is debugging output, I'm not
+ * going to worry about it.
+ */
+ if (__wt_buf_grow(
+ session, msg, msg->memsize + len + 128) != 0)
+ return;
+ }
+ if (((uint8_t *)msg->mem)[msg->size - 1] == '\n') {
+ ((uint8_t *)msg->mem)[msg->size - 1] = '\0';
+ (void)__wt_msg(session, "%s", (char *)msg->mem);
+ msg->size = 0;
+ }
+ } else {
+ va_start(ap, fmt);
+ (void)vfprintf(ds->fp, fmt, ap);
+ va_end(ap);
+ }
+}
+
+/*
+ * __wt_debug_addr_print --
+ * Print out an address.
+ */
+int
+__wt_debug_addr_print(
+ WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+ WT_DECL_ITEM(buf);
+
+ WT_RET(__wt_scr_alloc(session, 128, &buf));
+ fprintf(stderr, "%s\n",
+ __wt_addr_string(session, addr, addr_size, buf));
+ __wt_scr_free(&buf);
+
+ return (0);
+}
+
+/*
+ * __wt_debug_addr --
+ * Read and dump a disk page in debugging mode, using an addr/size pair.
+ */
+int
+__wt_debug_addr(WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_size, const char *ofile)
+{
+ WT_BM *bm;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+
+ bm = S2BT(session)->bm;
+
+ WT_RET(__wt_scr_alloc(session, 1024, &buf));
+ WT_ERR(bm->read(bm, session, buf, addr, addr_size));
+ ret = __wt_debug_disk(session, buf->mem, ofile);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_debug_offset_blind --
+ * Read and dump a disk page in debugging mode, using a file offset.
+ */
+int
+__wt_debug_offset_blind(
+ WT_SESSION_IMPL *session, wt_off_t offset, const char *ofile)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+
+ /*
+ * This routine depends on the default block manager's view of files,
+ * where an address consists of a file offset, length, and checksum.
+ * This is for debugging only. Other block managers might not see a
+ * file or address the same way, that's why there's no block manager
+ * method.
+ */
+ WT_RET(__wt_scr_alloc(session, 1024, &buf));
+ WT_ERR(__wt_block_read_off_blind(
+ session, S2BT(session)->bm->block, buf, offset));
+ ret = __wt_debug_disk(session, buf->mem, ofile);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_debug_offset --
+ * Read and dump a disk page in debugging mode, using a file
+ * offset/size/checksum triplet.
+ */
+int
+__wt_debug_offset(WT_SESSION_IMPL *session,
+ wt_off_t offset, uint32_t size, uint32_t cksum, const char *ofile)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE], *endp;
+
+ /*
+ * This routine depends on the default block manager's view of files,
+ * where an address consists of a file offset, length, and checksum.
+ * This is for debugging only: other block managers might not see a
+ * file or address the same way, that's why there's no block manager
+ * method.
+ *
+ * Convert the triplet into an address structure.
+ */
+ endp = addr;
+ WT_RET(__wt_block_addr_to_buffer(
+ S2BT(session)->bm->block, &endp, offset, size, cksum));
+
+ /*
+ * Read the address through the btree I/O functions (so the block is
+ * decompressed as necessary).
+ */
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_bt_read(session, buf, addr, WT_PTRDIFF(endp, addr)));
+ ret = __wt_debug_disk(session, buf->mem, ofile);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_debug_disk --
+ * Dump a disk page in debugging mode.
+ */
+int
+__wt_debug_disk(
+ WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile)
+{
+ WT_DBG *ds, _ds;
+ WT_DECL_RET;
+
+ ds = &_ds;
+ WT_RET(__debug_config(session, ds, ofile));
+
+ __dmsg(ds, "%s page", __wt_page_type_string(dsk->type));
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_VAR:
+ __dmsg(ds, ", recno %" PRIu64, dsk->recno);
+ /* FALLTHROUGH */
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ __dmsg(ds, ", entries %" PRIu32 "\n", dsk->u.entries);
+ break;
+ case WT_PAGE_OVFL:
+ __dmsg(ds, ", datalen %" PRIu32 "\n", dsk->u.datalen);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ __debug_dsk_col_fix(ds, dsk);
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ ret = __debug_dsk_cell(ds, dsk);
+ break;
+ default:
+ break;
+ }
+
+ __dmsg_wrapup(ds);
+
+ return (ret);
+}
+
+/*
+ * __debug_dsk_col_fix --
+ * Dump a WT_PAGE_COL_FIX page.
+ */
+static void
+__debug_dsk_col_fix(WT_DBG *ds, const WT_PAGE_HEADER *dsk)
+{
+ WT_BTREE *btree;
+ uint32_t i;
+ uint8_t v;
+
+ btree = S2BT(ds->session);
+
+ WT_FIX_FOREACH(btree, dsk, v, i) {
+ __dmsg(ds, "\t{");
+ __debug_hex_byte(ds, v);
+ __dmsg(ds, "}\n");
+ }
+}
+
+/*
+ * __debug_dsk_cell --
+ * Dump a page of WT_CELL's.
+ */
+static int
+__debug_dsk_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ uint32_t i;
+
+ btree = S2BT(ds->session);
+ unpack = &_unpack;
+
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ WT_RET(__debug_cell(ds, dsk, unpack));
+ }
+ return (0);
+}
+
+/*
+ * __debug_shape_info --
+ * Pretty-print information about a page.
+ */
+static char *
+__debug_tree_shape_info(WT_PAGE *page)
+{
+ uint64_t v;
+ static char buf[32];
+
+ v = page->memory_footprint;
+ if (v >= WT_GIGABYTE)
+ snprintf(buf, sizeof(buf), "(%" PRIu64 "G)", v / WT_GIGABYTE);
+ else if (v >= WT_MEGABYTE)
+ snprintf(buf, sizeof(buf), "(%" PRIu64 "M)", v / WT_MEGABYTE);
+ else
+ snprintf(buf, sizeof(buf), "(%" PRIu64 ")", v);
+ return (buf);
+}
+
+/*
+ * __debug_tree_shape_worker --
+ * Dump information about the current page and descend.
+ */
+static void
+__debug_tree_shape_worker(WT_DBG *ds, WT_PAGE *page, int level)
+{
+ WT_REF *ref;
+ WT_SESSION_IMPL *session;
+
+ session = ds->session;
+
+ if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) {
+ __dmsg(ds, "%*s" "I" "%s\n",
+ level, " ", __debug_tree_shape_info(page));
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ if (ref->state == WT_REF_MEM)
+ __debug_tree_shape_worker(
+ ds, ref->page, level + 3);
+ } WT_INTL_FOREACH_END;
+ } else
+ __dmsg(ds, "%*s" "L" "%s\n",
+ level, " ", __debug_tree_shape_info(page));
+}
+
+/*
+ * __wt_debug_tree_shape --
+ * Dump the shape of the in-memory tree.
+ */
+int
+__wt_debug_tree_shape(
+ WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+{
+ WT_DBG *ds, _ds;
+
+ ds = &_ds;
+ WT_RET(__debug_config(session, ds, ofile));
+
+ /* A NULL page starts at the top of the tree -- it's a convenience. */
+ if (page == NULL)
+ page = S2BT(session)->root.page;
+
+ __debug_tree_shape_worker(ds, page, 0);
+
+ __dmsg_wrapup(ds);
+ return (0);
+}
+
+#define WT_DEBUG_TREE_LEAF 0x01 /* Debug leaf pages */
+#define WT_DEBUG_TREE_WALK 0x02 /* Descend the tree */
+
+/*
+ * __wt_debug_tree_all --
+ * Dump the in-memory information for a tree, including leaf pages.
+ */
+int
+__wt_debug_tree_all(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+{
+ return (__debug_tree(
+ session, page, ofile, WT_DEBUG_TREE_LEAF | WT_DEBUG_TREE_WALK));
+}
+
+/*
+ * __wt_debug_tree --
+ * Dump the in-memory information for a tree, not including leaf pages.
+ */
+int
+__wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+{
+ return (__debug_tree(session, page, ofile, WT_DEBUG_TREE_WALK));
+}
+
+/*
+ * __wt_debug_page --
+ * Dump the in-memory information for a page.
+ */
+int
+__wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+{
+ WT_DBG *ds, _ds;
+ WT_DECL_RET;
+
+ ds = &_ds;
+ WT_RET(__debug_config(session, ds, ofile));
+
+ ret = __debug_page(ds, page, WT_DEBUG_TREE_LEAF);
+
+ __dmsg_wrapup(ds);
+
+ return (ret);
+}
+
+/*
+ * __debug_tree --
+ * Dump the in-memory information for a tree.
+ */
+static int
+__debug_tree(
+ WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile, uint32_t flags)
+{
+ WT_DBG *ds, _ds;
+ WT_DECL_RET;
+
+ ds = &_ds;
+ WT_RET(__debug_config(session, ds, ofile));
+
+ /* A NULL page starts at the top of the tree -- it's a convenience. */
+ if (page == NULL)
+ page = S2BT(session)->root.page;
+
+ ret = __debug_page(ds, page, flags);
+
+ __dmsg_wrapup(ds);
+
+ return (ret);
+}
+
+/*
+ * __debug_page --
+ * Dump the in-memory information for an in-memory page.
+ */
+static int
+__debug_page(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
+{
+ WT_SESSION_IMPL *session;
+
+ session = ds->session;
+
+ /* Dump the page metadata. */
+ WT_RET(__debug_page_metadata(ds, page));
+
+ /* Dump the page. */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ if (LF_ISSET(WT_DEBUG_TREE_LEAF))
+ __debug_page_col_fix(ds, page);
+ break;
+ case WT_PAGE_COL_INT:
+ WT_RET(__debug_page_col_int(ds, page, flags));
+ break;
+ case WT_PAGE_COL_VAR:
+ if (LF_ISSET(WT_DEBUG_TREE_LEAF))
+ WT_RET(__debug_page_col_var(ds, page));
+ break;
+ case WT_PAGE_ROW_INT:
+ WT_RET(__debug_page_row_int(ds, page, flags));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (LF_ISSET(WT_DEBUG_TREE_LEAF))
+ WT_RET(__debug_page_row_leaf(ds, page));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * __debug_page_metadata --
+ * Dump an in-memory page's metadata.
+ */
+static int
+__debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
+{
+ WT_PAGE_INDEX *pindex;
+ WT_PAGE_MODIFY *mod;
+ WT_SESSION_IMPL *session;
+ uint32_t entries;
+
+ session = ds->session;
+ mod = page->modify;
+
+ __dmsg(ds, "%p", page);
+
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ __dmsg(ds, " recno %" PRIu64, page->pg_intl_recno);
+ pindex = WT_INTL_INDEX_COPY(page);
+ entries = pindex->entries;
+ break;
+ case WT_PAGE_COL_FIX:
+ __dmsg(ds, " recno %" PRIu64, page->pg_fix_recno);
+ entries = page->pg_fix_entries;
+ break;
+ case WT_PAGE_COL_VAR:
+ __dmsg(ds, " recno %" PRIu64, page->pg_var_recno);
+ entries = page->pg_var_entries;
+ break;
+ case WT_PAGE_ROW_INT:
+ pindex = WT_INTL_INDEX_COPY(page);
+ entries = pindex->entries;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ entries = page->pg_row_entries;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ __dmsg(ds, ": %s\n", __wt_page_type_string(page->type));
+ __dmsg(ds, "\t" "disk %p, entries %" PRIu32, page->dsk, entries);
+ __dmsg(ds, "%s", __wt_page_is_modified(page) ? ", dirty" : ", clean");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
+ __dmsg(ds, ", keys-built");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC))
+ __dmsg(ds, ", disk-alloc");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED))
+ __dmsg(ds, ", disk-mapped");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
+ __dmsg(ds, ", evict-lru");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_SCANNING))
+ __dmsg(ds, ", scanning");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_SPLITTING))
+ __dmsg(ds, ", splitting");
+
+ if (mod != NULL)
+ switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+ case WT_PM_REC_EMPTY:
+ __dmsg(ds, ", empty");
+ break;
+ case WT_PM_REC_MULTIBLOCK:
+ __dmsg(ds, ", multiblock");
+ break;
+ case WT_PM_REC_REPLACE:
+ __dmsg(ds, ", replaced");
+ break;
+ case 0:
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ if (mod != NULL)
+ __dmsg(ds, ", write generation=%" PRIu32, mod->write_gen);
+ __dmsg(ds, "\n");
+
+ return (0);
+}
+
+/*
+ * __debug_page_col_fix --
+ * Dump an in-memory WT_PAGE_COL_FIX page.
+ */
+static void
+__debug_page_col_fix(WT_DBG *ds, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_INSERT *ins;
+ const WT_PAGE_HEADER *dsk;
+ WT_SESSION_IMPL *session;
+ uint64_t recno;
+ uint32_t i;
+ uint8_t v;
+
+ session = ds->session;
+ btree = S2BT(session);
+ dsk = page->dsk;
+ recno = page->pg_fix_recno;
+
+ if (dsk != NULL) {
+ ins = WT_SKIP_FIRST(WT_COL_UPDATE_SINGLE(page));
+ WT_FIX_FOREACH(btree, dsk, v, i) {
+ __dmsg(ds, "\t%" PRIu64 "\t{", recno);
+ __debug_hex_byte(ds, v);
+ __dmsg(ds, "}\n");
+
+ /* Check for a match on the update list. */
+ if (ins != NULL && WT_INSERT_RECNO(ins) == recno) {
+ __dmsg(ds,
+ "\tupdate %" PRIu64 "\n",
+ WT_INSERT_RECNO(ins));
+ __debug_update(ds, ins->upd, 1);
+ ins = WT_SKIP_NEXT(ins);
+ }
+ ++recno;
+ }
+ }
+
+ if (WT_COL_UPDATE_SINGLE(page) != NULL) {
+ __dmsg(ds, "%s", sep);
+ __debug_col_skip(ds, WT_COL_UPDATE_SINGLE(page), "update", 1);
+ }
+ if (WT_COL_APPEND(page) != NULL) {
+ __dmsg(ds, "%s", sep);
+ __debug_col_skip(ds, WT_COL_APPEND(page), "append", 1);
+ }
+}
+
+/*
+ * __debug_page_col_int --
+ * Dump an in-memory WT_PAGE_COL_INT page.
+ */
+static int
+__debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
+{
+ WT_REF *ref;
+ WT_SESSION_IMPL *session;
+
+ session = ds->session;
+
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ __dmsg(ds, "\trecno %" PRIu64 "\n", ref->key.recno);
+ WT_RET(__debug_ref(ds, ref));
+ } WT_INTL_FOREACH_END;
+
+ if (LF_ISSET(WT_DEBUG_TREE_WALK))
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ if (ref->state == WT_REF_MEM) {
+ __dmsg(ds, "\n");
+ WT_RET(__debug_page(ds, ref->page, flags));
+ }
+ } WT_INTL_FOREACH_END;
+
+ return (0);
+}
+
+/*
+ * __debug_page_col_var --
+ * Dump an in-memory WT_PAGE_COL_VAR page.
+ */
+static int
+__debug_page_col_var(WT_DBG *ds, WT_PAGE *page)
+{
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_COL *cip;
+ WT_INSERT_HEAD *update;
+ uint64_t recno, rle;
+ uint32_t i;
+ char tag[64];
+
+ unpack = &_unpack;
+ recno = page->pg_var_recno;
+
+ WT_COL_FOREACH(page, cip, i) {
+ if ((cell = WT_COL_PTR(page, cip)) == NULL) {
+ unpack = NULL;
+ rle = 1;
+ } else {
+ __wt_cell_unpack(cell, unpack);
+ rle = __wt_cell_rle(unpack);
+ }
+ snprintf(tag, sizeof(tag), "%" PRIu64 " %" PRIu64, recno, rle);
+ WT_RET(
+ __debug_cell_data(ds, page, WT_PAGE_COL_VAR, tag, unpack));
+
+ if ((update = WT_COL_UPDATE(page, cip)) != NULL)
+ __debug_col_skip(ds, update, "update", 0);
+ recno += rle;
+ }
+
+ if (WT_COL_APPEND(page) != NULL) {
+ __dmsg(ds, "%s", sep);
+ __debug_col_skip(ds, WT_COL_APPEND(page), "append", 0);
+ }
+
+ return (0);
+}
+
+/*
+ * __debug_page_row_int --
+ * Dump an in-memory WT_PAGE_ROW_INT page.
+ */
+static int
+__debug_page_row_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
+{
+ WT_REF *ref;
+ WT_SESSION_IMPL *session;
+ size_t len;
+ uint8_t *p;
+
+ session = ds->session;
+
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ __wt_ref_key(page, ref, &p, &len);
+ __debug_item(ds, "K", p, len);
+ WT_RET(__debug_ref(ds, ref));
+ } WT_INTL_FOREACH_END;
+
+ if (LF_ISSET(WT_DEBUG_TREE_WALK))
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ if (ref->state == WT_REF_MEM) {
+ __dmsg(ds, "\n");
+ WT_RET(__debug_page(ds, ref->page, flags));
+ }
+ } WT_INTL_FOREACH_END;
+ return (0);
+}
+
+/*
+ * __debug_page_row_leaf --
+ * Dump an in-memory WT_PAGE_ROW_LEAF page.
+ */
+static int
+__debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
+{
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_INSERT_HEAD *insert;
+ WT_ROW *rip;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+ uint32_t i;
+
+ session = ds->session;
+ unpack = &_unpack;
+ WT_RET(__wt_scr_alloc(session, 256, &key));
+
+ /*
+ * Dump any K/V pairs inserted into the page before the first from-disk
+ * key on the page.
+ */
+ if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
+ __debug_row_skip(ds, insert);
+
+ /* Dump the page's K/V pairs. */
+ WT_ROW_FOREACH(page, rip, i) {
+ WT_RET(__wt_row_leaf_key(session, page, rip, key, 0));
+ __debug_item(ds, "K", key->data, key->size);
+
+ if ((cell = __wt_row_leaf_value_cell(page, rip, NULL)) == NULL)
+ __dmsg(ds, "\tV {}\n");
+ else {
+ __wt_cell_unpack(cell, unpack);
+ WT_ERR(__debug_cell_data(
+ ds, page, WT_PAGE_ROW_LEAF, "V", unpack));
+ }
+
+ if ((upd = WT_ROW_UPDATE(page, rip)) != NULL)
+ __debug_update(ds, upd, 0);
+
+ if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
+ __debug_row_skip(ds, insert);
+ }
+
+err: __wt_scr_free(&key);
+ return (ret);
+}
+
+/*
+ * __debug_col_skip --
+ * Dump a column-store skiplist.
+ */
+static void
+__debug_col_skip(WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, int hexbyte)
+{
+ WT_INSERT *ins;
+
+ WT_SKIP_FOREACH(ins, head) {
+ __dmsg(ds,
+ "\t%s %" PRIu64 "\n", tag, WT_INSERT_RECNO(ins));
+ __debug_update(ds, ins->upd, hexbyte);
+ }
+}
+
+/*
+ * __debug_row_skip --
+ * Dump an insert list.
+ */
+static void
+__debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head)
+{
+ WT_INSERT *ins;
+
+ WT_SKIP_FOREACH(ins, head) {
+ __debug_item(ds,
+ "insert", WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins));
+ __debug_update(ds, ins->upd, 0);
+ }
+}
+
+/*
+ * __debug_update --
+ * Dump an update list.
+ */
+static void
+__debug_update(WT_DBG *ds, WT_UPDATE *upd, int hexbyte)
+{
+ for (; upd != NULL; upd = upd->next)
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ __dmsg(ds, "\tvalue {deleted}\n");
+ else if (hexbyte) {
+ __dmsg(ds, "\t{");
+ __debug_hex_byte(ds,
+ ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+ __dmsg(ds, "}\n");
+ } else
+ __debug_item(ds,
+ "value", WT_UPDATE_DATA(upd), upd->size);
+}
+
+/*
+ * __debug_ref --
+ * Dump a WT_REF structure.
+ */
+static int
+__debug_ref(WT_DBG *ds, WT_REF *ref)
+{
+ WT_SESSION_IMPL *session;
+ size_t addr_size;
+ const uint8_t *addr;
+
+ session = ds->session;
+
+ __dmsg(ds, "\t");
+ switch (ref->state) {
+ case WT_REF_DISK:
+ __dmsg(ds, "disk");
+ break;
+ case WT_REF_DELETED:
+ __dmsg(ds, "deleted");
+ break;
+ case WT_REF_LOCKED:
+ __dmsg(ds, "locked %p", ref->page);
+ break;
+ case WT_REF_MEM:
+ __dmsg(ds, "memory %p", ref->page);
+ break;
+ case WT_REF_READING:
+ __dmsg(ds, "reading");
+ break;
+ case WT_REF_SPLIT:
+ __dmsg(ds, "split");
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ __dmsg(ds, " %s\n",
+ __wt_addr_string(session, addr, addr_size, ds->tmp));
+
+ return (0);
+}
+
+/*
+ * __debug_cell --
+ * Dump a single unpacked WT_CELL.
+ */
+static int
+__debug_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ const char *type;
+
+ session = ds->session;
+
+ __dmsg(ds, "\t%s: len %" PRIu32,
+ __wt_cell_type_string(unpack->raw), unpack->size);
+
+ /* Dump cell's per-disk page type information. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_INT:
+ switch (unpack->type) {
+ case WT_CELL_VALUE:
+ __dmsg(ds, ", recno: %" PRIu64, unpack->v);
+ break;
+ }
+ break;
+ case WT_PAGE_COL_VAR:
+ switch (unpack->type) {
+ case WT_CELL_DEL:
+ case WT_CELL_KEY_OVFL_RM:
+ case WT_CELL_VALUE:
+ case WT_CELL_VALUE_OVFL:
+ case WT_CELL_VALUE_OVFL_RM:
+ __dmsg(ds, ", rle: %" PRIu64, __wt_cell_rle(unpack));
+ break;
+ }
+ break;
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ switch (unpack->type) {
+ case WT_CELL_KEY:
+ __dmsg(ds, ", pfx: %" PRIu8, unpack->prefix);
+ break;
+ }
+ break;
+ }
+
+ /* Dump addresses. */
+ switch (unpack->raw) {
+ case WT_CELL_ADDR_DEL:
+ type = "addr/del";
+ goto addr;
+ case WT_CELL_ADDR_INT:
+ type = "addr/int";
+ goto addr;
+ case WT_CELL_ADDR_LEAF:
+ type = "addr/leaf";
+ goto addr;
+ case WT_CELL_ADDR_LEAF_NO:
+ type = "addr/leaf-no";
+ goto addr;
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_KEY_OVFL_RM:
+ case WT_CELL_VALUE_OVFL:
+ case WT_CELL_VALUE_OVFL_RM:
+ type = "ovfl";
+addr: WT_RET(__wt_scr_alloc(session, 128, &buf));
+ __dmsg(ds, ", %s %s", type,
+ __wt_addr_string(session, unpack->data, unpack->size, buf));
+ __wt_scr_free(&buf);
+ WT_RET(ret);
+ break;
+ }
+ __dmsg(ds, "\n");
+
+ return (__debug_cell_data(ds, NULL, dsk->type, NULL, unpack));
+}
+
+/*
+ * __debug_cell_data --
+ * Dump a single cell's data in debugging mode.
+ */
+static int
+__debug_cell_data(WT_DBG *ds,
+ WT_PAGE *page, int page_type, const char *tag, WT_CELL_UNPACK *unpack)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ const char *p;
+
+ session = ds->session;
+
+ /*
+ * Column-store references to deleted cells return a NULL cell
+ * reference.
+ */
+ if (unpack == NULL) {
+ __debug_item(ds, tag, "deleted", strlen("deleted"));
+ return (0);
+ }
+
+ switch (unpack->raw) {
+ case WT_CELL_ADDR_DEL:
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ case WT_CELL_DEL:
+ case WT_CELL_KEY_OVFL_RM:
+ case WT_CELL_VALUE_OVFL_RM:
+ p = __wt_cell_type_string(unpack->raw);
+ __debug_item(ds, tag, p, strlen(p));
+ break;
+ case WT_CELL_KEY:
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_KEY_PFX:
+ case WT_CELL_KEY_SHORT:
+ case WT_CELL_KEY_SHORT_PFX:
+ case WT_CELL_VALUE:
+ case WT_CELL_VALUE_COPY:
+ case WT_CELL_VALUE_OVFL:
+ case WT_CELL_VALUE_SHORT:
+ WT_RET(__wt_scr_alloc(session, 256, &buf));
+ ret = page == NULL ?
+ __wt_dsk_cell_data_ref(session, page_type, unpack, buf) :
+ __wt_page_cell_data_ref(session, page, unpack, buf);
+ if (ret == 0)
+ __debug_item(ds, tag, buf->data, buf->size);
+ __wt_scr_free(&buf);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (ret);
+}
+
+/*
+ * __debug_item --
+ * Dump a single data/size pair, with an optional tag.
+ */
+static void
+__debug_item(WT_DBG *ds, const char *tag, const void *data_arg, size_t size)
+{
+ size_t i;
+ int ch;
+ const uint8_t *data;
+
+ __dmsg(ds, "\t%s%s{", tag == NULL ? "" : tag, tag == NULL ? "" : " ");
+ for (data = data_arg, i = 0; i < size; ++i, ++data) {
+ ch = data[0];
+ if (isprint(ch))
+ __dmsg(ds, "%c", ch);
+ else
+ __debug_hex_byte(ds, data[0]);
+ }
+ __dmsg(ds, "}\n");
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
new file mode 100644
index 00000000000..2fc1b0d5460
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -0,0 +1,339 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Fast-delete support.
+ *
+ * This file contains most of the code that allows WiredTiger to delete pages
+ * of data without reading them into the cache. (This feature is currently
+ * only available for row-store objects.)
+ *
+ * The way cursor truncate works in a row-store object is it explicitly reads
+ * the first and last pages of the truncate range, then walks the tree with a
+ * flag so the cursor walk code marks any page within the range, that hasn't
+ * yet been read and which has no overflow items, as deleted, by changing the
+ * WT_REF state to WT_REF_DELETED. Pages already in the cache or with overflow
+ * items, have their rows updated/deleted individually. The transaction for the
+ * delete operation is stored in memory referenced by the WT_REF.page_del field.
+ *
+ * Future cursor walks of the tree will skip the deleted page based on the
+ * transaction stored for the delete, but it gets more complicated if a read is
+ * done using a random key, or a cursor walk is done with a transaction where
+ * the delete is not visible. In those cases, we read the original contents of
+ * the page. The page-read code notices a deleted page is being read, and as
+ * part of the read instantiates the contents of the page, creating a WT_UPDATE
+ * with a deleted operation, in the same transaction as deleted the page. In
+ * other words, the read process makes it appear as if the page was read and
+ * each individual row deleted, exactly as would have happened if the page had
+ * been in the cache all along.
+ *
+ * There's an additional complication to support rollback of the page delete.
+ * When the page was marked deleted, a pointer to the WT_REF was saved in the
+ * deleting session's transaction list and the delete is unrolled by resetting
+ * the WT_REF_DELETED state back to WT_REF_DISK. However, if the page has been
+ * instantiated by some reading thread, that's not enough, each individual row
+ * on the page must have the delete operation reset. If the page split, the
+ * WT_UPDATE lists might have been saved/restored during reconciliation and
+ * appear on multiple pages, and the WT_REF stored in the deleting session's
+ * transaction list is no longer useful. For this reason, when the page is
+ * instantiated by a read, a list of the WT_UPDATE structures on the page is
+ * stored in the WT_REF.page_del field, with the transaction ID, that way the
+ * session unrolling the delete can find all of the WT_UPDATE structures that
+ * require update.
+ *
+ * One final note: pages can also be marked deleted if emptied and evicted. In
+ * that case, the WT_REF state will be set to WT_REF_DELETED but there will not
+ * be any associated WT_REF.page_del field. These pages are always skipped
+ * during cursor traversal (the page could not have been evicted if there were
+ * updates that weren't globally visible), and if read is forced to instantiate
+ * such a page, it simply creates an empty page from scratch.
+ */
+
+/*
+ * __wt_delete_page --
+ * If deleting a range, try to delete the page without instantiating it.
+ */
+int
+__wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+
+ *skipp = 0;
+
+ /*
+ * Atomically switch the page's state to lock it. If the page is not
+ * on-disk, other threads may be using it, no fast delete.
+ *
+ * Possible optimization: if the page is already deleted and the delete
+ * is visible to us (the delete has been committed), we could skip the
+ * page instead of instantiating it and figuring out there are no rows
+ * in the page. While that's a huge amount of work to no purpose, it's
+ * unclear optimizing for overlapping range deletes is worth the effort.
+ */
+ if (ref->state != WT_REF_DISK ||
+ !WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_LOCKED))
+ return (0);
+
+ /*
+ * We cannot fast-delete pages that have overflow key/value items as
+ * the overflow blocks have to be discarded. The way we figure that
+ * out is to check the on-page cell type for the page, cells for leaf
+ * pages that have no overflow items are special.
+ *
+ * In some cases, the reference address may not reference an on-page
+ * cell (for example, some combination of page splits), in which case
+ * we can't check the original cell value and we fail.
+ *
+ * To look at an on-page cell, we need to look at the parent page, and
+ * that's dangerous, our parent page could change without warning if
+ * the parent page were to split, deepening the tree. It's safe: the
+ * page's reference will always point to some valid page, and if we find
+ * any problems we simply fail the fast-delete optimization.
+ *
+ * !!!
+ * I doubt it's worth the effort, but we could copy the cell's type into
+ * the reference structure, and then we wouldn't need an on-page cell.
+ */
+ parent = ref->home;
+ if (__wt_off_page(parent, ref->addr) ||
+ __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO)
+ goto err;
+
+ /*
+ * This action dirties the parent page: mark it dirty now, there's no
+ * future reconciliation of the child leaf page that will dirty it as
+ * we write the tree.
+ */
+ WT_ERR(__wt_page_parent_modify_set(session, ref, 0));
+
+ /*
+ * Record the change in the transaction structure and set the change's
+ * transaction ID.
+ */
+ WT_ERR(__wt_calloc_def(session, 1, &ref->page_del));
+ ref->page_del->txnid = session->txn.id;
+
+ WT_ERR(__wt_txn_modify_ref(session, ref));
+
+ *skipp = 1;
+ WT_PUBLISH(ref->state, WT_REF_DELETED);
+ return (0);
+
+err: __wt_free(session, ref->page_del);
+
+ /*
+ * Restore the page to on-disk status, we'll have to instantiate it.
+ */
+ WT_PUBLISH(ref->state, WT_REF_DISK);
+ return (ret);
+}
+
+/*
+ * __wt_delete_page_rollback --
+ * Abort pages that were deleted without being instantiated.
+ */
+void
+__wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_UPDATE **upd;
+
+ /*
+ * If the page is still "deleted", it's as we left it, reset the state
+ * to on-disk and we're done. Otherwise, we expect the page is either
+ * instantiated or being instantiated. Loop because it's possible for
+ * the page to return to the deleted state if instantiation fails.
+ */
+ for (;; __wt_yield())
+ switch (ref->state) {
+ case WT_REF_DISK:
+ case WT_REF_READING:
+ WT_ASSERT(session, 0); /* Impossible, assert */
+ break;
+ case WT_REF_DELETED:
+ /*
+ * If the page is still "deleted", it's as we left it,
+ * reset the state.
+ */
+ if (WT_ATOMIC_CAS4(
+ ref->state, WT_REF_DELETED, WT_REF_DISK))
+ return;
+ break;
+ case WT_REF_LOCKED:
+ /*
+ * A possible state, the page is being instantiated.
+ */
+ break;
+ case WT_REF_MEM:
+ case WT_REF_SPLIT:
+ /*
+ * We can't use the normal read path to get a copy of
+ * the page because the session may have closed the
+ * cursor, we no longer have the reference to the tree
+ * required for a hazard pointer. We're safe because
+ * with unresolved transactions, the page isn't going
+ * anywhere.
+ *
+ * The page is in an in-memory state, walk the list of
+ * update structures and abort them.
+ */
+ for (upd =
+ ref->page_del->update_list; *upd != NULL; ++upd)
+ (*upd)->txnid = WT_TXN_ABORTED;
+
+ /*
+ * Discard the memory, the transaction can't abort
+ * twice.
+ */
+ __wt_free(session, ref->page_del->update_list);
+ __wt_free(session, ref->page_del);
+ return;
+ }
+}
+
+/*
+ * __wt_delete_page_skip --
+ * If iterating a cursor, skip deleted pages that are visible to us.
+ */
+int
+__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ int skip;
+
+ /*
+ * Deleted pages come from two sources: either it's a fast-delete as
+ * described above, or the page has been emptied by other operations
+ * and eviction deleted it.
+ *
+ * In both cases, the WT_REF state will be WT_REF_DELETED. In the case
+ * of a fast-delete page, there will be a WT_PAGE_DELETED structure with
+ * the transaction ID of the transaction that deleted the page, and the
+ * page is visible if that transaction ID is visible. In the case of an
+ * empty page, there will be no WT_PAGE_DELETED structure and the delete
+ * is by definition visible, eviction could not have deleted the page if
+ * there were changes on it that were not globally visible.
+ *
+ * We're here because we found a WT_REF state set to WT_REF_DELETED. It
+ * is possible the page is being read into memory right now, though, and
+ * the page could switch to an in-memory state at any time. Lock down
+ * the structure, just to be safe.
+ */
+ if (!WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+ return (0);
+
+ skip = ref->page_del == NULL ||
+ __wt_txn_visible(session, ref->page_del->txnid) ? 1 : 0;
+
+ WT_PUBLISH(ref->state, WT_REF_DELETED);
+ return (skip);
+}
+
+/*
+ * __wt_delete_page_instantiate --
+ * Instantiate an entirely deleted row-store leaf page.
+ */
+int
+__wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_DELETED *page_del;
+ WT_UPDATE **upd_array, *upd;
+ uint32_t i;
+
+ btree = S2BT(session);
+ page = ref->page;
+ page_del = ref->page_del;
+
+ /*
+ * Give the page a modify structure.
+ *
+ * If the tree is already dirty and so will be written, mark the page
+ * dirty. (We'd like to free the deleted pages, but if the handle is
+ * read-only or if the application never modifies the tree, we're not
+ * able to do so.)
+ */
+ if (btree->modified) {
+ WT_RET(__wt_page_modify_init(session, page));
+ __wt_page_modify_set(session, page);
+ }
+
+ /*
+ * An operation is accessing a "deleted" page, and we're building an
+ * in-memory version of the page (making it look like all entries in
+ * the page were individually updated by a remove operation). There
+ * are two cases where we end up here:
+ *
+ * First, a running transaction used a truncate call to delete the page
+ * without reading it, in which case the page reference includes a
+ * structure with a transaction ID; the page we're building might split
+ * in the future, so we update that structure to include references to
+ * all of the update structures we create, so the transaction can abort.
+ *
+ * Second, a truncate call deleted a page and the truncate committed,
+ * but an older transaction in the system forced us to keep the old
+ * version of the page around, then we crashed and recovered, and now
+ * we're being forced to read that page.
+ *
+ * In the first case, we have a page reference structure, in the second
+ * second, we don't.
+ *
+ * Allocate the per-reference update array; in the case of instantiating
+ * a page, deleted by a running transaction that might eventually abort,
+ * we need a list of the update structures so we can do that abort. The
+ * hard case is if a page splits: the update structures might be moved
+ * to different pages, and we still have to find them all for an abort.
+ */
+
+ if (page_del != NULL)
+ WT_RET(__wt_calloc_def(
+ session, page->pg_row_entries + 1, &page_del->update_list));
+
+ /* Allocate the per-page update array. */
+ WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array));
+ page->pg_row_upd = upd_array;
+
+ /*
+ * Fill in the per-reference update array with references to update
+ * structures, fill in the per-page update array with references to
+ * deleted items.
+ */
+ for (i = 0; i < page->pg_row_entries; ++i) {
+ WT_ERR(__wt_calloc_def(session, 1, &upd));
+ WT_UPDATE_DELETED_SET(upd);
+
+ if (page_del == NULL)
+ upd->txnid = WT_TXN_NONE; /* Globally visible */
+ else {
+ upd->txnid = page_del->txnid;
+ page_del->update_list[i] = upd;
+ }
+
+ upd->next = upd_array[i];
+ upd_array[i] = upd;
+ }
+
+ __wt_cache_page_inmem_incr(session, page,
+ page->pg_row_entries * (sizeof(WT_UPDATE *) + sizeof(WT_UPDATE)));
+
+ return (0);
+
+err: /*
+ * There's no need to free the page update structures on error, our
+ * caller will discard the page and do that work for us. We could
+ * similarly leave the per-reference update array alone because it
+ * won't ever be used by any page that's not in-memory, but cleaning
+ * it up makes sense, especially if we come back in to this function
+ * attempting to instantiate this page again.
+ */
+ if (page_del != NULL)
+ __wt_free(session, page_del->update_list);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
new file mode 100644
index 00000000000..a162e2dc841
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -0,0 +1,422 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static void __free_page_modify(WT_SESSION_IMPL *, WT_PAGE *);
+static void __free_page_col_var(WT_SESSION_IMPL *, WT_PAGE *);
+static void __free_page_int(WT_SESSION_IMPL *, WT_PAGE *);
+static void __free_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *);
+static void __free_skip_array(WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t);
+static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *);
+static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t);
+static void __free_update_list(WT_SESSION_IMPL *, WT_UPDATE *);
+
+/*
+ * __wt_ref_out --
+ * Discard an in-memory page, freeing all memory associated with it.
+ */
+void
+__wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ /*
+ * A version of the page-out function that allows us to make additional
+ * diagnostic checks.
+ */
+ WT_ASSERT(session, S2BT(session)->evict_ref != ref);
+
+ __wt_page_out(session, &ref->page);
+}
+
+/*
+ * __wt_page_out --
+ * Discard an in-memory page, freeing all memory associated with it.
+ */
+void
+__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
+{
+ WT_PAGE *page;
+ WT_PAGE_HEADER *dsk;
+ WT_PAGE_MODIFY *mod;
+
+ /*
+ * Kill our caller's reference, do our best to catch races.
+ */
+ page = *pagep;
+ *pagep = NULL;
+
+ /*
+ * We should never discard a dirty page, the file's current eviction
+ * point or a page queued for LRU eviction.
+ */
+ WT_ASSERT(session, !__wt_page_is_modified(page));
+ WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
+ WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLITTING));
+
+#ifdef HAVE_DIAGNOSTIC
+ {
+ WT_HAZARD *hp;
+ int i;
+ /*
+ * Make sure no other thread has a hazard pointer on the page we are
+ * about to discard. This is complicated by the fact that readers
+ * publish their hazard pointer before re-checking the page state, so
+ * our check can race with readers without indicating a real problem.
+ * Wait for up to a second for hazard pointers to be cleared.
+ */
+ for (hp = NULL, i = 0; i < 100; i++) {
+ if ((hp = __wt_page_hazard_check(session, page)) == NULL)
+ break;
+ __wt_sleep(0, 10000);
+ }
+ if (hp != NULL)
+ __wt_errx(session,
+ "discarded page has hazard pointer: (%p: %s, line %d)",
+ hp->page, hp->file, hp->line);
+ WT_ASSERT(session, hp == NULL);
+ }
+#endif
+
+ /*
+ * If a root page split, there may be one or more pages linked from the
+ * page; walk the list, discarding pages.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ mod = page->modify;
+ if (mod != NULL && mod->mod_root_split != NULL)
+ __wt_page_out(session, &mod->mod_root_split);
+ break;
+ }
+
+ /* Update the cache's information. */
+ __wt_cache_page_evict(session, page);
+
+ /*
+ * If discarding the page as part of process exit, the application may
+ * configure to leak the memory rather than do the work.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_LEAK_MEMORY))
+ return;
+
+ /* Free the page modification information. */
+ if (page->modify != NULL)
+ __free_page_modify(session, page);
+
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ __free_page_int(session, page);
+ break;
+ case WT_PAGE_COL_VAR:
+ __free_page_col_var(session, page);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ __free_page_row_leaf(session, page);
+ break;
+ }
+
+ /* Discard any disk image. */
+ dsk = (WT_PAGE_HEADER *)page->dsk;
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC))
+ __wt_overwrite_and_free_len(session, dsk, dsk->mem_size);
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED))
+ (void)__wt_mmap_discard(session, dsk, dsk->mem_size);
+
+ __wt_overwrite_and_free(session, page);
+}
+
+/*
+ * __free_page_modify --
+ * Discard the page's associated modification structures.
+ */
+static void
+__free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_INSERT_HEAD *append;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ uint32_t i;
+
+ mod = page->modify;
+
+ switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+ case WT_PM_REC_MULTIBLOCK:
+ /* Free list of replacement blocks. */
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ __wt_free(session, multi->key.ikey);
+ break;
+ }
+ __wt_free(session, multi->skip);
+ __wt_free(session, multi->skip_dsk);
+ __wt_free(session, multi->addr.addr);
+ }
+ __wt_free(session, mod->mod_multi);
+ break;
+ case WT_PM_REC_REPLACE:
+ /*
+ * Discard any replacement address: this memory is usually moved
+ * into the parent's WT_REF, but at the root that can't happen.
+ */
+ __wt_free(session, mod->mod_replace.addr);
+ break;
+ }
+
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ /* Free the append array. */
+ if ((append = WT_COL_APPEND(page)) != NULL) {
+ __free_skip_list(session, WT_SKIP_FIRST(append));
+ __wt_free(session, append);
+ __wt_free(session, mod->mod_append);
+ }
+
+ /* Free the insert/update array. */
+ if (mod->mod_update != NULL)
+ __free_skip_array(session, mod->mod_update,
+ page->type ==
+ WT_PAGE_COL_FIX ? 1 : page->pg_var_entries);
+ break;
+ }
+
+ /* Free the overflow on-page, reuse and transaction-cache skiplists. */
+ __wt_ovfl_reuse_free(session, page);
+ __wt_ovfl_txnc_free(session, page);
+ __wt_ovfl_discard_free(session, page);
+
+ __wt_free(session, page->modify->ovfl_track);
+
+ __wt_free(session, page->modify);
+}
+
+/*
+ * __free_page_int --
+ * Discard a WT_PAGE_COL_INT or WT_PAGE_ROW_INT page.
+ */
+static void
+__free_page_int(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ __wt_free_ref_index(session, page, WT_INTL_INDEX_COPY(page), 0);
+}
+
+/*
+ * __wt_free_ref --
+ * Discard the contents of a WT_REF structure (optionally including the
+ * pages it references).
+ */
+void
+__wt_free_ref(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, int free_pages)
+{
+ WT_IKEY *ikey;
+
+ if (ref == NULL)
+ return;
+
+ /*
+ * Optionally free the referenced pages. (The path to free referenced
+ * page is used for error cleanup, no instantiated and then discarded
+ * page should have WT_REF entries with real pages. The page may have
+ * been marked dirty as well; page discard checks for that, so we mark
+ * it clean explicitly.)
+ */
+ if (free_pages && ref->page != NULL) {
+ if (ref->page->modify != NULL) {
+ ref->page->modify->write_gen = 0;
+ __wt_cache_dirty_decr(session, ref->page);
+ }
+ __wt_page_out(session, &ref->page);
+ }
+
+ /* Free any key allocation. */
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ if ((ikey = __wt_ref_key_instantiated(ref)) != NULL)
+ __wt_free(session, ikey);
+ break;
+ }
+
+ /* Free any address allocation. */
+ if (ref->addr != NULL && __wt_off_page(page, ref->addr)) {
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ }
+
+ /* Free any page-deleted information. */
+ if (ref->page_del != NULL) {
+ __wt_free(session, ref->page_del->update_list);
+ __wt_free(session, ref->page_del);
+ }
+
+ __wt_overwrite_and_free(session, ref);
+}
+
+/*
+ * __wt_free_ref_index --
+ * Discard a page index and it's references.
+ */
+void
+__wt_free_ref_index(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_PAGE_INDEX *pindex, int free_pages)
+{
+ uint32_t i;
+
+ if (pindex == NULL)
+ return;
+
+ for (i = 0; i < pindex->entries; ++i)
+ __wt_free_ref(session, page, pindex->index[i], free_pages);
+ __wt_free(session, pindex);
+}
+
+/*
+ * __free_page_col_var --
+ * Discard a WT_PAGE_COL_VAR page.
+ */
+static void
+__free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ /* Free the RLE lookup array. */
+ __wt_free(session, page->pg_var_repeats);
+}
+
+/*
+ * __free_page_row_leaf --
+ * Discard a WT_PAGE_ROW_LEAF page.
+ */
+static void
+__free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_IKEY *ikey;
+ WT_ROW *rip;
+ uint32_t i;
+ void *copy;
+
+ /*
+ * Free the in-memory index array.
+ *
+ * For each entry, see if the key was an allocation (that is, if it
+ * points somewhere other than the original page), and if so, free
+ * the memory.
+ */
+ WT_ROW_FOREACH(page, rip, i) {
+ copy = WT_ROW_KEY_COPY(rip);
+ (void)__wt_row_leaf_key_info(
+ page, copy, &ikey, NULL, NULL, NULL);
+ if (ikey != NULL)
+ __wt_free(session, ikey);
+ }
+
+ /*
+ * Free the insert array.
+ *
+ * Row-store tables have one additional slot in the insert array (the
+ * insert array has an extra slot to hold keys that sort before keys
+ * found on the original page).
+ */
+ if (page->pg_row_ins != NULL)
+ __free_skip_array(
+ session, page->pg_row_ins, page->pg_row_entries + 1);
+
+ /* Free the update array. */
+ if (page->pg_row_upd != NULL)
+ __free_update(session, page->pg_row_upd, page->pg_row_entries);
+}
+
+/*
+ * __free_skip_array --
+ * Discard an array of skip list headers.
+ */
+static void
+__free_skip_array(
+ WT_SESSION_IMPL *session, WT_INSERT_HEAD **head_arg, uint32_t entries)
+{
+ WT_INSERT_HEAD **head;
+
+ /*
+ * For each non-NULL slot in the page's array of inserts, free the
+ * linked list anchored in that slot.
+ */
+ for (head = head_arg; entries > 0; --entries, ++head)
+ if (*head != NULL) {
+ __free_skip_list(session, WT_SKIP_FIRST(*head));
+ __wt_free(session, *head);
+ }
+
+ /* Free the header array. */
+ __wt_free(session, head_arg);
+}
+
+/*
+ * __free_skip_list --
+ * Walk a WT_INSERT forward-linked list and free the per-thread combination
+ * of a WT_INSERT structure and its associated chain of WT_UPDATE structures.
+ */
+static void
+__free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins)
+{
+ WT_INSERT *next;
+
+ for (; ins != NULL; ins = next) {
+ __free_update_list(session, ins->upd);
+ next = WT_SKIP_NEXT(ins);
+ __wt_free(session, ins);
+ }
+}
+
+/*
+ * __free_update --
+ * Discard the update array.
+ */
+static void
+__free_update(
+ WT_SESSION_IMPL *session, WT_UPDATE **update_head, uint32_t entries)
+{
+ WT_UPDATE **updp;
+
+ /*
+ * For each non-NULL slot in the page's array of updates, free the
+ * linked list anchored in that slot.
+ */
+ for (updp = update_head; entries > 0; --entries, ++updp)
+ if (*updp != NULL)
+ __free_update_list(session, *updp);
+
+ /* Free the update array. */
+ __wt_free(session, update_head);
+}
+
+/*
+ * __free_update_list --
+ * Walk a WT_UPDATE forward-linked list and free the per-thread combination
+ * of a WT_UPDATE structure and its associated data.
+ */
+static void
+__free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+ WT_UPDATE *next;
+
+ for (; upd != NULL; upd = next) {
+ /* Everything we free should be visible to everyone. */
+ WT_ASSERT(session,
+ F_ISSET(session, WT_SESSION_DISCARD_FORCE) ||
+ upd->txnid == WT_TXN_ABORTED ||
+ __wt_txn_visible_all(session, upd->txnid));
+
+ next = upd->next;
+ __wt_free(session, upd);
+ }
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_evict.c b/src/third_party/wiredtiger/src/btree/bt_evict.c
new file mode 100644
index 00000000000..ff049553c7f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_evict.c
@@ -0,0 +1,1297 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __evict_clear_walks(WT_SESSION_IMPL *);
+static int __evict_has_work(WT_SESSION_IMPL *, uint32_t *);
+static int __evict_lru(WT_SESSION_IMPL *, uint32_t);
+static int __evict_lru_cmp(const void *, const void *);
+static int __evict_lru_pages(WT_SESSION_IMPL *, int);
+static int __evict_pass(WT_SESSION_IMPL *);
+static int __evict_walk(WT_SESSION_IMPL *, uint32_t *, uint32_t);
+static int __evict_walk_file(WT_SESSION_IMPL *, u_int *, uint32_t);
+static void *__evict_worker(void *);
+
+/*
+ * __evict_read_gen --
+ * Get the adjusted read generation for an eviction entry.
+ */
+static inline uint64_t
+__evict_read_gen(const WT_EVICT_ENTRY *entry)
+{
+ WT_PAGE *page;
+ uint64_t read_gen;
+
+ /* Never prioritize empty slots. */
+ if (entry->ref == NULL)
+ return (UINT64_MAX);
+
+ page = entry->ref->page;
+ read_gen = page->read_gen + entry->btree->evict_priority;
+
+ /*
+ * Skew the read generation for internal pages, we prefer to evict leaf
+ * pages.
+ */
+ if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT)
+ read_gen += WT_EVICT_INT_SKEW;
+
+ return (read_gen);
+}
+
+/*
+ * __evict_lru_cmp --
+ * Qsort function: sort the eviction array.
+ */
+static int
+__evict_lru_cmp(const void *a, const void *b)
+{
+ uint64_t a_lru, b_lru;
+
+ a_lru = __evict_read_gen(a);
+ b_lru = __evict_read_gen(b);
+
+ return ((a_lru < b_lru) ? -1 : (a_lru == b_lru) ? 0 : 1);
+}
+
+/*
+ * __evict_list_clear --
+ * Clear an entry in the LRU eviction list.
+ */
+static inline void
+__evict_list_clear(WT_SESSION_IMPL *session, WT_EVICT_ENTRY *e)
+{
+ if (e->ref != NULL) {
+ WT_ASSERT(session,
+ F_ISSET_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU));
+ F_CLR_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU);
+ }
+ e->ref = NULL;
+ e->btree = WT_DEBUG_POINT;
+}
+
+/*
+ * __wt_evict_list_clear_page --
+ * Make sure a page is not in the LRU eviction list. This called from the
+ * page eviction code to make sure there is no attempt to evict a child
+ * page multiple times.
+ */
+void
+__wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_CACHE *cache;
+ WT_EVICT_ENTRY *evict;
+ uint32_t i, elem;
+
+ WT_ASSERT(session,
+ __wt_ref_is_root(ref) || ref->state == WT_REF_LOCKED);
+
+ /* Fast path: if the page isn't on the queue, don't bother searching. */
+ if (!F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU))
+ return;
+
+ cache = S2C(session)->cache;
+ __wt_spin_lock(session, &cache->evict_lock);
+
+ elem = cache->evict_max;
+ for (i = 0, evict = cache->evict; i < elem; i++, evict++)
+ if (evict->ref == ref) {
+ __evict_list_clear(session, evict);
+ break;
+ }
+
+ WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU));
+
+ __wt_spin_unlock(session, &cache->evict_lock);
+}
+
+/*
+ * __wt_evict_server_wake --
+ * Wake the eviction server thread.
+ */
+int
+__wt_evict_server_wake(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+ cache = conn->cache;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_EVICTSERVER)) {
+ uint64_t bytes_inuse, bytes_max;
+
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ bytes_max = conn->cache_size;
+ WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+ "waking, bytes inuse %s max (%" PRIu64
+ "MB %s %" PRIu64 "MB)",
+ bytes_inuse <= bytes_max ? "<=" : ">",
+ bytes_inuse / WT_MEGABYTE,
+ bytes_inuse <= bytes_max ? "<=" : ">",
+ bytes_max / WT_MEGABYTE));
+ }
+
+ return (__wt_cond_signal(session, cache->evict_cond));
+}
+
+/*
+ * __evict_server --
+ * Thread to evict pages from the cache.
+ */
+static void *
+__evict_server(void *arg)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_EVICT_WORKER *worker;
+ WT_SESSION_IMPL *session;
+
+ session = arg;
+ conn = S2C(session);
+ cache = conn->cache;
+
+ while (F_ISSET(conn, WT_CONN_EVICTION_RUN)) {
+ /* Evict pages from the cache as needed. */
+ WT_ERR(__evict_pass(session));
+
+ if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
+ break;
+
+ /*
+ * If we have caught up and there are more than the minimum
+ * number of eviction workers running, shut one down.
+ */
+ if (conn->evict_workers > conn->evict_workers_min) {
+ WT_TRET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+ "Stopping evict worker: %"PRIu32"\n",
+ conn->evict_workers));
+ worker = &conn->evict_workctx[--conn->evict_workers];
+ F_CLR(worker, WT_EVICT_WORKER_RUN);
+ WT_TRET(__wt_cond_signal(
+ session, cache->evict_waiter_cond));
+ WT_TRET(__wt_thread_join(session, worker->tid));
+ /*
+ * Flag errors here with a message, but don't shut down
+ * the eviction server - that's fatal.
+ */
+ WT_ASSERT(session, ret == 0);
+ if (ret != 0) {
+ (void)__wt_msg(session,
+ "Error stopping eviction worker: %d", ret);
+ ret = 0;
+ }
+ }
+ F_CLR(cache, WT_EVICT_ACTIVE);
+ WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping"));
+ /* Don't rely on signals: check periodically. */
+ WT_ERR(__wt_cond_wait(session, cache->evict_cond, 100000));
+ WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "waking"));
+ }
+
+ WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "exiting"));
+
+err:
+ if (ret != 0) {
+ WT_PANIC_MSG(session, ret, "eviction server error");
+ return (NULL);
+ }
+
+ if (cache->pages_inmem != cache->pages_evict)
+ __wt_errx(session,
+ "cache server: exiting with %" PRIu64 " pages in "
+ "memory and %" PRIu64 " pages evicted",
+ cache->pages_inmem, cache->pages_evict);
+ if (cache->bytes_inmem != cache->bytes_evict)
+ __wt_errx(session,
+ "cache server: exiting with %" PRIu64 " bytes in "
+ "memory and %" PRIu64 " bytes evicted",
+ cache->bytes_inmem, cache->bytes_evict);
+ if (cache->bytes_dirty != 0 || cache->pages_dirty != 0)
+ __wt_errx(session,
+ "cache server: exiting with %" PRIu64
+ " bytes dirty and %" PRIu64 " pages dirty",
+ cache->bytes_dirty, cache->pages_dirty);
+
+ return (NULL);
+}
+
+/*
+ * __wt_evict_create --
+ * Start the eviction server thread.
+ */
+int
+__wt_evict_create(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_EVICT_WORKER *workers;
+ u_int i;
+
+ conn = S2C(session);
+
+ /* Set first, the thread might run before we finish up. */
+ F_SET(conn, WT_CONN_EVICTION_RUN);
+
+ /* We need a session handle because we're reading/writing pages. */
+ WT_RET(__wt_open_internal_session(
+ conn, "eviction-server", 0, 0, &conn->evict_session));
+ session = conn->evict_session;
+
+ /*
+ * If there's only a single eviction thread, it may be called upon to
+ * perform slow operations for the block manager. (The flag is not
+ * reset if reconfigured later, but I doubt that's a problem.)
+ */
+ if (conn->evict_workers_max == 0)
+ F_SET(session, WT_SESSION_CAN_WAIT);
+
+ if (conn->evict_workers_max > 0) {
+ WT_RET(__wt_calloc_def(
+ session, conn->evict_workers_max, &workers));
+ conn->evict_workctx = workers;
+
+ for (i = 0; i < conn->evict_workers_max; i++) {
+ WT_RET(__wt_open_internal_session(conn,
+ "eviction-worker", 0, 0, &workers[i].session));
+ workers[i].id = i;
+ F_SET(workers[i].session, WT_SESSION_CAN_WAIT);
+
+ if (i < conn->evict_workers_min) {
+ ++conn->evict_workers;
+ F_SET(&workers[i], WT_EVICT_WORKER_RUN);
+ WT_RET(__wt_thread_create(
+ workers[i].session, &workers[i].tid,
+ __evict_worker, &workers[i]));
+ }
+ }
+ }
+
+ /*
+ * Start the primary eviction server thread after the worker threads
+ * have started to avoid it starting additional worker threads before
+ * the worker's sessions are created.
+ */
+ WT_RET(__wt_thread_create(
+ session, &conn->evict_tid, __evict_server, session));
+ conn->evict_tid_set = 1;
+
+ return (0);
+}
+
+/*
+ * __wt_evict_destroy --
+ * Destroy the eviction server thread.
+ */
+int
+__wt_evict_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_EVICT_WORKER *workers;
+ WT_SESSION *wt_session;
+ u_int i;
+
+ conn = S2C(session);
+ cache = conn->cache;
+ workers = conn->evict_workctx;
+
+ F_CLR(conn, WT_CONN_EVICTION_RUN);
+
+ WT_TRET(__wt_verbose(
+ session, WT_VERB_EVICTSERVER, "waiting for helper threads"));
+ for (i = 0; i < conn->evict_workers; i++) {
+ WT_TRET(__wt_cond_signal(session, cache->evict_waiter_cond));
+ WT_TRET(__wt_thread_join(session, workers[i].tid));
+ }
+ /* Handle shutdown when cleaning up after a failed open */
+ if (conn->evict_workctx != NULL) {
+ for (i = 0; i < conn->evict_workers_max; i++) {
+ wt_session = &conn->evict_workctx[i].session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ }
+ __wt_free(session, conn->evict_workctx);
+ }
+
+ if (conn->evict_tid_set) {
+ WT_TRET(__wt_evict_server_wake(session));
+ WT_TRET(__wt_thread_join(session, conn->evict_tid));
+ conn->evict_tid_set = 0;
+ }
+
+ if (conn->evict_session != NULL) {
+ wt_session = &conn->evict_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+
+ conn->evict_session = NULL;
+ }
+
+ return (ret);
+}
+
+/*
+ * __evict_worker --
+ * Thread to help evict pages from the cache.
+ */
+static void *
+__evict_worker(void *arg)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_EVICT_WORKER *worker;
+ WT_SESSION_IMPL *session;
+ uint32_t flags;
+
+ worker = arg;
+ session = worker->session;
+ conn = S2C(session);
+ cache = conn->cache;
+
+ while (F_ISSET(conn, WT_CONN_EVICTION_RUN) &&
+ F_ISSET(worker, WT_EVICT_WORKER_RUN)) {
+ /* Don't spin in a busy loop if there is no work to do */
+ WT_ERR(__evict_has_work(session, &flags));
+ if (flags == 0)
+ WT_ERR(__wt_cond_wait(
+ session, cache->evict_waiter_cond, 10000));
+ else
+ WT_ERR(__evict_lru_pages(session, 1));
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "cache eviction helper error");
+ }
+
+ WT_TRET(__wt_verbose(session, WT_VERB_EVICTSERVER, "helper exiting"));
+
+ return (NULL);
+}
+
+/*
+ * __evict_has_work --
+ * Find out if there is eviction work to be done.
+ */
+static int
+__evict_has_work(WT_SESSION_IMPL *session, uint32_t *flagsp)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ uint32_t flags;
+ uint64_t bytes_inuse, bytes_max, dirty_inuse;
+
+ conn = S2C(session);
+ cache = conn->cache;
+ flags = 0;
+ *flagsp = 0;
+
+ if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
+ return (0);
+
+ /*
+ * Figure out whether the cache usage exceeds either the eviction
+ * target or the dirty target.
+ */
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ dirty_inuse = cache->bytes_dirty;
+ bytes_max = conn->cache_size;
+
+ /* Check to see if the eviction server should run. */
+ if (bytes_inuse > (cache->eviction_target * bytes_max) / 100)
+ LF_SET(WT_EVICT_PASS_ALL);
+ else if (dirty_inuse >
+ (cache->eviction_dirty_target * bytes_max) / 100)
+ /* Ignore clean pages unless the cache is too large */
+ LF_SET(WT_EVICT_PASS_DIRTY);
+
+ if (F_ISSET(cache, WT_EVICT_STUCK))
+ LF_SET(WT_EVICT_PASS_AGGRESSIVE);
+
+ *flagsp = flags;
+ return (0);
+}
+
+/*
+ * __evict_pass --
+ * Evict pages from memory.
+ */
+static int
+__evict_pass(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_EVICT_WORKER *worker;
+ int loop;
+ uint32_t flags;
+ uint64_t bytes_inuse;
+
+ conn = S2C(session);
+ cache = conn->cache;
+
+ /* Evict pages from the cache. */
+ for (loop = 0;; loop++) {
+ /*
+ * If there is a request to clear eviction walks, do that now,
+ * before checking if the cache is full.
+ */
+ if (F_ISSET(cache, WT_EVICT_CLEAR_WALKS)) {
+ F_CLR(cache, WT_EVICT_CLEAR_WALKS);
+ WT_RET(__evict_clear_walks(session));
+ WT_RET(__wt_cond_signal(
+ session, cache->evict_waiter_cond));
+ }
+
+ WT_RET(__evict_has_work(session, &flags));
+ if (flags == 0)
+ break;
+
+ if (loop > 10)
+ LF_SET(WT_EVICT_PASS_AGGRESSIVE);
+
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ /*
+ * When the cache is full, track whether pages are being
+ * evicted. This will be cleared by the next thread to
+ * successfully evict a page.
+ */
+ if (bytes_inuse > conn->cache_size) {
+ F_SET(cache, WT_EVICT_NO_PROGRESS);
+ } else
+ F_CLR(cache, WT_EVICT_NO_PROGRESS);
+
+ /* Start a worker if we have capacity and the cache is full. */
+ if (bytes_inuse > conn->cache_size &&
+ conn->evict_workers < conn->evict_workers_max) {
+ WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+ "Starting evict worker: %"PRIu32"\n",
+ conn->evict_workers));
+ worker = &conn->evict_workctx[conn->evict_workers++];
+ F_SET(worker, WT_EVICT_WORKER_RUN);
+ WT_RET(__wt_thread_create(session,
+ &worker->tid, __evict_worker, worker));
+ }
+
+ F_SET(cache, WT_EVICT_ACTIVE);
+ WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+ "Eviction pass with: Max: %" PRIu64
+ " In use: %" PRIu64 " Dirty: %" PRIu64,
+ conn->cache_size, bytes_inuse, cache->bytes_dirty));
+
+ WT_RET(__evict_lru(session, flags));
+
+ /*
+ * If we're making progress, keep going; if we're not making
+ * any progress at all, mark the cache "stuck" and go back to
+ * sleep, it's not something we can fix.
+ */
+ if (F_ISSET(cache, WT_EVICT_NO_PROGRESS)) {
+ if (F_ISSET(cache, WT_EVICT_STUCK))
+ break;
+ if (loop == 100) {
+ F_SET(cache, WT_EVICT_STUCK);
+ WT_STAT_FAST_CONN_INCR(
+ session, cache_eviction_slow);
+ WT_RET(__wt_verbose(
+ session, WT_VERB_EVICTSERVER,
+ "unable to reach eviction goal"));
+ break;
+ }
+ } else
+ loop = 0;
+ }
+ return (0);
+}
+
+/*
+ * __evict_clear_walks --
+ * Clear the eviction walk points for all files.
+ */
+static int
+__evict_clear_walks(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ WT_REF *ref;
+
+ conn = S2C(session);
+ cache = conn->cache;
+ cache->evict_file_next = NULL;
+
+ /*
+ * Lock the dhandle list so sweeping cannot change the pointers out
+ * from under us.
+ *
+ * NOTE: we don't hold the schema lock, so we have to take care
+ * that the handles we see are open and valid.
+ */
+ __wt_spin_lock(session, &conn->dhandle_lock);
+
+ SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ /* Ignore non-file handles, or handles that aren't open. */
+ if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
+ !F_ISSET(dhandle, WT_DHANDLE_OPEN))
+ continue;
+
+ btree = dhandle->handle;
+ session->dhandle = dhandle;
+ if ((ref = btree->evict_ref) != NULL) {
+ /*
+ * Clear evict_ref first, in case releasing it forces
+ * eviction (we assert that we never try to evict the
+ * current eviction walk point).
+ */
+ btree->evict_ref = NULL;
+ WT_TRET(__wt_page_release(session, ref, 0));
+ }
+ session->dhandle = NULL;
+ }
+
+ __wt_spin_unlock(session, &conn->dhandle_lock);
+
+ return (ret);
+}
+
+/*
+ * __evict_tree_walk_clear --
+ * Clear the tree's current eviction point, acquiring the eviction lock.
+ */
+static int
+__evict_tree_walk_clear(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_DECL_RET;
+
+ btree = S2BT(session);
+ cache = S2C(session)->cache;
+
+ while (btree->evict_ref != NULL) {
+ F_SET(cache, WT_EVICT_CLEAR_WALKS);
+ WT_RET(__wt_cond_wait(
+ session, cache->evict_waiter_cond, 100000));
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_evict_page --
+ * Evict a given page.
+ */
+int
+__wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_TXN *txn;
+ WT_TXN_ISOLATION saved_iso;
+
+ /*
+ * We have to take care when evicting pages not to write a change that:
+ * (a) is not yet committed; or
+ * (b) is committed more recently than an in-progress checkpoint.
+ *
+ * We handle both of these cases by setting up the transaction context
+ * before evicting, using a special "eviction" isolation level, where
+ * only globally visible updates can be evicted.
+ */
+ __wt_txn_update_oldest(session);
+ txn = &session->txn;
+ saved_iso = txn->isolation;
+ txn->isolation = TXN_ISO_EVICTION;
+
+ /*
+ * Sanity check: if a transaction has updates, its updates should not
+ * be visible to eviction.
+ */
+ WT_ASSERT(session,
+ !F_ISSET(txn, TXN_HAS_ID) || !__wt_txn_visible(session, txn->id));
+
+ ret = __wt_rec_evict(session, ref, 0);
+ txn->isolation = saved_iso;
+
+ return (ret);
+}
+
+/*
+ * __wt_evict_file_exclusive_on --
+ * Get exclusive eviction access to a file and discard any of the file's
+ * blocks queued for eviction.
+ */
+int
+__wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_EVICT_ENTRY *evict;
+ u_int i, elem;
+
+ btree = S2BT(session);
+ cache = S2C(session)->cache;
+
+ /*
+ * Hold the walk lock to set the "no eviction" flag: no new pages from
+ * the file will be queued for eviction after this point.
+ */
+ __wt_spin_lock(session, &cache->evict_walk_lock);
+ F_SET(btree, WT_BTREE_NO_EVICTION);
+ __wt_spin_unlock(session, &cache->evict_walk_lock);
+
+ /* Clear any existing LRU eviction walk for the file. */
+ WT_RET(__evict_tree_walk_clear(session));
+
+ /* Hold the evict lock to remove any queued pages from this file. */
+ __wt_spin_lock(session, &cache->evict_lock);
+
+ /*
+ * The eviction candidate list might reference pages from the file,
+ * clear it.
+ */
+ elem = cache->evict_max;
+ for (i = 0, evict = cache->evict; i < elem; i++, evict++)
+ if (evict->btree == btree)
+ __evict_list_clear(session, evict);
+ __wt_spin_unlock(session, &cache->evict_lock);
+
+ /*
+ * We have disabled further eviction: wait for concurrent LRU eviction
+ * activity to drain.
+ */
+ while (btree->evict_busy > 0)
+ __wt_yield();
+
+ return (0);
+}
+
+/*
+ * __wt_evict_file_exclusive_off --
+ * Release exclusive eviction access to a file.
+ */
+void
+__wt_evict_file_exclusive_off(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ WT_ASSERT(session, btree->evict_ref == NULL);
+
+ F_CLR(btree, WT_BTREE_NO_EVICTION);
+}
+
+/*
+ * __evict_lru_pages --
+ * Get pages from the LRU queue to evict.
+ */
+static int
+__evict_lru_pages(WT_SESSION_IMPL *session, int is_app)
+{
+ WT_DECL_RET;
+
+ /*
+ * Reconcile and discard some pages: EBUSY is returned if a page fails
+ * eviction because it's unavailable, continue in that case.
+ */
+ while ((ret = __wt_evict_lru_page(session, is_app)) == 0 ||
+ ret == EBUSY)
+ ;
+ return (ret == WT_NOTFOUND ? 0 : ret);
+}
+
+/*
+ * __evict_lru --
+ * Evict pages from the cache based on their read generation.
+ */
+static int
+__evict_lru(WT_SESSION_IMPL *session, uint32_t flags)
+{
+ WT_CACHE *cache;
+ WT_EVICT_ENTRY *evict;
+ uint64_t cutoff;
+ uint32_t candidates, entries, i;
+
+ cache = S2C(session)->cache;
+
+ /* Get some more pages to consider for eviction. */
+ WT_RET(__evict_walk(session, &entries, flags));
+
+ /* Sort the list into LRU order and restart. */
+ __wt_spin_lock(session, &cache->evict_lock);
+
+ qsort(cache->evict,
+ entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp);
+
+ while (entries > 0 && cache->evict[entries - 1].ref == NULL)
+ --entries;
+
+ cache->evict_entries = entries;
+
+ if (entries == 0) {
+ /*
+ * If there are no entries, there cannot be any candidates.
+ * Make sure application threads don't read past the end of the
+ * candidate list, or they may race with the next walk.
+ */
+ cache->evict_candidates = 0;
+ cache->evict_current = NULL;
+ __wt_spin_unlock(session, &cache->evict_lock);
+ return (0);
+ }
+
+ WT_ASSERT(session, cache->evict[0].ref != NULL);
+
+ /* Find the bottom 25% of read generations. */
+ cutoff = (3 * __evict_read_gen(&cache->evict[0]) +
+ __evict_read_gen(&cache->evict[entries - 1])) / 4;
+
+ /*
+ * Don't take less than 10% or more than 50% of entries, regardless.
+ * That said, if there is only one entry, which is normal when
+ * populating an empty file, don't exclude it.
+ */
+ for (candidates = 1 + entries / 10;
+ candidates < entries / 2;
+ candidates++)
+ if (__evict_read_gen(&cache->evict[candidates]) > cutoff)
+ break;
+ cache->evict_candidates = candidates;
+
+ /* If we have more than the minimum number of entries, clear them. */
+ if (cache->evict_entries > WT_EVICT_WALK_BASE) {
+ for (i = WT_EVICT_WALK_BASE, evict = cache->evict + i;
+ i < cache->evict_entries;
+ i++, evict++)
+ __evict_list_clear(session, evict);
+ cache->evict_entries = WT_EVICT_WALK_BASE;
+ }
+
+ cache->evict_current = cache->evict;
+ __wt_spin_unlock(session, &cache->evict_lock);
+
+ /*
+ * The eviction server thread doesn't do any actual eviction if there
+ * are multiple eviction workers running.
+ */
+ WT_RET(__wt_cond_signal(session, cache->evict_waiter_cond));
+
+ if (S2C(session)->evict_workers > 1) {
+ WT_STAT_FAST_CONN_INCR(
+ session, cache_eviction_server_not_evicting);
+ /*
+ * If there are candidates queued, give other threads a chance
+ * to access them before gathering more.
+ */
+ if (candidates > 10 && cache->evict_current != NULL)
+ __wt_yield();
+ } else {
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_server_evicting);
+ WT_RET(__evict_lru_pages(session, 0));
+ }
+
+ return (0);
+}
+
+/*
+ * __evict_walk --
+ * Fill in the array by walking the next set of pages.
+ */
+static int
+__evict_walk(WT_SESSION_IMPL *session, u_int *entriesp, uint32_t flags)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ u_int max_entries, old_slot, retries, slot;
+
+ conn = S2C(session);
+ cache = S2C(session)->cache;
+ retries = 0;
+
+ /* Increment the shared read generation. */
+ __wt_cache_read_gen_incr(session);
+
+ /*
+ * Update the oldest ID: we use it to decide whether pages are
+ * candidates for eviction. Without this, if all threads are blocked
+ * after a long-running transaction (such as a checkpoint) completes,
+ * we may never start evicting again.
+ */
+ __wt_txn_update_oldest(session);
+
+ /*
+ * Set the starting slot in the queue and the maximum pages added
+ * per walk.
+ */
+ slot = cache->evict_entries;
+ max_entries = slot + WT_EVICT_WALK_INCR;
+ if (cache->evict_current == NULL)
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_empty);
+ else
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_not_empty);
+
+ /*
+ * Lock the dhandle list so sweeping cannot change the pointers out
+ * from under us.
+ *
+ * NOTE: we don't hold the schema lock, so we have to take care
+ * that the handles we see are open and valid.
+ */
+ __wt_spin_lock(session, &conn->dhandle_lock);
+
+retry: SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ /* Ignore non-file handles, or handles that aren't open. */
+ if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
+ !F_ISSET(dhandle, WT_DHANDLE_OPEN))
+ continue;
+
+ /*
+ * Each time we reenter this function, start at the next handle
+ * on the list.
+ */
+ if (cache->evict_file_next != NULL &&
+ cache->evict_file_next != dhandle)
+ continue;
+ cache->evict_file_next = NULL;
+
+ /* Skip files that don't allow eviction. */
+ btree = dhandle->handle;
+ if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ continue;
+
+ /*
+ * Also skip files that are configured to stick in cache until
+ * we get aggressive.
+ */
+ if (btree->evict_priority != 0 &&
+ !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE))
+ continue;
+
+ /*
+ * If we are filling the queue, skip files that haven't been
+ * useful in the past.
+ */
+ if (btree->evict_walk_period != 0 &&
+ cache->evict_entries >= WT_EVICT_WALK_INCR &&
+ btree->evict_walk_skips++ < btree->evict_walk_period)
+ continue;
+ btree->evict_walk_skips = 0;
+ old_slot = slot;
+
+ __wt_spin_lock(session, &cache->evict_walk_lock);
+
+ /*
+ * Re-check the "no eviction" flag -- it is used to enforce
+ * exclusive access when a handle is being closed.
+ */
+ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ WT_WITH_BTREE(session, btree,
+ ret = __evict_walk_file(session, &slot, flags));
+
+ __wt_spin_unlock(session, &cache->evict_walk_lock);
+
+ /*
+ * If we didn't find enough candidates in the file, skip it
+ * next time.
+ */
+ if (slot >= old_slot + WT_EVICT_WALK_PER_FILE ||
+ slot >= max_entries)
+ btree->evict_walk_period = 0;
+ else
+ btree->evict_walk_period = WT_MIN(
+ WT_MAX(1, 2 * btree->evict_walk_period), 1000);
+
+ if (ret != 0 || slot >= max_entries)
+ break;
+ }
+
+ /* Walk the list of files a few times if we don't find enough pages. */
+ if (ret == 0 && slot < max_entries && ++retries < 10)
+ goto retry;
+
+ /* Remember the file we should visit first, next loop. */
+ if (dhandle != NULL)
+ dhandle = SLIST_NEXT(dhandle, l);
+ cache->evict_file_next = dhandle;
+
+ __wt_spin_unlock(session, &conn->dhandle_lock);
+
+ *entriesp = slot;
+ return (ret);
+}
+
+/*
+ * __evict_init_candidate --
+ * Initialize a WT_EVICT_ENTRY structure with a given page.
+ */
+static void
+__evict_init_candidate(
+ WT_SESSION_IMPL *session, WT_EVICT_ENTRY *evict, WT_REF *ref)
+{
+ WT_CACHE *cache;
+ u_int slot;
+
+ cache = S2C(session)->cache;
+
+ /* Keep track of the maximum slot we are using. */
+ slot = (u_int)(evict - cache->evict);
+ if (slot >= cache->evict_max)
+ cache->evict_max = slot + 1;
+
+ if (evict->ref != NULL)
+ __evict_list_clear(session, evict);
+ evict->ref = ref;
+ evict->btree = S2BT(session);
+
+ /* Mark the page on the list */
+ F_SET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU);
+}
+
+/*
+ * __evict_walk_file --
+ * Get a few page eviction candidates from a single underlying file.
+ */
+static int
+__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_DECL_RET;
+ WT_EVICT_ENTRY *end, *evict, *start;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ uint64_t pages_walked;
+ uint32_t walk_flags;
+ int internal_pages, modified, restarts;
+
+ btree = S2BT(session);
+ cache = S2C(session)->cache;
+ start = cache->evict + *slotp;
+ end = WT_MIN(start + WT_EVICT_WALK_PER_FILE,
+ cache->evict + cache->evict_slots);
+
+ walk_flags =
+ WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
+
+ /*
+ * Get some more eviction candidate pages.
+ */
+ for (evict = start, pages_walked = 0, internal_pages = restarts = 0;
+ evict < end && (ret == 0 || ret == WT_NOTFOUND);
+ ret = __wt_tree_walk(session, &btree->evict_ref, walk_flags),
+ ++pages_walked) {
+ if (btree->evict_ref == NULL) {
+ /*
+ * Take care with terminating this loop.
+ *
+ * Don't make an extra call to __wt_tree_walk: that will
+ * leave a page pinned, which may prevent any work from
+ * being done.
+ */
+ if (++restarts == 2)
+ break;
+ continue;
+ }
+
+ /* Ignore root pages entirely. */
+ if (__wt_ref_is_root(btree->evict_ref))
+ continue;
+ page = btree->evict_ref->page;
+
+ /*
+ * Use the EVICT_LRU flag to avoid putting pages onto the list
+ * multiple times.
+ */
+ if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
+ continue;
+
+ /* Limit internal pages to 50% unless we get aggressive. */
+ if ((page->type == WT_PAGE_COL_INT ||
+ page->type == WT_PAGE_ROW_INT) &&
+ ++internal_pages > WT_EVICT_WALK_PER_FILE / 2 &&
+ !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE))
+ break;
+
+ /*
+ * If this page has never been considered for eviction,
+ * set its read generation to a little bit in the
+ * future and move on, give readers a chance to start
+ * updating the read generation.
+ */
+ if (page->read_gen == WT_READGEN_NOTSET) {
+ page->read_gen = __wt_cache_read_gen_set(session);
+ continue;
+ }
+
+ /*
+ * If the file is being checkpointed, there's a period of time
+ * where we can't discard dirty pages because of possible races
+ * with the checkpointing thread.
+ */
+ modified = __wt_page_is_modified(page);
+ if (modified && btree->checkpointing)
+ continue;
+
+ /* Optionally ignore clean pages. */
+ if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY))
+ continue;
+
+ /*
+ * If the page is clean but has modifications that appear too
+ * new to evict, skip it.
+ */
+ mod = page->modify;
+ if (!modified && mod != NULL &&
+ !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE) &&
+ !__wt_txn_visible_all(session, mod->rec_max_txn))
+ continue;
+
+ /*
+ * If the oldest transaction hasn't changed since the
+ * last time this page was written, it's unlikely that
+ * we can make progress. Similarly, if the most recent
+ * update on the page is not yet globally visible,
+ * eviction will fail. These heuristics attempt to
+ * avoid repeated attempts to evict the same page.
+ *
+ * That said, if eviction is stuck, or the file is
+ * being checkpointed, try anyway: maybe a transaction
+ * that was running last time we wrote the page has
+ * since rolled back, or we can help get the checkpoint
+ * completed sooner.
+ */
+ if (modified && !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE) &&
+ !btree->checkpointing &&
+ (mod->disk_snap_min == S2C(session)->txn_global.oldest_id ||
+ !__wt_txn_visible_all(session, mod->update_txn)))
+ continue;
+
+ WT_ASSERT(session, evict->ref == NULL);
+ __evict_init_candidate(session, evict, btree->evict_ref);
+ ++evict;
+
+ WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+ "select: %p, size %" PRIu64, page, page->memory_footprint));
+ }
+
+ /* If the walk was interrupted by a locked page, that's okay. */
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+
+ *slotp += (u_int)(evict - start);
+ WT_STAT_FAST_CONN_INCRV(session, cache_eviction_walk, pages_walked);
+ return (ret);
+}
+
+/*
+ * __evict_get_ref --
+ * Get a page for eviction.
+ */
+static int
+__evict_get_ref(
+ WT_SESSION_IMPL *session, int is_app, WT_BTREE **btreep, WT_REF **refp)
+{
+ WT_CACHE *cache;
+ WT_EVICT_ENTRY *evict;
+ uint32_t candidates;
+ WT_DECL_SPINLOCK_ID(id); /* Must appear last */
+
+ cache = S2C(session)->cache;
+ *btreep = NULL;
+ *refp = NULL;
+
+ /*
+ * A pathological case: if we're the oldest transaction in the system
+ * and the eviction server is stuck trying to find space, abort the
+ * transaction to give up all hazard pointers before trying again.
+ */
+ if (is_app && F_ISSET(cache, WT_EVICT_STUCK) &&
+ __wt_txn_am_oldest(session)) {
+ F_CLR(cache, WT_EVICT_STUCK);
+ WT_STAT_FAST_CONN_INCR(session, txn_fail_cache);
+ return (WT_ROLLBACK);
+ }
+
+ /*
+ * Avoid the LRU lock if no pages are available. If there are pages
+ * available, spin until we get the lock. If this function returns
+ * without getting a page to evict, application threads assume there
+ * are no more pages available and will attempt to wake the eviction
+ * server.
+ */
+ for (;;) {
+ if (cache->evict_current == NULL)
+ return (WT_NOTFOUND);
+ if (__wt_spin_trylock(session, &cache->evict_lock, &id) == 0)
+ break;
+ __wt_yield();
+ }
+
+ /*
+ * The eviction server only tries to evict half of the pages before
+ * looking for more.
+ */
+ candidates = cache->evict_candidates;
+ if (!is_app && candidates > 1)
+ candidates /= 2;
+
+ /* Get the next page queued for eviction. */
+ while ((evict = cache->evict_current) != NULL &&
+ evict < cache->evict + candidates && evict->ref != NULL) {
+ WT_ASSERT(session, evict->btree != NULL);
+
+ /* Move to the next item. */
+ ++cache->evict_current;
+
+ /*
+ * Lock the page while holding the eviction mutex to prevent
+ * multiple attempts to evict it. For pages that are already
+ * being evicted, this operation will fail and we will move on.
+ */
+ if (!WT_ATOMIC_CAS4(
+ evict->ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
+ __evict_list_clear(session, evict);
+ continue;
+ }
+
+ /*
+ * Increment the busy count in the btree handle to prevent it
+ * from being closed under us.
+ */
+ (void)WT_ATOMIC_ADD4(evict->btree->evict_busy, 1);
+
+ *btreep = evict->btree;
+ *refp = evict->ref;
+
+ /*
+ * Remove the entry so we never try to reconcile the same page
+ * on reconciliation error.
+ */
+ __evict_list_clear(session, evict);
+ break;
+ }
+
+ /* Clear the current pointer if there are no more candidates. */
+ if (evict >= cache->evict + cache->evict_candidates)
+ cache->evict_current = NULL;
+ __wt_spin_unlock(session, &cache->evict_lock);
+
+ return ((*refp == NULL) ? WT_NOTFOUND : 0);
+}
+
+/*
+ * __wt_evict_lru_page --
+ * Called by both eviction and application threads to evict a page.
+ */
+int
+__wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_REF *ref;
+
+ WT_RET(__evict_get_ref(session, is_app, &btree, &ref));
+ WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+
+ /*
+ * In case something goes wrong, don't pick the same set of pages every
+ * time.
+ *
+ * We used to bump the page's read generation only if eviction failed,
+ * but that isn't safe: at that point, eviction has already unlocked
+ * the page and some other thread may have evicted it by the time we
+ * look at it.
+ */
+ page = ref->page;
+ if (page->read_gen != WT_READGEN_OLDEST)
+ page->read_gen = __wt_cache_read_gen_set(session);
+
+ WT_WITH_BTREE(session, btree, ret = __wt_evict_page(session, ref));
+
+ (void)WT_ATOMIC_SUB4(btree->evict_busy, 1);
+
+ WT_RET(ret);
+
+ cache = S2C(session)->cache;
+ if (F_ISSET(cache, WT_EVICT_NO_PROGRESS | WT_EVICT_STUCK))
+ F_CLR(cache, WT_EVICT_NO_PROGRESS | WT_EVICT_STUCK);
+
+ return (ret);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_cache_dump --
+ * Dump debugging information to stdout about the size of the files in the
+ * cache.
+ *
+ * NOTE: this function is not called anywhere, it is intended to be called
+ * from a debugger.
+ */
+void
+__wt_cache_dump(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_REF *next_walk;
+ WT_PAGE *page;
+ uint64_t file_intl_pages, file_leaf_pages;
+ uint64_t file_bytes, file_dirty, total_bytes;
+
+ conn = S2C(session);
+ total_bytes = 0;
+
+ SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
+ !F_ISSET(dhandle, WT_DHANDLE_OPEN))
+ continue;
+
+ btree = dhandle->handle;
+ if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ continue;
+
+ file_bytes = file_dirty = file_intl_pages = file_leaf_pages = 0;
+ next_walk = NULL;
+ session->dhandle = dhandle;
+ while (__wt_tree_walk(session,
+ &next_walk, WT_READ_CACHE | WT_READ_NO_WAIT) == 0 &&
+ next_walk != NULL) {
+ page = next_walk->page;
+ if (page->type == WT_PAGE_COL_INT ||
+ page->type == WT_PAGE_ROW_INT)
+ ++file_intl_pages;
+ else
+ ++file_leaf_pages;
+ file_bytes += page->memory_footprint;
+ if (__wt_page_is_modified(page))
+ file_dirty += page->memory_footprint;
+ }
+ session->dhandle = NULL;
+
+ printf("cache dump: %s [%s]:"
+ " %" PRIu64 " intl pages, %" PRIu64 " leaf pages,"
+ " %" PRIu64 "MB, %" PRIu64 "MB dirty\n",
+ dhandle->name, dhandle->checkpoint,
+ file_intl_pages, file_leaf_pages,
+ file_bytes >> 20, file_dirty >> 20);
+
+ total_bytes += file_bytes;
+ }
+ printf("cache dump: total found = %" PRIu64 "MB"
+ " vs tracked inuse %" PRIu64 "MB\n",
+ total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20);
+ fflush(stdout);
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
new file mode 100644
index 00000000000..a21d6d277d3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -0,0 +1,770 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt);
+static int __btree_get_last_recno(WT_SESSION_IMPL *);
+static int __btree_page_sizes(WT_SESSION_IMPL *);
+static int __btree_preload(WT_SESSION_IMPL *);
+static int __btree_tree_open_empty(WT_SESSION_IMPL *, int, int);
+
+static int pse1(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t);
+static int pse2(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t, int);
+
+/*
+ * __wt_btree_open --
+ * Open a Btree.
+ */
+int
+__wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CKPT ckpt;
+ WT_CONFIG_ITEM cval;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ size_t root_addr_size;
+ uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
+ int creation, forced_salvage, readonly;
+ const char *filename;
+
+ dhandle = session->dhandle;
+ btree = S2BT(session);
+
+ /* Checkpoint files are readonly. */
+ readonly = dhandle->checkpoint == NULL ? 0 : 1;
+
+ /* Get the checkpoint information for this name/checkpoint pair. */
+ WT_CLEAR(ckpt);
+ WT_RET(__wt_meta_checkpoint(
+ session, dhandle->name, dhandle->checkpoint, &ckpt));
+
+ /*
+ * Bulk-load is only permitted on newly created files, not any empty
+ * file -- see the checkpoint code for a discussion.
+ */
+ creation = ckpt.raw.size == 0;
+ if (!creation && F_ISSET(btree, WT_BTREE_BULK))
+ WT_ERR_MSG(session, EINVAL,
+ "bulk-load is only supported on newly created objects");
+
+ /* Handle salvage configuration. */
+ forced_salvage = 0;
+ if (F_ISSET(btree, WT_BTREE_SALVAGE)) {
+ WT_ERR(__wt_config_gets(session, op_cfg, "force", &cval));
+ forced_salvage = (cval.val != 0);
+ }
+
+ /* Initialize and configure the WT_BTREE structure. */
+ WT_ERR(__btree_conf(session, &ckpt));
+
+ /* Connect to the underlying block manager. */
+ filename = dhandle->name;
+ if (!WT_PREFIX_SKIP(filename, "file:"))
+ WT_ERR_MSG(session, EINVAL, "expected a 'file:' URI");
+
+ WT_ERR(__wt_block_manager_open(session, filename, dhandle->cfg,
+ forced_salvage, readonly, btree->allocsize, &btree->bm));
+ bm = btree->bm;
+
+ /*
+ * !!!
+ * As part of block-manager configuration, we need to return the maximum
+ * sized address cookie that a block manager will ever return. There's
+ * a limit of WT_BTREE_MAX_ADDR_COOKIE, but at 255B, it's too large for
+ * a Btree with 512B internal pages. The default block manager packs
+ * a wt_off_t and 2 uint32_t's into its cookie, so there's no problem
+ * now, but when we create a block manager extension API, we need some
+ * way to consider the block manager's maximum cookie size versus the
+ * minimum Btree internal node size.
+ */
+ btree->block_header = bm->block_header(bm);
+
+ /*
+ * Open the specified checkpoint unless it's a special command (special
+ * commands are responsible for loading their own checkpoints, if any).
+ */
+ if (!F_ISSET(btree,
+ WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
+ /*
+ * There are two reasons to load an empty tree rather than a
+ * checkpoint: either there is no checkpoint (the file is
+ * being created), or the load call returns no root page (the
+ * checkpoint is for an empty file).
+ */
+ WT_ERR(bm->checkpoint_load(bm, session,
+ ckpt.raw.data, ckpt.raw.size,
+ root_addr, &root_addr_size, readonly));
+ if (creation || root_addr_size == 0)
+ WT_ERR(__btree_tree_open_empty(
+ session, creation, readonly));
+ else {
+ WT_ERR(__wt_btree_tree_open(
+ session, root_addr, root_addr_size));
+
+ /* Warm the cache, if possible. */
+ WT_ERR(__btree_preload(session));
+
+ /* Get the last record number in a column-store file. */
+ if (btree->type != BTREE_ROW)
+ WT_ERR(__btree_get_last_recno(session));
+ }
+ }
+
+ if (0) {
+err: WT_TRET(__wt_btree_close(session));
+ }
+ __wt_meta_checkpoint_free(session, &ckpt);
+
+ return (ret);
+}
+
+/*
+ * __wt_btree_close --
+ * Close a Btree.
+ */
+int
+__wt_btree_close(WT_SESSION_IMPL *session)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ dhandle = session->dhandle;
+ btree = S2BT(session);
+
+ if ((bm = btree->bm) != NULL) {
+ /* Unload the checkpoint, unless it's a special command. */
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+ !F_ISSET(btree,
+ WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
+ WT_TRET(bm->checkpoint_unload(bm, session));
+
+ /* Close the underlying block manager reference. */
+ WT_TRET(bm->close(bm, session));
+
+ btree->bm = NULL;
+ }
+
+ /* Close the Huffman tree. */
+ __wt_btree_huffman_close(session);
+
+ /* Destroy locks. */
+ WT_TRET(__wt_rwlock_destroy(session, &btree->ovfl_lock));
+ __wt_spin_destroy(session, &btree->flush_lock);
+
+ /* Free allocated memory. */
+ __wt_free(session, btree->key_format);
+ __wt_free(session, btree->value_format);
+
+ if (btree->collator_owned) {
+ if (btree->collator->terminate != NULL)
+ WT_TRET(btree->collator->terminate(
+ btree->collator, &session->iface));
+ btree->collator_owned = 0;
+ }
+ btree->collator = NULL;
+
+ btree->bulk_load_ok = 0;
+
+ return (ret);
+}
+
+/*
+ * __btree_conf --
+ * Configure a WT_BTREE structure.
+ */
+static int
+__btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
+{
+ WT_BTREE *btree;
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_NAMED_COMPRESSOR *ncomp;
+ int64_t maj_version, min_version;
+ uint32_t bitcnt;
+ int fixed;
+ const char **cfg;
+
+ btree = S2BT(session);
+ conn = S2C(session);
+ cfg = btree->dhandle->cfg;
+
+ /* Dump out format information. */
+ if (WT_VERBOSE_ISSET(session, WT_VERB_VERSION)) {
+ WT_RET(__wt_config_gets(session, cfg, "version.major", &cval));
+ maj_version = cval.val;
+ WT_RET(__wt_config_gets(session, cfg, "version.minor", &cval));
+ min_version = cval.val;
+ WT_RET(__wt_verbose(session, WT_VERB_VERSION,
+ "%" PRIu64 ".%" PRIu64, maj_version, min_version));
+ }
+
+ /* Get the file ID. */
+ WT_RET(__wt_config_gets(session, cfg, "id", &cval));
+ btree->id = (uint32_t)cval.val;
+
+ /* Validate file types and check the data format plan. */
+ WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
+ WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL));
+ if (WT_STRING_MATCH("r", cval.str, cval.len))
+ btree->type = BTREE_COL_VAR;
+ else
+ btree->type = BTREE_ROW;
+ WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->key_format));
+
+ WT_RET(__wt_config_gets(session, cfg, "value_format", &cval));
+ WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL));
+ WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->value_format));
+
+ /* Row-store key comparison and key gap for prefix compression. */
+ if (btree->type == BTREE_ROW) {
+ WT_RET(__wt_collator_config(
+ session, cfg, &btree->collator, &btree->collator_owned));
+
+ WT_RET(__wt_config_gets(session, cfg, "key_gap", &cval));
+ btree->key_gap = (uint32_t)cval.val;
+ }
+
+ /* Column-store: check for fixed-size data. */
+ if (btree->type == BTREE_COL_VAR) {
+ WT_RET(__wt_struct_check(
+ session, cval.str, cval.len, &fixed, &bitcnt));
+ if (fixed) {
+ if (bitcnt == 0 || bitcnt > 8)
+ WT_RET_MSG(session, EINVAL,
+ "fixed-width field sizes must be greater "
+ "than 0 and less than or equal to 8");
+ btree->bitcnt = (uint8_t)bitcnt;
+ btree->type = BTREE_COL_FIX;
+ }
+ }
+
+ /* Page sizes */
+ WT_RET(__btree_page_sizes(session));
+
+ /* Eviction; the metadata file is never evicted. */
+ if (WT_IS_METADATA(btree->dhandle))
+ F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD);
+ else {
+ WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval));
+ if (cval.val)
+ F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD);
+ else
+ F_CLR(btree, WT_BTREE_NO_EVICTION);
+ }
+
+ /* Checksums */
+ WT_RET(__wt_config_gets(session, cfg, "checksum", &cval));
+ if (WT_STRING_MATCH("on", cval.str, cval.len))
+ btree->checksum = CKSUM_ON;
+ else if (WT_STRING_MATCH("off", cval.str, cval.len))
+ btree->checksum = CKSUM_OFF;
+ else
+ btree->checksum = CKSUM_UNCOMPRESSED;
+
+ /* Huffman encoding */
+ WT_RET(__wt_btree_huffman_open(session));
+
+ /*
+ * Reconciliation configuration:
+ * Block compression (all)
+ * Dictionary compression (variable-length column-store, row-store)
+ * Page-split percentage
+ * Prefix compression (row-store)
+ * Suffix compression (row-store)
+ */
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ break;
+ case BTREE_ROW:
+ WT_RET(__wt_config_gets(
+ session, cfg, "internal_key_truncate", &cval));
+ btree->internal_key_truncate = cval.val == 0 ? 0 : 1;
+
+ WT_RET(__wt_config_gets(
+ session, cfg, "prefix_compression", &cval));
+ btree->prefix_compression = cval.val == 0 ? 0 : 1;
+ WT_RET(__wt_config_gets(
+ session, cfg, "prefix_compression_min", &cval));
+ btree->prefix_compression_min = (u_int)cval.val;
+ /* FALLTHROUGH */
+ case BTREE_COL_VAR:
+ WT_RET(__wt_config_gets(session, cfg, "dictionary", &cval));
+ btree->dictionary = (u_int)cval.val;
+ break;
+ }
+
+ WT_RET(__wt_config_gets(session, cfg, "block_compressor", &cval));
+ if (cval.len > 0) {
+ TAILQ_FOREACH(ncomp, &conn->compqh, q)
+ if (WT_STRING_MATCH(ncomp->name, cval.str, cval.len)) {
+ btree->compressor = ncomp->compressor;
+ break;
+ }
+ if (btree->compressor == NULL)
+ WT_RET_MSG(session, EINVAL,
+ "unknown block compressor '%.*s'",
+ (int)cval.len, cval.str);
+ }
+
+ /* Initialize locks. */
+ WT_RET(__wt_rwlock_alloc(
+ session, &btree->ovfl_lock, "btree overflow lock"));
+ WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush lock"));
+
+ __wt_stat_init_dsrc_stats(&btree->dhandle->stats);
+
+ btree->write_gen = ckpt->write_gen; /* Write generation */
+ btree->modified = 0; /* Clean */
+
+ return (0);
+}
+
+/*
+ * __wt_root_ref_init --
+ * Initialize a tree root reference, and link in the root page.
+ */
+void
+__wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno)
+{
+ memset(root_ref, 0, sizeof(*root_ref));
+
+ root_ref->page = root;
+ root_ref->state = WT_REF_MEM;
+
+ root_ref->key.recno = is_recno ? 1 : 0;
+
+ root->pg_intl_parent_ref = root_ref;
+}
+
+/*
+ * __wt_btree_tree_open --
+ * Read in a tree from disk.
+ */
+int
+__wt_btree_tree_open(
+ WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_ITEM dsk;
+ WT_PAGE *page;
+
+ btree = S2BT(session);
+
+ /*
+ * A buffer into which we read a root page; don't use a scratch buffer,
+ * the buffer's allocated memory becomes the persistent in-memory page.
+ */
+ WT_CLEAR(dsk);
+
+ /* Read the page, then build the in-memory version of the page. */
+ WT_ERR(__wt_bt_read(session, &dsk, addr, addr_size));
+ WT_ERR(__wt_page_inmem(session, NULL, dsk.data,
+ WT_DATA_IN_ITEM(&dsk) ?
+ WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED , &page));
+
+ /* Finish initializing the root, root reference links. */
+ __wt_root_ref_init(&btree->root, page, btree->type != BTREE_ROW);
+
+ if (0) {
+err: __wt_buf_free(session, &dsk);
+ }
+ return (ret);
+}
+
+/*
+ * __btree_tree_open_empty --
+ * Create an empty in-memory tree.
+ */
+static int
+__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation, int readonly)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *root, *leaf;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *ref;
+
+ btree = S2BT(session);
+ root = leaf = NULL;
+
+ /*
+ * Newly created objects can be used for cursor inserts or for bulk
+ * loads; set a flag that's cleared when a row is inserted into the
+ * tree. Objects being bulk-loaded cannot be evicted, we set it
+ * globally, there's no point in searching empty trees for eviction.
+ */
+ if (creation) {
+ btree->bulk_load_ok = 1;
+ __wt_btree_evictable(session, 0);
+ }
+
+ /*
+ * A note about empty trees: the initial tree is a root page and a leaf
+ * page. We need a pair of pages instead of just a single page because
+ * we can reconcile the leaf page while the root stays pinned in memory.
+ * If the pair is evicted without being modified, that's OK, nothing is
+ * ever written.
+ *
+ * Create the root and leaf pages.
+ *
+ * !!!
+ * Be cautious about changing the order of updates in this code: to call
+ * __wt_page_out on error, we require a correct page setup at each point
+ * where we might fail.
+ */
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ WT_ERR(
+ __wt_page_alloc(session, WT_PAGE_COL_INT, 1, 1, 1, &root));
+ root->pg_intl_parent_ref = &btree->root;
+
+ pindex = WT_INTL_INDEX_COPY(root);
+ ref = pindex->index[0];
+ ref->home = root;
+ WT_ERR(__wt_btree_new_leaf_page(session, &leaf));
+ ref->page = leaf;
+ ref->addr = NULL;
+ ref->state = WT_REF_MEM;
+ ref->key.recno = 1;
+ break;
+ case BTREE_ROW:
+ WT_ERR(
+ __wt_page_alloc(session, WT_PAGE_ROW_INT, 0, 1, 1, &root));
+ root->pg_intl_parent_ref = &btree->root;
+
+ pindex = WT_INTL_INDEX_COPY(root);
+ ref = pindex->index[0];
+ ref->home = root;
+ WT_ERR(__wt_btree_new_leaf_page(session, &leaf));
+ ref->page = leaf;
+ ref->addr = NULL;
+ ref->state = WT_REF_MEM;
+ WT_ERR(__wt_row_ikey_incr(
+ session, root, 0, "", 1, &ref->key.ikey));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /*
+ * Mark the leaf page dirty: we didn't create an entirely valid root
+ * page (specifically, the root page's disk address isn't set, and it's
+ * the act of reconciling the leaf page that makes it work, we don't
+ * try and use the original disk address of modified pages). We could
+ * get around that by leaving the leaf page clean and building a better
+ * root page, but then we get into trouble because a checkpoint marks
+ * the root page dirty to force a write, and without reconciling the
+ * leaf page we won't realize there's no records to write, we'll write
+ * a root page, which isn't correct for an empty tree.
+ *
+ * Earlier versions of this code kept the leaf page clean, but with the
+ * "empty" flag set in the leaf page's modification structure; in that
+ * case, checkpoints works (forced reconciliation of a root with a
+ * single "empty" page wouldn't write any blocks). That version had
+ * memory leaks because the eviction code didn't correctly handle pages
+ * that were "clean" (and so never reconciled), yet "modified" with an
+ * "empty" flag. The goal of this code is to mimic a real tree that
+ * simply has no records, for whatever reason, and trust reconciliation
+ * to figure out it's empty and not write any blocks.
+ *
+ * We do not set the tree's modified flag because the checkpoint code
+ * skips unmodified files in closing checkpoints (checkpoints that
+ * don't require a write unless the file is actually dirty). There's
+ * no need to reconcile this file unless the application does a real
+ * checkpoint or it's actually modified.
+ *
+ * Only do this for a live tree, not for checkpoints. If we open an
+ * empty checkpoint, the leaf page cannot be dirty or eviction may try
+ * to write it, which will fail because checkpoints are read-only.
+ */
+ if (!readonly) {
+ WT_ERR(__wt_page_modify_init(session, leaf));
+ __wt_page_only_modify_set(session, leaf);
+ }
+
+ /* Finish initializing the root, root reference links. */
+ __wt_root_ref_init(&btree->root, root, btree->type != BTREE_ROW);
+
+ return (0);
+
+err: if (leaf != NULL)
+ __wt_page_out(session, &leaf);
+ if (root != NULL)
+ __wt_page_out(session, &root);
+ return (ret);
+}
+
+/*
+ * __wt_btree_new_leaf_page --
+ * Create an empty leaf page and link it into a reference in its parent.
+ */
+int
+__wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ WT_RET(
+ __wt_page_alloc(session, WT_PAGE_COL_FIX, 1, 0, 1, pagep));
+ break;
+ case BTREE_COL_VAR:
+ WT_RET(
+ __wt_page_alloc(session, WT_PAGE_COL_VAR, 1, 0, 1, pagep));
+ break;
+ case BTREE_ROW:
+ WT_RET(
+ __wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, 1, pagep));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
+
+/*
+ * __wt_btree_evictable --
+ * Setup or release a cache-resident tree.
+ */
+void
+__wt_btree_evictable(WT_SESSION_IMPL *session, int on)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ /* The metadata file is never evicted. */
+ if (on && !WT_IS_METADATA(btree->dhandle))
+ F_CLR(btree, WT_BTREE_NO_EVICTION);
+ else
+ F_SET(btree, WT_BTREE_NO_EVICTION);
+}
+
+/*
+ * __btree_preload --
+ * Pre-load internal pages.
+ */
+static int
+__btree_preload(WT_SESSION_IMPL *session)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_REF *ref;
+ size_t addr_size;
+ const uint8_t *addr;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ /* Pre-load the second-level internal pages. */
+ WT_INTL_FOREACH_BEGIN(session, btree->root.page, ref) {
+ WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ if (addr != NULL)
+ WT_RET(bm->preload(bm, session, addr, addr_size));
+ } WT_INTL_FOREACH_END;
+ return (0);
+}
+
+/*
+ * __btree_get_last_recno --
+ * Set the last record number for a column-store.
+ */
+static int
+__btree_get_last_recno(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ WT_REF *next_walk;
+
+ btree = S2BT(session);
+
+ next_walk = NULL;
+ WT_RET(__wt_tree_walk(session, &next_walk, WT_READ_PREV));
+ if (next_walk == NULL)
+ return (WT_NOTFOUND);
+
+ page = next_walk->page;
+ btree->last_recno = page->type == WT_PAGE_COL_VAR ?
+ __col_var_last_recno(page) : __col_fix_last_recno(page);
+
+ return (__wt_page_release(session, next_walk, 0));
+}
+
+/*
+ * __btree_page_sizes --
+ * Verify the page sizes. Some of these sizes are automatically checked
+ * using limits defined in the API, don't duplicate the logic here.
+ */
+static int
+__btree_page_sizes(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CONFIG_ITEM cval;
+ uint64_t cache_size;
+ uint32_t intl_split_size, leaf_split_size;
+ const char **cfg;
+
+ btree = S2BT(session);
+ cfg = btree->dhandle->cfg;
+
+ WT_RET(__wt_direct_io_size_check(
+ session, cfg, "allocation_size", &btree->allocsize));
+ WT_RET(__wt_direct_io_size_check(
+ session, cfg, "internal_page_max", &btree->maxintlpage));
+ WT_RET(__wt_config_gets(session, cfg, "internal_item_max", &cval));
+ btree->maxintlitem = (uint32_t)cval.val;
+ WT_RET(__wt_direct_io_size_check(
+ session, cfg, "leaf_page_max", &btree->maxleafpage));
+ WT_RET(__wt_config_gets(session, cfg, "leaf_item_max", &cval));
+ btree->maxleafitem = (uint32_t)cval.val;
+
+ WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval));
+ btree->split_pct = (int)cval.val;
+
+ /*
+ * When a page is forced to split, we want at least 50 entries on its
+ * parent.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
+ btree->maxmempage = WT_MAX((uint64_t)cval.val, 50 * btree->maxleafpage);
+
+ /*
+ * Don't let pages grow to more than half the cache size. Otherwise,
+ * with very small caches, we can end up in a situation where nothing
+ * can be evicted. Take care getting the cache size: with a shared
+ * cache, it may not have been set.
+ */
+ cache_size = S2C(session)->cache_size;
+ if (cache_size > 0)
+ btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 2);
+
+ /* Allocation sizes must be a power-of-two, nothing else makes sense. */
+ if (!__wt_ispo2(btree->allocsize))
+ WT_RET_MSG(session,
+ EINVAL, "the allocation size must be a power of two");
+
+ /* All page sizes must be in units of the allocation size. */
+ if (btree->maxintlpage < btree->allocsize ||
+ btree->maxintlpage % btree->allocsize != 0 ||
+ btree->maxleafpage < btree->allocsize ||
+ btree->maxleafpage % btree->allocsize != 0)
+ WT_RET_MSG(session, EINVAL,
+ "page sizes must be a multiple of the page allocation "
+ "size (%" PRIu32 "B)", btree->allocsize);
+
+ /*
+ * Set the split percentage: reconciliation splits to a smaller-than-
+ * maximum page size so we don't split every time a new entry is added.
+ */
+ intl_split_size = __wt_split_page_size(btree, btree->maxintlpage);
+ leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage);
+
+ /*
+ * Default values for internal and leaf page items: make sure at least
+ * 8 items fit on split pages.
+ */
+ if (btree->maxintlitem == 0)
+ btree->maxintlitem = intl_split_size / 8;
+ if (btree->maxleafitem == 0)
+ btree->maxleafitem = leaf_split_size / 8;
+
+ /*
+ * If raw compression is configured, the application owns page layout,
+ * it's not our problem. Hopefully the application chose well.
+ */
+ if (btree->compressor != NULL &&
+ btree->compressor->compress_raw != NULL)
+ return (0);
+
+ /* Check we can fit at least 2 items on a page. */
+ if (btree->maxintlitem > btree->maxintlpage / 2)
+ return (pse1(session, "internal",
+ btree->maxintlpage, btree->maxintlitem));
+ if (btree->maxleafitem > btree->maxleafpage / 2)
+ return (pse1(session, "leaf",
+ btree->maxleafpage, btree->maxleafitem));
+
+ /*
+ * Take into account the size of a split page:
+ *
+ * Make it a separate error message so it's clear what went wrong.
+ */
+ if (btree->maxintlitem > intl_split_size / 2)
+ return (pse2(session, "internal",
+ btree->maxintlpage, btree->maxintlitem, btree->split_pct));
+ if (btree->maxleafitem > leaf_split_size / 2)
+ return (pse2(session, "leaf",
+ btree->maxleafpage, btree->maxleafitem, btree->split_pct));
+
+ return (0);
+}
+
+/*
+ * __wt_split_page_size --
+ * Split page size calculation: we don't want to repeatedly split every
+ * time a new entry is added, so we split to a smaller-than-maximum page size.
+ */
+uint32_t
+__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
+{
+ uintmax_t a;
+ uint32_t split_size;
+
+ /*
+ * Ideally, the split page size is some percentage of the maximum page
+ * size rounded to an allocation unit (round to an allocation unit so
+ * we don't waste space when we write).
+ */
+ a = maxpagesize; /* Don't overflow. */
+ split_size = (uint32_t)
+ WT_ALIGN((a * (u_int)btree->split_pct) / 100, btree->allocsize);
+
+ /*
+ * If the result of that calculation is the same as the allocation unit
+ * (that happens if the maximum size is the same size as an allocation
+ * unit, use a percentage of the maximum page size).
+ */
+ if (split_size == btree->allocsize)
+ split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100);
+
+ return (split_size);
+}
+
+/*
+ * pse1 --
+ * Page size error message 1.
+ */
+static int
+pse1(WT_SESSION_IMPL *session, const char *type, uint32_t max, uint32_t ovfl)
+{
+ WT_RET_MSG(session, EINVAL,
+ "%s page size (%" PRIu32 "B) too small for the maximum item size "
+ "(%" PRIu32 "B); the page must be able to hold at least 2 items",
+ type, max, ovfl);
+}
+
+/*
+ * pse2 --
+ * Page size error message 2.
+ */
+static int
+pse2(WT_SESSION_IMPL *session,
+ const char *type, uint32_t max, uint32_t ovfl, int pct)
+{
+ WT_RET_MSG(session, EINVAL,
+ "%s page size (%" PRIu32 "B) too small for the maximum item size "
+ "(%" PRIu32 "B), because of the split percentage (%d %%); a split "
+ "page must be able to hold at least 2 items",
+ type, max, ovfl, pct);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_huffman.c b/src/third_party/wiredtiger/src/btree/bt_huffman.c
new file mode 100644
index 00000000000..aa6e7c36451
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_huffman.c
@@ -0,0 +1,340 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * 7-bit ASCII, with English language frequencies.
+ *
+ * Based on "Case-sensitive letter and bigram frequency counts from large-scale
+ * English corpora"
+ * Michael N. Jones and D.J.K. Mewhort
+ * Queen's University, Kingston, Ontario, Canada
+ * Behavior Research Methods, Instruments, & Computers 2004, 36 (3), 388-396
+ *
+ * Additionally supports space and tab characters; space is the most common
+ * character in text where it occurs, and tab appears about as frequently as
+ * 'a' and 'n' in text where it occurs.
+ */
+struct __wt_huffman_table {
+ uint32_t symbol;
+ uint32_t frequency;
+};
+static const struct __wt_huffman_table __wt_huffman_nytenglish[] = {
+ /* nul */ { 0x00, 0 }, /* For an escape character. */
+ /* ht */ { 0x09, 5263779 },
+ /* sp */ { 0x20, 8000000 },
+ /* ! */ { 0x21, 2178 },
+ /* " */ { 0x22, 284671 },
+ /* # */ { 0x23, 10 },
+ /* $ */ { 0x24, 51572 },
+ /* % */ { 0x25, 1993 },
+ /* & */ { 0x26, 6523 },
+ /* ' */ { 0x27, 204497 },
+ /* ( */ { 0x28, 53398 },
+ /* ) */ { 0x29, 53735 },
+ /* * */ { 0x2a, 20716 },
+ /* + */ { 0x2b, 309 },
+ /* , */ { 0x2c, 984969 },
+ /* - */ { 0x2d, 252302 },
+ /* . */ { 0x2e, 946136 },
+ /* / */ { 0x2f, 8161 },
+ /* 0 */ { 0x30, 546233 },
+ /* 1 */ { 0x31, 460946 },
+ /* 2 */ { 0x32, 333499 },
+ /* 3 */ { 0x33, 187606 },
+ /* 4 */ { 0x34, 192528 },
+ /* 5 */ { 0x35, 374413 },
+ /* 6 */ { 0x36, 153865 },
+ /* 7 */ { 0x37, 120094 },
+ /* 8 */ { 0x38, 182627 },
+ /* 9 */ { 0x39, 282364 },
+ /* : */ { 0x3a, 54036 },
+ /* ; */ { 0x3b, 36727 },
+ /* < */ { 0x3c, 82 },
+ /* = */ { 0x3d, 22 },
+ /* > */ { 0x3e, 83 },
+ /* ? */ { 0x3f, 12357 },
+ /* @ */ { 0x40, 1 },
+ /* A */ { 0x41, 280937 },
+ /* B */ { 0x42, 169474 },
+ /* C */ { 0x43, 229363 },
+ /* D */ { 0x44, 129632 },
+ /* E */ { 0x45, 138443 },
+ /* F */ { 0x46, 100751 },
+ /* G */ { 0x47, 93212 },
+ /* H */ { 0x48, 123632 },
+ /* I */ { 0x49, 223312 },
+ /* J */ { 0x4a, 78706 },
+ /* K */ { 0x4b, 46580 },
+ /* L */ { 0x4c, 106984 },
+ /* M */ { 0x4d, 259474 },
+ /* N */ { 0x4e, 205409 },
+ /* O */ { 0x4f, 105700 },
+ /* P */ { 0x50, 144239 },
+ /* Q */ { 0x51, 11659 },
+ /* R */ { 0x52, 146448 },
+ /* S */ { 0x53, 304971 },
+ /* T */ { 0x54, 325462 },
+ /* U */ { 0x55, 57488 },
+ /* V */ { 0x56, 31053 },
+ /* W */ { 0x57, 107195 },
+ /* X */ { 0x58, 7578 },
+ /* Y */ { 0x59, 94297 },
+ /* Z */ { 0x5a, 5610 },
+ /* [ */ { 0x5b, 1 },
+ /* \ */ { 0x5c, 1 },
+ /* ] */ { 0x5d, 1 },
+ /* ^ */ { 0x5e, 1 },
+ /* _ */ { 0x5f, 1 },
+ /* ` */ { 0x60, 1 },
+ /* a */ { 0x61, 5263779 },
+ /* b */ { 0x62, 866156 },
+ /* c */ { 0x63, 1960412 },
+ /* d */ { 0x64, 2369820 },
+ /* e */ { 0x65, 7741842 },
+ /* f */ { 0x66, 1296925 },
+ /* g */ { 0x67, 1206747 },
+ /* h */ { 0x68, 2955858 },
+ /* i */ { 0x69, 4527332 },
+ /* j */ { 0x6a, 65856 },
+ /* k */ { 0x6b, 460788 },
+ /* l */ { 0x6c, 2553152 },
+ /* m */ { 0x6d, 1467376 },
+ /* n */ { 0x6e, 4535545 },
+ /* o */ { 0x6f, 4729266 },
+ /* p */ { 0x70, 1255579 },
+ /* q */ { 0x71, 54221 },
+ /* r */ { 0x72, 4137949 },
+ /* s */ { 0x73, 4186210 },
+ /* t */ { 0x74, 5507692 },
+ /* u */ { 0x75, 1613323 },
+ /* v */ { 0x76, 653370 },
+ /* w */ { 0x77, 1015656 },
+ /* x */ { 0x78, 123577 },
+ /* y */ { 0x79, 1062040 },
+ /* z */ { 0x7a, 66423 },
+ /* { */ { 0x7b, 1 },
+ /* | */ { 0x7c, 1 },
+ /* } */ { 0x7d, 1 },
+ /* ~ */ { 0x7e, 1 }
+};
+
+static int __wt_huffman_read(WT_SESSION_IMPL *,
+ WT_CONFIG_ITEM *, struct __wt_huffman_table **, u_int *, u_int *);
+
+/*
+ * __wt_btree_huffman_open --
+ * Configure Huffman encoding for the tree.
+ */
+int
+__wt_btree_huffman_open(WT_SESSION_IMPL *session)
+{
+ struct __wt_huffman_table *table;
+ WT_BTREE *btree;
+ WT_CONFIG_ITEM key_conf, value_conf;
+ WT_DECL_RET;
+ const char **cfg;
+ u_int entries, numbytes;
+
+ btree = S2BT(session);
+ cfg = btree->dhandle->cfg;
+
+ WT_RET(__wt_config_gets(session, cfg, "huffman_key", &key_conf));
+ WT_RET(__wt_config_gets(session, cfg, "huffman_value", &value_conf));
+ if (key_conf.len == 0 && value_conf.len == 0)
+ return (0);
+
+ switch (btree->type) { /* Check file type compatibility. */
+ case BTREE_COL_FIX:
+ WT_RET_MSG(session, EINVAL,
+ "fixed-size column-store files may not be Huffman encoded");
+ case BTREE_COL_VAR:
+ if (key_conf.len != 0)
+ WT_RET_MSG(session, EINVAL,
+ "the keys of variable-length column-store files "
+ "may not be Huffman encoded");
+ break;
+ case BTREE_ROW:
+ break;
+ }
+
+ if (strncasecmp(key_conf.str, "english", key_conf.len) == 0) {
+ struct __wt_huffman_table
+ copy[WT_ELEMENTS(__wt_huffman_nytenglish)];
+
+ memcpy(copy,
+ __wt_huffman_nytenglish, sizeof(__wt_huffman_nytenglish));
+ WT_RET(__wt_huffman_open(session, copy,
+ WT_ELEMENTS(__wt_huffman_nytenglish),
+ 1, &btree->huffman_key));
+
+ /* Check for a shared key/value table. */
+ if (strncasecmp(
+ value_conf.str, "english", value_conf.len) == 0) {
+ btree->huffman_value = btree->huffman_key;
+ return (0);
+ }
+ } else {
+ WT_RET(__wt_huffman_read(
+ session, &key_conf, &table, &entries, &numbytes));
+ ret = __wt_huffman_open(session, table,
+ entries, numbytes, &btree->huffman_key);
+ __wt_free(session, table);
+ if (ret != 0)
+ return (ret);
+
+ /* Check for a shared key/value table. */
+ if (value_conf.len != 0 && key_conf.len == value_conf.len &&
+ memcmp(key_conf.str, value_conf.str, key_conf.len) == 0) {
+ btree->huffman_value = btree->huffman_key;
+ return (0);
+ }
+ }
+ if (strncasecmp(value_conf.str, "english", value_conf.len) == 0) {
+ struct __wt_huffman_table
+ copy[WT_ELEMENTS(__wt_huffman_nytenglish)];
+
+ memcpy(copy,
+ __wt_huffman_nytenglish, sizeof(__wt_huffman_nytenglish));
+ WT_RET(__wt_huffman_open(session, copy,
+ WT_ELEMENTS(__wt_huffman_nytenglish),
+ 1, &btree->huffman_value));
+ } else {
+ WT_RET(__wt_huffman_read(
+ session, &value_conf, &table, &entries, &numbytes));
+ ret = __wt_huffman_open(session, table,
+ entries, numbytes, &btree->huffman_value);
+ __wt_free(session, table);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_huffman_read --
+ * Read a Huffman table from a file.
+ */
+static int
+__wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip,
+ struct __wt_huffman_table **tablep, u_int *entriesp, u_int *numbytesp)
+{
+ struct __wt_huffman_table *table, *tp;
+ FILE *fp;
+ WT_DECL_RET;
+ uint64_t symbol, frequency;
+ u_int entries, lineno;
+ char *file;
+
+ *tablep = NULL;
+ *entriesp = *numbytesp = 0;
+
+ fp = NULL;
+ file = NULL;
+ table = NULL;
+
+ /*
+ * UTF-8 table is 256 bytes, with a range of 0-255.
+ * UTF-16 is 128KB (2 * 65536) bytes, with a range of 0-65535.
+ */
+ if (strncasecmp(ip->str, "utf8", 4) == 0) {
+ entries = UINT8_MAX;
+ *numbytesp = 1;
+ WT_ERR(__wt_calloc_def(session, entries, &table));
+
+ if (ip->len == 4)
+ WT_ERR_MSG(session, EINVAL,
+ "no Huffman table file name specified");
+ WT_ERR(__wt_calloc_def(session, ip->len, &file));
+ memcpy(file, ip->str + 4, ip->len - 4);
+ } else if (strncasecmp(ip->str, "utf16", 5) == 0) {
+ entries = UINT16_MAX;
+ *numbytesp = 2;
+ WT_ERR(__wt_calloc_def(session, entries, &table));
+
+ if (ip->len == 5)
+ WT_ERR_MSG(session, EINVAL,
+ "no Huffman table file name specified");
+ WT_ERR(__wt_calloc_def(session, ip->len, &file));
+ memcpy(file, ip->str + 5, ip->len - 5);
+ } else {
+ WT_ERR_MSG(session, EINVAL,
+ "unknown Huffman configuration value %.*s",
+ (int)ip->len, ip->str);
+ }
+
+ if ((fp = fopen(file, "r")) == NULL)
+ WT_ERR_MSG(session, __wt_errno(),
+ "unable to read Huffman table file %.*s",
+ (int)ip->len, ip->str);
+
+ for (tp = table, lineno = 1; (ret =
+ fscanf(fp, "%" SCNu64 " %" SCNu64, &symbol, &frequency)) != EOF;
+ ++tp, ++lineno) {
+ if (lineno > entries)
+ WT_ERR_MSG(session, EINVAL,
+ "Huffman table file %.*s is corrupted, "
+ "more than %" PRIu32 " entries",
+ (int)ip->len, ip->str, entries);
+ if (ret != 2)
+ WT_ERR_MSG(session, EINVAL,
+ "line %u of Huffman table file %.*s is corrupted: "
+ "expected two unsigned integral values",
+ lineno, (int)ip->len, ip->str);
+ if (symbol > entries)
+ WT_ERR_MSG(session, EINVAL,
+ "line %u of Huffman table file %.*s is corrupted: "
+ "symbol larger than maximum value of %u",
+ lineno, (int)ip->len, ip->str, entries);
+ if (frequency > UINT32_MAX)
+ WT_ERR_MSG(session, EINVAL,
+ "line %u of Huffman table file %.*s is corrupted: "
+ "frequency larger than maximum value of %" PRIu32,
+ lineno, (int)ip->len, ip->str, UINT32_MAX);
+
+ tp->symbol = (uint32_t)symbol;
+ tp->frequency = (uint32_t)frequency;
+ }
+
+ *entriesp = lineno - 1;
+ *tablep = table;
+
+ if (0) {
+err: __wt_free(session, table);
+ }
+ if (fp != NULL)
+ (void)fclose(fp);
+ __wt_free(session, file);
+ return (ret);
+}
+
+/*
+ * __wt_btree_huffman_close --
+ * Close the Huffman tables.
+ */
+void
+__wt_btree_huffman_close(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ if (btree->huffman_key != NULL) {
+ /* Key and data may use the same table, only close it once. */
+ if (btree->huffman_value == btree->huffman_key)
+ btree->huffman_value = NULL;
+
+ __wt_huffman_close(session, btree->huffman_key);
+ btree->huffman_key = NULL;
+ }
+ if (btree->huffman_value != NULL) {
+ __wt_huffman_close(session, btree->huffman_value);
+ btree->huffman_value = NULL;
+ }
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c
new file mode 100644
index 00000000000..ccc67c994dc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_io.c
@@ -0,0 +1,304 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bt_read --
+ * Read a cookie referenced block into a buffer.
+ */
+int
+__wt_bt_read(WT_SESSION_IMPL *session,
+ WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ const WT_PAGE_HEADER *dsk;
+ size_t result_len;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ /*
+ * If anticipating a compressed block, read into a scratch buffer and
+ * decompress into the caller's buffer. Else, read directly into the
+ * caller's buffer.
+ */
+ if (btree->compressor == NULL) {
+ WT_RET(bm->read(bm, session, buf, addr, addr_size));
+ dsk = buf->data;
+ } else {
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(bm->read(bm, session, tmp, addr, addr_size));
+ dsk = tmp->data;
+ }
+
+ /*
+ * If the block is compressed, copy the skipped bytes of the original
+ * image into place, then decompress.
+ */
+ if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) {
+ if (btree->compressor == NULL ||
+ btree->compressor->decompress == NULL)
+ WT_ERR_MSG(session, WT_ERROR,
+ "read compressed block where no compression engine "
+ "configured");
+
+ /*
+ * We're allocating the exact number of bytes we're expecting
+ * from decompression.
+ */
+ WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size));
+
+ /*
+ * Note the source length is NOT the number of compressed bytes,
+ * it's the length of the block we just read (minus the skipped
+ * bytes). We don't store the number of compressed bytes: some
+ * compression engines need that length stored externally, they
+ * don't have markers in the stream to signal the end of the
+ * compressed bytes. Those engines must store the compressed
+ * byte length somehow, see the snappy compression extension for
+ * an example.
+ */
+ memcpy(buf->mem, tmp->data, WT_BLOCK_COMPRESS_SKIP);
+ ret = btree->compressor->decompress(
+ btree->compressor, &session->iface,
+ (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP,
+ tmp->size - WT_BLOCK_COMPRESS_SKIP,
+ (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP,
+ dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len);
+
+ /*
+ * If checksums were turned off because we're depending on the
+ * decompression to fail on any corrupted data, we'll end up
+ * here after corruption happens. If we're salvaging the file,
+ * it's OK, otherwise it's really, really bad.
+ */
+ if (ret != 0 ||
+ result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
+ WT_ERR(
+ F_ISSET(btree, WT_BTREE_VERIFY) ||
+ F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
+ WT_ERROR :
+ __wt_illegal_value(session, btree->dhandle->name));
+ } else
+ if (btree->compressor == NULL)
+ buf->size = dsk->mem_size;
+ else
+ /*
+ * We guessed wrong: there was a compressor, but this
+ * block was not compressed, and now the page is in the
+ * wrong buffer and the buffer may be of the wrong size.
+ * This should be rare, but happens with small blocks
+ * that aren't worth compressing.
+ */
+ WT_ERR(__wt_buf_set(
+ session, buf, tmp->data, dsk->mem_size));
+
+ /* If the handle is a verify handle, verify the physical page. */
+ if (F_ISSET(btree, WT_BTREE_VERIFY)) {
+ if (tmp == NULL)
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
+ WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf));
+ }
+
+ WT_STAT_FAST_CONN_INCR(session, cache_read);
+ WT_STAT_FAST_DATA_INCR(session, cache_read);
+ if (F_ISSET(dsk, WT_PAGE_COMPRESSED))
+ WT_STAT_FAST_DATA_INCR(session, compress_read);
+ WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size);
+ WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_bt_write --
+ * Write a buffer into a block, returning the block's addr/size and
+ * checksum.
+ */
+int
+__wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
+ uint8_t *addr, size_t *addr_sizep, int checkpoint, int compressed)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_ITEM *ip;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_PAGE_HEADER *dsk;
+ size_t len, src_len, dst_len, result_len, size;
+ int data_cksum, compression_failed;
+ uint8_t *src, *dst;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ /* Checkpoint calls are different than standard calls. */
+ WT_ASSERT(session,
+ (checkpoint == 0 && addr != NULL && addr_sizep != NULL) ||
+ (checkpoint == 1 && addr == NULL && addr_sizep == NULL));
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * We're passed a table's disk image. Decompress if necessary and
+ * verify the image. Always check the in-memory length for accuracy.
+ */
+ dsk = buf->mem;
+ WT_ASSERT(session, dsk->u.entries != 0);
+ if (compressed) {
+ WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &tmp));
+
+ memcpy(tmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP);
+ WT_ERR(btree->compressor->decompress(
+ btree->compressor, &session->iface,
+ (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP,
+ buf->size - WT_BLOCK_COMPRESS_SKIP,
+ (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP,
+ tmp->memsize - WT_BLOCK_COMPRESS_SKIP,
+ &result_len));
+ WT_ASSERT(session,
+ dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP);
+ tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP;
+ ip = tmp;
+ } else {
+ WT_ASSERT(session, dsk->mem_size == buf->size);
+ ip = buf;
+ }
+ WT_ERR(__wt_verify_dsk(session, "[write-check]", ip));
+ __wt_scr_free(&tmp);
+#endif
+
+ /*
+ * Optionally stream-compress the data, but don't compress blocks that
+ * are already as small as they're going to get.
+ */
+ if (btree->compressor == NULL ||
+ btree->compressor->compress == NULL || compressed)
+ ip = buf;
+ else if (buf->size <= btree->allocsize) {
+ ip = buf;
+ WT_STAT_FAST_DATA_INCR(session, compress_write_too_small);
+ } else {
+ /* Skip the header bytes of the source data. */
+ src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP;
+ src_len = buf->size - WT_BLOCK_COMPRESS_SKIP;
+
+ /*
+ * Compute the size needed for the destination buffer. We only
+ * allocate enough memory for a copy of the original by default,
+ * if any compressed version is bigger than the original, we
+ * won't use it. However, some compression engines (snappy is
+ * one example), may need more memory because they don't stop
+ * just because there's no more memory into which to compress.
+ */
+ if (btree->compressor->pre_size == NULL)
+ len = src_len;
+ else
+ WT_ERR(btree->compressor->pre_size(btree->compressor,
+ &session->iface, src, src_len, &len));
+
+ size = len + WT_BLOCK_COMPRESS_SKIP;
+ WT_ERR(bm->write_size(bm, session, &size));
+ WT_ERR(__wt_scr_alloc(session, size, &tmp));
+
+ /* Skip the header bytes of the destination data. */
+ dst = (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP;
+ dst_len = len;
+
+ compression_failed = 0;
+ WT_ERR(btree->compressor->compress(btree->compressor,
+ &session->iface,
+ src, src_len,
+ dst, dst_len,
+ &result_len, &compression_failed));
+ result_len += WT_BLOCK_COMPRESS_SKIP;
+
+ /*
+ * If compression fails, or doesn't gain us at least one unit of
+ * allocation, fallback to the original version. This isn't
+ * unexpected: if compression doesn't work for some chunk of
+ * data for some reason (noting likely additional format/header
+ * information which compressed output requires), it just means
+ * the uncompressed version is as good as it gets, and that's
+ * what we use.
+ */
+ if (compression_failed ||
+ buf->size / btree->allocsize ==
+ result_len / btree->allocsize) {
+ ip = buf;
+ WT_STAT_FAST_DATA_INCR(session, compress_write_fail);
+ } else {
+ compressed = 1;
+ WT_STAT_FAST_DATA_INCR(session, compress_write);
+
+ /*
+ * Copy in the skipped header bytes, set the final data
+ * size.
+ */
+ memcpy(tmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP);
+ tmp->size = result_len;
+ ip = tmp;
+ }
+ }
+ dsk = ip->mem;
+
+ /* If the buffer is compressed, set the flag. */
+ if (compressed)
+ F_SET(dsk, WT_PAGE_COMPRESSED);
+
+ /*
+ * We increment the block's write generation so it's easy to identify
+ * newer versions of blocks during salvage. (It's common in WiredTiger,
+ * at least for the default block manager, for multiple blocks to be
+ * internally consistent with identical first and last keys, so we need
+ * a way to know the most recent state of the block. We could check
+ * which leaf is referenced by a valid internal page, but that implies
+ * salvaging internal pages, which I don't want to do, and it's not
+ * as good anyway, because the internal page may not have been written
+ * after the leaf page was updated. So, write generations it is.
+ *
+ * Nothing is locked at this point but two versions of a page with the
+ * same generation is pretty unlikely, and if we did, they're going to
+ * be roughly identical for the purposes of salvage, anyway.
+ */
+ dsk->write_gen = ++btree->write_gen;
+
+ /*
+ * Checksum the data if the buffer isn't compressed or checksums are
+ * configured.
+ */
+ switch (btree->checksum) {
+ case CKSUM_ON:
+ data_cksum = 1;
+ break;
+ case CKSUM_OFF:
+ data_cksum = 0;
+ break;
+ case CKSUM_UNCOMPRESSED:
+ default:
+ data_cksum = !compressed;
+ break;
+ }
+
+ /* Call the block manager to write the block. */
+ WT_ERR(checkpoint ?
+ bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) :
+ bm->write(bm, session, ip, addr, addr_sizep, data_cksum));
+
+ WT_STAT_FAST_CONN_INCR(session, cache_write);
+ WT_STAT_FAST_DATA_INCR(session, cache_write);
+ WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, ip->size);
+ WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, ip->size);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_misc.c b/src/third_party/wiredtiger/src/btree/bt_misc.c
new file mode 100644
index 00000000000..cba1c0c61aa
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_misc.c
@@ -0,0 +1,128 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_page_type_string --
+ * Return a string representing the page type.
+ */
+const char *
+__wt_page_type_string(u_int type)
+{
+ switch (type) {
+ case WT_PAGE_INVALID:
+ return ("invalid");
+ case WT_PAGE_BLOCK_MANAGER:
+ return ("block manager");
+ case WT_PAGE_COL_FIX:
+ return ("column-store fixed-length leaf");
+ case WT_PAGE_COL_INT:
+ return ("column-store internal");
+ case WT_PAGE_COL_VAR:
+ return ("column-store variable-length leaf");
+ case WT_PAGE_OVFL:
+ return ("overflow");
+ case WT_PAGE_ROW_INT:
+ return ("row-store internal");
+ case WT_PAGE_ROW_LEAF:
+ return ("row-store leaf");
+ default:
+ return ("unknown");
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_cell_type_string --
+ * Return a string representing the cell type.
+ */
+const char *
+__wt_cell_type_string(uint8_t type)
+{
+ switch (type) {
+ case WT_CELL_ADDR_DEL:
+ return ("addr/del");
+ case WT_CELL_ADDR_INT:
+ return ("addr/int");
+ case WT_CELL_ADDR_LEAF:
+ return ("addr/leaf");
+ case WT_CELL_ADDR_LEAF_NO:
+ return ("addr/leaf-no");
+ case WT_CELL_DEL:
+ return ("deleted");
+ case WT_CELL_KEY:
+ return ("key");
+ case WT_CELL_KEY_PFX:
+ return ("key/pfx");
+ case WT_CELL_KEY_OVFL:
+ return ("key/ovfl");
+ case WT_CELL_KEY_SHORT:
+ return ("key/short");
+ case WT_CELL_KEY_SHORT_PFX:
+ return ("key/short,pfx");
+ case WT_CELL_KEY_OVFL_RM:
+ return ("key/ovfl,rm");
+ case WT_CELL_VALUE:
+ return ("value");
+ case WT_CELL_VALUE_COPY:
+ return ("value/copy");
+ case WT_CELL_VALUE_OVFL:
+ return ("value/ovfl");
+ case WT_CELL_VALUE_OVFL_RM:
+ return ("value/ovfl,rm");
+ case WT_CELL_VALUE_SHORT:
+ return ("value/short");
+ default:
+ return ("unknown");
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_page_addr_string --
+ * Figure out a page's "address" and load a buffer with a printable,
+ * nul-terminated representation of that address.
+ */
+const char *
+__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf)
+{
+ size_t addr_size;
+ const uint8_t *addr;
+
+ if (__wt_ref_is_root(ref)) {
+ buf->data = "[Root]";
+ buf->size = strlen("[Root]");
+ return (buf->data);
+ }
+
+ (void)__wt_ref_info(session, ref, &addr, &addr_size, NULL);
+ return (__wt_addr_string(session, addr, addr_size, buf));
+}
+
+/*
+ * __wt_addr_string --
+ * Load a buffer with a printable, nul-terminated representation of an
+ * address.
+ */
+const char *
+__wt_addr_string(WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_size, WT_ITEM *buf)
+{
+ WT_BM *bm;
+
+ bm = S2BT(session)->bm;
+
+ if (addr == NULL) {
+ buf->data = "[NoAddr]";
+ buf->size = strlen("[NoAddr]");
+ } else if (bm->addr_string(bm, session, buf, addr, addr_size) != 0) {
+ buf->data = "[Error]";
+ buf->size = strlen("[Error]");
+ }
+ return (buf->data);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
new file mode 100644
index 00000000000..4cd317f1e8f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __ovfl_read --
+ * Read an overflow item from the disk.
+ */
+static int
+__ovfl_read(WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_size, WT_ITEM *store)
+{
+ WT_BTREE *btree;
+ const WT_PAGE_HEADER *dsk;
+
+ btree = S2BT(session);
+
+ /*
+ * Read the overflow item from the block manager, then reference the
+ * start of the data and set the data's length.
+ *
+ * Overflow reads are synchronous. That may bite me at some point, but
+ * WiredTiger supports large page sizes, overflow items should be rare.
+ */
+ WT_RET(__wt_bt_read(session, store, addr, addr_size));
+ dsk = store->data;
+ store->data = WT_PAGE_HEADER_BYTE(btree, dsk);
+ store->size = dsk->u.datalen;
+
+ WT_STAT_FAST_DATA_INCR(session, cache_read_overflow);
+
+ return (0);
+}
+
+/*
+ * __wt_ovfl_read --
+ * Bring an overflow item into memory.
+ */
+int
+__wt_ovfl_read(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+ WT_DECL_RET;
+
+ /*
+ * If no page specified, there's no need to lock and there's no cache
+ * to search, we don't care about WT_CELL_VALUE_OVFL_RM cells.
+ */
+ if (page == NULL)
+ return (
+ __ovfl_read(session, unpack->data, unpack->size, store));
+
+ /*
+ * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow
+ * value, but there was still a reader in the system that might need it,
+ * the on-page cell type will have been reset to WT_CELL_VALUE_OVFL_RM
+ * and we will be passed a page so we can look-aside into the cache of
+ * such values.
+ *
+ * Acquire the overflow lock, and retest the on-page cell's value inside
+ * the lock.
+ */
+ WT_RET(__wt_readlock(session, S2BT(session)->ovfl_lock));
+ ret = __wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM ?
+ __wt_ovfl_txnc_search(page, unpack->data, unpack->size, store) :
+ __ovfl_read(session, unpack->data, unpack->size, store);
+ WT_TRET(__wt_readunlock(session, S2BT(session)->ovfl_lock));
+
+ return (ret);
+}
+
+/*
+ * __ovfl_cache_col_visible --
+ * column-store: check for a globally visible update.
+ */
+static int
+__ovfl_cache_col_visible(
+ WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack)
+{
+ /*
+ * Column-store is harder than row_store: we're here because there's a
+ * reader in the system that might read the original version of an
+ * overflow record, which might match a number of records. For example,
+ * the original overflow value was for records 100-200, we've replaced
+ * each of those records individually, but there exists a reader that
+ * might read any one of those records, and all of those records have
+ * different update entries with different transaction IDs. Since it's
+ * infeasible to determine if there's a globally visible update for each
+ * reader for each record, we test the simple case where a single record
+ * has a single, globally visible update. If that's not the case, cache
+ * the value.
+ */
+ if (__wt_cell_rle(unpack) == 1 &&
+ upd != NULL && /* Sanity: upd should always be set. */
+ __wt_txn_visible_all(session, upd->txnid))
+ return (1);
+ return (0);
+}
+
+/*
+ * __ovfl_cache_row_visible --
+ * row-store: check for a globally visible update.
+ */
+static int
+__ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip)
+{
+ WT_UPDATE *upd;
+
+ /* Check to see if there's a globally visible update. */
+ for (upd = WT_ROW_UPDATE(page, rip); upd != NULL; upd = upd->next)
+ if (__wt_txn_visible_all(session, upd->txnid))
+ return (1);
+
+ return (0);
+}
+
+/*
+ * __ovfl_cache --
+ * Cache a deleted overflow value.
+ */
+static int
+__ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ size_t addr_size;
+ const uint8_t *addr;
+
+ addr = unpack->data;
+ addr_size = unpack->size;
+
+ WT_RET(__wt_scr_alloc(session, 1024, &tmp));
+
+ /* Enter the value into the overflow cache. */
+ WT_ERR(__ovfl_read(session, addr, addr_size, tmp));
+ WT_ERR(__wt_ovfl_txnc_add(
+ session, page, addr, addr_size, tmp->data, tmp->size));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_ovfl_cache --
+ * Handle deletion of an overflow value.
+ */
+int
+__wt_ovfl_cache(WT_SESSION_IMPL *session,
+ WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack)
+{
+ int visible;
+
+ /*
+ * This function solves a problem in reconciliation. The scenario is:
+ * - reconciling a leaf page that references an overflow item
+ * - the item is updated and the update committed
+ * - a checkpoint runs, freeing the backing overflow blocks
+ * - a snapshot transaction wants the original version of the item
+ *
+ * In summary, we may need the original version of an overflow item for
+ * a snapshot transaction after the item was deleted from a page that's
+ * subsequently been checkpointed, where the checkpoint must know about
+ * the freed blocks. We don't have any way to delay a free of the
+ * underlying blocks until a particular set of transactions exit (and
+ * this shouldn't be a common scenario), so cache the overflow value in
+ * memory.
+ *
+ * This gets hard because the snapshot transaction reader might:
+ * - search the WT_UPDATE list and not find an useful entry
+ * - read the overflow value's address from the on-page cell
+ * - go to sleep
+ * - checkpoint runs, caches the overflow value, frees the blocks
+ * - another thread allocates and overwrites the blocks
+ * - the reader wakes up and reads the wrong value
+ *
+ * Use a read/write lock and the on-page cell to fix the problem: hold
+ * a write lock when changing the cell type from WT_CELL_VALUE_OVFL to
+ * WT_CELL_VALUE_OVFL_RM and hold a read lock when reading an overflow
+ * item.
+ *
+ * The read/write lock is per btree, but it could be per page or even
+ * per overflow item. We don't do any of that because overflow values
+ * are supposed to be rare and we shouldn't see contention for the lock.
+ *
+ * Check for a globally visible update. If there is a globally visible
+ * update, we don't need to cache the item because it's not possible for
+ * a running thread to have moved past it.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_VAR:
+ visible = __ovfl_cache_col_visible(session, cookie, vpack);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ visible = __ovfl_cache_row_visible(session, page, cookie);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /*
+ * If there's no globally visible update, there's a reader in the system
+ * that might try and read the old value, cache it.
+ */
+ if (!visible) {
+ WT_RET(__ovfl_cache(session, page, vpack));
+ WT_STAT_FAST_DATA_INCR(session, cache_overflow_value);
+ }
+
+ /*
+ * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the
+ * underlying overflow value's blocks to be freed when reconciliation
+ * completes.
+ */
+ return (__wt_ovfl_discard_add(session, page, vpack->cell));
+}
+
+/*
+ * __wt_ovfl_discard --
+ * Discard an on-page overflow value, and reset the page's cell.
+ */
+int
+__wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_RET;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ unpack = &_unpack;
+
+ __wt_cell_unpack(cell, unpack);
+
+ /*
+ * Finally remove overflow key/value objects, called when reconciliation
+ * finishes after successfully writing a page.
+ *
+ * Keys must have already been instantiated and value objects must have
+ * already been cached (if they might potentially still be read by any
+ * running transaction).
+ *
+ * Acquire the overflow lock to avoid racing with a thread reading the
+ * backing overflow blocks.
+ */
+ WT_RET(__wt_writelock(session, btree->ovfl_lock));
+
+ switch (unpack->raw) {
+ case WT_CELL_KEY_OVFL:
+ __wt_cell_type_reset(session,
+ unpack->cell, WT_CELL_KEY_OVFL, WT_CELL_KEY_OVFL_RM);
+ break;
+ case WT_CELL_VALUE_OVFL:
+ __wt_cell_type_reset(session,
+ unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ WT_TRET(__wt_writeunlock(session, btree->ovfl_lock));
+
+ /* Free the backing disk blocks. */
+ WT_TRET(bm->free(bm, session, unpack->data, unpack->size));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
new file mode 100644
index 00000000000..c5f24c06286
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -0,0 +1,734 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static void __inmem_col_fix(WT_SESSION_IMPL *, WT_PAGE *);
+static void __inmem_col_int(WT_SESSION_IMPL *, WT_PAGE *);
+static int __inmem_col_var(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
+static int __inmem_row_int(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
+static int __inmem_row_leaf(WT_SESSION_IMPL *, WT_PAGE *);
+static int __inmem_row_leaf_entries(
+ WT_SESSION_IMPL *, const WT_PAGE_HEADER *, uint32_t *);
+
+/*
+ * __evict_force_check --
+ * Check if a page matches the criteria for forced eviction.
+ */
+static int
+__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ /* Pages are usually small enough, check that first. */
+ if (page->memory_footprint < btree->maxmempage)
+ return (0);
+
+ /* Leaf pages only. */
+ if (page->type != WT_PAGE_COL_FIX &&
+ page->type != WT_PAGE_COL_VAR &&
+ page->type != WT_PAGE_ROW_LEAF)
+ return (0);
+
+ /* Eviction may be turned off, although that's rare. */
+ if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ return (0);
+
+ /*
+ * It's hard to imagine a page with a huge memory footprint that has
+ * never been modified, but check to be sure.
+ */
+ if (page->modify == NULL)
+ return (0);
+
+ /* Trigger eviction on the next page release. */
+ page->read_gen = WT_READGEN_OLDEST;
+
+ return (1);
+}
+
+/*
+ * __wt_page_in_func --
+ * Acquire a hazard pointer to a page; if the page is not in-memory,
+ * read it from the disk and build an in-memory version.
+ */
+int
+__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ )
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ int busy, force_attempts, oldgen;
+
+ for (force_attempts = oldgen = 0;;) {
+ switch (ref->state) {
+ case WT_REF_DISK:
+ case WT_REF_DELETED:
+ if (LF_ISSET(WT_READ_CACHE))
+ return (WT_NOTFOUND);
+
+ /*
+ * The page isn't in memory, attempt to read it.
+ * Make sure there is space in the cache.
+ */
+ WT_RET(__wt_cache_full_check(session));
+ WT_RET(__wt_cache_read(session, ref));
+ oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
+ F_ISSET(session, WT_SESSION_NO_CACHE);
+ continue;
+ case WT_REF_READING:
+ if (LF_ISSET(WT_READ_CACHE))
+ return (WT_NOTFOUND);
+ /* FALLTHROUGH */
+ case WT_REF_LOCKED:
+ if (LF_ISSET(WT_READ_NO_WAIT))
+ return (WT_NOTFOUND);
+ /* The page is busy -- wait. */
+ break;
+ case WT_REF_SPLIT:
+ return (WT_RESTART);
+ case WT_REF_MEM:
+ /*
+ * The page is in memory: get a hazard pointer, update
+ * the page's LRU and return. The expected reason we
+ * can't get a hazard pointer is because the page is
+ * being evicted; yield and try again.
+ */
+#ifdef HAVE_DIAGNOSTIC
+ WT_RET(
+ __wt_hazard_set(session, ref, &busy, file, line));
+#else
+ WT_RET(__wt_hazard_set(session, ref, &busy));
+#endif
+ if (busy)
+ break;
+
+ page = ref->page;
+ WT_ASSERT(session, page != NULL);
+
+ /* Forcibly evict pages that are too big. */
+ if (!LF_ISSET(WT_READ_NO_EVICT) &&
+ force_attempts < 10 &&
+ __evict_force_check(session, page)) {
+ ++force_attempts;
+ WT_RET(__wt_page_release(session, ref, flags));
+ break;
+ }
+
+ /* Check if we need an autocommit transaction. */
+ if ((ret = __wt_txn_autocommit_check(session)) != 0) {
+ WT_TRET(__wt_hazard_clear(session, page));
+ return (ret);
+ }
+
+ /*
+ * If we read the page and we are configured to not
+ * trash the cache, set the oldest read generation so
+ * the page is forcibly evicted as soon as possible.
+ *
+ * Otherwise, update the page's read generation.
+ */
+ if (oldgen && page->read_gen == WT_READGEN_NOTSET)
+ page->read_gen = WT_READGEN_OLDEST;
+ else if (!LF_ISSET(WT_READ_NO_GEN) &&
+ page->read_gen < __wt_cache_read_gen(session))
+ page->read_gen =
+ __wt_cache_read_gen_set(session);
+
+ return (0);
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /* We failed to get the page -- yield before retrying. */
+ __wt_yield();
+ }
+}
+
+/*
+ * __wt_page_alloc --
+ * Create or read a page into the cache.
+ */
+int
+__wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type,
+ uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep)
+{
+ WT_CACHE *cache;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ size_t size;
+ uint32_t i;
+ void *p;
+
+ *pagep = NULL;
+
+ cache = S2C(session)->cache;
+ page = NULL;
+
+ size = sizeof(WT_PAGE);
+ switch (type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ break;
+ case WT_PAGE_COL_VAR:
+ /*
+ * Variable-length column-store leaf page: allocate memory to
+ * describe the page's contents with the initial allocation.
+ */
+ size += alloc_entries * sizeof(WT_COL);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * Row-store leaf page: allocate memory to describe the page's
+ * contents with the initial allocation.
+ */
+ size += alloc_entries * sizeof(WT_ROW);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ WT_RET(__wt_calloc(session, 1, size, &page));
+
+ page->type = type;
+ page->read_gen = WT_READGEN_NOTSET;
+
+ switch (type) {
+ case WT_PAGE_COL_FIX:
+ page->pg_fix_recno = recno;
+ page->pg_fix_entries = alloc_entries;
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ page->pg_intl_recno = recno;
+
+ /*
+ * Internal pages have an array of references to objects so they
+ * can split. Allocate the array of references and optionally,
+ * the objects to which they point.
+ */
+ WT_ERR(__wt_calloc(session, 1,
+ sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *),
+ &p));
+ size +=
+ sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *);
+ pindex = p;
+ pindex->index = (WT_REF **)((WT_PAGE_INDEX *)p + 1);
+ pindex->entries = alloc_entries;
+ WT_INTL_INDEX_SET(page, pindex);
+ if (alloc_refs)
+ for (i = 0; i < pindex->entries; ++i) {
+ WT_ERR(__wt_calloc_def(
+ session, 1, &pindex->index[i]));
+ size += sizeof(WT_REF);
+ }
+ if (0) {
+err: if ((pindex = WT_INTL_INDEX_COPY(page)) != NULL) {
+ for (i = 0; i < pindex->entries; ++i)
+ __wt_free(session, pindex->index[i]);
+ __wt_free(session, pindex);
+ }
+ __wt_free(session, page);
+ return (ret);
+ }
+ break;
+ case WT_PAGE_COL_VAR:
+ page->pg_var_recno = recno;
+ page->pg_var_d = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE));
+ page->pg_var_entries = alloc_entries;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ page->pg_row_d = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE));
+ page->pg_row_entries = alloc_entries;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /* Increment the cache statistics. */
+ __wt_cache_page_inmem_incr(session, page, size);
+ (void)WT_ATOMIC_ADD8(cache->pages_inmem, 1);
+
+ *pagep = page;
+ return (0);
+}
+
+/*
+ * __wt_page_inmem --
+ * Build in-memory page information.
+ */
+int
+__wt_page_inmem(WT_SESSION_IMPL *session,
+ WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep)
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ const WT_PAGE_HEADER *dsk;
+ uint32_t alloc_entries;
+ size_t size;
+
+ *pagep = NULL;
+
+ dsk = image;
+ alloc_entries = 0;
+
+ /*
+ * Figure out how many underlying objects the page references so we can
+ * allocate them along with the page.
+ */
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_VAR:
+ /*
+ * Column-store leaf page entries map one-to-one to the number
+ * of physical entries on the page (each physical entry is a
+ * value item).
+ *
+ * Column-store internal page entries map one-to-one to the
+ * number of physical entries on the page (each entry is a
+ * location cookie).
+ */
+ alloc_entries = dsk->u.entries;
+ break;
+ case WT_PAGE_ROW_INT:
+ /*
+ * Row-store internal page entries map one-to-two to the number
+ * of physical entries on the page (each entry is a key and
+ * location cookie pair).
+ */
+ alloc_entries = dsk->u.entries / 2;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * If the "no empty values" flag is set, row-store leaf page
+ * entries map one-to-one to the number of physical entries
+ * on the page (each physical entry is a key or value item).
+ * If that flag is not set, there are more keys than values,
+ * we have to walk the page to figure it out.
+ */
+ if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL))
+ alloc_entries = dsk->u.entries;
+ else if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE))
+ alloc_entries = dsk->u.entries / 2;
+ else
+ WT_RET(__inmem_row_leaf_entries(
+ session, dsk, &alloc_entries));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /* Allocate and initialize a new WT_PAGE. */
+ WT_RET(__wt_page_alloc(
+ session, dsk->type, dsk->recno, alloc_entries, 1, &page));
+ page->dsk = dsk;
+ F_SET_ATOMIC(page, flags);
+
+ /*
+ * Track the memory allocated to build this page so we can update the
+ * cache statistics in a single call.
+ */
+ size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0;
+
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ __inmem_col_fix(session, page);
+ break;
+ case WT_PAGE_COL_INT:
+ __inmem_col_int(session, page);
+ break;
+ case WT_PAGE_COL_VAR:
+ WT_ERR(__inmem_col_var(session, page, &size));
+ break;
+ case WT_PAGE_ROW_INT:
+ WT_ERR(__inmem_row_int(session, page, &size));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ WT_ERR(__inmem_row_leaf(session, page));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Update the page's in-memory size and the cache statistics. */
+ __wt_cache_page_inmem_incr(session, page, size);
+
+ /* Link the new internal page to the parent. */
+ if (ref != NULL) {
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ page->pg_intl_parent_ref = ref;
+ break;
+ }
+ ref->page = page;
+ }
+
+ *pagep = page;
+ return (0);
+
+err: __wt_page_out(session, &page);
+ return (ret);
+}
+
+/*
+ * __inmem_col_fix --
+ * Build in-memory index for fixed-length column-store leaf pages.
+ */
+static void
+__inmem_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ const WT_PAGE_HEADER *dsk;
+
+ btree = S2BT(session);
+ dsk = page->dsk;
+
+ page->pg_fix_bitf = WT_PAGE_HEADER_BYTE(btree, dsk);
+}
+
+/*
+ * __inmem_col_int --
+ * Build in-memory index for column-store internal pages.
+ */
+static void
+__inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ const WT_PAGE_HEADER *dsk;
+ WT_PAGE_INDEX *pindex;
+ WT_REF **refp, *ref;
+ uint32_t i;
+
+ btree = S2BT(session);
+ dsk = page->dsk;
+ unpack = &_unpack;
+
+ /*
+ * Walk the page, building references: the page contains value items.
+ * The value items are on-page items (WT_CELL_VALUE).
+ */
+ pindex = WT_INTL_INDEX_COPY(page);
+ refp = pindex->index;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ ref = *refp++;
+ ref->home = page;
+
+ __wt_cell_unpack(cell, unpack);
+ ref->addr = cell;
+ ref->key.recno = unpack->v;
+ }
+}
+
+/*
+ * __inmem_col_var_repeats --
+ * Count the number of repeat entries on the page.
+ */
+static int
+__inmem_col_var_repeats(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t *np)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ const WT_PAGE_HEADER *dsk;
+ uint32_t i;
+
+ btree = S2BT(session);
+ dsk = page->dsk;
+ unpack = &_unpack;
+
+ /* Walk the page, counting entries for the repeats array. */
+ *np = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ if (__wt_cell_rle(unpack) > 1)
+ ++*np;
+ }
+ return (0);
+}
+
+/*
+ * __inmem_col_var --
+ * Build in-memory index for variable-length, data-only leaf pages in
+ * column-store trees.
+ */
+static int
+__inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
+{
+ WT_BTREE *btree;
+ WT_COL *cip;
+ WT_COL_RLE *repeats;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ const WT_PAGE_HEADER *dsk;
+ uint64_t recno, rle;
+ size_t bytes_allocated;
+ uint32_t i, indx, n, repeat_off;
+
+ btree = S2BT(session);
+ dsk = page->dsk;
+ recno = page->pg_var_recno;
+
+ repeats = NULL;
+ repeat_off = 0;
+ unpack = &_unpack;
+ bytes_allocated = 0;
+
+ /*
+ * Walk the page, building references: the page contains unsorted value
+ * items. The value items are on-page (WT_CELL_VALUE), overflow items
+ * (WT_CELL_VALUE_OVFL) or deleted items (WT_CELL_DEL).
+ */
+ indx = 0;
+ cip = page->pg_var_d;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ WT_COL_PTR_SET(cip, WT_PAGE_DISK_OFFSET(page, cell));
+ cip++;
+
+ /*
+ * Add records with repeat counts greater than 1 to an array we
+ * use for fast lookups. The first entry we find needing the
+ * repeats array triggers a re-walk from the start of the page
+ * to determine the size of the array.
+ */
+ rle = __wt_cell_rle(unpack);
+ if (rle > 1) {
+ if (repeats == NULL) {
+ WT_RET(
+ __inmem_col_var_repeats(session, page, &n));
+ WT_RET(__wt_realloc_def(session,
+ &bytes_allocated, n + 1, &repeats));
+
+ page->pg_var_repeats = repeats;
+ page->pg_var_nrepeats = n;
+ *sizep += bytes_allocated;
+ }
+ repeats[repeat_off].indx = indx;
+ repeats[repeat_off].recno = recno;
+ repeats[repeat_off++].rle = rle;
+ }
+ indx++;
+ recno += rle;
+ }
+
+ return (0);
+}
+
+/*
+ * __inmem_row_int --
+ * Build in-memory index for row-store internal pages.
+ */
+static int
+__inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_ITEM(current);
+ WT_DECL_RET;
+ const WT_PAGE_HEADER *dsk;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *ref, **refp;
+ uint32_t i;
+
+ btree = S2BT(session);
+ unpack = &_unpack;
+ dsk = page->dsk;
+
+ WT_RET(__wt_scr_alloc(session, 0, &current));
+
+ /*
+ * Walk the page, instantiating keys: the page contains sorted key and
+ * location cookie pairs. Keys are on-page/overflow items and location
+ * cookies are WT_CELL_ADDR_XXX items.
+ */
+ pindex = WT_INTL_INDEX_COPY(page);
+ refp = pindex->index;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ ref = *refp;
+ ref->home = page;
+
+ __wt_cell_unpack(cell, unpack);
+ switch (unpack->type) {
+ case WT_CELL_KEY:
+ /*
+ * Note: we don't Huffman encode internal page keys,
+ * there's no decoding work to do.
+ */
+ __wt_ref_key_onpage_set(page, ref, unpack);
+ break;
+ case WT_CELL_KEY_OVFL:
+ /* Instantiate any overflow records. */
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, page->type, unpack, current));
+
+ WT_ERR(__wt_row_ikey_incr(session, page,
+ WT_PAGE_DISK_OFFSET(page, cell),
+ current->data, current->size, &ref->key.ikey));
+
+ *sizep += sizeof(WT_IKEY) + current->size;
+ break;
+ case WT_CELL_ADDR_DEL:
+ /*
+ * A cell may reference a deleted leaf page: if a leaf
+ * page was deleted without being read (fast truncate),
+ * and the deletion committed, but older transactions
+ * in the system required the previous version of the
+ * page to remain available, a special deleted-address
+ * type cell is written. The only reason we'd ever see
+ * that cell on a page we're reading is if we crashed
+ * and recovered (otherwise a version of the page w/o
+ * that cell would have eventually been written). If we
+ * crash and recover to a page with a deleted-address
+ * cell, we want to discard the page from the backing
+ * store (it was never discarded), and, of course, by
+ * definition no earlier transaction will ever need it.
+ *
+ * Re-create the state of a deleted page.
+ */
+ ref->addr = cell;
+ ref->state = WT_REF_DELETED;
+ ++refp;
+
+ /*
+ * If the tree is already dirty and so will be written,
+ * mark the page dirty. (We want to free the deleted
+ * pages, but if the handle is read-only or if the
+ * application never modifies the tree, we're not able
+ * to do so.)
+ */
+ if (btree->modified) {
+ WT_ERR(__wt_page_modify_init(session, page));
+ __wt_page_modify_set(session, page);
+ }
+ break;
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ ref->addr = cell;
+ ++refp;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ }
+
+err: __wt_scr_free(&current);
+ return (ret);
+}
+
+/*
+ * __inmem_row_leaf_entries --
+ * Return the number of entries for row-store leaf pages.
+ */
+static int
+__inmem_row_leaf_entries(
+ WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, uint32_t *nindxp)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ uint32_t i, nindx;
+
+ btree = S2BT(session);
+ unpack = &_unpack;
+
+ /*
+ * Leaf row-store page entries map to a maximum of one-to-one to the
+ * number of physical entries on the page (each physical entry might be
+ * a key without a subsequent data item). To avoid over-allocation in
+ * workloads without empty data items, first walk the page counting the
+ * number of keys, then allocate the indices.
+ *
+ * The page contains key/data pairs. Keys are on-page (WT_CELL_KEY) or
+ * overflow (WT_CELL_KEY_OVFL) items, data are either non-existent or a
+ * single on-page (WT_CELL_VALUE) or overflow (WT_CELL_VALUE_OVFL) item.
+ */
+ nindx = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ switch (unpack->type) {
+ case WT_CELL_KEY:
+ case WT_CELL_KEY_OVFL:
+ ++nindx;
+ break;
+ case WT_CELL_VALUE:
+ case WT_CELL_VALUE_OVFL:
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ }
+
+ *nindxp = nindx;
+ return (0);
+}
+
+/*
+ * __inmem_row_leaf --
+ * Build in-memory index for row-store leaf pages.
+ */
+static int
+__inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ const WT_PAGE_HEADER *dsk;
+ WT_ROW *rip;
+ uint32_t i;
+
+ btree = S2BT(session);
+ dsk = page->dsk;
+ unpack = &_unpack;
+
+ /* Walk the page, building indices. */
+ rip = page->pg_row_d;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ switch (unpack->type) {
+ case WT_CELL_KEY_OVFL:
+ __wt_row_leaf_key_set_cell(page, rip, cell);
+ ++rip;
+ break;
+ case WT_CELL_KEY:
+ /*
+ * Simple keys without compression (not Huffman encoded
+ * or prefix compressed), can be directly referenced on
+ * the page to avoid repeatedly unpacking their cells.
+ */
+ if (!btree->huffman_key && unpack->prefix == 0)
+ __wt_row_leaf_key_set(page, rip, unpack);
+ else
+ __wt_row_leaf_key_set_cell(page, rip, cell);
+ ++rip;
+ break;
+ case WT_CELL_VALUE:
+ /*
+ * Simple values without compression can be directly
+ * referenced on the page to avoid repeatedly unpacking
+ * their cells.
+ */
+ if (!btree->huffman_value)
+ __wt_row_leaf_value_set(page, rip - 1, unpack);
+ break;
+ case WT_CELL_VALUE_OVFL:
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ }
+
+ /*
+ * We do not currently instantiate keys on leaf pages when the page is
+ * loaded, they're instantiated on demand.
+ */
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
new file mode 100644
index 00000000000..9cd6f8310af
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -0,0 +1,88 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cache_read --
+ * Read a page from the file.
+ */
+int
+__wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_ITEM tmp;
+ WT_PAGE *page;
+ WT_PAGE_STATE previous_state;
+ size_t addr_size;
+ const uint8_t *addr;
+
+ page = NULL;
+
+ /*
+ * Don't pass an allocated buffer to the underlying block read function,
+ * force allocation of new memory of the appropriate size.
+ */
+ WT_CLEAR(tmp);
+
+ /*
+ * Attempt to set the state to WT_REF_READING for normal reads, or
+ * WT_REF_LOCKED, for deleted pages. If successful, we've won the
+ * race, read the page.
+ */
+ if (WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_READING))
+ previous_state = WT_REF_DISK;
+ else if (WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+ previous_state = WT_REF_DELETED;
+ else
+ return (0);
+
+ /*
+ * Get the address: if there is no address, the page was deleted, but a
+ * subsequent search or insert is forcing re-creation of the name space.
+ * Otherwise, there's an address, read the backing disk page and build
+ * an in-memory version of the page.
+ */
+ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ if (addr == NULL) {
+ WT_ASSERT(session, previous_state == WT_REF_DELETED);
+
+ WT_ERR(__wt_btree_new_leaf_page(session, &page));
+ ref->page = page;
+ } else {
+ /* Read the backing disk page. */
+ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
+
+ /* Build the in-memory version of the page. */
+ WT_ERR(__wt_page_inmem(session, ref, tmp.data,
+ WT_DATA_IN_ITEM(&tmp) ?
+ WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
+
+ /* If the page was deleted, instantiate that information. */
+ if (previous_state == WT_REF_DELETED)
+ WT_ERR(__wt_delete_page_instantiate(session, ref));
+ }
+
+ WT_ERR(__wt_verbose(session, WT_VERB_READ,
+ "page %p: %s", page, __wt_page_type_string(page->type)));
+
+ WT_PUBLISH(ref->state, WT_REF_MEM);
+ return (0);
+
+err: /*
+ * If the function building an in-memory version of the page failed,
+ * it discarded the page, but not the disk image. Discard the page
+ * and separately discard the disk image in all cases.
+ */
+ if (ref->page != NULL)
+ __wt_ref_out(session, ref);
+ WT_PUBLISH(ref->state, previous_state);
+
+ __wt_buf_free(session, &tmp);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c
new file mode 100644
index 00000000000..25b4bfc3005
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_ret.c
@@ -0,0 +1,116 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_kv_return --
+ * Return a page referenced key/value pair to the application.
+ */
+int
+__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK unpack;
+ WT_CURSOR *cursor;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ uint8_t v;
+
+ btree = S2BT(session);
+
+ page = cbt->ref->page;
+ cursor = &cbt->iface;
+
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ /*
+ * The interface cursor's record has usually been set, but that
+ * isn't universally true, specifically, cursor.search_near may
+ * call here without first setting the interface cursor.
+ */
+ cursor->recno = cbt->recno;
+
+ /* If the cursor references a WT_UPDATE item, return it. */
+ if (upd != NULL) {
+ cursor->value.data = WT_UPDATE_DATA(upd);
+ cursor->value.size = upd->size;
+ return (0);
+ }
+
+ /* Take the value from the original page. */
+ v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt);
+ return (__wt_buf_set(session, &cursor->value, &v, 1));
+ case WT_PAGE_COL_VAR:
+ /*
+ * The interface cursor's record has usually been set, but that
+ * isn't universally true, specifically, cursor.search_near may
+ * call here without first setting the interface cursor.
+ */
+ cursor->recno = cbt->recno;
+
+ /* If the cursor references a WT_UPDATE item, return it. */
+ if (upd != NULL) {
+ cursor->value.data = WT_UPDATE_DATA(upd);
+ cursor->value.size = upd->size;
+ return (0);
+ }
+
+ /* Take the value from the original page cell. */
+ cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ rip = &page->pg_row_d[cbt->slot];
+
+ /*
+ * If the cursor references a WT_INSERT item, take its key.
+ * Else, if we have an exact match, we copied the key in the
+ * search function, take it from there.
+ * If we don't have an exact match, take the key from the
+ * original page.
+ */
+ if (cbt->ins != NULL) {
+ cursor->key.data = WT_INSERT_KEY(cbt->ins);
+ cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins);
+ } else if (cbt->compare == 0) {
+ cursor->key.data = cbt->search_key.data;
+ cursor->key.size = cbt->search_key.size;
+ } else
+ WT_RET(__wt_row_leaf_key(
+ session, page, rip, &cursor->key, 0));
+
+ /* If the cursor references a WT_UPDATE item, return it. */
+ if (upd != NULL) {
+ cursor->value.data = WT_UPDATE_DATA(upd);
+ cursor->value.size = upd->size;
+ return (0);
+ }
+
+ /* Simple values have their location encoded in the WT_ROW. */
+ if (__wt_row_leaf_value(page, rip, &cursor->value))
+ return (0);
+
+ /*
+ * Take the value from the original page cell (which may be
+ * empty).
+ */
+ if ((cell =
+ __wt_row_leaf_value_cell(page, rip, NULL)) == NULL) {
+ cursor->value.size = 0;
+ return (0);
+ }
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /* The value is an on-page cell, unpack and expand it as necessary. */
+ __wt_cell_unpack(cell, &unpack);
+ WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value));
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c
new file mode 100644
index 00000000000..10366e91a0e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c
@@ -0,0 +1,2520 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+struct __wt_stuff; typedef struct __wt_stuff WT_STUFF;
+struct __wt_track; typedef struct __wt_track WT_TRACK;
+struct __wt_track_shared; typedef struct __wt_track_shared WT_TRACK_SHARED;
+
+/*
+ * There's a bunch of stuff we pass around during salvage, group it together
+ * to make the code prettier.
+ */
+struct __wt_stuff {
+ WT_SESSION_IMPL *session; /* Salvage session */
+
+ WT_TRACK **pages; /* Pages */
+ uint32_t pages_next; /* Next empty slot */
+ size_t pages_allocated; /* Bytes allocated */
+
+ WT_TRACK **ovfl; /* Overflow pages */
+ uint32_t ovfl_next; /* Next empty slot */
+ size_t ovfl_allocated; /* Bytes allocated */
+
+ WT_REF root_ref; /* Created root page */
+
+ uint8_t page_type; /* Page type */
+
+ /* If need to free blocks backing merged page ranges. */
+ int merge_free;
+
+ WT_ITEM *tmp1; /* Verbose print buffer */
+ WT_ITEM *tmp2; /* Verbose print buffer */
+
+ uint64_t fcnt; /* Progress counter */
+};
+
+/*
+ * WT_TRACK_SHARED --
+ * Information shared between pages being merged.
+ */
+struct __wt_track_shared {
+ uint32_t ref; /* Reference count */
+
+ /*
+ * Physical information about the file block.
+ */
+ WT_ADDR addr; /* Page address */
+ uint32_t size; /* Page size */
+ uint64_t gen; /* Page generation */
+
+ /*
+ * Pages that reference overflow pages contain a list of the overflow
+ * pages they reference. We start out with a list of addresses, and
+ * convert to overflow array slots during the reconciliation of page
+ * references to overflow records.
+ */
+ WT_ADDR *ovfl_addr; /* Overflow pages by address */
+ uint32_t *ovfl_slot; /* Overflow pages by slot */
+ uint32_t ovfl_cnt; /* Overflow reference count */
+};
+
+/*
+ * WT_TRACK --
+ * Structure to track chunks, one per chunk; we start out with a chunk per
+ * page (either leaf or overflow), but when we find overlapping key ranges, we
+ * split the leaf page chunks up, one chunk for each unique key range.
+ */
+struct __wt_track {
+#define trk_addr shared->addr.addr
+#define trk_addr_size shared->addr.size
+#define trk_gen shared->gen
+#define trk_ovfl_addr shared->ovfl_addr
+#define trk_ovfl_cnt shared->ovfl_cnt
+#define trk_ovfl_slot shared->ovfl_slot
+#define trk_size shared->size
+ WT_TRACK_SHARED *shared; /* Shared information */
+
+ WT_STUFF *ss; /* Enclosing stuff */
+
+ union {
+ struct {
+#undef row_start
+#define row_start u.row._row_start
+ WT_ITEM _row_start; /* Row-store start range */
+#undef row_stop
+#define row_stop u.row._row_stop
+ WT_ITEM _row_stop; /* Row-store stop range */
+ } row;
+
+ struct {
+#undef col_start
+#define col_start u.col._col_start
+ uint64_t _col_start; /* Col-store start range */
+#undef col_stop
+#define col_stop u.col._col_stop
+ uint64_t _col_stop; /* Col-store stop range */
+#undef col_missing
+#define col_missing u.col._col_missing
+ uint64_t _col_missing; /* Col-store missing range */
+ } col;
+ } u;
+
+#define WT_TRACK_CHECK_START 0x01 /* Row: initial key updated */
+#define WT_TRACK_CHECK_STOP 0x02 /* Row: last key updated */
+#define WT_TRACK_MERGE 0x04 /* Page requires merging */
+#define WT_TRACK_OVFL_REFD 0x08 /* Overflow page referenced */
+ u_int flags;
+};
+
+static int __slvg_cleanup(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_col_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *);
+static int __slvg_col_build_leaf(WT_SESSION_IMPL *, WT_TRACK *, WT_REF *);
+static int __slvg_col_ovfl(
+ WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint64_t, uint64_t);
+static int __slvg_col_range(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_col_range_missing(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_col_range_overlap(
+ WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *);
+static void __slvg_col_trk_update_start(uint32_t, WT_STUFF *);
+static int __slvg_merge_block_free(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_ovfl_compare(const void *, const void *);
+static int __slvg_ovfl_discard(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_ovfl_reconcile(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_ovfl_ref(WT_SESSION_IMPL *, WT_TRACK *, int);
+static int __slvg_ovfl_ref_all(WT_SESSION_IMPL *, WT_TRACK *);
+static int __slvg_read(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_row_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *);
+static int __slvg_row_build_leaf(
+ WT_SESSION_IMPL *, WT_TRACK *, WT_REF *, WT_STUFF *);
+static int __slvg_row_ovfl(
+ WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint32_t, uint32_t);
+static int __slvg_row_range(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_row_range_overlap(
+ WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *);
+static int __slvg_row_trk_update_start(
+ WT_SESSION_IMPL *, WT_ITEM *, uint32_t, WT_STUFF *);
+static int __slvg_trk_compare_addr(const void *, const void *);
+static int __slvg_trk_compare_gen(const void *, const void *);
+static int __slvg_trk_compare_key(const void *, const void *);
+static int __slvg_trk_free(WT_SESSION_IMPL *, WT_TRACK **, int);
+static void __slvg_trk_free_addr(WT_SESSION_IMPL *, WT_TRACK *);
+static int __slvg_trk_init(WT_SESSION_IMPL *, uint8_t *,
+ size_t, uint32_t, uint64_t, WT_STUFF *, WT_TRACK **);
+static int __slvg_trk_leaf(WT_SESSION_IMPL *,
+ const WT_PAGE_HEADER *, uint8_t *, size_t, WT_STUFF *);
+static int __slvg_trk_leaf_ovfl(
+ WT_SESSION_IMPL *, const WT_PAGE_HEADER *, WT_TRACK *);
+static int __slvg_trk_ovfl(WT_SESSION_IMPL *,
+ const WT_PAGE_HEADER *, uint8_t *, size_t, WT_STUFF *);
+static int __slvg_trk_split(WT_SESSION_IMPL *, WT_TRACK *, WT_TRACK **);
+
+/*
+ * __wt_bt_salvage --
+ * Salvage a Btree.
+ */
+int
+__wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_STUFF *ss, stuff;
+ uint32_t i, leaf_cnt;
+
+ WT_UNUSED(cfg);
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ WT_CLEAR(stuff);
+ ss = &stuff;
+ ss->session = session;
+ ss->page_type = WT_PAGE_INVALID;
+
+ /* Allocate temporary buffers. */
+ WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp1));
+ WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp2));
+
+ /*
+ * Step 1:
+ * Inform the underlying block manager that we're salvaging the file.
+ */
+ WT_ERR(bm->salvage_start(bm, session));
+
+ /*
+ * Step 2:
+ * Read the file and build in-memory structures that reference any leaf
+ * or overflow page. Any pages other than leaf or overflow pages are
+ * added to the free list.
+ *
+ * Turn off read checksum and verification error messages while we're
+ * reading the file, we expect to see corrupted blocks.
+ */
+ F_SET(session, WT_SESSION_SALVAGE_CORRUPT_OK);
+ ret = __slvg_read(session, ss);
+ F_CLR(session, WT_SESSION_SALVAGE_CORRUPT_OK);
+ WT_ERR(ret);
+
+ /*
+ * Step 3:
+ * Discard any page referencing a non-existent overflow page. We do
+ * this before checking overlapping key ranges on the grounds that a
+ * bad key range we can use is better than a terrific key range that
+ * references pages we don't have. On the other hand, we subsequently
+ * discard key ranges where there are better overlapping ranges, and
+ * it would be better if we let the availability of an overflow value
+ * inform our choices as to the key ranges we select, ideally on a
+ * per-key basis.
+ *
+ * A complicating problem is found in variable-length column-store
+ * objects, where we potentially split key ranges within RLE units.
+ * For example, if there's a page with rows 15-20 and we later find
+ * row 17 with a larger LSN, the range splits into 3 chunks, 15-16,
+ * 17, and 18-20. If rows 15-20 were originally a single value (an
+ * RLE of 6), and that record is an overflow record, we end up with
+ * two chunks, both of which want to reference the same overflow value.
+ *
+ * Instead of the approach just described, we're first discarding any
+ * pages referencing non-existent overflow pages, then we're reviewing
+ * our key ranges and discarding any that overlap. We're doing it that
+ * way for a few reasons: absent corruption, missing overflow items are
+ * strong arguments the page was replaced (on the other hand, some kind
+ * of file corruption is probably why we're here); it's a significant
+ * amount of additional complexity to simultaneously juggle overlapping
+ * ranges and missing overflow items; finally, real-world applications
+ * usually don't have a lot of overflow items, as WiredTiger supports
+ * very large page sizes, overflow items shouldn't be common.
+ *
+ * Step 4:
+ * Add unreferenced overflow page blocks to the free list so they are
+ * reused immediately.
+ */
+ if (ss->ovfl_next != 0) {
+ WT_ERR(__slvg_ovfl_reconcile(session, ss));
+ WT_ERR(__slvg_ovfl_discard(session, ss));
+ }
+
+ /*
+ * Step 5:
+ * Walk the list of pages looking for overlapping ranges to resolve.
+ * If we find a range that needs to be resolved, set a global flag
+ * and a per WT_TRACK flag on the pages requiring modification.
+ *
+ * This requires sorting the page list by key, and secondarily by LSN.
+ *
+ * !!!
+ * It's vanishingly unlikely and probably impossible for fixed-length
+ * column-store files to have overlapping key ranges. It's possible
+ * for an entire key range to go missing (if a page is corrupted and
+ * lost), but because pages can't split, it shouldn't be possible to
+ * find pages where the key ranges overlap. That said, we check for
+ * it and clean up after it in reconciliation because it doesn't cost
+ * much and future column-store formats or operations might allow for
+ * fixed-length format ranges to overlap during salvage, and I don't
+ * want to have to retrofit the code later.
+ */
+ qsort(ss->pages,
+ (size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_key);
+ if (ss->page_type == WT_PAGE_ROW_LEAF)
+ WT_ERR(__slvg_row_range(session, ss));
+ else
+ WT_ERR(__slvg_col_range(session, ss));
+
+ /*
+ * Step 6:
+ * We may have lost key ranges in column-store databases, that is, some
+ * part of the record number space is gone. Look for missing ranges.
+ */
+ switch (ss->page_type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ WT_ERR(__slvg_col_range_missing(session, ss));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ break;
+ }
+
+ /*
+ * Step 7:
+ * Build an internal page that references all of the leaf pages,
+ * and write it, as well as any merged pages, to the file.
+ *
+ * Count how many leaf pages we have (we could track this during the
+ * array shuffling/splitting, but that's a lot harder).
+ */
+ for (leaf_cnt = i = 0; i < ss->pages_next; ++i)
+ if (ss->pages[i] != NULL)
+ ++leaf_cnt;
+ if (leaf_cnt != 0)
+ switch (ss->page_type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ WT_ERR(
+ __slvg_col_build_internal(session, leaf_cnt, ss));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ WT_ERR(
+ __slvg_row_build_internal(session, leaf_cnt, ss));
+ break;
+ }
+
+ /*
+ * Step 8:
+ * If we had to merge key ranges, we have to do a final pass through
+ * the leaf page array and discard file pages used during key merges.
+ * We can't do it earlier: if we free'd the leaf pages we're merging as
+ * we merged them, the write of subsequent leaf pages or the internal
+ * page might allocate those free'd file blocks, and if the salvage run
+ * subsequently fails, we'd have overwritten pages used to construct the
+ * final key range. In other words, if the salvage run fails, we don't
+ * want to overwrite data the next salvage run might need.
+ */
+ if (ss->merge_free)
+ WT_ERR(__slvg_merge_block_free(session, ss));
+
+ /*
+ * Step 9:
+ * Evict the newly created root page, creating a checkpoint.
+ */
+ if (ss->root_ref.page != NULL) {
+ btree->ckpt = ckptbase;
+ ret = __wt_rec_evict(session, &ss->root_ref, 1);
+ ss->root_ref.page = NULL;
+ btree->ckpt = NULL;
+ }
+
+ /*
+ * Step 10:
+ * Inform the underlying block manager that we're done.
+ */
+err: WT_TRET(bm->salvage_end(bm, session));
+
+ /* Discard any root page we created. */
+ if (ss->root_ref.page != NULL)
+ __wt_ref_out(session, &ss->root_ref);
+
+ /* Discard the leaf and overflow page memory. */
+ WT_TRET(__slvg_cleanup(session, ss));
+
+ /* Discard temporary buffers. */
+ __wt_scr_free(&ss->tmp1);
+ __wt_scr_free(&ss->tmp2);
+
+ /* Wrap up reporting. */
+ WT_TRET(__wt_progress(session, NULL, ss->fcnt));
+
+ return (ret);
+}
+
+/*
+ * __slvg_read --
+ * Read the file and build a table of the pages we can use.
+ */
+static int
+__slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ WT_BM *bm;
+ WT_DECL_ITEM(as);
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ const WT_PAGE_HEADER *dsk;
+ size_t addr_size;
+ uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE];
+ int eof, valid;
+
+ bm = S2BT(session)->bm;
+ WT_ERR(__wt_scr_alloc(session, 0, &as));
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+
+ for (;;) {
+ /* Get the next block address from the block manager. */
+ WT_ERR(bm->salvage_next(bm, session, addr, &addr_size, &eof));
+ if (eof)
+ break;
+
+ /* Report progress every 10 chunks. */
+ if (++ss->fcnt % 10 == 0)
+ WT_ERR(__wt_progress(session, NULL, ss->fcnt));
+
+ /*
+ * Read (and potentially decompress) the block; the underlying
+ * block manager might return only good blocks if checksums are
+ * configured, or both good and bad blocks if we're relying on
+ * compression.
+ *
+ * Report the block's status to the block manager.
+ */
+ if ((ret = __wt_bt_read(session, buf, addr, addr_size)) == 0)
+ valid = 1;
+ else {
+ valid = 0;
+ if (ret == WT_ERROR)
+ ret = 0;
+ WT_ERR(ret);
+ }
+ WT_ERR(bm->salvage_valid(bm, session, addr, addr_size, valid));
+ if (!valid)
+ continue;
+
+ /* Create a printable version of the address. */
+ WT_ERR(bm->addr_string(bm, session, as, addr, addr_size));
+
+ /*
+ * Make sure it's an expected page type for the file.
+ *
+ * We only care about leaf and overflow pages from here on out;
+ * discard all of the others. We put them on the free list now,
+ * because we might as well overwrite them, we want the file to
+ * grow as little as possible, or shrink, and future salvage
+ * calls don't need them either.
+ */
+ dsk = buf->data;
+ switch (dsk->type) {
+ case WT_PAGE_BLOCK_MANAGER:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s page ignored %s",
+ __wt_page_type_string(dsk->type),
+ (const char *)as->data));
+ WT_ERR(bm->free(bm, session, addr, addr_size));
+ continue;
+ }
+
+ /*
+ * Verify the page. It's unlikely a page could have a valid
+ * checksum and still be broken, but paranoia is healthy in
+ * salvage. Regardless, verify does return failure because
+ * it detects failures we'd expect to see in a corrupted file,
+ * like overflow references past the end of the file or
+ * overflow references to non-existent pages, might as well
+ * discard these pages now.
+ */
+ if (__wt_verify_dsk(session, as->data, buf) != 0) {
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s page failed verify %s",
+ __wt_page_type_string(dsk->type),
+ (const char *)as->data));
+ WT_ERR(bm->free(bm, session, addr, addr_size));
+ continue;
+ }
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "tracking %s page, generation %" PRIu64 " %s",
+ __wt_page_type_string(dsk->type), dsk->write_gen,
+ (const char *)as->data));
+
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_ROW_LEAF:
+ if (ss->page_type == WT_PAGE_INVALID)
+ ss->page_type = dsk->type;
+ if (ss->page_type != dsk->type)
+ WT_ERR_MSG(session, WT_ERROR,
+ "file contains multiple file formats (both "
+ "%s and %s), and cannot be salvaged",
+ __wt_page_type_string(ss->page_type),
+ __wt_page_type_string(dsk->type));
+
+ WT_ERR(__slvg_trk_leaf(
+ session, dsk, addr, addr_size, ss));
+ break;
+ case WT_PAGE_OVFL:
+ WT_ERR(__slvg_trk_ovfl(
+ session, dsk, addr, addr_size, ss));
+ break;
+ }
+ }
+
+err: __wt_scr_free(&as);
+ __wt_scr_free(&buf);
+
+ return (ret);
+}
+
+/*
+ * __slvg_trk_init --
+ * Initialize tracking information for a page.
+ */
+static int
+__slvg_trk_init(WT_SESSION_IMPL *session,
+ uint8_t *addr, size_t addr_size,
+ uint32_t size, uint64_t gen, WT_STUFF *ss, WT_TRACK **retp)
+{
+ WT_DECL_RET;
+ WT_TRACK *trk;
+
+ WT_RET(__wt_calloc_def(session, 1, &trk));
+ WT_ERR(__wt_calloc_def(session, 1, &trk->shared));
+ trk->shared->ref = 1;
+
+ trk->ss = ss;
+ WT_ERR(__wt_strndup(session, addr, addr_size, &trk->trk_addr));
+ trk->trk_addr_size = (uint8_t)addr_size;
+ trk->trk_size = size;
+ trk->trk_gen = gen;
+
+ *retp = trk;
+ return (0);
+
+err: __wt_free(session, trk->trk_addr);
+ __wt_free(session, trk->shared);
+ __wt_free(session, trk);
+ return (ret);
+}
+
+/*
+ * __slvg_trk_split --
+ * Split a tracked chunk.
+ */
+static int
+__slvg_trk_split(WT_SESSION_IMPL *session, WT_TRACK *orig, WT_TRACK **newp)
+{
+ WT_TRACK *trk;
+
+ WT_RET(__wt_calloc_def(session, 1, &trk));
+
+ trk->shared = orig->shared;
+ trk->ss = orig->ss;
+
+ ++orig->shared->ref;
+
+ *newp = trk;
+ return (0);
+}
+
+/*
+ * __slvg_trk_leaf --
+ * Track a leaf page.
+ */
+static int
+__slvg_trk_leaf(WT_SESSION_IMPL *session,
+ const WT_PAGE_HEADER *dsk, uint8_t *addr, size_t addr_size, WT_STUFF *ss)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_TRACK *trk;
+ uint64_t stop_recno;
+ uint32_t i;
+
+ btree = S2BT(session);
+ unpack = &_unpack;
+ page = NULL;
+ trk = NULL;
+
+ /* Re-allocate the array of pages, as necessary. */
+ WT_RET(__wt_realloc_def(
+ session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages));
+
+ /* Allocate a WT_TRACK entry for this new page and fill it in. */
+ WT_RET(__slvg_trk_init(
+ session, addr, addr_size, dsk->mem_size, dsk->write_gen, ss, &trk));
+
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ /*
+ * Column-store fixed-sized format: start and stop keys can be
+ * taken from the block's header, and doesn't contain overflow
+ * items.
+ */
+ trk->col_start = dsk->recno;
+ trk->col_stop = dsk->recno + (dsk->u.entries - 1);
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s records %" PRIu64 "-%" PRIu64,
+ __wt_addr_string(
+ session, trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+ trk->col_start, trk->col_stop));
+ break;
+ case WT_PAGE_COL_VAR:
+ /*
+ * Column-store variable-length format: the start key can be
+ * taken from the block's header, stop key requires walking
+ * the page.
+ */
+ stop_recno = dsk->recno;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ stop_recno += __wt_cell_rle(unpack);
+ }
+
+ trk->col_start = dsk->recno;
+ trk->col_stop = stop_recno - 1;
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s records %" PRIu64 "-%" PRIu64,
+ __wt_addr_string(
+ session, trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+ trk->col_start, trk->col_stop));
+
+ /* Column-store pages can contain overflow items. */
+ WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * Row-store format: copy the first and last keys on the page.
+ * Keys are prefix-compressed, the simplest and slowest thing
+ * to do is instantiate the in-memory page, then instantiate
+ * and copy the full keys, then free the page. We do this
+ * on every leaf page, and if you need to speed up the salvage,
+ * it's probably a great place to start.
+ */
+ WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, &page));
+ WT_ERR(__wt_row_leaf_key_copy(session,
+ page, &page->pg_row_d[0], &trk->row_start));
+ WT_ERR(__wt_row_leaf_key_copy(session, page,
+ &page->pg_row_d[page->pg_row_entries - 1], &trk->row_stop));
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) {
+ WT_ERR(__wt_buf_set_printable(session, ss->tmp1,
+ trk->row_start.data, trk->row_start.size));
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s start key %.*s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, ss->tmp2),
+ (int)ss->tmp1->size, (char *)ss->tmp1->data));
+ WT_ERR(__wt_buf_set_printable(session, ss->tmp1,
+ trk->row_stop.data, trk->row_stop.size));
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s stop key %.*s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, ss->tmp2),
+ (int)ss->tmp1->size, (char *)ss->tmp1->data));
+ }
+
+ /* Row-store pages can contain overflow items. */
+ WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk));
+ break;
+ }
+ ss->pages[ss->pages_next++] = trk;
+
+ if (0) {
+err: __wt_free(session, trk);
+ }
+ if (page != NULL)
+ __wt_page_out(session, &page);
+ return (ret);
+}
+
+/*
+ * __slvg_trk_ovfl --
+ * Track an overflow page.
+ */
+static int
+__slvg_trk_ovfl(WT_SESSION_IMPL *session,
+ const WT_PAGE_HEADER *dsk, uint8_t *addr, size_t addr_size, WT_STUFF *ss)
+{
+ WT_TRACK *trk;
+
+ /*
+ * Reallocate the overflow page array as necessary, then save the
+ * page's location information.
+ */
+ WT_RET(__wt_realloc_def(
+ session, &ss->ovfl_allocated, ss->ovfl_next + 1, &ss->ovfl));
+
+ WT_RET(__slvg_trk_init(
+ session, addr, addr_size, dsk->mem_size, dsk->write_gen, ss, &trk));
+ ss->ovfl[ss->ovfl_next++] = trk;
+
+ return (0);
+}
+
+/*
+ * __slvg_trk_leaf_ovfl --
+ * Search a leaf page for overflow items.
+ */
+static int
+__slvg_trk_leaf_ovfl(
+ WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_TRACK *trk)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ uint32_t i, ovfl_cnt;
+
+ btree = S2BT(session);
+ unpack = &_unpack;
+
+ /*
+ * Two passes: count the overflow items, then copy them into an
+ * allocated array.
+ */
+ ovfl_cnt = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ if (unpack->ovfl)
+ ++ovfl_cnt;
+ }
+ if (ovfl_cnt == 0)
+ return (0);
+
+ /* Allocate room for the array of overflow addresses and fill it in. */
+ WT_RET(__wt_calloc_def(session, ovfl_cnt, &trk->trk_ovfl_addr));
+ trk->trk_ovfl_cnt = ovfl_cnt;
+
+ ovfl_cnt = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ if (unpack->ovfl) {
+ WT_RET(__wt_strndup(session, unpack->data,
+ unpack->size, &trk->trk_ovfl_addr[ovfl_cnt].addr));
+ trk->trk_ovfl_addr[ovfl_cnt].size =
+ (uint8_t)unpack->size;
+
+ WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s overflow reference %s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1),
+ __wt_addr_string(session,
+ unpack->data, unpack->size, trk->ss->tmp2)));
+
+ if (++ovfl_cnt == trk->trk_ovfl_cnt)
+ break;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __slvg_col_range --
+ * Figure out the leaf pages we need and free the leaf pages we don't.
+ *
+ * When pages split, the key range is split across multiple pages. If not all
+ * of the old versions of the page are overwritten, or not all of the new pages
+ * are written, or some of the pages are corrupted, salvage will read different
+ * pages with overlapping key ranges, at different LSNs.
+ *
+ * We salvage all of the key ranges we find, at the latest LSN value: this means
+ * we may resurrect pages of deleted items, as page deletion doesn't write leaf
+ * pages and salvage will read and instantiate the contents of an old version of
+ * the deleted page.
+ *
+ * The leaf page array is sorted in key order, and secondarily on LSN: what this
+ * means is that for each new key range, the first page we find is the best page
+ * for that key. The process is to walk forward from each page until we reach
+ * a page with a starting key after the current page's stopping key.
+ *
+ * For each of page, check to see if they overlap the current page's key range.
+ * If they do, resolve the overlap. Because WiredTiger rarely splits pages,
+ * overlap resolution usually means discarding a page because the key ranges
+ * are the same, and one of the pages is simply an old version of the other.
+ *
+ * However, it's possible more complex resolution is necessary. For example,
+ * here's an improbably complex list of page ranges and LSNs:
+ *
+ * Page Range LSN
+ * 30 A-G 3
+ * 31 C-D 4
+ * 32 B-C 5
+ * 33 C-F 6
+ * 34 C-D 7
+ * 35 F-M 8
+ * 36 H-O 9
+ *
+ * We walk forward from each page reviewing all other pages in the array that
+ * overlap the range. For each overlap, the current or the overlapping
+ * page is updated so the page with the most recent information for any range
+ * "owns" that range. Here's an example for page 30.
+ *
+ * Review page 31: because page 31 has the range C-D and a higher LSN than page
+ * 30, page 30 would "split" into two ranges, A-C and E-G, conceding the C-D
+ * range to page 31. The new track element would be inserted into array with
+ * the following result:
+ *
+ * Page Range LSN
+ * 30 A-C 3 << Changed WT_TRACK element
+ * 31 C-D 4
+ * 32 B-C 5
+ * 33 C-F 6
+ * 34 C-D 7
+ * 30 E-G 3 << New WT_TRACK element
+ * 35 F-M 8
+ * 36 H-O 9
+ *
+ * Continue the review of the first element, using its new values.
+ *
+ * Review page 32: because page 31 has the range B-C and a higher LSN than page
+ * 30, page 30's A-C range would be truncated, conceding the B-C range to page
+ * 32.
+ * 30 A-B 3
+ * E-G 3
+ * 31 C-D 4
+ * 32 B-C 5
+ * 33 C-F 6
+ * 34 C-D 7
+ *
+ * Review page 33: because page 33 has a starting key (C) past page 30's ending
+ * key (B), we stop evaluating page 30's A-B range, as there can be no further
+ * overlaps.
+ *
+ * This process is repeated for each page in the array.
+ *
+ * When page 33 is processed, we'd discover that page 33's C-F range overlaps
+ * page 30's E-G range, and page 30's E-G range would be updated, conceding the
+ * E-F range to page 33.
+ *
+ * This is not computationally expensive because we don't walk far forward in
+ * the leaf array because it's sorted by starting key, and because WiredTiger
+ * splits are rare, the chance of finding the kind of range overlap requiring
+ * re-sorting the array is small.
+ */
+static int
+__slvg_col_range(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ WT_TRACK *jtrk;
+ uint32_t i, j;
+
+ /*
+ * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
+ * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
+ * BEING HANDLED.
+ *
+ * Walk the page array looking for overlapping key ranges, adjusting
+ * the ranges based on the LSN until there are no overlaps.
+ *
+ * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE
+ * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE
+ * PLUS OFFSET.
+ */
+ for (i = 0; i < ss->pages_next; ++i) {
+ if (ss->pages[i] == NULL)
+ continue;
+
+ /* Check for pages that overlap our page. */
+ for (j = i + 1; j < ss->pages_next; ++j) {
+ if (ss->pages[j] == NULL)
+ continue;
+ /*
+ * We're done if this page starts after our stop, no
+ * subsequent pages can overlap our page.
+ */
+ if (ss->pages[j]->col_start >
+ ss->pages[i]->col_stop)
+ break;
+
+ /* There's an overlap, fix it up. */
+ jtrk = ss->pages[j];
+ WT_RET(__slvg_col_range_overlap(session, i, j, ss));
+
+ /*
+ * If the overlap resolution changed the entry's start
+ * key, the entry might have moved and the page array
+ * re-sorted, and pages[j] would reference a different
+ * page. We don't move forward if that happened, we
+ * re-process the slot again (by decrementing j before
+ * the loop's increment).
+ */
+ if (ss->pages[j] != NULL && jtrk != ss->pages[j])
+ --j;
+ }
+ }
+ return (0);
+}
+
+/*
+ * __slvg_col_range_overlap --
+ * Two column-store key ranges overlap, deal with it.
+ */
+static int
+__slvg_col_range_overlap(
+ WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_slot, WT_STUFF *ss)
+{
+ WT_TRACK *a_trk, *b_trk, *new;
+ uint32_t i;
+
+ /*
+ * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
+ * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
+ * BEING HANDLED.
+ */
+ a_trk = ss->pages[a_slot];
+ b_trk = ss->pages[b_slot];
+
+ WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s and %s range overlap",
+ __wt_addr_string(
+ session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1),
+ __wt_addr_string(
+ session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)));
+
+ /*
+ * The key ranges of two WT_TRACK pages in the array overlap -- choose
+ * the ranges we're going to take from each.
+ *
+ * We can think of the overlap possibilities as 11 different cases:
+ *
+ * AAAAAAAAAAAAAAAAAA
+ * #1 BBBBBBBBBBBBBBBBBB pages are the same
+ * #2 BBBBBBBBBBBBB overlaps the beginning
+ * #3 BBBBBBBBBBBBBBBB overlaps the end
+ * #4 BBBBB B is a prefix of A
+ * #5 BBBBBB B is middle of A
+ * #6 BBBBBBBBBB B is a suffix of A
+ *
+ * and:
+ *
+ * BBBBBBBBBBBBBBBBBB
+ * #7 AAAAAAAAAAAAA same as #3
+ * #8 AAAAAAAAAAAAAAAA same as #2
+ * #9 AAAAA A is a prefix of B
+ * #10 AAAAAA A is middle of B
+ * #11 AAAAAAAAAA A is a suffix of B
+ *
+ * Note the leaf page array was sorted by key and a_trk appears earlier
+ * in the array than b_trk, so cases #2/8, #10 and #11 are impossible.
+ *
+ * Finally, there's one additional complicating factor -- final ranges
+ * are assigned based on the page's LSN.
+ */
+ /* Case #2/8, #10, #11 */
+ if (a_trk->col_start > b_trk->col_start)
+ WT_PANIC_RET(
+ session, EINVAL, "unexpected merge array sort order");
+
+ if (a_trk->col_start == b_trk->col_start) { /* Case #1, #4 and #9 */
+ /*
+ * The secondary sort of the leaf page array was the page's LSN,
+ * in high-to-low order, which means a_trk has a higher LSN, and
+ * is more desirable, than b_trk. In cases #1 and #4 and #9,
+ * where the start of the range is the same for the two pages,
+ * this simplifies things, it guarantees a_trk has a higher LSN
+ * than b_trk.
+ */
+ if (a_trk->col_stop >= b_trk->col_stop)
+ /*
+ * Case #1, #4: a_trk is a superset of b_trk, and a_trk
+ * is more desirable -- discard b_trk.
+ */
+ goto delete_b;
+
+ /*
+ * Case #9: b_trk is a superset of a_trk, but a_trk is more
+ * desirable: keep both but delete a_trk's key range from
+ * b_trk.
+ */
+ b_trk->col_start = a_trk->col_stop + 1;
+ __slvg_col_trk_update_start(b_slot, ss);
+ F_SET(b_trk, WT_TRACK_MERGE);
+ goto merge;
+ }
+
+ if (a_trk->col_stop == b_trk->col_stop) { /* Case #6 */
+ if (a_trk->trk_gen > b_trk->trk_gen)
+ /*
+ * Case #6: a_trk is a superset of b_trk and a_trk is
+ * more desirable -- discard b_trk.
+ */
+ goto delete_b;
+
+ /*
+ * Case #6: a_trk is a superset of b_trk, but b_trk is more
+ * desirable: keep both but delete b_trk's key range from a_trk.
+ */
+ a_trk->col_stop = b_trk->col_start - 1;
+ F_SET(a_trk, WT_TRACK_MERGE);
+ goto merge;
+ }
+
+ if (a_trk->col_stop < b_trk->col_stop) { /* Case #3/7 */
+ if (a_trk->trk_gen > b_trk->trk_gen) {
+ /*
+ * Case #3/7: a_trk is more desirable, delete a_trk's
+ * key range from b_trk;
+ */
+ b_trk->col_start = a_trk->col_stop + 1;
+ __slvg_col_trk_update_start(b_slot, ss);
+ F_SET(b_trk, WT_TRACK_MERGE);
+ } else {
+ /*
+ * Case #3/7: b_trk is more desirable, delete b_trk's
+ * key range from a_trk;
+ */
+ a_trk->col_stop = b_trk->col_start - 1;
+ F_SET(a_trk, WT_TRACK_MERGE);
+ }
+ goto merge;
+ }
+
+ /*
+ * Case #5: a_trk is a superset of b_trk and a_trk is more desirable --
+ * discard b_trk.
+ */
+ if (a_trk->trk_gen > b_trk->trk_gen) {
+delete_b: /*
+ * After page and overflow reconciliation, one (and only one)
+ * page can reference an overflow record. But, if we split a
+ * page into multiple chunks, any of the chunks might own any
+ * of the backing overflow records, so overflow records won't
+ * normally be discarded until after the merge phase completes.
+ * (The merge phase is where the final pages are written, and
+ * we figure out which overflow records are actually used.)
+ * If freeing a chunk and there are no other references to the
+ * underlying shared information, the overflow records must be
+ * useless, discard them to keep the final file size small.
+ */
+ if (b_trk->shared->ref == 1)
+ for (i = 0; i < b_trk->trk_ovfl_cnt; ++i)
+ WT_RET(__slvg_trk_free(session,
+ &ss->ovfl[b_trk->trk_ovfl_slot[i]], 1));
+ return (__slvg_trk_free(session, &ss->pages[b_slot], 1));
+ }
+
+ /*
+ * Case #5: b_trk is more desirable and is a middle chunk of a_trk.
+ * Split a_trk into two parts, the key range before b_trk and the
+ * key range after b_trk.
+ */
+ WT_RET(__slvg_trk_split(session, a_trk, &new));
+
+ /*
+ * Second, reallocate the array of pages if necessary, and then insert
+ * the new element into the array after the existing element (that's
+ * probably wrong, but we'll fix it up in a second).
+ */
+ WT_RET(__wt_realloc_def(
+ session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages));
+ memmove(ss->pages + a_slot + 1, ss->pages + a_slot,
+ (ss->pages_next - a_slot) * sizeof(*ss->pages));
+ ss->pages[a_slot + 1] = new;
+ ++ss->pages_next;
+
+ /*
+ * Third, set its start key to be the first key after the stop key of
+ * the middle chunk (that's b_trk), and its stop key to be the stop key
+ * of the original chunk, and call __slvg_col_trk_update_start. That
+ * function will re-sort the WT_TRACK array as necessary to move our
+ * new entry into the right sorted location.
+ */
+ new->col_start = b_trk->col_stop + 1;
+ new->col_stop = a_trk->col_stop;
+ __slvg_col_trk_update_start(a_slot + 1, ss);
+
+ /*
+ * Fourth, set the original WT_TRACK information to reference only
+ * the initial key space in the page, that is, everything up to the
+ * starting key of the middle chunk (that's b_trk).
+ */
+ a_trk->col_stop = b_trk->col_start - 1;
+
+ F_SET(new, WT_TRACK_MERGE);
+ F_SET(a_trk, WT_TRACK_MERGE);
+
+merge: WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s and %s require merge",
+ __wt_addr_string(
+ session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1),
+ __wt_addr_string(
+ session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)));
+ return (0);
+}
+
+/*
+ * __slvg_col_trk_update_start --
+ * Update a column-store page's start key after an overlap.
+ */
+static void
+__slvg_col_trk_update_start(uint32_t slot, WT_STUFF *ss)
+{
+ WT_TRACK *trk;
+ uint32_t i;
+
+ trk = ss->pages[slot];
+
+ /*
+ * If we deleted an initial piece of the WT_TRACK name space, it may no
+ * longer be in the right location.
+ *
+ * For example, imagine page #1 has the key range 30-50, it split, and
+ * we wrote page #2 with key range 30-40, and page #3 key range with
+ * 40-50, where pages #2 and #3 have larger LSNs than page #1. When the
+ * key ranges were sorted, page #2 came first, then page #1 (because of
+ * their earlier start keys than page #3), and page #2 came before page
+ * #1 because of its LSN. When we resolve the overlap between page #2
+ * and page #1, we truncate the initial key range of page #1, and it now
+ * sorts after page #3, because it has the same starting key of 40, and
+ * a lower LSN.
+ *
+ * We have already updated b_trk's start key; what we may have to do is
+ * re-sort some number of elements in the list.
+ */
+ for (i = slot + 1; i < ss->pages_next; ++i) {
+ if (ss->pages[i] == NULL)
+ continue;
+ if (ss->pages[i]->col_start > trk->col_stop)
+ break;
+ }
+ i -= slot;
+ if (i > 1)
+ qsort(ss->pages + slot, (size_t)i,
+ sizeof(WT_TRACK *), __slvg_trk_compare_key);
+}
+
+/*
+ * __slvg_col_range_missing --
+ * Detect missing ranges from column-store files.
+ */
+static int
+__slvg_col_range_missing(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ WT_TRACK *trk;
+ uint64_t r;
+ uint32_t i;
+
+ for (i = 0, r = 0; i < ss->pages_next; ++i) {
+ if ((trk = ss->pages[i]) == NULL)
+ continue;
+ if (trk->col_start != r + 1) {
+ WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s column-store missing range from %"
+ PRIu64 " to %" PRIu64 " inclusive",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+ r + 1, trk->col_start - 1));
+
+ /*
+ * We need to instantiate deleted items for the missing
+ * record range.
+ */
+ trk->col_missing = r + 1;
+ F_SET(trk, WT_TRACK_MERGE);
+ }
+ r = trk->col_stop;
+ }
+ return (0);
+}
+
+/*
+ * __slvg_modify_init --
+ * Initialize a salvage page's modification information.
+ */
+static int
+__slvg_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_RET(__wt_page_modify_init(session, page));
+ __wt_page_modify_set(session, page);
+
+ return (0);
+}
+
+/*
+ * __slvg_col_build_internal --
+ * Build a column-store in-memory page that references all of the leaf
+ * pages we've found.
+ */
+static int
+__slvg_col_build_internal(
+ WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss)
+{
+ WT_ADDR *addr;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *ref, **refp;
+ WT_TRACK *trk;
+ uint32_t i;
+
+ addr = NULL;
+
+ /* Allocate a column-store root (internal) page and fill it in. */
+ WT_RET(
+ __wt_page_alloc(session, WT_PAGE_COL_INT, 1, leaf_cnt, 1, &page));
+ WT_ERR(__slvg_modify_init(session, page));
+
+ pindex = WT_INTL_INDEX_COPY(page);
+ for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) {
+ if ((trk = ss->pages[i]) == NULL)
+ continue;
+
+ ref = *refp++;
+ ref->home = page;
+ ref->page = NULL;
+
+ WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
+ WT_ERR(__wt_strndup(
+ session, trk->trk_addr, trk->trk_addr_size, &addr->addr));
+ addr->size = trk->trk_addr_size;
+ addr->type =
+ trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF;
+ ref->addr = addr;
+ addr = NULL;
+
+ ref->key.recno = trk->col_start;
+ ref->state = WT_REF_DISK;
+
+ /*
+ * If the page's key range is unmodified from when we read it
+ * (in other words, we didn't merge part of this page with
+ * another page), we can use the page without change, and the
+ * only thing we need to do is mark all overflow records the
+ * page references as in-use.
+ *
+ * If we did merge with another page, we have to build a page
+ * reflecting the updated key range. Note, that requires an
+ * additional pass to free the merge page's backing blocks.
+ */
+ if (F_ISSET(trk, WT_TRACK_MERGE)) {
+ ss->merge_free = 1;
+
+ WT_ERR(__slvg_col_build_leaf(session, trk, ref));
+ } else
+ WT_ERR(__slvg_ovfl_ref_all(session, trk));
+ ++ref;
+ }
+
+ __wt_root_ref_init(&ss->root_ref, page, 1);
+
+ if (0) {
+err: if (addr != NULL)
+ __wt_free(session, addr);
+ __wt_page_out(session, &page);
+ }
+ return (ret);
+}
+
+/*
+ * __slvg_col_build_leaf --
+ * Build a column-store leaf page for a merged page.
+ */
+static int
+__slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
+{
+ WT_COL *save_col_var;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_SALVAGE_COOKIE *cookie, _cookie;
+ uint64_t skip, take;
+ uint32_t *entriesp, save_entries;
+
+ cookie = &_cookie;
+ WT_CLEAR(*cookie);
+
+ /* Get the original page, including the full in-memory setup. */
+ WT_RET(__wt_page_in(session, ref, 0));
+ page = ref->page;
+
+ entriesp = page->type == WT_PAGE_COL_VAR ?
+ &page->pg_var_entries : &page->pg_fix_entries;
+
+ save_col_var = page->pg_var_d;
+ save_entries = *entriesp;
+
+ /*
+ * Calculate the number of K/V entries we are going to skip, and
+ * the total number of K/V entries we'll take from this page.
+ */
+ cookie->skip = skip = trk->col_start - page->pg_var_recno;
+ cookie->take = take = (trk->col_stop - trk->col_start) + 1;
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s merge discarding first %" PRIu64 " records, "
+ "then taking %" PRIu64 " records",
+ __wt_addr_string(
+ session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1),
+ skip, take));
+
+ /* Set the referenced flag on overflow pages we're using. */
+ if (page->type == WT_PAGE_COL_VAR && trk->trk_ovfl_cnt != 0)
+ WT_ERR(__slvg_col_ovfl(session, trk, page, skip, take));
+
+ /*
+ * If we're missing some part of the range, the real start range is in
+ * trk->col_missing, else, it's in trk->col_start. Update the parent's
+ * reference as well as the page itself.
+ */
+ if (trk->col_missing == 0)
+ page->pg_var_recno = trk->col_start;
+ else {
+ page->pg_var_recno = trk->col_missing;
+ cookie->missing = trk->col_start - trk->col_missing;
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s merge inserting %" PRIu64 " missing records",
+ __wt_addr_string(
+ session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1),
+ cookie->missing));
+ }
+ ref->key.recno = page->pg_var_recno;
+
+ /*
+ * We can't discard the original blocks associated with this page now.
+ * (The problem is we don't want to overwrite any original information
+ * until the salvage run succeeds -- if we free the blocks now, the next
+ * merge page we write might allocate those blocks and overwrite them,
+ * and should the salvage run eventually fail, the original information
+ * would have been lost.) Clear the reference addr so eviction doesn't
+ * free the underlying blocks.
+ */
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ ref->addr = NULL;
+
+ /* Write the new version of the leaf page to disk. */
+ WT_ERR(__slvg_modify_init(session, page));
+ WT_ERR(__wt_rec_write(session, ref, cookie, WT_SKIP_UPDATE_ERR));
+
+ /* Reset the page. */
+ page->pg_var_d = save_col_var;
+ *entriesp = save_entries;
+
+ ret = __wt_page_release(session, ref, 0);
+ if (ret == 0)
+ ret = __wt_rec_evict(session, ref, 1);
+
+ if (0) {
+err: WT_TRET(__wt_page_release(session, ref, 0));
+ }
+
+ return (ret);
+}
+
+/*
+ * __slvg_col_ovfl_single --
+ * Find a single overflow record in the merge page's list, and mark it as
+ * referenced.
+ */
+static int
+__slvg_col_ovfl_single(
+ WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK *unpack)
+{
+ WT_TRACK *ovfl;
+ uint32_t i;
+
+ /*
+ * Search the list of overflow records for this page -- we should find
+ * exactly one match, and we mark it as referenced.
+ */
+ for (i = 0; i < trk->trk_ovfl_cnt; ++i) {
+ ovfl = trk->ss->ovfl[trk->trk_ovfl_slot[i]];
+ if (unpack->size == ovfl->trk_addr_size &&
+ memcmp(unpack->data, ovfl->trk_addr, unpack->size) == 0)
+ return (__slvg_ovfl_ref(session, ovfl, 0));
+ }
+
+ WT_PANIC_RET(session,
+ EINVAL, "overflow record at column-store page merge not found");
+}
+
+/*
+ * __slvg_col_ovfl --
+ * Mark overflow items referenced by the merged page.
+ */
+static int
+__slvg_col_ovfl(WT_SESSION_IMPL *session,
+ WT_TRACK *trk, WT_PAGE *page, uint64_t skip, uint64_t take)
+{
+ WT_CELL_UNPACK unpack;
+ WT_CELL *cell;
+ WT_COL *cip;
+ WT_DECL_RET;
+ uint64_t recno, start, stop;
+ uint32_t i;
+
+ /*
+ * Merging a variable-length column-store page, and we took some number
+ * of records, figure out which (if any) overflow records we used.
+ */
+ recno = page->pg_var_recno;
+ start = recno + skip;
+ stop = (recno + skip + take) - 1;
+
+ WT_COL_FOREACH(page, cip, i) {
+ cell = WT_COL_PTR(page, cip);
+ __wt_cell_unpack(cell, &unpack);
+ recno += __wt_cell_rle(&unpack);
+
+ /*
+ * I keep getting this calculation wrong, so here's the logic.
+ * Start is the first record we want, stop is the last record
+ * we want. The record number has already been incremented one
+ * past the maximum record number for this page entry, that is,
+ * it's set to the first record number for the next page entry.
+ * The test of start should be greater-than (not greater-than-
+ * or-equal), because of that increment, if the record number
+ * equals start, we want the next record, not this one. The
+ * test against stop is greater-than, not greater-than-or-equal
+ * because stop is the last record wanted, if the record number
+ * equals stop, we want the next record.
+ */
+ if (recno > start && unpack.type == WT_CELL_VALUE_OVFL) {
+ ret = __slvg_col_ovfl_single(session, trk, &unpack);
+
+ /*
+ * When handling overlapping ranges on variable-length
+ * column-store leaf pages, we split ranges without
+ * considering if we were splitting RLE units. (See
+ * note at the beginning of this file for explanation
+ * of the overall process.) If the RLE unit was on-page,
+ * we can simply write it again. If the RLE unit was an
+ * overflow value that's already been used by another
+ * row (from some other page created by a range split),
+ * there's not much to do, this row can't reference an
+ * overflow record we don't have: delete the row.
+ */
+ if (ret == EBUSY) {
+ __wt_cell_type_reset(session,
+ cell, WT_CELL_VALUE_OVFL, WT_CELL_DEL);
+ ret = 0;
+ }
+ WT_RET(ret);
+ }
+ if (recno > stop)
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __slvg_row_range --
+ * Figure out the leaf pages we need and discard everything else. At the
+ * same time, tag the overflow pages they reference.
+ */
+static int
+__slvg_row_range(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ WT_TRACK *jtrk;
+ WT_BTREE *btree;
+ uint32_t i, j;
+ int cmp;
+
+ btree = S2BT(session);
+
+ /*
+ * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
+ * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
+ * BEING HANDLED.
+ *
+ * Walk the page array looking for overlapping key ranges, adjusting
+ * the ranges based on the LSN until there are no overlaps.
+ *
+ * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE
+ * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE
+ * PLUS OFFSET.
+ */
+ for (i = 0; i < ss->pages_next; ++i) {
+ if (ss->pages[i] == NULL)
+ continue;
+
+ /* Check for pages that overlap our page. */
+ for (j = i + 1; j < ss->pages_next; ++j) {
+ if (ss->pages[j] == NULL)
+ continue;
+ /*
+ * We're done if this page starts after our stop, no
+ * subsequent pages can overlap our page.
+ */
+ WT_RET(__wt_compare(session, btree->collator,
+ &ss->pages[j]->row_start, &ss->pages[i]->row_stop,
+ &cmp));
+ if (cmp > 0)
+ break;
+
+ /* There's an overlap, fix it up. */
+ jtrk = ss->pages[j];
+ WT_RET(__slvg_row_range_overlap(session, i, j, ss));
+
+ /*
+ * If the overlap resolution changed the entry's start
+ * key, the entry might have moved and the page array
+ * re-sorted, and pages[j] would reference a different
+ * page. We don't move forward if that happened, we
+ * re-process the slot again (by decrementing j before
+ * the loop's increment).
+ */
+ if (ss->pages[j] != NULL && jtrk != ss->pages[j])
+ --j;
+ }
+ }
+ return (0);
+}
+
+/*
+ * __slvg_row_range_overlap --
+ * Two row-store key ranges overlap, deal with it.
+ */
+static int
+__slvg_row_range_overlap(
+ WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_slot, WT_STUFF *ss)
+{
+ WT_BTREE *btree;
+ WT_TRACK *a_trk, *b_trk, *new;
+ uint32_t i;
+ int start_cmp, stop_cmp;
+
+ /*
+ * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
+ * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
+ * BEING HANDLED.
+ */
+ btree = S2BT(session);
+
+ a_trk = ss->pages[a_slot];
+ b_trk = ss->pages[b_slot];
+
+ WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s and %s range overlap",
+ __wt_addr_string(
+ session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1),
+ __wt_addr_string(
+ session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)));
+
+ /*
+ * The key ranges of two WT_TRACK pages in the array overlap -- choose
+ * the ranges we're going to take from each.
+ *
+ * We can think of the overlap possibilities as 11 different cases:
+ *
+ * AAAAAAAAAAAAAAAAAA
+ * #1 BBBBBBBBBBBBBBBBBB pages are the same
+ * #2 BBBBBBBBBBBBB overlaps the beginning
+ * #3 BBBBBBBBBBBBBBBB overlaps the end
+ * #4 BBBBB B is a prefix of A
+ * #5 BBBBBB B is middle of A
+ * #6 BBBBBBBBBB B is a suffix of A
+ *
+ * and:
+ *
+ * BBBBBBBBBBBBBBBBBB
+ * #7 AAAAAAAAAAAAA same as #3
+ * #8 AAAAAAAAAAAAAAAA same as #2
+ * #9 AAAAA A is a prefix of B
+ * #10 AAAAAA A is middle of B
+ * #11 AAAAAAAAAA A is a suffix of B
+ *
+ * Note the leaf page array was sorted by key and a_trk appears earlier
+ * in the array than b_trk, so cases #2/8, #10 and #11 are impossible.
+ *
+ * Finally, there's one additional complicating factor -- final ranges
+ * are assigned based on the page's LSN.
+ */
+#define A_TRK_START (&a_trk->row_start)
+#define A_TRK_STOP (&a_trk->row_stop)
+#define B_TRK_START (&b_trk->row_start)
+#define B_TRK_STOP (&b_trk->row_stop)
+#define SLOT_START(i) (&ss->pages[i]->row_start)
+#define __slvg_key_copy(session, dst, src) \
+ __wt_buf_set(session, dst, (src)->data, (src)->size)
+
+ WT_RET(__wt_compare(
+ session, btree->collator, A_TRK_START, B_TRK_START, &start_cmp));
+ WT_RET(__wt_compare(
+ session, btree->collator, A_TRK_STOP, B_TRK_STOP, &stop_cmp));
+
+ if (start_cmp > 0) /* Case #2/8, #10, #11 */
+ WT_PANIC_RET(
+ session, EINVAL, "unexpected merge array sort order");
+
+ if (start_cmp == 0) { /* Case #1, #4, #9 */
+ /*
+ * The secondary sort of the leaf page array was the page's LSN,
+ * in high-to-low order, which means a_trk has a higher LSN, and
+ * is more desirable, than b_trk. In cases #1 and #4 and #9,
+ * where the start of the range is the same for the two pages,
+ * this simplifies things, it guarantees a_trk has a higher LSN
+ * than b_trk.
+ */
+ if (stop_cmp >= 0)
+ /*
+ * Case #1, #4: a_trk is a superset of b_trk, and a_trk
+ * is more desirable -- discard b_trk.
+ */
+ goto delete_b;
+
+ /*
+ * Case #9: b_trk is a superset of a_trk, but a_trk is more
+ * desirable: keep both but delete a_trk's key range from
+ * b_trk.
+ */
+ WT_RET(__slvg_row_trk_update_start(
+ session, A_TRK_STOP, b_slot, ss));
+ F_SET(b_trk, WT_TRACK_CHECK_START | WT_TRACK_MERGE);
+ goto merge;
+ }
+
+ if (stop_cmp == 0) { /* Case #6 */
+ if (a_trk->trk_gen > b_trk->trk_gen)
+ /*
+ * Case #6: a_trk is a superset of b_trk and a_trk is
+ * more desirable -- discard b_trk.
+ */
+ goto delete_b;
+
+ /*
+ * Case #6: a_trk is a superset of b_trk, but b_trk is more
+ * desirable: keep both but delete b_trk's key range from a_trk.
+ */
+ WT_RET(__slvg_key_copy(session, A_TRK_STOP, B_TRK_START));
+ F_SET(a_trk, WT_TRACK_CHECK_STOP | WT_TRACK_MERGE);
+ goto merge;
+ }
+
+ if (stop_cmp < 0) { /* Case #3/7 */
+ if (a_trk->trk_gen > b_trk->trk_gen) {
+ /*
+ * Case #3/7: a_trk is more desirable, delete a_trk's
+ * key range from b_trk;
+ */
+ WT_RET(__slvg_row_trk_update_start(
+ session, A_TRK_STOP, b_slot, ss));
+ F_SET(b_trk, WT_TRACK_CHECK_START | WT_TRACK_MERGE);
+ } else {
+ /*
+ * Case #3/7: b_trk is more desirable, delete b_trk's
+ * key range from a_trk;
+ */
+ WT_RET(__slvg_key_copy(
+ session, A_TRK_STOP, B_TRK_START));
+ F_SET(a_trk, WT_TRACK_CHECK_STOP | WT_TRACK_MERGE);
+ }
+ goto merge;
+ }
+
+ /*
+ * Case #5: a_trk is a superset of b_trk and a_trk is more desirable --
+ * discard b_trk.
+ */
+ if (a_trk->trk_gen > b_trk->trk_gen) {
+delete_b: /*
+ * After page and overflow reconciliation, one (and only one)
+ * page can reference an overflow record. But, if we split a
+ * page into multiple chunks, any of the chunks might own any
+ * of the backing overflow records, so overflow records won't
+ * normally be discarded until after the merge phase completes.
+ * (The merge phase is where the final pages are written, and
+ * we figure out which overflow records are actually used.)
+ * If freeing a chunk and there are no other references to the
+ * underlying shared information, the overflow records must be
+ * useless, discard them to keep the final file size small.
+ */
+ if (b_trk->shared->ref == 1)
+ for (i = 0; i < b_trk->trk_ovfl_cnt; ++i)
+ WT_RET(__slvg_trk_free(session,
+ &ss->ovfl[b_trk->trk_ovfl_slot[i]], 1));
+ return (__slvg_trk_free(session, &ss->pages[b_slot], 1));
+ }
+
+ /*
+ * Case #5: b_trk is more desirable and is a middle chunk of a_trk.
+ * Split a_trk into two parts, the key range before b_trk and the
+ * key range after b_trk.
+ */
+ WT_RET(__slvg_trk_split(session, a_trk, &new));
+
+ /*
+ * Second, reallocate the array of pages if necessary, and then insert
+ * the new element into the array after the existing element (that's
+ * probably wrong, but we'll fix it up in a second).
+ */
+ WT_RET(__wt_realloc_def(
+ session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages));
+ memmove(ss->pages + a_slot + 1, ss->pages + a_slot,
+ (ss->pages_next - a_slot) * sizeof(*ss->pages));
+ ss->pages[a_slot + 1] = new;
+ ++ss->pages_next;
+
+ /*
+ * Third, set its its stop key to be the stop key of the original chunk,
+ * and call __slvg_row_trk_update_start. That function will both set
+ * the start key to be the first key after the stop key of the middle
+ * chunk (that's b_trk), and re-sort the WT_TRACK array as necessary to
+ * move our new entry into the right sorted location.
+ */
+ WT_RET(__slvg_key_copy(session, &new->row_stop, A_TRK_STOP));
+ WT_RET(
+ __slvg_row_trk_update_start(session, B_TRK_STOP, a_slot + 1, ss));
+
+ /*
+ * Fourth, set the original WT_TRACK information to reference only
+ * the initial key space in the page, that is, everything up to the
+ * starting key of the middle chunk (that's b_trk).
+ */
+ WT_RET(__slvg_key_copy(session, A_TRK_STOP, B_TRK_START));
+ F_SET(new, WT_TRACK_CHECK_START);
+ F_SET(a_trk, WT_TRACK_CHECK_STOP);
+
+ F_SET(new, WT_TRACK_MERGE);
+ F_SET(a_trk, WT_TRACK_MERGE);
+
+merge: WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s and %s require merge",
+ __wt_addr_string(
+ session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1),
+ __wt_addr_string(
+ session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)));
+ return (0);
+}
+
+/*
+ * __slvg_row_trk_update_start --
+ * Update a row-store page's start key after an overlap.
+ */
+static int
+__slvg_row_trk_update_start(
+ WT_SESSION_IMPL *session, WT_ITEM *stop, uint32_t slot, WT_STUFF *ss)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(dsk);
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ WT_TRACK *trk;
+ uint32_t i;
+ int cmp, found;
+
+ btree = S2BT(session);
+ page = NULL;
+ found = 0;
+
+ trk = ss->pages[slot];
+
+ /*
+ * If we deleted an initial piece of the WT_TRACK name space, it may no
+ * longer be in the right location.
+ *
+ * For example, imagine page #1 has the key range 30-50, it split, and
+ * we wrote page #2 with key range 30-40, and page #3 key range with
+ * 40-50, where pages #2 and #3 have larger LSNs than page #1. When the
+ * key ranges were sorted, page #2 came first, then page #1 (because of
+ * their earlier start keys than page #3), and page #2 came before page
+ * #1 because of its LSN. When we resolve the overlap between page #2
+ * and page #1, we truncate the initial key range of page #1, and it now
+ * sorts after page #3, because it has the same starting key of 40, and
+ * a lower LSN.
+ *
+ * First, update the WT_TRACK start key based on the specified stop key.
+ *
+ * Read and instantiate the WT_TRACK page (we don't have to verify the
+ * page, nor do we have to be quiet on error, we've already read this
+ * page successfully).
+ */
+ WT_RET(__wt_scr_alloc(session, trk->trk_size, &dsk));
+ WT_ERR(__wt_bt_read(session, dsk, trk->trk_addr, trk->trk_addr_size));
+ WT_ERR(__wt_page_inmem(session, NULL, dsk->mem, 0, &page));
+
+ /*
+ * Walk the page, looking for a key sorting greater than the specified
+ * stop key -- that's our new start key.
+ */
+ WT_ERR(__wt_scr_alloc(session, 0, &key));
+ WT_ROW_FOREACH(page, rip, i) {
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
+ WT_ERR(__wt_compare(session, btree->collator, key, stop, &cmp));
+ if (cmp > 0) {
+ found = 1;
+ break;
+ }
+ }
+
+ /*
+ * We know that at least one key on the page sorts after the specified
+ * stop key, otherwise the page would have entirely overlapped and we
+ * would have discarded it, we wouldn't be here. Therefore, this test
+ * is safe. (But, it never hurts to check.)
+ */
+ WT_ERR_TEST(!found, WT_ERROR);
+ WT_ERR(__slvg_key_copy(session, &trk->row_start, key));
+
+ /*
+ * We may need to re-sort some number of elements in the list. Walk
+ * forward in the list until reaching an entry which cannot overlap
+ * the adjusted entry. If it's more than a single slot, re-sort the
+ * entries.
+ */
+ for (i = slot + 1; i < ss->pages_next; ++i) {
+ if (ss->pages[i] == NULL)
+ continue;
+ WT_ERR(__wt_compare(session,
+ btree->collator, SLOT_START(i), &trk->row_stop, &cmp));
+ if (cmp > 0)
+ break;
+ }
+ i -= slot;
+ if (i > 1)
+ qsort(ss->pages + slot, (size_t)i,
+ sizeof(WT_TRACK *), __slvg_trk_compare_key);
+
+err: if (page != NULL)
+ __wt_page_out(session, &page);
+ __wt_scr_free(&dsk);
+ __wt_scr_free(&key);
+
+ return (ret);
+}
+
+/*
+ * __slvg_row_build_internal --
+ * Build a row-store in-memory page that references all of the leaf
+ * pages we've found.
+ */
+static int
+__slvg_row_build_internal(
+ WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss)
+{
+ WT_ADDR *addr;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *ref, **refp;
+ WT_TRACK *trk;
+ uint32_t i;
+
+ addr = NULL;
+
+ /* Allocate a row-store root (internal) page and fill it in. */
+ WT_RET(
+ __wt_page_alloc(session, WT_PAGE_ROW_INT, 0, leaf_cnt, 1, &page));
+ WT_ERR(__slvg_modify_init(session, page));
+
+ pindex = WT_INTL_INDEX_COPY(page);
+ for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) {
+ if ((trk = ss->pages[i]) == NULL)
+ continue;
+
+ ref = *refp++;
+ ref->home = page;
+ ref->page = NULL;
+
+ WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
+ WT_ERR(__wt_strndup(
+ session, trk->trk_addr, trk->trk_addr_size, &addr->addr));
+ addr->size = trk->trk_addr_size;
+ addr->type =
+ trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF;
+ ref->addr = addr;
+ addr = NULL;
+
+ __wt_ref_key_clear(ref);
+ ref->state = WT_REF_DISK;
+
+ /*
+ * If the page's key range is unmodified from when we read it
+ * (in other words, we didn't merge part of this page with
+ * another page), we can use the page without change, and the
+ * only thing we need to do is mark all overflow records the
+ * page references as in-use.
+ *
+ * If we did merge with another page, we have to build a page
+ * reflecting the updated key range. Note, that requires an
+ * additional pass to free the merge page's backing blocks.
+ */
+ if (F_ISSET(trk, WT_TRACK_MERGE)) {
+ ss->merge_free = 1;
+
+ WT_ERR(__slvg_row_build_leaf(session, trk, ref, ss));
+ } else {
+ WT_ERR(__wt_row_ikey_incr(session, page, 0,
+ trk->row_start.data, trk->row_start.size,
+ &ref->key.ikey));
+
+ WT_ERR(__slvg_ovfl_ref_all(session, trk));
+ }
+ ++ref;
+ }
+
+ __wt_root_ref_init(&ss->root_ref, page, 0);
+
+ if (0) {
+err: if (addr != NULL)
+ __wt_free(session, addr);
+ __wt_page_out(session, &page);
+ }
+ return (ret);
+}
+
+/*
+ * __slvg_row_build_leaf --
+ * Build a row-store leaf page for a merged page.
+ */
+static int
+__slvg_row_build_leaf(
+ WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref, WT_STUFF *ss)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ WT_SALVAGE_COOKIE *cookie, _cookie;
+ uint32_t i, skip_start, skip_stop;
+ int cmp;
+
+ btree = S2BT(session);
+ page = NULL;
+
+ cookie = &_cookie;
+ WT_CLEAR(*cookie);
+
+ /* Allocate temporary space in which to instantiate the keys. */
+ WT_RET(__wt_scr_alloc(session, 0, &key));
+
+ /* Get the original page, including the full in-memory setup. */
+ WT_ERR(__wt_page_in(session, ref, 0));
+ page = ref->page;
+
+ /*
+ * Figure out how many page keys we want to take and how many we want
+ * to skip.
+ *
+ * If checking the starting range key, the key we're searching for will
+ * be equal to the starting range key. This is because we figured out
+ * the true merged-page start key as part of discarding initial keys
+ * from the page (see the __slvg_row_range_overlap function, and its
+ * calls to __slvg_row_trk_update_start for more information).
+ *
+ * If checking the stopping range key, we want the keys on the page that
+ * are less-than the stopping range key. This is because we copied a
+ * key from another page to define this page's stop range: that page is
+ * the page that owns the "equal to" range space.
+ */
+ skip_start = skip_stop = 0;
+ if (F_ISSET(trk, WT_TRACK_CHECK_START))
+ WT_ROW_FOREACH(page, rip, i) {
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
+
+ /*
+ * >= is correct: see the comment above.
+ */
+ WT_ERR(__wt_compare(session,
+ btree->collator, key, &trk->row_start, &cmp));
+ if (cmp >= 0)
+ break;
+ if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) {
+ WT_ERR(__wt_buf_set_printable(session,
+ ss->tmp1, key->data, key->size));
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s merge discarding leading key %.*s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size,
+ ss->tmp2), (int)ss->tmp1->size,
+ (char *)ss->tmp1->data));
+ }
+ ++skip_start;
+ }
+ if (F_ISSET(trk, WT_TRACK_CHECK_STOP))
+ WT_ROW_FOREACH_REVERSE(page, rip, i) {
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
+
+ /*
+ * < is correct: see the comment above.
+ */
+ WT_ERR(__wt_compare(session,
+ btree->collator, key, &trk->row_stop, &cmp));
+ if (cmp < 0)
+ break;
+ if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) {
+ WT_ERR(__wt_buf_set_printable(session,
+ ss->tmp1, key->data, key->size));
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s merge discarding trailing key %.*s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size,
+ ss->tmp2), (int)ss->tmp1->size,
+ (char *)ss->tmp1->data));
+ }
+ ++skip_stop;
+ }
+
+ /* We should have selected some entries, but not the entire page. */
+ WT_ASSERT(session,
+ skip_start + skip_stop > 0 &&
+ skip_start + skip_stop < page->pg_row_entries);
+
+ /*
+ * Take a copy of this page's first key to define the start of
+ * its range. The key may require processing, otherwise, it's
+ * a copy from the page.
+ */
+ rip = page->pg_row_d + skip_start;
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
+ WT_ERR(__wt_row_ikey_incr(session,
+ ref->home, 0, key->data, key->size, &ref->key.ikey));
+
+ /* Set the referenced flag on overflow pages we're using. */
+ if (trk->trk_ovfl_cnt != 0)
+ WT_ERR(__slvg_row_ovfl(session,
+ trk, page, skip_start, page->pg_row_entries - skip_stop));
+
+ /*
+ * Change the page to reflect the correct record count: there is no
+ * need to copy anything on the page itself, the entries value limits
+ * the number of page items.
+ */
+ page->pg_row_entries -= skip_stop;
+ cookie->skip = skip_start;
+
+ /*
+ * We can't discard the original blocks associated with this page now.
+ * (The problem is we don't want to overwrite any original information
+ * until the salvage run succeeds -- if we free the blocks now, the next
+ * merge page we write might allocate those blocks and overwrite them,
+ * and should the salvage run eventually fail, the original information
+ * would have been lost.) Clear the reference addr so eviction doesn't
+ * free the underlying blocks.
+ */
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ ref->addr = NULL;
+
+ /* Write the new version of the leaf page to disk. */
+ WT_ERR(__slvg_modify_init(session, page));
+ WT_ERR(__wt_rec_write(session, ref, cookie, WT_SKIP_UPDATE_ERR));
+
+ /* Reset the page. */
+ page->pg_row_entries += skip_stop;
+
+ /*
+ * Discard our hazard pointer and evict the page, updating the
+ * parent's reference.
+ */
+ ret = __wt_page_release(session, ref, 0);
+ if (ret == 0)
+ ret = __wt_rec_evict(session, ref, 1);
+
+ if (0) {
+err: WT_TRET(__wt_page_release(session, ref, 0));
+ }
+ __wt_scr_free(&key);
+
+ return (ret);
+}
+
+/*
+ * __slvg_row_ovfl_single --
+ * Find a single overflow record in the merge page's list, and mark it as
+ * referenced.
+ */
+static int
+__slvg_row_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL *cell)
+{
+ WT_CELL_UNPACK unpack;
+ WT_TRACK *ovfl;
+ uint32_t i;
+
+ /* Unpack the cell, and check if it's an overflow record. */
+ __wt_cell_unpack(cell, &unpack);
+ if (unpack.type != WT_CELL_KEY_OVFL &&
+ unpack.type != WT_CELL_VALUE_OVFL)
+ return (0);
+
+ /*
+ * Search the list of overflow records for this page -- we should find
+ * exactly one match, and we mark it as referenced.
+ */
+ for (i = 0; i < trk->trk_ovfl_cnt; ++i) {
+ ovfl = trk->ss->ovfl[trk->trk_ovfl_slot[i]];
+ if (unpack.size == ovfl->trk_addr_size &&
+ memcmp(unpack.data, ovfl->trk_addr, unpack.size) == 0)
+ return (__slvg_ovfl_ref(session, ovfl, 1));
+ }
+
+ WT_PANIC_RET(session,
+ EINVAL, "overflow record at row-store page merge not found");
+}
+
+/*
+ * __slvg_row_ovfl --
+ * Mark overflow items referenced by the merged page.
+ */
+static int
+__slvg_row_ovfl(WT_SESSION_IMPL *session,
+ WT_TRACK *trk, WT_PAGE *page, uint32_t start, uint32_t stop)
+{
+ WT_CELL *cell;
+ WT_ROW *rip;
+ void *copy;
+
+ /*
+ * We're merging a row-store page, and we took some number of records,
+ * figure out which (if any) overflow records we used.
+ */
+ for (rip = page->pg_row_d + start; start < stop; ++start, ++rip) {
+ copy = WT_ROW_KEY_COPY(rip);
+ (void)__wt_row_leaf_key_info(
+ page, copy, NULL, &cell, NULL, NULL);
+ if (cell != NULL)
+ WT_RET(__slvg_row_ovfl_single(session, trk, cell));
+ cell = __wt_row_leaf_value_cell(page, rip, NULL);
+ if (cell != NULL)
+ WT_RET(__slvg_row_ovfl_single(session, trk, cell));
+ }
+ return (0);
+}
+
+/*
+ * __slvg_trk_compare_addr --
+ * Compare two WT_TRACK array entries by address cookie.
+ */
+static int
+__slvg_trk_compare_addr(const void *a, const void *b)
+{
+ WT_DECL_RET;
+ WT_TRACK *a_trk, *b_trk;
+ size_t len;
+
+ a_trk = *(WT_TRACK **)a;
+ b_trk = *(WT_TRACK **)b;
+
+ /*
+ * We don't care about the order because these are opaque cookies --
+ * we're just sorting them so we can binary search instead of linear
+ * search.
+ */
+ len = WT_MIN(a_trk->trk_addr_size, b_trk->trk_addr_size);
+ ret = memcmp(a_trk->trk_addr, b_trk->trk_addr, len);
+ if (ret == 0)
+ ret = a_trk->trk_addr_size > b_trk->trk_addr_size ? -1 : 1;
+ return (ret);
+}
+
+/*
+ * __slvg_ovfl_compare --
+ * Bsearch comparison routine for the overflow array.
+ */
+static int
+__slvg_ovfl_compare(const void *a, const void *b)
+{
+ WT_ADDR *addr;
+ WT_DECL_RET;
+ WT_TRACK *trk;
+ size_t len;
+
+ addr = (WT_ADDR *)a;
+ trk = *(WT_TRACK **)b;
+
+ len = WT_MIN(trk->trk_addr_size, addr->size);
+ ret = memcmp(addr->addr, trk->trk_addr, len);
+ if (ret == 0 && addr->size != trk->trk_addr_size)
+ ret = addr->size < trk->trk_addr_size ? -1 : 1;
+ return (ret);
+}
+
+/*
+ * __slvg_ovfl_reconcile --
+ * Review relationships between leaf pages and the overflow pages, delete
+ * leaf pages until there's a one-to-one relationship between leaf and overflow
+ * pages.
+ */
+static int
+__slvg_ovfl_reconcile(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ WT_ADDR *addr;
+ WT_DECL_RET;
+ WT_TRACK **searchp, *trk;
+ uint32_t i, j, *slot;
+
+ slot = NULL;
+
+ /*
+ * If an overflow page is referenced more than once, discard leaf pages
+ * with the lowest LSNs until overflow pages are only referenced once.
+ *
+ * This requires sorting the page list by LSN, and the overflow array
+ * by address cookie.
+ */
+ qsort(ss->pages,
+ (size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_gen);
+ qsort(ss->ovfl,
+ (size_t)ss->ovfl_next, sizeof(WT_TRACK *), __slvg_trk_compare_addr);
+
+ /*
+ * Walk the list of pages and discard any pages referencing non-existent
+ * overflow pages or referencing overflow pages also referenced by pages
+ * with higher LSNs. Our caller sorted the page list by LSN, high to
+ * low, so we don't have to do explicit testing of the page LSNs, the
+ * first page to reference an overflow page is the best page to own it.
+ */
+ for (i = 0; i < ss->pages_next; ++i) {
+ if ((trk = ss->pages[i]) == NULL || trk->trk_ovfl_cnt == 0)
+ continue;
+
+ WT_ERR(__wt_calloc_def(session, trk->trk_ovfl_cnt, &slot));
+ for (j = 0; j < trk->trk_ovfl_cnt; ++j) {
+ addr = &trk->trk_ovfl_addr[j];
+ searchp = bsearch(addr, ss->ovfl, ss->ovfl_next,
+ sizeof(WT_TRACK *), __slvg_ovfl_compare);
+
+ /*
+ * If the overflow page doesn't exist or if another page
+ * has already claimed it, this leaf page isn't usable.
+ */
+ if (searchp != NULL &&
+ !F_ISSET(*searchp, WT_TRACK_OVFL_REFD)) {
+ /*
+ * Convert each block address into a slot in the
+ * list of overflow pages as we go.
+ */
+ slot[j] = (uint32_t)(searchp - ss->ovfl);
+ F_SET(*searchp, WT_TRACK_OVFL_REFD);
+ continue;
+ }
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s references unavailable overflow page %s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+ __wt_addr_string(session,
+ addr->addr, addr->size, ss->tmp2)));
+
+ /*
+ * Clear the "referenced" flag for any overflow pages
+ * already claimed by this leaf page some other page
+ * might claim them.
+ */
+ while (j > 0)
+ F_CLR(ss->ovfl[slot[--j]], WT_TRACK_OVFL_REFD);
+ trk = NULL;
+ WT_ERR(__slvg_trk_free(session, &ss->pages[i], 1));
+ break;
+ }
+
+ /*
+ * We now have a reference to the overflow WT_TRACK, and so no
+ * longer need the page's address array, discard it. Note, we
+ * potentially freed the WT_TRACK in the loop above, check it's
+ * still valid.
+ */
+ if (trk == NULL)
+ __wt_free(session, slot);
+ else {
+ __slvg_trk_free_addr(session, trk);
+
+ trk->trk_ovfl_slot = slot;
+ slot = NULL;
+ }
+ }
+ return (0);
+
+err: __wt_free(session, slot);
+ return (ret);
+}
+
+/*
+ * __slvg_trk_compare_key --
+ * Compare two WT_TRACK array entries by key, and secondarily, by LSN.
+ */
+static int
+__slvg_trk_compare_key(const void *a, const void *b)
+{
+ WT_SESSION_IMPL *session;
+ WT_TRACK *a_trk, *b_trk;
+ uint64_t a_gen, a_recno, b_gen, b_recno;
+ int cmp;
+
+ a_trk = *(WT_TRACK **)a;
+ b_trk = *(WT_TRACK **)b;
+
+ if (a_trk == NULL)
+ return (b_trk == NULL ? 0 : 1);
+ if (b_trk == NULL)
+ return (-1);
+
+ switch (a_trk->ss->page_type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ a_recno = a_trk->col_start;
+ b_recno = b_trk->col_start;
+ if (a_recno == b_recno)
+ break;
+ if (a_recno > b_recno)
+ return (1);
+ if (a_recno < b_recno)
+ return (-1);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * XXX
+ * __wt_compare can potentially fail, and we're ignoring that
+ * error because this routine is called as an underlying qsort
+ * routine.
+ */
+ session = a_trk->ss->session;
+ (void)__wt_compare(session, S2BT(session)->collator,
+ &a_trk->row_start, &b_trk->row_start, &cmp);
+ if (cmp != 0)
+ return (cmp);
+ break;
+ }
+
+ /*
+ * If the primary keys compare equally, differentiate based on LSN.
+ * Sort from highest LSN to lowest, that is, the earlier pages in
+ * the array are more desirable.
+ */
+ a_gen = a_trk->trk_gen;
+ b_gen = b_trk->trk_gen;
+ return (a_gen > b_gen ? -1 : (a_gen < b_gen ? 1 : 0));
+}
+
+/*
+ * __slvg_trk_compare_gen --
+ * Compare two WT_TRACK array entries by LSN.
+ */
+static int
+__slvg_trk_compare_gen(const void *a, const void *b)
+{
+ WT_TRACK *a_trk, *b_trk;
+ uint64_t a_gen, b_gen;
+
+ a_trk = *(WT_TRACK **)a;
+ b_trk = *(WT_TRACK **)b;
+
+ /*
+ * Sort from highest LSN to lowest, that is, the earlier pages in the
+ * array are more desirable.
+ */
+ a_gen = a_trk->trk_gen;
+ b_gen = b_trk->trk_gen;
+ return (a_gen > b_gen ? -1 : (a_gen < b_gen ? 1 : 0));
+}
+
+/*
+ * __slvg_merge_block_free --
+ * Clean up backing file and overflow blocks after the merge phase.
+ */
+static int
+__slvg_merge_block_free(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ WT_TRACK *trk;
+ uint32_t i;
+
+ /* Free any underlying file blocks for merged pages. */
+ for (i = 0; i < ss->pages_next; ++i) {
+ if ((trk = ss->pages[i]) == NULL)
+ continue;
+ if (F_ISSET(trk, WT_TRACK_MERGE))
+ WT_RET(__slvg_trk_free(session, &ss->pages[i], 1));
+ }
+
+ /* Free any unused overflow records. */
+ return (__slvg_ovfl_discard(session, ss));
+}
+
+/*
+ * __slvg_ovfl_ref --
+ * Reference an overflow page, checking for multiple references.
+ */
+static int
+__slvg_ovfl_ref(WT_SESSION_IMPL *session, WT_TRACK *trk, int multi_panic)
+{
+ if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) {
+ if (!multi_panic)
+ return (EBUSY);
+ WT_PANIC_RET(session, EINVAL,
+ "overflow record unexpectedly referenced multiple times "
+ "during leaf page merge");
+ }
+
+ F_SET(trk, WT_TRACK_OVFL_REFD);
+ return (0);
+}
+
+/*
+ * __slvg_ovfl_ref_all --
+ * Reference all of the page's overflow pages.
+ */
+static int
+__slvg_ovfl_ref_all(WT_SESSION_IMPL *session, WT_TRACK *trk)
+{
+ uint32_t i;
+
+ for (i = 0; i < trk->trk_ovfl_cnt; ++i)
+ WT_RET(__slvg_ovfl_ref(
+ session, trk->ss->ovfl[trk->trk_ovfl_slot[i]], 1));
+
+ return (0);
+}
+
+/*
+ * __slvg_ovfl_discard --
+ * Discard unused overflow pages.
+ */
+static int
+__slvg_ovfl_discard(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ WT_TRACK *trk;
+ uint32_t i;
+
+ /*
+ * Walk the overflow page array: if an overflow page isn't referenced,
+ * add its file blocks to the free list.
+ *
+ * Clear the reference flag (it's reused to figure out if the overflow
+ * record is referenced, but never used, by merged pages).
+ */
+ for (i = 0; i < ss->ovfl_next; ++i) {
+ if ((trk = ss->ovfl[i]) == NULL)
+ continue;
+
+ if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) {
+ F_CLR(trk, WT_TRACK_OVFL_REFD);
+ continue;
+ }
+ WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s unused overflow page",
+ __wt_addr_string(
+ session, trk->trk_addr, trk->trk_addr_size, ss->tmp1)));
+ WT_RET(__slvg_trk_free(session, &ss->ovfl[i], 1));
+ }
+
+ return (0);
+}
+
+/*
+ * __slvg_cleanup --
+ * Discard memory allocated to the page and overflow arrays.
+ */
+static int
+__slvg_cleanup(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ uint32_t i;
+
+ /* Discard the leaf page array. */
+ for (i = 0; i < ss->pages_next; ++i)
+ if (ss->pages[i] != NULL)
+ WT_RET(__slvg_trk_free(session, &ss->pages[i], 0));
+ __wt_free(session, ss->pages);
+
+ /* Discard the ovfl page array. */
+ for (i = 0; i < ss->ovfl_next; ++i)
+ if (ss->ovfl[i] != NULL)
+ WT_RET(__slvg_trk_free(session, &ss->ovfl[i], 0));
+ __wt_free(session, ss->ovfl);
+
+ return (0);
+}
+
+/*
+ * __slvg_trk_free_addr --
+ * Discard address information.
+ */
+static void
+__slvg_trk_free_addr(WT_SESSION_IMPL *session, WT_TRACK *trk)
+{
+ uint32_t i;
+
+ if (trk->trk_ovfl_addr != NULL) {
+ for (i = 0; i < trk->trk_ovfl_cnt; ++i)
+ __wt_free(session, trk->trk_ovfl_addr[i].addr);
+ __wt_free(session, trk->trk_ovfl_addr);
+ }
+}
+
+/*
+ * __slvg_trk_free_block --
+ * Discard underlying blocks.
+ */
+static int
+__slvg_trk_free_block(WT_SESSION_IMPL *session, WT_TRACK *trk)
+{
+ WT_BM *bm;
+
+ bm = S2BT(session)->bm;
+
+ /*
+ * If freeing underlying file blocks or overflow pages, this is a page
+ * we were tracking but eventually decided not to use.
+ */
+ WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s blocks discarded: discard freed file bytes %" PRIu32,
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), trk->trk_size));
+
+ return (bm->free(bm, session, trk->trk_addr, trk->trk_addr_size));
+}
+
+/*
+ * __slvg_trk_free --
+ * Discard a WT_TRACK structure and (optionally) its underlying blocks.
+ */
+static int
+__slvg_trk_free(WT_SESSION_IMPL *session, WT_TRACK **trkp, int free_on_last_ref)
+{
+ WT_TRACK *trk;
+
+ trk = *trkp;
+ *trkp = NULL;
+
+ /*
+ * If we're the last user of shared information, clean up.
+ */
+ WT_ASSERT(session, trk->shared->ref > 0);
+ if (--trk->shared->ref == 0) {
+ /*
+ * If the free-on-last-ref flag is set, this chunk isn't going
+ * to use the backing physical blocks. As we're the last user
+ * of those blocks, nobody is going to use them and they can be
+ * discarded.
+ */
+ if (free_on_last_ref)
+ WT_RET(__slvg_trk_free_block(session, trk));
+
+ __wt_free(session, trk->trk_addr);
+
+ __slvg_trk_free_addr(session, trk);
+
+ __wt_free(session, trk->trk_ovfl_slot);
+
+ __wt_free(session, trk->shared);
+ }
+
+ if (trk->ss->page_type == WT_PAGE_ROW_LEAF) {
+ __wt_buf_free(session, &trk->row_start);
+ __wt_buf_free(session, &trk->row_stop);
+ }
+
+ __wt_free(session, trk);
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c
new file mode 100644
index 00000000000..3da0bcf346c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_stat.c
@@ -0,0 +1,190 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
+static int __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *);
+static int __stat_page_row_leaf(WT_PAGE *, WT_DSRC_STATS *);
+
+/*
+ * __wt_btree_stat_init --
+ * Initialize the Btree statistics.
+ */
+int
+__wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_DSRC_STATS *stats;
+ WT_REF *next_walk;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ stats = &btree->dhandle->stats;
+
+ WT_RET(bm->stat(bm, session, stats));
+
+ WT_STAT_SET(stats, btree_fixed_len, btree->bitcnt);
+ WT_STAT_SET(stats, btree_maximum_depth, btree->maximum_depth);
+ WT_STAT_SET(stats, btree_maxintlitem, btree->maxintlitem);
+ WT_STAT_SET(stats, btree_maxintlpage, btree->maxintlpage);
+ WT_STAT_SET(stats, btree_maxleafitem, btree->maxleafitem);
+ WT_STAT_SET(stats, btree_maxleafpage, btree->maxleafpage);
+
+ /* Everything else is really, really expensive. */
+ if (!F_ISSET(cst, WT_CONN_STAT_ALL))
+ return (0);
+
+ next_walk = NULL;
+ while ((ret =
+ __wt_tree_walk(session, &next_walk, 0)) == 0 && next_walk != NULL)
+ WT_RET(__stat_page(session, next_walk->page, stats));
+ return (ret == WT_NOTFOUND ? 0 : ret);
+}
+
+/*
+ * __stat_page --
+ * Stat any Btree page.
+ */
+static int
+__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
+{
+ WT_PAGE_INDEX *pindex;
+
+ /*
+ * All internal pages and overflow pages are trivial, all we track is
+ * a count of the page type.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ WT_STAT_INCR(stats, btree_column_fix);
+ WT_STAT_INCRV(stats, btree_entries, page->pg_fix_entries);
+ break;
+ case WT_PAGE_COL_INT:
+ WT_STAT_INCR(stats, btree_column_internal);
+ pindex = WT_INTL_INDEX_COPY(page);
+ WT_STAT_INCRV(stats, btree_entries, pindex->entries);
+ break;
+ case WT_PAGE_COL_VAR:
+ WT_RET(__stat_page_col_var(page, stats));
+ break;
+ case WT_PAGE_OVFL:
+ WT_STAT_INCR(stats, btree_overflow);
+ break;
+ case WT_PAGE_ROW_INT:
+ WT_STAT_INCR(stats, btree_row_internal);
+ pindex = WT_INTL_INDEX_COPY(page);
+ WT_STAT_INCRV(stats, btree_entries, pindex->entries);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__stat_page_row_leaf(page, stats));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
+
+/*
+ * __stat_page_col_var --
+ * Stat a WT_PAGE_COL_VAR page.
+ */
+static int
+__stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
+{
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_COL *cip;
+ WT_INSERT *ins;
+ WT_UPDATE *upd;
+ uint32_t i;
+ int orig_deleted;
+
+ unpack = &_unpack;
+
+ WT_STAT_INCR(stats, btree_column_variable);
+
+ /*
+ * Walk the page, counting regular and overflow data items, and checking
+ * to be sure any updates weren't deletions. If the item was updated,
+ * assume it was updated by an item of the same size (it's expensive to
+ * figure out if it will require the same space or not, especially if
+ * there's Huffman encoding).
+ */
+ WT_COL_FOREACH(page, cip, i) {
+ if ((cell = WT_COL_PTR(page, cip)) == NULL) {
+ orig_deleted = 1;
+ WT_STAT_INCR(stats, btree_column_deleted);
+ } else {
+ orig_deleted = 0;
+ __wt_cell_unpack(cell, unpack);
+ WT_STAT_INCRV(
+ stats, btree_entries, __wt_cell_rle(unpack));
+ }
+
+ /*
+ * Walk the insert list, checking for changes. For each insert
+ * we find, correct the original count based on its state.
+ */
+ WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) {
+ upd = ins->upd;
+ if (WT_UPDATE_DELETED_ISSET(upd)) {
+ if (orig_deleted)
+ continue;
+ WT_STAT_INCR(stats, btree_column_deleted);
+ WT_STAT_DECR(stats, btree_entries);
+ } else {
+ if (!orig_deleted)
+ continue;
+ WT_STAT_DECR(stats, btree_column_deleted);
+ WT_STAT_INCR(stats, btree_entries);
+ }
+ }
+ }
+ return (0);
+}
+
+/*
+ * __stat_page_row_leaf --
+ * Stat a WT_PAGE_ROW_LEAF page.
+ */
+static int
+__stat_page_row_leaf(WT_PAGE *page, WT_DSRC_STATS *stats)
+{
+ WT_INSERT *ins;
+ WT_ROW *rip;
+ WT_UPDATE *upd;
+ uint32_t cnt, i;
+
+ WT_STAT_INCR(stats, btree_row_leaf);
+
+ /*
+ * Stat any K/V pairs inserted into the page before the first from-disk
+ * key on the page.
+ */
+ cnt = 0;
+ WT_SKIP_FOREACH(ins, WT_ROW_INSERT_SMALLEST(page))
+ if (!WT_UPDATE_DELETED_ISSET(ins->upd))
+ ++cnt;
+
+ /* Stat the page's K/V pairs. */
+ WT_ROW_FOREACH(page, rip, i) {
+ upd = WT_ROW_UPDATE(page, rip);
+ if (upd == NULL || !WT_UPDATE_DELETED_ISSET(upd))
+ ++cnt;
+
+ /* Stat inserted K/V pairs. */
+ WT_SKIP_FOREACH(ins, WT_ROW_INSERT(page, rip))
+ if (!WT_UPDATE_DELETED_ISSET(ins->upd))
+ ++cnt;
+ }
+
+ WT_STAT_INCRV(stats, btree_entries, cnt);
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
new file mode 100644
index 00000000000..607e7919513
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -0,0 +1,373 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __sync_file --
+ * Flush pages for a specific file.
+ */
+static int
+__sync_file(WT_SESSION_IMPL *session, int syncop)
+{
+ struct timespec end, start;
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_REF *walk;
+ WT_TXN *txn;
+ uint64_t internal_bytes, leaf_bytes;
+ uint64_t internal_pages, leaf_pages;
+ uint32_t flags;
+
+ btree = S2BT(session);
+
+ flags = WT_READ_CACHE | WT_READ_NO_GEN;
+ walk = NULL;
+ txn = &session->txn;
+
+ internal_bytes = leaf_bytes = 0;
+ internal_pages = leaf_pages = 0;
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
+ WT_RET(__wt_epoch(session, &start));
+
+ switch (syncop) {
+ case WT_SYNC_WRITE_LEAVES:
+ /*
+ * Write all immediately available, dirty in-cache leaf pages.
+ *
+ * Writing the leaf pages is done without acquiring a high-level
+ * lock, serialize so multiple threads don't walk the tree at
+ * the same time.
+ */
+ if (!btree->modified)
+ return (0);
+ __wt_spin_lock(session, &btree->flush_lock);
+ if (!btree->modified) {
+ __wt_spin_unlock(session, &btree->flush_lock);
+ return (0);
+ }
+
+ flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
+ for (walk = NULL;;) {
+ WT_ERR(__wt_tree_walk(session, &walk, flags));
+ if (walk == NULL)
+ break;
+
+ /* Write dirty pages if nobody beat us to it. */
+ page = walk->page;
+ if (__wt_page_is_modified(page)) {
+ if (txn->isolation == TXN_ISO_READ_COMMITTED)
+ __wt_txn_refresh(session, 1);
+ leaf_bytes += page->memory_footprint;
+ ++leaf_pages;
+ WT_ERR(__wt_rec_write(session, walk, NULL, 0));
+ }
+ }
+ break;
+ case WT_SYNC_CHECKPOINT:
+ /*
+ * We cannot check the tree modified flag in the case of a
+ * checkpoint, the checkpoint code has already cleared it.
+ *
+ * Writing the leaf pages is done without acquiring a high-level
+ * lock, serialize so multiple threads don't walk the tree at
+ * the same time. We're holding the schema lock, but need the
+ * lower-level lock as well.
+ */
+ __wt_spin_lock(session, &btree->flush_lock);
+
+ /*
+ * When internal pages are being reconciled by checkpoint their
+ * child pages cannot disappear from underneath them or be split
+ * into them, nor can underlying blocks be freed until the block
+ * lists for the checkpoint are stable. Set the checkpointing
+ * flag to block eviction of dirty pages until the checkpoint's
+ * internal page pass is complete, then wait for any existing
+ * eviction to complete.
+ */
+ btree->checkpointing = 1;
+
+ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
+ WT_ERR(__wt_evict_file_exclusive_on(session));
+ __wt_evict_file_exclusive_off(session);
+ }
+
+ /* Write all dirty in-cache pages. */
+ flags |= WT_READ_NO_EVICT;
+ for (walk = NULL;;) {
+ WT_ERR(__wt_tree_walk(session, &walk, flags));
+ if (walk == NULL)
+ break;
+
+ /*
+ * Write dirty pages, unless we can be sure they only
+ * became dirty after the checkpoint started.
+ *
+ * We can skip dirty pages if:
+ * (1) they are leaf pages;
+ * (2) there is a snapshot transaction active (which
+ * is the case in ordinary application checkpoints
+ * but not all internal cases); and
+ * (3) the first dirty update on the page is
+ * sufficiently recent that the checkpoint
+ * transaction would skip them.
+ */
+ page = walk->page;
+ mod = page->modify;
+ if (__wt_page_is_modified(page) &&
+ (WT_PAGE_IS_INTERNAL(page) ||
+ !F_ISSET(txn, TXN_HAS_SNAPSHOT) ||
+ TXNID_LE(mod->first_dirty_txn, txn->snap_max))) {
+ if (WT_PAGE_IS_INTERNAL(page)) {
+ internal_bytes +=
+ page->memory_footprint;
+ ++internal_pages;
+ } else {
+ leaf_bytes += page->memory_footprint;
+ ++leaf_pages;
+ }
+ WT_ERR(__wt_rec_write(session, walk, NULL, 0));
+ }
+ }
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
+ WT_ERR(__wt_epoch(session, &end));
+ WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+ "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64
+ " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64
+ " bytes, %" PRIu64 " pages of internal\n\t"
+ "Took: %" PRIu64 "ms",
+ syncop == WT_SYNC_WRITE_LEAVES ?
+ "WRITE_LEAVES" : "CHECKPOINT",
+ leaf_bytes, leaf_pages, internal_bytes, internal_pages,
+ WT_TIMEDIFF(end, start) / WT_MILLION));
+ }
+
+err: /* On error, clear any left-over tree walk. */
+ if (walk != NULL)
+ WT_TRET(__wt_page_release(session, walk, flags));
+
+ if (txn->isolation == TXN_ISO_READ_COMMITTED && session->ncursors == 0)
+ __wt_txn_release_snapshot(session);
+
+ if (btree->checkpointing) {
+ /*
+ * Clear the checkpoint flag and push the change; not required,
+ * but publishing the change means stalled eviction gets moving
+ * as soon as possible.
+ */
+ btree->checkpointing = 0;
+ WT_FULL_BARRIER();
+
+ /*
+ * Wake the eviction server, in case application threads have
+ * stalled while the eviction server decided it couldn't make
+ * progress. Without this, application threads will be stalled
+ * until the eviction server next wakes.
+ */
+ WT_TRET(__wt_evict_server_wake(session));
+ }
+
+ __wt_spin_unlock(session, &btree->flush_lock);
+
+ /*
+ * Leaves are written before a checkpoint (or as part of a file close,
+ * before checkpointing the file). Start a flush to stable storage,
+ * but don't wait for it.
+ */
+ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES)
+ WT_RET(btree->bm->sync(btree->bm, session, 1));
+
+ return (ret);
+}
+
+/*
+ * __evict_file --
+ * Discard pages for a specific file.
+ */
+static int
+__evict_file(WT_SESSION_IMPL *session, int syncop)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_REF *next_ref, *ref;
+ int eviction_enabled;
+
+ btree = S2BT(session);
+ eviction_enabled = !F_ISSET(btree, WT_BTREE_NO_EVICTION);
+
+ /*
+ * We need exclusive access to the file -- disable ordinary eviction
+ * and drain any blocks already queued.
+ */
+ if (eviction_enabled)
+ WT_RET(__wt_evict_file_exclusive_on(session));
+
+ /* Make sure the oldest transaction ID is up-to-date. */
+ __wt_txn_update_oldest(session);
+
+ /* Walk the tree, discarding pages. */
+ next_ref = NULL;
+ WT_ERR(__wt_tree_walk(
+ session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT));
+ while ((ref = next_ref) != NULL) {
+ page = ref->page;
+
+ /*
+ * Eviction can fail when a page in the evicted page's subtree
+ * switches state. For example, if we don't evict a page marked
+ * empty, because we expect it to be merged into its parent, it
+ * might no longer be empty after it's reconciled, in which case
+ * eviction of its parent would fail. We can either walk the
+ * tree multiple times (until it's finally empty), or reconcile
+ * each page to get it to its final state before considering if
+ * it's an eviction target or will be merged into its parent.
+ *
+ * Don't limit this test to any particular page type, that tends
+ * to introduce bugs when the reconciliation of other page types
+ * changes, and there's no advantage to doing so.
+ *
+ * Eviction can also fail because an update cannot be written.
+ * If sessions have disjoint sets of files open, updates in a
+ * no-longer-referenced file may not yet be globally visible,
+ * and the write will fail with EBUSY. Our caller handles that
+ * error, retrying later.
+ */
+ if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page))
+ WT_ERR(__wt_rec_write(session, ref, NULL, WT_EVICTING));
+
+ /*
+ * We can't evict the page just returned to us (it marks our
+ * place in the tree), so move the walk to one page ahead of
+ * the page being evicted. Note, we reconciled the returned
+ * page first: if reconciliation of that page were to change
+ * the shape of the tree, and we did the next walk call before
+ * the reconciliation, the next walk call could miss a page in
+ * the tree.
+ */
+ WT_ERR(__wt_tree_walk(
+ session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT));
+
+ switch (syncop) {
+ case WT_SYNC_CLOSE:
+ /*
+ * Evict the page.
+ * Do not attempt to evict pages expected to be merged
+ * into their parents, with the exception that the root
+ * page can't be merged, it must be written.
+ */
+ if (__wt_ref_is_root(ref) ||
+ page->modify == NULL ||
+ !F_ISSET(page->modify, WT_PM_REC_EMPTY))
+ WT_ERR(__wt_rec_evict(session, ref, 1));
+ break;
+ case WT_SYNC_DISCARD:
+ case WT_SYNC_DISCARD_FORCE:
+ /*
+ * Discard the page, whether clean or dirty.
+ *
+ * Clean the page, both to keep statistics correct, and
+ * to let the page-discard function assert no dirty page
+ * is ever discarded.
+ */
+ if (__wt_page_is_modified(page)) {
+ page->modify->write_gen = 0;
+ __wt_cache_dirty_decr(session, page);
+ }
+ /*
+ * If the page contains an update that is too recent to
+ * evict, stop. This should never happen during
+ * connection close, and in other paths our caller
+ * should be prepared to deal with this case.
+ */
+ if (syncop == WT_SYNC_DISCARD &&
+ page->modify != NULL &&
+ !__wt_txn_visible_all(session,
+ page->modify->rec_max_txn))
+ return (EBUSY);
+ if (syncop == WT_SYNC_DISCARD_FORCE)
+ F_SET(session, WT_SESSION_DISCARD_FORCE);
+ __wt_ref_out(session, ref);
+ /*
+ * In case we don't discard the whole tree, make sure
+ * that future readers know that the page is no longer
+ * in cache.
+ */
+ ref->state = WT_REF_DISK;
+ F_CLR(session, WT_SESSION_DISCARD_FORCE);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ }
+
+ if (0) {
+err: /* On error, clear any left-over tree walk. */
+ if (next_ref != NULL)
+ WT_TRET(__wt_page_release(
+ session, next_ref, WT_READ_NO_EVICT));
+ }
+
+ if (eviction_enabled)
+ __wt_evict_file_exclusive_off(session);
+
+ return (ret);
+}
+
+/*
+ * __wt_cache_op --
+ * Cache operations.
+ */
+int
+__wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op)
+{
+ WT_DECL_RET;
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ switch (op) {
+ case WT_SYNC_CHECKPOINT:
+ case WT_SYNC_CLOSE:
+ /*
+ * Set the checkpoint reference for reconciliation; it's ugly,
+ * but drilling a function parameter path from our callers to
+ * the reconciliation of the tree's root page is going to be
+ * worse.
+ */
+ WT_ASSERT(session, btree->ckpt == NULL);
+ btree->ckpt = ckptbase;
+ break;
+ }
+
+ switch (op) {
+ case WT_SYNC_CHECKPOINT:
+ case WT_SYNC_WRITE_LEAVES:
+ WT_ERR(__sync_file(session, op));
+ break;
+ case WT_SYNC_CLOSE:
+ case WT_SYNC_DISCARD:
+ case WT_SYNC_DISCARD_FORCE:
+ WT_ERR(__evict_file(session, op));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+err: switch (op) {
+ case WT_SYNC_CHECKPOINT:
+ case WT_SYNC_CLOSE:
+ btree->ckpt = NULL;
+ break;
+ }
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_upgrade.c b/src/third_party/wiredtiger/src/btree/bt_upgrade.c
new file mode 100644
index 00000000000..d65c8793fbb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_upgrade.c
@@ -0,0 +1,22 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_upgrade --
+ * Upgrade a file.
+ */
+int
+__wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_UNUSED(cfg);
+
+ /* There's nothing to upgrade, yet. */
+ WT_RET(__wt_progress(session, NULL, 1));
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
new file mode 100644
index 00000000000..e7caf02fd2f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
@@ -0,0 +1,666 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * There's a bunch of stuff we pass around during verification, group it
+ * together to make the code prettier.
+ */
+typedef struct {
+ uint64_t record_total; /* Total record count */
+
+ WT_ITEM *max_key; /* Largest key */
+ WT_ITEM *max_addr; /* Largest key page */
+
+ uint64_t fcnt; /* Progress counter */
+
+ int dump_address; /* Debugging hooks */
+ int dump_pages;
+ int dump_blocks;
+
+ WT_ITEM *tmp1; /* Temporary buffer */
+ WT_ITEM *tmp2; /* Temporary buffer */
+} WT_VSTUFF;
+
+static void __verify_checkpoint_reset(WT_VSTUFF *);
+static int __verify_config(WT_SESSION_IMPL *, const char *[], WT_VSTUFF *);
+static int __verify_config_offsets(WT_SESSION_IMPL *, const char *[], int *);
+static int __verify_overflow(
+ WT_SESSION_IMPL *, const uint8_t *, size_t, WT_VSTUFF *);
+static int __verify_overflow_cell(
+ WT_SESSION_IMPL *, WT_REF *, int *, WT_VSTUFF *);
+static int __verify_row_int_key_order(
+ WT_SESSION_IMPL *, WT_PAGE *, WT_REF *, uint32_t, WT_VSTUFF *);
+static int __verify_row_leaf_key_order(
+ WT_SESSION_IMPL *, WT_REF *, WT_VSTUFF *);
+static int __verify_tree(WT_SESSION_IMPL *, WT_REF *, WT_VSTUFF *);
+
+/*
+ * __wt_verify --
+ * Verify a file.
+ */
+int
+__wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CKPT *ckptbase, *ckpt;
+ WT_DECL_RET;
+ WT_VSTUFF *vs, _vstuff;
+ size_t root_addr_size;
+ uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
+ int bm_start, quit;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ ckptbase = NULL;
+ bm_start = 0;
+
+ WT_CLEAR(_vstuff);
+ vs = &_vstuff;
+ WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key));
+ WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr));
+ WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1));
+ WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2));
+
+ /* Check configuration strings. */
+ WT_ERR(__verify_config(session, cfg, vs));
+
+ /* Optionally dump specific block offsets. */
+ WT_ERR(__verify_config_offsets(session, cfg, &quit));
+ if (quit)
+ goto done;
+
+ /* Get a list of the checkpoints for this file. */
+ WT_ERR(
+ __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase));
+
+ /* Inform the underlying block manager we're verifying. */
+ WT_ERR(bm->verify_start(bm, session, ckptbase));
+ bm_start = 1;
+
+ /* Loop through the file's checkpoints, verifying each one. */
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ WT_ERR(__wt_verbose(session, WT_VERB_VERIFY,
+ "%s: checkpoint %s", btree->dhandle->name, ckpt->name));
+
+ /* Fake checkpoints require no work. */
+ if (F_ISSET(ckpt, WT_CKPT_FAKE))
+ continue;
+
+ /* House-keeping between checkpoints. */
+ __verify_checkpoint_reset(vs);
+
+#ifdef HAVE_DIAGNOSTIC
+ if (vs->dump_address || vs->dump_blocks || vs->dump_pages)
+ WT_ERR(__wt_msg(session, "%s: checkpoint %s",
+ btree->dhandle->name, ckpt->name));
+#endif
+ /* Load the checkpoint. */
+ WT_ERR(bm->checkpoint_load(bm, session,
+ ckpt->raw.data, ckpt->raw.size,
+ root_addr, &root_addr_size, 1));
+
+ /*
+ * Ignore trees with no root page.
+ * Verify, then discard the checkpoint from the cache.
+ */
+ if (root_addr_size != 0 &&
+ (ret = __wt_btree_tree_open(
+ session, root_addr, root_addr_size)) == 0) {
+#ifdef HAVE_DIAGNOSTIC
+ if (vs->dump_address ||
+ vs->dump_blocks || vs->dump_pages)
+ WT_ERR(__wt_msg(session, "Root: %s %s",
+ __wt_addr_string(session,
+ root_addr, root_addr_size, vs->tmp1),
+ __wt_page_type_string(
+ btree->root.page->type)));
+#endif
+ ret = __verify_tree(session, &btree->root, vs);
+
+ WT_TRET(__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
+ }
+
+ /* Unload the checkpoint. */
+ WT_TRET(bm->checkpoint_unload(bm, session));
+ WT_ERR(ret);
+ }
+
+done:
+err: /* Inform the underlying block manager we're done. */
+ if (bm_start)
+ WT_TRET(bm->verify_end(bm, session));
+
+ /* Discard the list of checkpoints. */
+ if (ckptbase != NULL)
+ __wt_meta_ckptlist_free(session, ckptbase);
+
+ /* Wrap up reporting. */
+ WT_TRET(__wt_progress(session, NULL, vs->fcnt));
+
+ /* Free allocated memory. */
+ __wt_scr_free(&vs->max_key);
+ __wt_scr_free(&vs->max_addr);
+ __wt_scr_free(&vs->tmp1);
+ __wt_scr_free(&vs->tmp2);
+
+ return (ret);
+}
+
+/*
+ * __verify_config --
+ * Debugging: verification supports dumping pages in various formats.
+ */
+static int
+__verify_config(WT_SESSION_IMPL *session, const char *cfg[], WT_VSTUFF *vs)
+{
+ WT_CONFIG_ITEM cval;
+
+ WT_RET(__wt_config_gets(session, cfg, "dump_address", &cval));
+ vs->dump_address = cval.val != 0;
+
+ WT_RET(__wt_config_gets(session, cfg, "dump_blocks", &cval));
+ vs->dump_blocks = cval.val != 0;
+
+ WT_RET(__wt_config_gets(session, cfg, "dump_pages", &cval));
+ vs->dump_pages = cval.val != 0;
+
+#if !defined(HAVE_DIAGNOSTIC)
+ if (vs->dump_address || vs->dump_blocks || vs->dump_pages)
+ WT_RET_MSG(session, ENOTSUP,
+ "the WiredTiger library was not built in diagnostic mode");
+#endif
+ return (0);
+}
+
+/*
+ * __verify_config_offsets --
+ * Debugging: optionally dump specific blocks from the file.
+ */
+static int
+__verify_config_offsets(WT_SESSION_IMPL *session, const char *cfg[], int *quitp)
+{
+ WT_CONFIG list;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_DECL_RET;
+ u_long offset;
+
+ *quitp = 0;
+
+ WT_RET(__wt_config_gets(session, cfg, "dump_offsets", &cval));
+ WT_RET(__wt_config_subinit(session, &list, &cval));
+ while ((ret = __wt_config_next(&list, &k, &v)) == 0) {
+ /*
+ * Quit after dumping the requested blocks. (That's hopefully
+ * what the user wanted, all of this stuff is just hooked into
+ * verify because that's where we "dump blocks" for debugging.)
+ */
+ *quitp = 1;
+ if (v.len != 0 || sscanf(k.str, "%lu", &offset) != 1)
+ WT_RET_MSG(session, EINVAL,
+ "unexpected dump offset format");
+#if !defined(HAVE_DIAGNOSTIC)
+ WT_RET_MSG(session, ENOTSUP,
+ "the WiredTiger library was not built in diagnostic mode");
+#else
+ WT_TRET(
+ __wt_debug_offset_blind(session, (wt_off_t)offset, NULL));
+#endif
+ }
+ return (ret == WT_NOTFOUND ? 0 : ret);
+}
+
+/*
+ * __verify_checkpoint_reset --
+ * Reset anything needing to be reset for each new checkpoint verification.
+ */
+static void
+__verify_checkpoint_reset(WT_VSTUFF *vs)
+{
+ /*
+ * Key order is per checkpoint, reset the data length that serves as a
+ * flag value.
+ */
+ vs->max_addr->size = 0;
+
+ /* Record total is per checkpoint, reset the record count. */
+ vs->record_total = 0;
+}
+
+/*
+ * __verify_tree --
+ * Verify a tree, recursively descending through it in depth-first fashion.
+ * The page argument was physically verified (so we know it's correctly formed),
+ * and the in-memory version built. Our job is to check logical relationships
+ * in the page and in the tree.
+ */
+static int
+__verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
+{
+ WT_BM *bm;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_COL *cip;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_REF *child_ref;
+ uint64_t recno;
+ uint32_t entry, i;
+ int found;
+
+ bm = S2BT(session)->bm;
+ page = ref->page;
+
+ unpack = &_unpack;
+ WT_CLEAR(*unpack); /* -Wuninitialized */
+
+ WT_RET(__wt_verbose(session, WT_VERB_VERIFY, "%s %s",
+ __wt_page_addr_string(session, ref, vs->tmp1),
+ __wt_page_type_string(page->type)));
+#ifdef HAVE_DIAGNOSTIC
+ if (vs->dump_address)
+ WT_RET(__wt_msg(session, "%s %s",
+ __wt_page_addr_string(session, ref, vs->tmp1),
+ __wt_page_type_string(page->type)));
+#endif
+
+ /*
+ * The page's physical structure was verified when it was read into
+ * memory by the read server thread, and then the in-memory version
+ * of the page was built. Now we make sure the page and tree are
+ * logically consistent.
+ *
+ * !!!
+ * The problem: (1) the read server has to build the in-memory version
+ * of the page because the read server is the thread that flags when
+ * any thread can access the page in the tree; (2) we can't build the
+ * in-memory version of the page until the physical structure is known
+ * to be OK, so the read server has to verify at least the physical
+ * structure of the page; (3) doing complete page verification requires
+ * reading additional pages (for example, overflow keys imply reading
+ * overflow pages in order to test the key's order in the page); (4)
+ * the read server cannot read additional pages because it will hang
+ * waiting on itself. For this reason, we split page verification
+ * into a physical verification, which allows the in-memory version
+ * of the page to be built, and then a subsequent logical verification
+ * which happens here.
+ *
+ * Report progress every 10 pages.
+ */
+ if (++vs->fcnt % 10 == 0)
+ WT_RET(__wt_progress(session, NULL, vs->fcnt));
+
+#ifdef HAVE_DIAGNOSTIC
+ /* Optionally dump the blocks or page in debugging mode. */
+ if (vs->dump_blocks)
+ WT_RET(__wt_debug_disk(session, page->dsk, NULL));
+ if (vs->dump_pages)
+ WT_RET(__wt_debug_page(session, page, NULL));
+#endif
+
+ /*
+ * Column-store key order checks: check the page's record number and
+ * then update the total record count.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ recno = page->pg_fix_recno;
+ goto recno_chk;
+ case WT_PAGE_COL_INT:
+ recno = page->pg_intl_recno;
+ goto recno_chk;
+ case WT_PAGE_COL_VAR:
+ recno = page->pg_var_recno;
+recno_chk: if (recno != vs->record_total + 1)
+ WT_RET_MSG(session, WT_ERROR,
+ "page at %s has a starting record of %" PRIu64
+ " when the expected starting record is %" PRIu64,
+ __wt_page_addr_string(session, ref, vs->tmp1),
+ recno, vs->record_total + 1);
+ break;
+ }
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ vs->record_total += page->pg_fix_entries;
+ break;
+ case WT_PAGE_COL_VAR:
+ recno = 0;
+ WT_COL_FOREACH(page, cip, i)
+ if ((cell = WT_COL_PTR(page, cip)) == NULL)
+ ++recno;
+ else {
+ __wt_cell_unpack(cell, unpack);
+ recno += __wt_cell_rle(unpack);
+ }
+ vs->record_total += recno;
+ break;
+ }
+
+ /*
+ * Row-store leaf page key order check: it's a depth-first traversal,
+ * the first key on this page should be larger than any key previously
+ * seen.
+ */
+ switch (page->type) {
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__verify_row_leaf_key_order(session, ref, vs));
+ break;
+ }
+
+ /* If it's not the root page, unpack the parent cell. */
+ if (!__wt_ref_is_root(ref)) {
+ __wt_cell_unpack(ref->addr, unpack);
+
+ /* Compare the parent cell against the page type. */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ if (unpack->raw != WT_CELL_ADDR_LEAF_NO)
+ goto celltype_err;
+ break;
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_ROW_LEAF:
+ if (unpack->raw != WT_CELL_ADDR_LEAF &&
+ unpack->raw != WT_CELL_ADDR_LEAF_NO)
+ goto celltype_err;
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ if (unpack->raw != WT_CELL_ADDR_INT)
+celltype_err: WT_RET_MSG(session, WT_ERROR,
+ "page at %s, of type %s, is referenced in "
+ "its parent by a cell of type %s",
+ __wt_page_addr_string(
+ session, ref, vs->tmp1),
+ __wt_page_type_string(page->type),
+ __wt_cell_type_string(unpack->raw));
+ break;
+ }
+ }
+
+ /*
+ * Check overflow pages. We check overflow cells separately from other
+ * tests that walk the page as it's simpler, and I don't care much how
+ * fast table verify runs.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__verify_overflow_cell(session, ref, &found, vs));
+ if (__wt_ref_is_root(ref) || page->type == WT_PAGE_ROW_INT)
+ break;
+
+ /*
+ * Object if a leaf-no-overflow address cell references a page
+ * with overflow keys, but don't object if a leaf address cell
+ * references a page without overflow keys. Reconciliation
+ * doesn't guarantee every leaf page without overflow items will
+ * be a leaf-no-overflow type.
+ */
+ if (found && unpack->raw == WT_CELL_ADDR_LEAF_NO)
+ WT_RET_MSG(session, WT_ERROR,
+ "page at %s, of type %s and referenced in its "
+ "parent by a cell of type %s, contains overflow "
+ "items",
+ __wt_page_addr_string(session, ref, vs->tmp1),
+ __wt_page_type_string(page->type),
+ __wt_cell_type_string(WT_CELL_ADDR_LEAF_NO));
+ break;
+ }
+
+ /* Check tree connections and recursively descend the tree. */
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ /* For each entry in an internal page, verify the subtree. */
+ entry = 0;
+ WT_INTL_FOREACH_BEGIN(session, page, child_ref) {
+ /*
+ * It's a depth-first traversal: this entry's starting
+ * record number should be 1 more than the total records
+ * reviewed to this point.
+ */
+ ++entry;
+ if (child_ref->key.recno != vs->record_total + 1) {
+ WT_RET_MSG(session, WT_ERROR,
+ "the starting record number in entry %"
+ PRIu32 " of the column internal page at "
+ "%s is %" PRIu64 " and the expected "
+ "starting record number is %" PRIu64,
+ entry,
+ __wt_page_addr_string(
+ session, child_ref, vs->tmp1),
+ child_ref->key.recno,
+ vs->record_total + 1);
+ }
+
+ /* Verify the subtree. */
+ WT_RET(__wt_page_in(session, child_ref, 0));
+ ret = __verify_tree(session, child_ref, vs);
+ WT_TRET(__wt_page_release(session, child_ref, 0));
+ WT_RET(ret);
+
+ __wt_cell_unpack(child_ref->addr, unpack);
+ WT_RET(bm->verify_addr(
+ bm, session, unpack->data, unpack->size));
+ } WT_INTL_FOREACH_END;
+ break;
+ case WT_PAGE_ROW_INT:
+ /* For each entry in an internal page, verify the subtree. */
+ entry = 0;
+ WT_INTL_FOREACH_BEGIN(session, page, child_ref) {
+ /*
+ * It's a depth-first traversal: this entry's starting
+ * key should be larger than the largest key previously
+ * reviewed.
+ *
+ * The 0th key of any internal page is magic, and we
+ * can't test against it.
+ */
+ ++entry;
+ if (entry != 1)
+ WT_RET(__verify_row_int_key_order(
+ session, page, child_ref, entry, vs));
+
+ /* Verify the subtree. */
+ WT_RET(__wt_page_in(session, child_ref, 0));
+ ret = __verify_tree(session, child_ref, vs);
+ WT_TRET(__wt_page_release(session, child_ref, 0));
+ WT_RET(ret);
+
+ __wt_cell_unpack(child_ref->addr, unpack);
+ WT_RET(bm->verify_addr(
+ bm, session, unpack->data, unpack->size));
+ } WT_INTL_FOREACH_END;
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __verify_row_int_key_order --
+ * Compare a key on an internal page to the largest key we've seen so
+ * far; update the largest key we've seen so far to that key.
+ */
+static int
+__verify_row_int_key_order(WT_SESSION_IMPL *session,
+ WT_PAGE *parent, WT_REF *ref, uint32_t entry, WT_VSTUFF *vs)
+{
+ WT_BTREE *btree;
+ WT_ITEM item;
+ int cmp;
+
+ btree = S2BT(session);
+
+ /* The maximum key is set, we updated it from a leaf page first. */
+ WT_ASSERT(session, vs->max_addr->size != 0);
+
+ /* Get the parent page's internal key. */
+ __wt_ref_key(parent, ref, &item.data, &item.size);
+
+ /* Compare the key against the largest key we've seen so far. */
+ WT_RET(__wt_compare(
+ session, btree->collator, &item, vs->max_key, &cmp));
+ if (cmp <= 0)
+ WT_RET_MSG(session, WT_ERROR,
+ "the internal key in entry %" PRIu32 " on the page at %s "
+ "sorts before the last key appearing on page %s, earlier "
+ "in the tree",
+ entry,
+ __wt_page_addr_string(session, ref, vs->tmp1),
+ (char *)vs->max_addr->data);
+
+ /* Update the largest key we've seen to the key just checked. */
+ WT_RET(__wt_buf_set(session, vs->max_key, item.data, item.size));
+ (void)__wt_page_addr_string(session, ref, vs->max_addr);
+
+ return (0);
+}
+
+/*
+ * __verify_row_leaf_key_order --
+ * Compare the first key on a leaf page to the largest key we've seen so
+ * far; update the largest key we've seen so far to the last key on the page.
+ */
+static int
+__verify_row_leaf_key_order(
+ WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ int cmp;
+
+ btree = S2BT(session);
+ page = ref->page;
+
+ /*
+ * If a tree is empty (just created), it won't have keys; if there
+ * are no keys, we're done.
+ */
+ if (page->pg_row_entries == 0)
+ return (0);
+
+ /*
+ * We visit our first leaf page before setting the maximum key (the 0th
+ * keys on the internal pages leading to the smallest leaf in the tree
+ * are all empty entries).
+ */
+ if (vs->max_addr->size != 0) {
+ WT_RET(__wt_row_leaf_key_copy(
+ session, page, page->pg_row_d, vs->tmp1));
+
+ /*
+ * Compare the key against the largest key we've seen so far.
+ *
+ * If we're comparing against a key taken from an internal page,
+ * we can compare equal (which is an expected path, the internal
+ * page key is often a copy of the leaf page's first key). But,
+ * in the case of the 0th slot on an internal page, the last key
+ * we've seen was a key from a previous leaf page, and it's not
+ * OK to compare equally in that case.
+ */
+ WT_RET(__wt_compare(session,
+ btree->collator, vs->tmp1, (WT_ITEM *)vs->max_key, &cmp));
+ if (cmp < 0)
+ WT_RET_MSG(session, WT_ERROR,
+ "the first key on the page at %s sorts equal to or "
+ "less than a key appearing on the page at %s, "
+ "earlier in the tree",
+ __wt_page_addr_string(session, ref, vs->tmp1),
+ (char *)vs->max_addr->data);
+ }
+
+ /* Update the largest key we've seen to the last key on this page. */
+ WT_RET(__wt_row_leaf_key_copy(session, page,
+ page->pg_row_d + (page->pg_row_entries - 1), vs->max_key));
+ (void)__wt_page_addr_string(session, ref, vs->max_addr);
+
+ return (0);
+}
+
+/*
+ * __verify_overflow_cell --
+ * Verify any overflow cells on the page.
+ */
+static int
+__verify_overflow_cell(
+ WT_SESSION_IMPL *session, WT_REF *ref, int *found, WT_VSTUFF *vs)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_RET;
+ const WT_PAGE_HEADER *dsk;
+ uint32_t cell_num, i;
+
+ btree = S2BT(session);
+ unpack = &_unpack;
+ *found = 0;
+
+ /*
+ * If a tree is empty (just created), it won't have a disk image;
+ * if there is no disk image, we're done.
+ */
+ if ((dsk = ref->page->dsk) == NULL)
+ return (0);
+
+ /* Walk the disk page, verifying pages referenced by overflow cells. */
+ cell_num = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ ++cell_num;
+ __wt_cell_unpack(cell, unpack);
+ switch (unpack->type) {
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_VALUE_OVFL:
+ *found = 1;
+ WT_ERR(__verify_overflow(
+ session, unpack->data, unpack->size, vs));
+ break;
+ }
+ }
+
+ return (0);
+
+err: WT_RET_MSG(session, ret,
+ "cell %" PRIu32 " on page at %s references an overflow item at %s "
+ "that failed verification",
+ cell_num - 1,
+ __wt_page_addr_string(session, ref, vs->tmp1),
+ __wt_addr_string(session, unpack->data, unpack->size, vs->tmp2));
+}
+
+/*
+ * __verify_overflow --
+ * Read in an overflow page and check it.
+ */
+static int
+__verify_overflow(WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_size, WT_VSTUFF *vs)
+{
+ WT_BM *bm;
+ const WT_PAGE_HEADER *dsk;
+
+ bm = S2BT(session)->bm;
+
+ /* Read and verify the overflow item. */
+ WT_RET(__wt_bt_read(session, vs->tmp1, addr, addr_size));
+
+ /*
+ * The physical page has already been verified, but we haven't confirmed
+ * it was an overflow page, only that it was a valid page. Confirm it's
+ * the type of page we expected.
+ */
+ dsk = vs->tmp1->data;
+ if (dsk->type != WT_PAGE_OVFL)
+ WT_RET_MSG(session, WT_ERROR,
+ "overflow referenced page at %s is not an overflow page",
+ __wt_addr_string(session, addr, addr_size, vs->tmp1));
+
+ WT_RET(bm->verify_addr(bm, session, addr, addr_size));
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
new file mode 100644
index 00000000000..a14f9f1078e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
@@ -0,0 +1,739 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __err_cell_corrupted(WT_SESSION_IMPL *, uint32_t, const char *);
+static int __err_cell_type(
+ WT_SESSION_IMPL *, uint32_t, const char *, uint8_t, uint8_t);
+static int __err_eof(WT_SESSION_IMPL *, uint32_t, const char *);
+static int __verify_dsk_chunk(
+ WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, uint32_t);
+static int __verify_dsk_col_fix(
+ WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *);
+static int __verify_dsk_col_int(
+ WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *);
+static int __verify_dsk_col_var(
+ WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *);
+static int __verify_dsk_memsize(
+ WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_CELL *);
+static int __verify_dsk_row(
+ WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *);
+
+#define WT_ERR_VRFY(session, ...) do { \
+ if (!(F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))) \
+ __wt_errx(session, __VA_ARGS__); \
+ goto err; \
+} while (0)
+
+#define WT_RET_VRFY(session, ...) do { \
+ if (!(F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))) \
+ __wt_errx(session, __VA_ARGS__); \
+ return (WT_ERROR); \
+} while (0)
+
+/*
+ * __wt_verify_dsk_image --
+ * Verify a single block as read from disk.
+ */
+int
+__wt_verify_dsk_image(WT_SESSION_IMPL *session,
+ const char *addr, const WT_PAGE_HEADER *dsk, size_t size)
+{
+ const uint8_t *p, *end;
+ u_int i;
+ uint8_t flags;
+
+ /* Check the page type. */
+ switch (dsk->type) {
+ case WT_PAGE_BLOCK_MANAGER:
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_OVFL:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ break;
+ case WT_PAGE_INVALID:
+ default:
+ WT_RET_VRFY(session,
+ "page at %s has an invalid type of %" PRIu32,
+ addr, dsk->type);
+ }
+
+ /* Check the page record number. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_VAR:
+ if (dsk->recno != 0)
+ break;
+ WT_RET_VRFY(session,
+ "%s page at %s has a record number of zero",
+ __wt_page_type_string(dsk->type), addr);
+ case WT_PAGE_BLOCK_MANAGER:
+ case WT_PAGE_OVFL:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ if (dsk->recno == 0)
+ break;
+ WT_RET_VRFY(session,
+ "%s page at %s has a non-zero record number",
+ __wt_page_type_string(dsk->type), addr);
+ }
+
+ /* Check the page flags. */
+ flags = dsk->flags;
+ if (LF_ISSET(WT_PAGE_COMPRESSED))
+ LF_CLR(WT_PAGE_COMPRESSED);
+ if (dsk->type == WT_PAGE_ROW_LEAF) {
+ if (LF_ISSET(WT_PAGE_EMPTY_V_ALL) &&
+ LF_ISSET(WT_PAGE_EMPTY_V_NONE))
+ WT_RET_VRFY(session,
+ "page at %s has invalid flags combination: 0x%"
+ PRIx8,
+ addr, dsk->flags);
+ if (LF_ISSET(WT_PAGE_EMPTY_V_ALL))
+ LF_CLR(WT_PAGE_EMPTY_V_ALL);
+ if (LF_ISSET(WT_PAGE_EMPTY_V_NONE))
+ LF_CLR(WT_PAGE_EMPTY_V_NONE);
+ }
+ if (flags != 0)
+ WT_RET_VRFY(session,
+ "page at %s has invalid flags set: 0x%" PRIx8,
+ addr, flags);
+
+ /* Unused bytes */
+ for (p = dsk->unused, i = sizeof(dsk->unused); i > 0; --i)
+ if (*p != '\0')
+ WT_RET_VRFY(session,
+ "page at %s has non-zero unused page header bytes",
+ addr);
+
+ /*
+ * Any bytes after the data chunk should be nul bytes; ignore if the
+ * size is 0, that allows easy checking of disk images where we don't
+ * have the size.
+ */
+ if (size != 0) {
+ p = (uint8_t *)dsk + dsk->mem_size;
+ end = (uint8_t *)dsk + size;
+ for (; p < end; ++p)
+ if (*p != '\0')
+ WT_RET_VRFY(session,
+ "%s page at %s has non-zero trailing bytes",
+ __wt_page_type_string(dsk->type), addr);
+ }
+
+ /* Verify the items on the page. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_INT:
+ return (__verify_dsk_col_int(session, addr, dsk));
+ case WT_PAGE_COL_FIX:
+ return (__verify_dsk_col_fix(session, addr, dsk));
+ case WT_PAGE_COL_VAR:
+ return (__verify_dsk_col_var(session, addr, dsk));
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ return (__verify_dsk_row(session, addr, dsk));
+ case WT_PAGE_BLOCK_MANAGER:
+ case WT_PAGE_OVFL:
+ return (__verify_dsk_chunk(session, addr, dsk, dsk->u.datalen));
+ WT_ILLEGAL_VALUE(session);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_verify_dsk --
+ * Verify a single Btree page as read from disk.
+ */
+int
+__wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf)
+{
+ return (__wt_verify_dsk_image(session, addr, buf->data, buf->size));
+}
+
+/*
+ * __verify_dsk_row --
+ * Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it.
+ */
+static int
+__verify_dsk_row(
+ WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_ITEM(current);
+ WT_DECL_ITEM(last_ovfl);
+ WT_DECL_ITEM(last_pfx);
+ WT_DECL_RET;
+ WT_ITEM *last;
+ enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type;
+ void *huffman;
+ uint32_t cell_num, cell_type, i, key_cnt, prefix;
+ uint8_t *end;
+ int cmp;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ unpack = &_unpack;
+ huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &current));
+ WT_ERR(__wt_scr_alloc(session, 0, &last_pfx));
+ WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl));
+ last = last_ovfl;
+
+ end = (uint8_t *)dsk + dsk->mem_size;
+
+ last_cell_type = FIRST;
+ cell_num = 0;
+ key_cnt = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ ++cell_num;
+
+ /* Carefully unpack the cell. */
+ if (__wt_cell_unpack_safe(cell, unpack, end) != 0) {
+ ret = __err_cell_corrupted(session, cell_num, addr);
+ goto err;
+ }
+
+ /* Check the raw and collapsed cell types. */
+ WT_ERR(__err_cell_type(
+ session, cell_num, addr, unpack->raw, dsk->type));
+ WT_ERR(__err_cell_type(
+ session, cell_num, addr, unpack->type, dsk->type));
+ cell_type = unpack->type;
+
+ /*
+ * Check ordering relationships between the WT_CELL entries.
+ * For row-store internal pages, check for:
+ * two values in a row,
+ * two keys in a row,
+ * a value as the first cell on a page.
+ * For row-store leaf pages, check for:
+ * two values in a row,
+ * a value as the first cell on a page.
+ */
+ switch (cell_type) {
+ case WT_CELL_KEY:
+ case WT_CELL_KEY_OVFL:
+ ++key_cnt;
+ switch (last_cell_type) {
+ case FIRST:
+ case WAS_VALUE:
+ break;
+ case WAS_KEY:
+ if (dsk->type == WT_PAGE_ROW_LEAF)
+ break;
+ WT_ERR_VRFY(session,
+ "cell %" PRIu32 " on page at %s is the "
+ "first of two adjacent keys",
+ cell_num - 1, addr);
+ }
+ last_cell_type = WAS_KEY;
+ break;
+ case WT_CELL_ADDR_DEL:
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ case WT_CELL_VALUE:
+ case WT_CELL_VALUE_OVFL:
+ switch (last_cell_type) {
+ case FIRST:
+ WT_ERR_VRFY(session,
+ "page at %s begins with a value", addr);
+ case WAS_KEY:
+ break;
+ case WAS_VALUE:
+ WT_ERR_VRFY(session,
+ "cell %" PRIu32 " on page at %s is the "
+ "first of two adjacent values",
+ cell_num - 1, addr);
+ }
+ last_cell_type = WAS_VALUE;
+ break;
+ }
+
+ /* Check if any referenced item has a valid address. */
+ switch (cell_type) {
+ case WT_CELL_ADDR_DEL:
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_VALUE_OVFL:
+ if (!bm->addr_valid(bm,
+ session, unpack->data, unpack->size))
+ goto eof;
+ break;
+ }
+
+ /*
+ * Remaining checks are for key order and prefix compression.
+ * If this cell isn't a key, we're done, move to the next cell.
+ * If this cell is an overflow item, instantiate the key and
+ * compare it with the last key. Otherwise, we have to deal
+ * with prefix compression.
+ */
+ switch (cell_type) {
+ case WT_CELL_KEY:
+ break;
+ case WT_CELL_KEY_OVFL:
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, dsk->type, unpack, current));
+ goto key_compare;
+ default:
+ /* Not a key -- continue with the next cell. */
+ continue;
+ }
+
+ /*
+ * Prefix compression checks.
+ *
+ * Confirm the first non-overflow key on a page has a zero
+ * prefix compression count.
+ */
+ prefix = unpack->prefix;
+ if (last_pfx->size == 0 && prefix != 0)
+ WT_ERR_VRFY(session,
+ "the %" PRIu32 " key on page at %s is the first "
+ "non-overflow key on the page and has a non-zero "
+ "prefix compression value",
+ cell_num, addr);
+
+ /* Confirm the prefix compression count is possible. */
+ if (cell_num > 1 && prefix > last->size)
+ WT_ERR_VRFY(session,
+ "key %" PRIu32 " on page at %s has a prefix "
+ "compression count of %" PRIu32 ", larger than "
+ "the length of the previous key, %" WT_SIZET_FMT,
+ cell_num, addr, prefix, last->size);
+
+ /*
+ * If Huffman decoding required, unpack the cell to build the
+ * key, then resolve the prefix. Else, we can do it faster
+ * internally because we don't have to shuffle memory around as
+ * much.
+ */
+ if (huffman != NULL) {
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, dsk->type, unpack, current));
+
+ /*
+ * If there's a prefix, make sure there's enough buffer
+ * space, then shift the decoded data past the prefix
+ * and copy the prefix into place. Take care with the
+ * pointers: current->data may be pointing inside the
+ * buffer.
+ */
+ if (prefix != 0) {
+ WT_ERR(__wt_buf_grow(
+ session, current, prefix + current->size));
+ memmove((uint8_t *)current->mem + prefix,
+ current->data, current->size);
+ memcpy(current->mem, last->data, prefix);
+ current->data = current->mem;
+ current->size += prefix;
+ }
+ } else {
+ /*
+ * Get the cell's data/length and make sure we have
+ * enough buffer space.
+ */
+ WT_ERR(__wt_buf_init(
+ session, current, prefix + unpack->size));
+
+ /* Copy the prefix then the data into place. */
+ if (prefix != 0)
+ memcpy(current->mem, last->data, prefix);
+ memcpy((uint8_t *)current->mem + prefix, unpack->data,
+ unpack->size);
+ current->size = prefix + unpack->size;
+ }
+
+key_compare: /*
+ * Compare the current key against the last key.
+ *
+ * Be careful about the 0th key on internal pages: we only store
+ * the first byte and custom collators may not be able to handle
+ * truncated keys.
+ */
+ if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) ||
+ (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) {
+ WT_ERR(__wt_compare(
+ session, btree->collator, last, current, &cmp));
+ if (cmp >= 0)
+ WT_ERR_VRFY(session,
+ "the %" PRIu32 " and %" PRIu32 " keys on "
+ "page at %s are incorrectly sorted",
+ cell_num - 2, cell_num, addr);
+ }
+
+ /*
+ * Swap the buffers: last always references the last key entry,
+ * last_pfx and last_ovfl reference the last prefix-compressed
+ * and last overflow key entries. Current gets pointed to the
+ * buffer we're not using this time around, which is where the
+ * next key goes.
+ */
+ last = current;
+ if (cell_type == WT_CELL_KEY) {
+ current = last_pfx;
+ last_pfx = last;
+ } else {
+ current = last_ovfl;
+ last_ovfl = last;
+ }
+ WT_ASSERT(session, last != current);
+ }
+ WT_ERR(__verify_dsk_memsize(session, addr, dsk, cell));
+
+ /*
+ * On row-store internal pages, and on row-store leaf pages, where the
+ * "no empty values" flag is set, the key count should be equal to half
+ * the number of physical entries. On row-store leaf pages where the
+ * "all empty values" flag is set, the key count should be equal to the
+ * number of physical entries.
+ */
+ if (dsk->type == WT_PAGE_ROW_INT && key_cnt * 2 != dsk->u.entries)
+ WT_ERR_VRFY(session,
+ "%s page at %s has a key count of %" PRIu32 " and a "
+ "physical entry count of %" PRIu32,
+ __wt_page_type_string(dsk->type),
+ addr, key_cnt, dsk->u.entries);
+ if (dsk->type == WT_PAGE_ROW_LEAF &&
+ F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL) &&
+ key_cnt != dsk->u.entries)
+ WT_ERR_VRFY(session,
+ "%s page at %s with the 'all empty values' flag set has a "
+ "key count of %" PRIu32 " and a physical entry count of %"
+ PRIu32,
+ __wt_page_type_string(dsk->type),
+ addr, key_cnt, dsk->u.entries);
+ if (dsk->type == WT_PAGE_ROW_LEAF &&
+ F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE) &&
+ key_cnt * 2 != dsk->u.entries)
+ WT_ERR_VRFY(session,
+ "%s page at %s with the 'no empty values' flag set has a "
+ "key count of %" PRIu32 " and a physical entry count of %"
+ PRIu32,
+ __wt_page_type_string(dsk->type),
+ addr, key_cnt, dsk->u.entries);
+
+ if (0) {
+eof: ret = __err_eof(session, cell_num, addr);
+ }
+
+ if (0) {
+err: if (ret == 0)
+ ret = WT_ERROR;
+ }
+ __wt_scr_free(&current);
+ __wt_scr_free(&last_pfx);
+ __wt_scr_free(&last_ovfl);
+ return (ret);
+}
+
+/*
+ * __verify_dsk_col_int --
+ * Walk a WT_PAGE_COL_INT disk page and verify it.
+ */
+static int
+__verify_dsk_col_int(
+ WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ uint32_t cell_num, i;
+ uint8_t *end;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ unpack = &_unpack;
+ end = (uint8_t *)dsk + dsk->mem_size;
+
+ cell_num = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ ++cell_num;
+
+ /* Carefully unpack the cell. */
+ if (__wt_cell_unpack_safe(cell, unpack, end) != 0)
+ return (__err_cell_corrupted(session, cell_num, addr));
+
+ /* Check the raw and collapsed cell types. */
+ WT_RET(__err_cell_type(
+ session, cell_num, addr, unpack->raw, dsk->type));
+ WT_RET(__err_cell_type(
+ session, cell_num, addr, unpack->type, dsk->type));
+
+ /* Check if any referenced item is entirely in the file. */
+ if (!bm->addr_valid(bm, session, unpack->data, unpack->size))
+ return (__err_eof(session, cell_num, addr));
+ }
+ WT_RET(__verify_dsk_memsize(session, addr, dsk, cell));
+
+ return (0);
+}
+
+/*
+ * __verify_dsk_col_fix --
+ * Walk a WT_PAGE_COL_FIX disk page and verify it.
+ */
+static int
+__verify_dsk_col_fix(
+ WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+{
+ WT_BTREE *btree;
+ uint32_t datalen;
+
+ btree = S2BT(session);
+
+ datalen = __bitstr_size(btree->bitcnt * dsk->u.entries);
+ return (__verify_dsk_chunk(session, addr, dsk, datalen));
+}
+
+/*
+ * __verify_dsk_col_var --
+ * Walk a WT_PAGE_COL_VAR disk page and verify it.
+ */
+static int
+__verify_dsk_col_var(
+ WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ size_t last_size;
+ uint32_t cell_num, cell_type, i;
+ int last_deleted;
+ const uint8_t *last_data;
+ uint8_t *end;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ unpack = &_unpack;
+ end = (uint8_t *)dsk + dsk->mem_size;
+
+ last_data = NULL;
+ last_size = 0;
+ last_deleted = 0;
+
+ cell_num = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ ++cell_num;
+
+ /* Carefully unpack the cell. */
+ if (__wt_cell_unpack_safe(cell, unpack, end) != 0)
+ return (__err_cell_corrupted(session, cell_num, addr));
+
+ /* Check the raw and collapsed cell types. */
+ WT_RET(__err_cell_type(
+ session, cell_num, addr, unpack->raw, dsk->type));
+ WT_RET(__err_cell_type(
+ session, cell_num, addr, unpack->type, dsk->type));
+ cell_type = unpack->type;
+
+ /* Check if any referenced item is entirely in the file. */
+ if (cell_type == WT_CELL_VALUE_OVFL &&
+ !bm->addr_valid(bm, session, unpack->data, unpack->size))
+ return (__err_eof(session, cell_num, addr));
+
+ /*
+ * Compare the last two items and see if reconciliation missed
+ * a chance for RLE encoding. We don't have to care about data
+ * encoding or anything else, a byte comparison is enough.
+ */
+ if (last_deleted == 1) {
+ if (cell_type == WT_CELL_DEL)
+ goto match_err;
+ } else
+ if (cell_type == WT_CELL_VALUE &&
+ last_data != NULL &&
+ last_size == unpack->size &&
+ memcmp(last_data, unpack->data, last_size) == 0)
+match_err: WT_RET_VRFY(session,
+ "data entries %" PRIu32 " and %" PRIu32
+ " on page at %s are identical and should "
+ "have been run-length encoded",
+ cell_num - 1, cell_num, addr);
+
+ switch (cell_type) {
+ case WT_CELL_DEL:
+ last_deleted = 1;
+ last_data = NULL;
+ break;
+ case WT_CELL_VALUE_OVFL:
+ last_deleted = 0;
+ last_data = NULL;
+ break;
+ case WT_CELL_VALUE:
+ last_deleted = 0;
+ last_data = unpack->data;
+ last_size = unpack->size;
+ break;
+ }
+ }
+ WT_RET(__verify_dsk_memsize(session, addr, dsk, cell));
+
+ return (0);
+}
+
+/*
+ * __verify_dsk_memsize --
+ * Verify the last cell on the page matches the page's memory size.
+ */
+static int
+__verify_dsk_memsize(WT_SESSION_IMPL *session,
+ const char *addr, const WT_PAGE_HEADER *dsk, WT_CELL *cell)
+{
+ size_t len;
+
+ /*
+ * We use the fact that cells exactly fill a page to detect the case of
+ * a row-store leaf page where the last cell is a key (that is, there's
+ * no subsequent value cell). Check for any page type containing cells.
+ */
+ len = WT_PTRDIFF((uint8_t *)dsk + dsk->mem_size, cell);
+ if (len == 0)
+ return (0);
+ WT_RET_VRFY(session,
+ "%s page at %s has %" WT_SIZET_FMT " unexpected bytes of data "
+ "after the last cell",
+ __wt_page_type_string(dsk->type), addr, len);
+}
+
+/*
+ * __verify_dsk_chunk --
+ * Verify a Chunk O' Data on a Btree page.
+ */
+static int
+__verify_dsk_chunk(WT_SESSION_IMPL *session,
+ const char *addr, const WT_PAGE_HEADER *dsk, uint32_t datalen)
+{
+ WT_BTREE *btree;
+ uint8_t *p, *end;
+
+ btree = S2BT(session);
+ end = (uint8_t *)dsk + dsk->mem_size;
+
+ /*
+ * Fixed-length column-store and overflow pages are simple chunks of
+ * data.
+ */
+ if (datalen == 0)
+ WT_RET_VRFY(session,
+ "%s page at %s has no data",
+ __wt_page_type_string(dsk->type), addr);
+
+ /* Verify the data doesn't overflow the end of the page. */
+ p = WT_PAGE_HEADER_BYTE(btree, dsk);
+ if (p + datalen > end)
+ WT_RET_VRFY(session,
+ "data on page at %s extends past the end of the page",
+ addr);
+
+ /* Any bytes after the data chunk should be nul bytes. */
+ for (p += datalen; p < end; ++p)
+ if (*p != '\0')
+ WT_RET_VRFY(session,
+ "%s page at %s has non-zero trailing bytes",
+ __wt_page_type_string(dsk->type), addr);
+
+ return (0);
+}
+
+/*
+ * __err_cell_corrupted --
+ * Generic corrupted cell, we couldn't read it.
+ */
+static int
+__err_cell_corrupted(
+ WT_SESSION_IMPL *session, uint32_t entry_num, const char *addr)
+{
+ WT_RET_VRFY(session,
+ "item %" PRIu32 " on page at %s is a corrupted cell",
+ entry_num, addr);
+}
+
+/*
+ * __err_cell_type --
+ * Generic illegal cell type for a particular page type error.
+ */
+static int
+__err_cell_type(WT_SESSION_IMPL *session,
+ uint32_t entry_num, const char *addr, uint8_t cell_type, uint8_t dsk_type)
+{
+ switch (cell_type) {
+ case WT_CELL_ADDR_DEL:
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ if (dsk_type == WT_PAGE_COL_INT ||
+ dsk_type == WT_PAGE_ROW_INT)
+ return (0);
+ break;
+ case WT_CELL_DEL:
+ if (dsk_type == WT_PAGE_COL_VAR)
+ return (0);
+ break;
+ case WT_CELL_KEY:
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_KEY_SHORT:
+ if (dsk_type == WT_PAGE_ROW_INT ||
+ dsk_type == WT_PAGE_ROW_LEAF)
+ return (0);
+ break;
+ case WT_CELL_KEY_PFX:
+ case WT_CELL_KEY_SHORT_PFX:
+ if (dsk_type == WT_PAGE_ROW_LEAF)
+ return (0);
+ break;
+ case WT_CELL_KEY_OVFL_RM:
+ case WT_CELL_VALUE_OVFL_RM:
+ /*
+ * Removed overflow cells are in-memory only, it's an error to
+ * ever see one on a disk page.
+ */
+ break;
+ case WT_CELL_VALUE:
+ case WT_CELL_VALUE_COPY:
+ case WT_CELL_VALUE_OVFL:
+ case WT_CELL_VALUE_SHORT:
+ if (dsk_type == WT_PAGE_COL_VAR ||
+ dsk_type == WT_PAGE_ROW_LEAF)
+ return (0);
+ break;
+ default:
+ break;
+ }
+
+ WT_RET_VRFY(session,
+ "illegal cell and page type combination: cell %" PRIu32
+ " on page at %s is a %s cell on a %s page",
+ entry_num, addr,
+ __wt_cell_type_string(cell_type), __wt_page_type_string(dsk_type));
+}
+
+/*
+ * __err_eof --
+ * Generic item references non-existent file pages error.
+ */
+static int
+__err_eof(WT_SESSION_IMPL *session, uint32_t entry_num, const char *addr)
+{
+ WT_RET_VRFY(session,
+ "off-page item %" PRIu32
+ " on page at %s references non-existent file pages",
+ entry_num, addr);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
new file mode 100644
index 00000000000..ef35d215ec0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -0,0 +1,285 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_tree_walk --
+ * Move to the next/previous page in the tree.
+ */
+int
+__wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *couple, *ref;
+ WT_TXN_STATE *txn_state;
+ int descending, prev, skip;
+ uint32_t slot;
+
+ btree = S2BT(session);
+ descending = 0;
+
+ /*
+ * Tree walks are special: they look inside page structures that splits
+ * may want to free. Publish that the tree is active during this
+ * window.
+ */
+ WT_ENTER_PAGE_INDEX(session);
+
+ /*
+ * !!!
+ * Fast-truncate currently only works on row-store trees.
+ */
+ if (btree->type != BTREE_ROW)
+ LF_CLR(WT_READ_TRUNCATE);
+
+ prev = LF_ISSET(WT_READ_PREV) ? 1 : 0;
+
+ /*
+ * Pin a transaction ID, required to safely look at page index
+ * structures, if our caller has not already done so.
+ */
+ txn_state = WT_SESSION_TXN_STATE(session);
+ if (txn_state->snap_min == WT_TXN_NONE)
+ txn_state->snap_min = S2C(session)->txn_global.last_running;
+ else
+ txn_state = NULL;
+
+ /*
+ * There are multiple reasons and approaches to walking the in-memory
+ * tree:
+ *
+ * (1) finding pages to evict (the eviction server);
+ * (2) writing just dirty leaves or internal nodes (checkpoint);
+ * (3) discarding pages (close);
+ * (4) truncating pages in a range (fast truncate);
+ * (5) skipping pages based on outside information (compaction);
+ * (6) cursor scans (applications).
+ *
+ * Except for cursor scans and compaction, the walk is limited to the
+ * cache, no pages are read. In all cases, hazard pointers protect the
+ * walked pages from eviction.
+ *
+ * Walks use hazard-pointer coupling through the tree and that's OK
+ * (hazard pointers can't deadlock, so there's none of the usual
+ * problems found when logically locking up a btree). If the eviction
+ * thread tries to evict the active page, it fails because of our
+ * hazard pointer. If eviction tries to evict our parent, that fails
+ * because the parent has a child page that can't be discarded. We do
+ * play one game: don't couple up to our parent and then back down to a
+ * new leaf, couple to the next page to which we're descending, it
+ * saves a hazard-pointer swap for each cursor page movement.
+ *
+ * !!!
+ * NOTE: we depend on the fact it's OK to release a page we don't hold,
+ * that is, it's OK to release couple when couple is set to NULL.
+ *
+ * Take a copy of any held page and clear the return value. Remember
+ * the hazard pointer we're currently holding.
+ *
+ * We may be passed a pointer to btree->evict_page that we are clearing
+ * here. We check when discarding pages that we're not discarding that
+ * page, so this clear must be done before the page is released.
+ */
+ couple = ref = *refp;
+ *refp = NULL;
+
+ /* If no page is active, begin a walk from the start of the tree. */
+ if (ref == NULL) {
+ ref = &btree->root;
+ if (ref->page == NULL) {
+ if (txn_state != NULL)
+ txn_state->snap_min = WT_TXN_NONE;
+ goto done;
+ }
+ goto descend;
+ }
+
+ascend: /*
+ * If the active page was the root, we've reached the walk's end.
+ * Release any hazard-pointer we're holding.
+ */
+ if (__wt_ref_is_root(ref)) {
+ WT_ERR(__wt_page_release(session, couple, flags));
+ goto done;
+ }
+
+ /* Figure out the current slot in the WT_REF array. */
+ __wt_page_refp(session, ref, &pindex, &slot);
+
+ if (0) {
+restart: /*
+ * The page we're moving to might have split, in which case find
+ * the last position we held.
+ *
+ * If we were starting a tree walk, begin again.
+ *
+ * If we were in the process of descending, repeat the descent.
+ * If we were moving within a single level of the tree, repeat
+ * the last move.
+ */
+ ref = couple;
+ if (ref == &btree->root) {
+ ref = &btree->root;
+ if (ref->page == NULL) {
+ if (txn_state != NULL)
+ txn_state->snap_min = WT_TXN_NONE;
+ goto done;
+ }
+ goto descend;
+ }
+ __wt_page_refp(session, ref, &pindex, &slot);
+ if (descending)
+ goto descend;
+ }
+
+ for (;;) {
+ /*
+ * If we're at the last/first slot on the page, return this page
+ * in post-order traversal. Otherwise we move to the next/prev
+ * slot and left/right-most element in its subtree.
+ */
+ if ((prev && slot == 0) ||
+ (!prev && slot == pindex->entries - 1)) {
+ ref = ref->home->pg_intl_parent_ref;
+
+ /* Optionally skip internal pages. */
+ if (LF_ISSET(WT_READ_SKIP_INTL))
+ goto ascend;
+
+ /*
+ * We've ascended the tree and are returning an internal
+ * page. If it's the root, discard our hazard pointer,
+ * otherwise, swap our hazard pointer for the page we'll
+ * return.
+ */
+ if (__wt_ref_is_root(ref))
+ WT_ERR(__wt_page_release(
+ session, couple, flags));
+ else {
+ /*
+ * Locate the reference to our parent page then
+ * swap our child hazard pointer for the parent.
+ * We don't handle a restart return because it
+ * would require additional complexity in the
+ * restart code (ascent code somewhat like the
+ * descent code already there), and it's not a
+ * possible return: we're moving to the parent
+ * of the current child, not another child of
+ * the same parent, there's no way our parent
+ * split.
+ */
+ __wt_page_refp(session, ref, &pindex, &slot);
+ if ((ret = __wt_page_swap(
+ session, couple, ref, flags)) != 0) {
+ WT_TRET(__wt_page_release(
+ session, couple, flags));
+ WT_ERR(ret);
+ }
+ }
+
+ *refp = ref;
+ goto done;
+ }
+
+ if (prev)
+ --slot;
+ else
+ ++slot;
+
+ for (descending = 0;;) {
+ ref = pindex->index[slot];
+
+ if (LF_ISSET(WT_READ_CACHE)) {
+ /*
+ * Only look at unlocked pages in memory:
+ * fast-path some common cases.
+ */
+ if (LF_ISSET(WT_READ_NO_WAIT) &&
+ ref->state != WT_REF_MEM)
+ break;
+ } else if (LF_ISSET(WT_READ_TRUNCATE)) {
+ /*
+ * If deleting a range, try to delete the page
+ * without instantiating it.
+ */
+ WT_ERR(__wt_delete_page(session, ref, &skip));
+ if (skip)
+ break;
+ } else if (LF_ISSET(WT_READ_COMPACT)) {
+ /*
+ * Skip deleted pages, rewriting them doesn't
+ * seem useful.
+ */
+ if (ref->state == WT_REF_DELETED)
+ break;
+
+ /*
+ * If the page is in-memory, we want to look at
+ * it (it may have been modified and written,
+ * and the current location is the interesting
+ * one in terms of compaction, not the original
+ * location). If the page isn't in-memory, test
+ * if the page will help with compaction, don't
+ * read it if we don't have to.
+ */
+ if (ref->state == WT_REF_DISK) {
+ WT_ERR(__wt_compact_page_skip(
+ session, ref, &skip));
+ if (skip)
+ break;
+ }
+ } else {
+ /*
+ * If iterating a cursor, try to skip deleted
+ * pages that are visible to us.
+ */
+ if (ref->state == WT_REF_DELETED &&
+ __wt_delete_page_skip(session, ref))
+ break;
+ }
+
+ ret = __wt_page_swap(session, couple, ref, flags);
+ if (ret == WT_NOTFOUND) {
+ ret = 0;
+ break;
+ }
+ if (ret == WT_RESTART)
+ goto restart;
+ WT_ERR(ret);
+
+ /*
+ * Entering a new page: configure for traversal of any
+ * internal page's children, else return (or optionally
+ * skip), the leaf page.
+ */
+descend: couple = ref;
+ page = ref->page;
+ if (page->type == WT_PAGE_ROW_INT ||
+ page->type == WT_PAGE_COL_INT) {
+ pindex = WT_INTL_INDEX_COPY(page);
+ slot = prev ? pindex->entries - 1 : 0;
+ descending = 1;
+ } else if (LF_ISSET(WT_READ_SKIP_LEAF))
+ goto ascend;
+ else {
+ *refp = ref;
+ goto done;
+ }
+ }
+ }
+
+done:
+err: if (txn_state != NULL)
+ txn_state->snap_min = WT_TXN_NONE;
+
+ WT_LEAVE_PAGE_INDEX(session);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c
new file mode 100644
index 00000000000..3a4a2a2987d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/col_modify.c
@@ -0,0 +1,223 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __col_insert_alloc(
+ WT_SESSION_IMPL *, uint64_t, u_int, WT_INSERT **, size_t *);
+
+/*
+ * __wt_col_modify --
+ * Column-store delete, insert, and update.
+ */
+int
+__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
+ uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_INSERT *ins;
+ WT_INSERT_HEAD *ins_head, **ins_headp;
+ WT_ITEM _value;
+ WT_PAGE *page;
+ WT_UPDATE *old_upd;
+ size_t ins_size, upd_size;
+ u_int i, skipdepth;
+ int append, logged;
+
+ btree = cbt->btree;
+ ins = NULL;
+ page = cbt->ref->page;
+ append = logged = 0;
+
+ /* This code expects a remove to have a NULL value. */
+ if (is_remove) {
+ if (btree->type == BTREE_COL_FIX) {
+ value = &_value;
+ value->data = "";
+ value->size = 1;
+ } else
+ value = NULL;
+ } else {
+ /*
+ * There's some chance the application specified a record past
+ * the last record on the page. If that's the case, and we're
+ * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
+ * append list, not the update list. In addition, a recno of
+ * 0 implies an append operation, we're allocating a new row.
+ */
+ if (recno == 0 ||
+ recno > (btree->type == BTREE_COL_VAR ?
+ __col_var_last_recno(page) : __col_fix_last_recno(page)))
+ append = 1;
+ }
+
+ /* If we don't yet have a modify structure, we'll need one. */
+ WT_RET(__wt_page_modify_init(session, page));
+
+ /*
+ * Delete, insert or update a column-store entry.
+ *
+ * If modifying a previously modified record, create a new WT_UPDATE
+ * entry and have a serialized function link it into an existing
+ * WT_INSERT entry's WT_UPDATE list.
+ *
+ * Else, allocate an insert array as necessary, build a WT_INSERT and
+ * WT_UPDATE structure pair, and call a serialized function to insert
+ * the WT_INSERT structure.
+ */
+ if (cbt->compare == 0 && cbt->ins != NULL) {
+ /*
+ * If we are restoring updates that couldn't be evicted, the
+ * key must not exist on the new page.
+ */
+ WT_ASSERT(session, upd == NULL);
+
+ /* Make sure the update can proceed. */
+ WT_ERR(__wt_txn_update_check(
+ session, old_upd = cbt->ins->upd));
+
+ /* Allocate a WT_UPDATE structure and transaction ID. */
+ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_txn_modify(session, upd));
+ logged = 1;
+
+ /* Avoid a data copy in WT_CURSOR.update. */
+ cbt->modify_update = upd;
+
+ /*
+ * Point the new WT_UPDATE item to the next element in the list.
+ * If we get it right, the serialization function lock acts as
+ * our memory barrier to flush this write.
+ */
+ upd->next = old_upd;
+
+ /* Serialize the update. */
+ WT_ERR(__wt_update_serial(
+ session, page, &cbt->ins->upd, &upd, upd_size));
+ } else {
+ /* Allocate the append/update list reference as necessary. */
+ if (append) {
+ WT_PAGE_ALLOC_AND_SWAP(session,
+ page, page->modify->mod_append, ins_headp, 1);
+ ins_headp = &page->modify->mod_append[0];
+ } else if (page->type == WT_PAGE_COL_FIX) {
+ WT_PAGE_ALLOC_AND_SWAP(session,
+ page, page->modify->mod_update, ins_headp, 1);
+ ins_headp = &page->modify->mod_update[0];
+ } else {
+ WT_PAGE_ALLOC_AND_SWAP(session,
+ page, page->modify->mod_update, ins_headp,
+ page->pg_var_entries);
+ ins_headp = &page->modify->mod_update[cbt->slot];
+ }
+
+ /* Allocate the WT_INSERT_HEAD structure as necessary. */
+ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
+ ins_head = *ins_headp;
+
+ /* Choose a skiplist depth for this insert. */
+ skipdepth = __wt_skip_choose_depth(session);
+
+ /*
+ * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
+ * update the cursor to reference it (the WT_INSERT_HEAD might
+ * be allocated, the WT_INSERT was allocated).
+ */
+ WT_ERR(__col_insert_alloc(
+ session, recno, skipdepth, &ins, &ins_size));
+ cbt->ins_head = ins_head;
+ cbt->ins = ins;
+
+ if (upd == NULL) {
+ WT_ERR(
+ __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_txn_modify(session, upd));
+ logged = 1;
+
+ /* Avoid a data copy in WT_CURSOR.update. */
+ cbt->modify_update = upd;
+ } else
+ upd_size = sizeof(WT_UPDATE) + upd->size;
+ ins->upd = upd;
+ ins_size += upd_size;
+
+ /*
+ * If there was no insert list during the search, or there was
+ * no search because the record number has not been allocated
+ * yet, the cursor's information cannot be correct, search
+ * couldn't have initialized it.
+ *
+ * Otherwise, point the new WT_INSERT item's skiplist to the
+ * next elements in the insert list (which we will check are
+ * still valid inside the serialization function).
+ *
+ * The serial mutex acts as our memory barrier to flush these
+ * writes before inserting them into the list.
+ */
+ if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0)
+ for (i = 0; i < skipdepth; i++) {
+ cbt->ins_stack[i] = &ins_head->head[i];
+ ins->next[i] = cbt->next_stack[i] = NULL;
+ }
+ else
+ for (i = 0; i < skipdepth; i++)
+ ins->next[i] = cbt->next_stack[i];
+
+ /* Append or insert the WT_INSERT structure. */
+ if (append)
+ WT_ERR(__wt_col_append_serial(
+ session, page, cbt->ins_head, cbt->ins_stack,
+ &ins, ins_size, &cbt->recno, skipdepth));
+ else
+ WT_ERR(__wt_insert_serial(
+ session, page, cbt->ins_head, cbt->ins_stack,
+ &ins, ins_size, skipdepth));
+ }
+
+ /* If the update was successful, add it to the in-memory log. */
+ if (logged)
+ WT_ERR(__wt_txn_log_op(session, cbt));
+
+ if (0) {
+err: /*
+ * Remove the update from the current transaction, so we don't
+ * try to modify it on rollback.
+ */
+ if (logged)
+ __wt_txn_unmodify(session);
+ __wt_free(session, ins);
+ __wt_free(session, upd);
+ }
+
+ return (ret);
+}
+
+/*
+ * __col_insert_alloc --
+ * Column-store insert: allocate a WT_INSERT structure and fill it in.
+ */
+static int
+__col_insert_alloc(WT_SESSION_IMPL *session,
+ uint64_t recno, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep)
+{
+ WT_INSERT *ins;
+ size_t ins_size;
+
+ /*
+ * Allocate the WT_INSERT structure and skiplist pointers, then copy
+ * the record number into place.
+ */
+ ins_size = sizeof(WT_INSERT) + skipdepth * sizeof(WT_INSERT *);
+ WT_RET(__wt_calloc(session, 1, ins_size, &ins));
+
+ WT_INSERT_RECNO(ins) = recno;
+
+ *insp = ins;
+ *ins_sizep = ins_size;
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
new file mode 100644
index 00000000000..e4083e2282f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -0,0 +1,199 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_col_search --
+ * Search a column-store tree for a specific record-based key.
+ */
+int
+__wt_col_search(WT_SESSION_IMPL *session,
+ uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_COL *cip;
+ WT_DECL_RET;
+ WT_INSERT *ins;
+ WT_INSERT_HEAD *ins_head;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *current, *descent;
+ uint32_t base, indx, limit;
+ int depth;
+
+ btree = S2BT(session);
+
+ __cursor_pos_clear(cbt);
+
+ /*
+ * In the service of eviction splits, we're only searching a single leaf
+ * page, not a full tree.
+ */
+ if (leaf != NULL) {
+ current = leaf;
+ goto leaf_only;
+ }
+
+ /* Search the internal pages of the tree. */
+ current = &btree->root;
+ for (depth = 2;; ++depth) {
+restart: page = current->page;
+ if (page->type != WT_PAGE_COL_INT)
+ break;
+
+ WT_ASSERT(session, current->key.recno == page->pg_intl_recno);
+
+ pindex = WT_INTL_INDEX_COPY(page);
+ base = pindex->entries;
+ descent = pindex->index[base - 1];
+
+ /* Fast path appends. */
+ if (recno >= descent->key.recno)
+ goto descend;
+
+ /* Binary search of internal pages. */
+ for (base = 0,
+ limit = pindex->entries - 1; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ descent = pindex->index[indx];
+
+ if (recno == descent->key.recno)
+ break;
+ if (recno < descent->key.recno)
+ continue;
+ base = indx + 1;
+ --limit;
+ }
+descend: /*
+ * Reference the slot used for next step down the tree.
+ *
+ * Base is the smallest index greater than recno and may be the
+ * (last + 1) index. The slot for descent is the one before
+ * base.
+ */
+ if (recno != descent->key.recno) {
+ /*
+ * We don't have to correct for base == 0 because the
+ * only way for base to be 0 is if recno is the page's
+ * starting recno.
+ */
+ WT_ASSERT(session, base > 0);
+ descent = pindex->index[base - 1];
+ }
+
+ /*
+ * Swap the current page for the child page. If the page splits
+ * while we're retrieving it, restart the search in the current
+ * page; otherwise return on error, the swap call ensures we're
+ * holding nothing on failure.
+ */
+ switch (ret = __wt_page_swap(session, current, descent, 0)) {
+ case 0:
+ current = descent;
+ break;
+ case WT_RESTART:
+ goto restart;
+ default:
+ return (ret);
+ }
+ }
+
+ /* Track how deep the tree gets. */
+ if (depth > btree->maximum_depth)
+ btree->maximum_depth = depth;
+
+leaf_only:
+ page = current->page;
+ cbt->ref = current;
+ cbt->recno = recno;
+ cbt->compare = 0;
+
+ /*
+ * Set the on-page slot to an impossible value larger than any possible
+ * slot (it's used to interpret the search function's return after the
+ * search returns an insert list for a page that has no entries).
+ */
+ cbt->slot = UINT32_MAX;
+
+ /*
+ * Search the leaf page. We do not check in the search path for a
+ * record greater than the maximum record in the tree; in that case,
+ * we arrive here with a record that's impossibly large for the page.
+ */
+ if (page->type == WT_PAGE_COL_FIX) {
+ if (recno >= page->pg_fix_recno + page->pg_fix_entries) {
+ cbt->recno = page->pg_fix_recno + page->pg_fix_entries;
+ goto past_end;
+ } else
+ ins_head = WT_COL_UPDATE_SINGLE(page);
+ } else
+ if ((cip = __col_var_search(page, recno)) == NULL) {
+ cbt->recno = __col_var_last_recno(page);
+ goto past_end;
+ } else {
+ cbt->slot = WT_COL_SLOT(page, cip);
+ ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
+ }
+
+ /*
+ * We have a match on the page, check for an update. Check the page's
+ * update list (fixed-length), or slot's update list (variable-length)
+ * for a better match. The only better match we can find is an exact
+ * match, otherwise the existing match on the page is the one we want.
+ * For that reason, don't set the cursor's WT_INSERT_HEAD/WT_INSERT pair
+ * until we know we have a useful entry.
+ */
+ if ((ins = __col_insert_search(
+ ins_head, cbt->ins_stack, cbt->next_stack, recno)) != NULL)
+ if (recno == WT_INSERT_RECNO(ins)) {
+ cbt->ins_head = ins_head;
+ cbt->ins = ins;
+ }
+ return (0);
+
+past_end:
+ /*
+ * A record past the end of the page's standard information. Check the
+ * append list; by definition, any record on the append list is closer
+ * than the last record on the page, so it's a better choice for return.
+ * This is a rarely used path: we normally find exact matches, because
+ * column-store files are dense, but in this case the caller searched
+ * past the end of the table.
+ *
+ * Don't bother searching if the caller is appending a new record where
+ * we'll allocate the record number; we're not going to find a match by
+ * definition, and we figure out the position when we do the work.
+ */
+ cbt->ins_head = WT_COL_APPEND(page);
+ if (recno == UINT64_MAX)
+ cbt->ins = NULL;
+ else
+ cbt->ins = __col_insert_search(
+ cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno);
+ if (cbt->ins == NULL)
+ cbt->compare = -1;
+ else {
+ cbt->recno = WT_INSERT_RECNO(cbt->ins);
+ if (recno == cbt->recno)
+ cbt->compare = 0;
+ else if (recno < cbt->recno)
+ cbt->compare = 1;
+ else
+ cbt->compare = -1;
+ }
+
+ /*
+ * Note if the record is past the maximum record in the tree, the cursor
+ * search functions need to know for fixed-length column-stores because
+ * appended records implicitly create any skipped records, and cursor
+ * search functions have to handle that case.
+ */
+ if (cbt->compare == -1)
+ F_SET(cbt, WT_CBT_MAX_RECORD);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/rec_evict.c b/src/third_party/wiredtiger/src/btree/rec_evict.c
new file mode 100644
index 00000000000..4696e78059e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/rec_evict.c
@@ -0,0 +1,468 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __hazard_exclusive(WT_SESSION_IMPL *, WT_REF *, int);
+static void __rec_discard_tree(WT_SESSION_IMPL *, WT_REF *, int, int);
+static void __rec_excl_clear(WT_SESSION_IMPL *);
+static void __rec_page_clean_update(WT_SESSION_IMPL *, WT_REF *);
+static int __rec_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, int);
+static int __rec_review(WT_SESSION_IMPL *, WT_REF *, int, int, int *);
+
+/*
+ * __wt_rec_evict --
+ * Reconciliation plus eviction.
+ */
+int
+__wt_rec_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_TXN_STATE *txn_state;
+ int istree;
+
+ page = ref->page;
+ istree = 0;
+
+ WT_RET(__wt_verbose(session, WT_VERB_EVICT,
+ "page %p (%s)", page, __wt_page_type_string(page->type)));
+
+ /*
+ * Pin the oldest transaction ID: eviction looks at page structures
+ * that are freed when no transaction in the system needs them.
+ */
+ txn_state = WT_SESSION_TXN_STATE(session);
+ if (txn_state->snap_min == WT_TXN_NONE)
+ txn_state->snap_min = S2C(session)->txn_global.oldest_id;
+ else
+ txn_state = NULL;
+
+ /*
+ * Get exclusive access to the page and review the page and its subtree
+ * for conditions that would block our eviction of the page. If the
+ * check fails (for example, we find a child page that can't be merged),
+ * we're done. We have to make this check for clean pages, too: while
+ * unlikely eviction would choose an internal page with children, it's
+ * not disallowed anywhere.
+ */
+ WT_ERR(__rec_review(session, ref, exclusive, 1, &istree));
+
+ /*
+ * Update the page's modification reference, reconciliation might have
+ * changed it.
+ */
+ mod = page->modify;
+
+ /* Count evictions of internal pages during normal operation. */
+ if (!exclusive &&
+ (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) {
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_internal);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_internal);
+ }
+
+ /* Discard any subtree rooted in this page. */
+ if (istree)
+ __rec_discard_tree(session, ref, exclusive, 1);
+
+ /* Update the reference and discard the page. */
+ if (mod == NULL || !F_ISSET(mod, WT_PM_REC_MASK)) {
+ WT_ASSERT(session, exclusive || ref->state == WT_REF_LOCKED);
+
+ if (__wt_ref_is_root(ref))
+ __wt_ref_out(session, ref);
+ else
+ __rec_page_clean_update(session, ref);
+
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_clean);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_clean);
+ } else {
+ if (__wt_ref_is_root(ref))
+ __wt_ref_out(session, ref);
+ else
+ WT_ERR(
+ __rec_page_dirty_update(session, ref, exclusive));
+
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_dirty);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_dirty);
+ }
+
+ if (0) {
+err: /*
+ * If unable to evict this page, release exclusive reference(s)
+ * we've acquired.
+ */
+ if (!exclusive)
+ __rec_excl_clear(session);
+
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_fail);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_fail);
+ }
+ session->excl_next = 0;
+
+ if (txn_state != NULL)
+ txn_state->snap_min = WT_TXN_NONE;
+
+ return (ret);
+}
+
+/*
+ * __rec_page_clean_update --
+ * Update a clean page's reference on eviction.
+ */
+static void
+__rec_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ /*
+ * Discard the page and update the reference structure; if the page has
+ * an address, it's a disk page; if it has no address, it's a deleted
+ * page re-instantiated (for example, by searching) and never written.
+ */
+ __wt_ref_out(session, ref);
+ WT_PUBLISH(ref->state,
+ ref->addr == NULL ? WT_REF_DELETED : WT_REF_DISK);
+}
+
+/*
+ * __rec_page_dirty_update --
+ * Update a dirty page's reference on eviction.
+ */
+static int
+__rec_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+{
+ WT_ADDR *addr;
+ WT_PAGE *parent;
+ WT_PAGE_MODIFY *mod;
+
+ parent = ref->home;
+ mod = ref->page->modify;
+
+ switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+ case WT_PM_REC_EMPTY: /* Page is empty */
+ if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ }
+
+ /*
+ * Update the parent to reference a deleted page. The fact that
+ * reconciliation left the page "empty" means there's no older
+ * transaction in the system that might need to see an earlier
+ * version of the page. For that reason, we clear the address
+ * of the page, if we're forced to "read" into that namespace,
+ * we'll instantiate a new page instead of trying to read from
+ * the backing store.
+ *
+ * Publish: a barrier to ensure the structure fields are set
+ * before the state change makes the page available to readers.
+ */
+ __wt_ref_out(session, ref);
+ ref->addr = NULL;
+ WT_PUBLISH(ref->state, WT_REF_DELETED);
+ break;
+ case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
+ /* Split the page in memory. */
+ WT_RET(__wt_split_evict(session, ref, exclusive));
+ break;
+ case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
+ if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ }
+
+ /*
+ * Update the parent to reference the replacement page.
+ *
+ * Publish: a barrier to ensure the structure fields are set
+ * before the state change makes the page available to readers.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
+ *addr = mod->mod_replace;
+ mod->mod_replace.addr = NULL;
+ mod->mod_replace.size = 0;
+
+ __wt_ref_out(session, ref);
+ ref->addr = addr;
+ WT_PUBLISH(ref->state, WT_REF_DISK);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * __rec_discard_tree --
+ * Discard the tree rooted a page (that is, any pages merged into it),
+ * then the page itself.
+ */
+static void
+__rec_discard_tree(
+ WT_SESSION_IMPL *session, WT_REF *ref, int exclusive, int top)
+{
+ WT_REF *child;
+
+ switch (ref->page->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ /* For each entry in the page... */
+ WT_INTL_FOREACH_BEGIN(session, ref->page, child) {
+ if (child->state == WT_REF_DISK ||
+ child->state == WT_REF_DELETED)
+ continue;
+ WT_ASSERT(session,
+ exclusive || child->state == WT_REF_LOCKED);
+ __rec_discard_tree(session, child, exclusive, 0);
+ } WT_INTL_FOREACH_END;
+ /* FALLTHROUGH */
+ default:
+ if (!top)
+ __wt_ref_out(session, ref);
+ break;
+ }
+}
+
+/*
+ * __rec_review --
+ * Get exclusive access to the page and review the page and its subtree
+ * for conditions that would block its eviction.
+ */
+static int
+__rec_review(
+ WT_SESSION_IMPL *session, WT_REF *ref, int exclusive, int top, int *istree)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_REF *child;
+ uint32_t flags;
+
+ btree = S2BT(session);
+ page = ref->page;
+
+ /*
+ * Get exclusive access to the page if our caller doesn't have the tree
+ * locked down.
+ */
+ if (!exclusive) {
+ WT_RET(__hazard_exclusive(session, ref, top));
+
+ /*
+ * Now the page is locked, remove it from the LRU eviction
+ * queue. We have to do this before freeing the page memory or
+ * otherwise touching the reference because eviction paths
+ * assume a non-NULL reference on the queue is pointing at
+ * valid memory.
+ */
+ __wt_evict_list_clear_page(session, ref);
+ }
+
+ /*
+ * Recurse through the page's subtree: this happens first because we
+ * have to write pages in depth-first order, otherwise we'll dirty
+ * pages after we've written them.
+ */
+ if (WT_PAGE_IS_INTERNAL(page))
+ WT_INTL_FOREACH_BEGIN(session, page, child) {
+ switch (child->state) {
+ case WT_REF_DISK: /* On-disk */
+ case WT_REF_DELETED: /* On-disk, deleted */
+ break;
+ case WT_REF_MEM: /* In-memory */
+ /*
+ * Tell our caller if there's a subtree so we
+ * know to do a full walk when discarding the
+ * page.
+ */
+ *istree = 1;
+ WT_RET(__rec_review(
+ session, child, exclusive, 0, istree));
+ break;
+ case WT_REF_LOCKED: /* Being evicted */
+ case WT_REF_READING: /* Being read */
+ case WT_REF_SPLIT: /* Being split */
+ return (EBUSY);
+ WT_ILLEGAL_VALUE(session);
+ }
+ } WT_INTL_FOREACH_END;
+
+ mod = page->modify;
+
+ /*
+ * If the tree was deepened, there's a requirement that newly created
+ * internal pages not be evicted until all threads are known to have
+ * exited the original page index array, because evicting an internal
+ * page discards its WT_REF array, and a thread traversing the original
+ * page index array might see an freed WT_REF. During the split we set
+ * a transaction value, once that's globally visible, we know we can
+ * evict the created page.
+ */
+ if (!exclusive && mod != NULL && WT_PAGE_IS_INTERNAL(page) &&
+ !__wt_txn_visible_all(session, mod->mod_split_txn))
+ return (EBUSY);
+
+ /*
+ * If the file is being checkpointed, we can't evict dirty pages:
+ * if we write a page and free the previous version of the page, that
+ * previous version might be referenced by an internal page already
+ * been written in the checkpoint, leaving the checkpoint inconsistent.
+ *
+ * Don't rely on new updates being skipped by the transaction used
+ * for transaction reads: (1) there are paths that dirty pages for
+ * artificial reasons; (2) internal pages aren't transactional; and
+ * (3) if an update was skipped during the checkpoint (leaving the page
+ * dirty), then rolled back, we could still successfully overwrite a
+ * page and corrupt the checkpoint.
+ *
+ * Further, we can't race with the checkpoint's reconciliation of
+ * an internal page as we evict a clean child from the page's subtree.
+ * This works in the usual way: eviction locks the page and then checks
+ * for existing hazard pointers, the checkpoint thread reconciling an
+ * internal page acquires hazard pointers on child pages it reads, and
+ * is blocked by the exclusive lock.
+ */
+ if (mod != NULL && btree->checkpointing &&
+ (__wt_page_is_modified(page) ||
+ F_ISSET(mod, WT_PM_REC_MULTIBLOCK))) {
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint);
+ return (EBUSY);
+ }
+
+ /*
+ * Fail if any page in the top-level page's subtree won't be merged into
+ * its parent, the page that cannot be merged must be evicted first.
+ * The test is necessary but should not fire much: the eviction code is
+ * biased for leaf pages, an internal page shouldn't be selected for
+ * eviction until its children have been evicted.
+ *
+ * We have to write dirty pages to know their final state, a page marked
+ * empty may have had records added since reconciliation. Writing the
+ * page is expensive, do a cheap test first: if it doesn't seem likely a
+ * subtree page can be merged, quit.
+ */
+ if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY)))
+ return (EBUSY);
+
+ /*
+ * If the page is dirty and can possibly change state, write it so we
+ * know the final state.
+ *
+ * If we have an exclusive lock (we're discarding the tree), assert
+ * there are no updates we cannot read.
+ *
+ * Otherwise, if the top-level page we're evicting is a leaf page, set
+ * the update-restore flag, so reconciliation will write blocks it can
+ * write and create a list of skipped updates for blocks it cannot
+ * write. This is how forced eviction of huge pages works: we take a
+ * big page and reconcile it into blocks, some of which we write and
+ * discard, the rest of which we re-create as smaller in-memory pages,
+ * (restoring the updates that stopped us from writing the block), and
+ * inserting the whole mess into the page's parent.
+ *
+ * Don't set the update-restore flag for internal pages, they don't
+ * have updates that can be saved and restored.
+ *
+ * Don't set the update-restore flag for small pages. (If a small
+ * page were selected by eviction and then modified, and we configure it
+ * for update-restore, we'll end up splitting one or two pages into the
+ * parent, which is a waste of effort. If we don't set update-restore,
+ * eviction will return EBUSY, which makes more sense, the page was just
+ * modified.)
+ *
+ * Don't set the update-restore flag for any page other than the
+ * top one; only the reconciled top page goes through the split path
+ * (and child pages are pages we expect to merge into the top page, they
+ * they are not expected to split).
+ */
+ if (__wt_page_is_modified(page)) {
+ flags = WT_EVICTING;
+ if (exclusive)
+ LF_SET(WT_SKIP_UPDATE_ERR);
+ else if (top && !WT_PAGE_IS_INTERNAL(page) &&
+ page->memory_footprint > 10 * btree->maxleafpage)
+ LF_SET(WT_SKIP_UPDATE_RESTORE);
+ WT_RET(__wt_rec_write(session, ref, NULL, flags));
+ WT_ASSERT(session,
+ !__wt_page_is_modified(page) ||
+ LF_ISSET(WT_SKIP_UPDATE_RESTORE));
+ } else {
+ /*
+ * If the page was ever modified, make sure all of the updates
+ * on the page are old enough they can be discarded from cache.
+ */
+ if (!exclusive && mod != NULL &&
+ !__wt_txn_visible_all(session, mod->rec_max_txn))
+ return (EBUSY);
+ }
+
+ /*
+ * Repeat the test: fail if any page in the top-level page's subtree
+ * won't be merged into its parent.
+ */
+ if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY)))
+ return (EBUSY);
+
+ return (0);
+}
+
+/*
+ * __rec_excl_clear --
+ * Discard exclusive access and return a page's subtree to availability.
+ */
+static void
+__rec_excl_clear(WT_SESSION_IMPL *session)
+{
+ WT_REF *ref;
+ uint32_t i;
+
+ for (i = 0; i < session->excl_next; ++i) {
+ if ((ref = session->excl[i]) == NULL)
+ break;
+ WT_ASSERT(session,
+ ref->state == WT_REF_LOCKED && ref->page != NULL);
+ ref->state = WT_REF_MEM;
+ }
+}
+
+/*
+ * __hazard_exclusive --
+ * Request exclusive access to a page.
+ */
+static int
+__hazard_exclusive(WT_SESSION_IMPL *session, WT_REF *ref, int top)
+{
+ /*
+ * Make sure there is space to track exclusive access so we can unlock
+ * to clean up.
+ */
+ WT_RET(__wt_realloc_def(session, &session->excl_allocated,
+ session->excl_next + 1, &session->excl));
+
+ /*
+ * Request exclusive access to the page. The top-level page should
+ * already be in the locked state, lock child pages in memory.
+ * If another thread already has this page, give up.
+ */
+ if (!top && !WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED))
+ return (EBUSY); /* We couldn't change the state. */
+ WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+
+ session->excl[session->excl_next++] = ref;
+
+ /* Check for a matching hazard pointer. */
+ if (__wt_page_hazard_check(session, ref->page) == NULL)
+ return (0);
+
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_hazard);
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_hazard);
+
+ WT_RET(__wt_verbose(session, WT_VERB_EVICT,
+ "page %p hazard request failed", ref->page));
+ return (EBUSY);
+}
diff --git a/src/third_party/wiredtiger/src/btree/rec_split.c b/src/third_party/wiredtiger/src/btree/rec_split.c
new file mode 100644
index 00000000000..babec2cc295
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/rec_split.c
@@ -0,0 +1,1121 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Tuning; global variables to allow the binary to be patched, we don't yet have
+ * any real understanding of what might be useful to surface to applications.
+ */
+static u_int __split_deepen_max_internal_image = 100;
+static u_int __split_deepen_min_child = 10;
+static u_int __split_deepen_per_child = 100;
+static u_int __split_deepen_split_child = 100;
+
+/*
+ * Track allocation increments, matching the cache calculations, which add an
+ * estimate of allocation overhead to every object.
+ */
+#define WT_MEMSIZE_ADD(total, len) do { \
+ total += (len) + WT_ALLOC_OVERHEAD; \
+} while (0)
+#define WT_MEMSIZE_TRANSFER(from_decr, to_incr, len) do { \
+ WT_MEMSIZE_ADD(from_decr, len); \
+ WT_MEMSIZE_ADD(to_incr, len); \
+} while (0)
+
+/*
+ * __split_oldest_gen --
+ * Calculate the oldest active split generation.
+ */
+static uint64_t
+__split_oldest_gen(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *s;
+ uint64_t gen, oldest;
+ u_int i, session_cnt;
+
+ conn = S2C(session);
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = 0, s = conn->sessions, oldest = conn->split_gen + 1;
+ i < session_cnt;
+ i++, s++)
+ if (((gen = s->split_gen) != 0) && gen < oldest)
+ oldest = gen;
+
+ return (oldest);
+}
+
+/*
+ * __split_stash_add --
+ * Add a new entry into the session's split stash list.
+ */
+static int
+__split_stash_add(WT_SESSION_IMPL *session, void *p, size_t len)
+{
+ WT_SPLIT_STASH *stash;
+
+ WT_ASSERT(session, p != NULL);
+
+ /* Grow the list as necessary. */
+ WT_RET(__wt_realloc_def(session, &session->split_stash_alloc,
+ session->split_stash_cnt + 1, &session->split_stash));
+
+ stash = session->split_stash + session->split_stash_cnt++;
+ stash->split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1);
+ stash->p = p;
+ stash->len = len;
+
+ WT_STAT_FAST_CONN_ATOMIC_INCRV(session, rec_split_stashed_bytes, len);
+ WT_STAT_FAST_CONN_ATOMIC_INCR(session, rec_split_stashed_objects);
+
+ /* See if we can free any previous entries. */
+ if (session->split_stash_cnt > 1)
+ __wt_split_stash_discard(session);
+
+ return (0);
+}
+
+/*
+ * __wt_split_stash_discard --
+ * Discard any memory from a session's split stash that we can.
+ */
+void
+__wt_split_stash_discard(WT_SESSION_IMPL *session)
+{
+ WT_SPLIT_STASH *stash;
+ uint64_t oldest;
+ size_t i;
+
+ /* Get the oldest split generation. */
+ oldest = __split_oldest_gen(session);
+
+ for (i = 0, stash = session->split_stash;
+ i < session->split_stash_cnt;
+ ++i, ++stash) {
+ if (stash->p == NULL)
+ continue;
+ else if (stash->split_gen >= oldest)
+ break;
+ /*
+ * It's a bad thing if another thread is in this memory after
+ * we free it, make sure nothing good happens to that thread.
+ */
+ WT_STAT_FAST_CONN_ATOMIC_DECRV(
+ session, rec_split_stashed_bytes, stash->len);
+ WT_STAT_FAST_CONN_ATOMIC_DECR(
+ session, rec_split_stashed_objects);
+ __wt_overwrite_and_free_len(session, stash->p, stash->len);
+ }
+
+ /*
+ * If there are enough free slots at the beginning of the list, shuffle
+ * everything down.
+ */
+ if (i > 100 || i == session->split_stash_cnt)
+ if ((session->split_stash_cnt -= i) > 0)
+ memmove(session->split_stash, stash,
+ session->split_stash_cnt * sizeof(*stash));
+}
+
+/*
+ * __wt_split_stash_discard_all --
+ * Discard all memory from a session's split stash.
+ */
+void
+__wt_split_stash_discard_all(
+ WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session)
+{
+ WT_SPLIT_STASH *stash;
+ size_t i;
+
+ /*
+ * This function is called during WT_CONNECTION.close to discard any
+ * memory that remains. For that reason, we take two WT_SESSION_IMPL
+ * arguments: session_safe is still linked to the WT_CONNECTION and
+ * can be safely used for calls to other WiredTiger functions, while
+ * session is the WT_SESSION_IMPL we're cleaning up.
+ */
+ for (i = 0, stash = session->split_stash;
+ i < session->split_stash_cnt;
+ ++i, ++stash)
+ if (stash->p != NULL)
+ __wt_free(session_safe, stash->p);
+
+ __wt_free(session_safe, session->split_stash);
+ session->split_stash_cnt = session->split_stash_alloc = 0;
+}
+
+/*
+ * __split_safe_free --
+ * Free a buffer if we can be sure no thread is accessing it, or schedule
+ * it to be freed otherwise.
+ */
+static int
+__split_safe_free(WT_SESSION_IMPL *session, int exclusive, void *p, size_t s)
+{
+ /*
+ * We have swapped something in a page: if we don't have exclusive
+ * access, check whether there are other threads in the same tree.
+ */
+ if (!exclusive &&
+ __split_oldest_gen(session) == S2C(session)->split_gen + 1)
+ exclusive = 1;
+
+ if (exclusive) {
+ __wt_free(session, p);
+ return (0);
+ }
+
+ return (__split_stash_add(session, p, s));
+}
+
+/*
+ * __split_should_deepen --
+ * Return if we should deepen the tree.
+ */
+static int
+__split_should_deepen(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_PAGE_INDEX *pindex;
+
+ /*
+ * Splits are based on either the number of child pages that will be
+ * created by the split (splitting an internal page that will be slow
+ * to search), or by the memory footprint of the parent page (avoiding
+ * an internal page that will eat up all of the cache and put eviction
+ * pressure on the system).
+ */
+ pindex = WT_INTL_INDEX_COPY(page);
+
+ /*
+ * Deepen the tree if the page's memory footprint is larger than the
+ * maximum size for a page in memory. We need an absolute minimum
+ * number of entries in order to split the page: if there is a single
+ * huge key, splitting won't help.
+ */
+ if (page->memory_footprint > S2BT(session)->maxmempage &&
+ pindex->entries >= __split_deepen_min_child)
+ return (1);
+
+ /*
+ * Deepen the tree if the page's memory footprint is at least N
+ * times the maximum internal page size chunk in the backing file and
+ * the split will result in at least N children in the newly created
+ * intermediate layer.
+ */
+ if (page->memory_footprint >
+ __split_deepen_max_internal_image * S2BT(session)->maxintlpage &&
+ pindex->entries >=
+ (__split_deepen_per_child * __split_deepen_split_child))
+ return (1);
+
+ return (0);
+}
+
+/*
+ * __split_ovfl_key_cleanup --
+ * Handle cleanup for on-page row-store overflow keys.
+ */
+static int
+__split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref)
+{
+ WT_CELL *cell;
+ WT_CELL_UNPACK kpack;
+ WT_IKEY *ikey;
+ uint32_t cell_offset;
+
+ /*
+ * A key being discarded (page split) or moved to a different page (page
+ * deepening) may be an on-page overflow key. Clear any reference to an
+ * underlying disk image, and, if the key hasn't been deleted, delete it
+ * along with any backing blocks.
+ */
+ if ((ikey = __wt_ref_key_instantiated(ref)) == NULL)
+ return (0);
+ if ((cell_offset = ikey->cell_offset) == 0)
+ return (0);
+
+ /* Leak blocks rather than try this twice. */
+ ikey->cell_offset = 0;
+
+ cell = WT_PAGE_REF_OFFSET(page, cell_offset);
+ __wt_cell_unpack(cell, &kpack);
+ if (kpack.ovfl && kpack.raw != WT_CELL_KEY_OVFL_RM)
+ WT_RET(__wt_ovfl_discard(session, cell));
+
+ return (0);
+}
+
+/*
+ * __split_ref_instantiate --
+ * Instantiate key/address pairs in memory in service of a split.
+ */
+static int
+__split_ref_instantiate(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp)
+{
+ WT_ADDR *addr;
+ WT_CELL_UNPACK unpack;
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ size_t size;
+ void *key;
+
+ /*
+ * Instantiate row-store keys, and column- and row-store addresses in
+ * the WT_REF structures referenced by a page that's being split (and
+ * deepening the tree). The WT_REF structures aren't moving, but the
+ * index references are moving from the page we're splitting to a set
+ * of child pages, and so we can no longer reference the block image
+ * that remains with the page being split.
+ *
+ * Track how much memory the parent is losing and the child gaining.
+ *
+ * No locking is required to update the WT_REF structure because we're
+ * the only thread splitting the parent page, and there's no way for
+ * readers to race with our updates of single pointers. The changes
+ * have to be written before the page goes away, of course, our caller
+ * owns that problem.
+ *
+ * Row-store keys, first.
+ */
+ if (page->type == WT_PAGE_ROW_INT) {
+ if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) {
+ __wt_ref_key(page, ref, &key, &size);
+ WT_RET(__wt_row_ikey(session, 0, key, size, &ikey));
+ ref->key.ikey = ikey;
+ } else {
+ WT_RET(__split_ovfl_key_cleanup(session, page, ref));
+ WT_MEMSIZE_ADD(*parent_decrp,
+ sizeof(WT_IKEY) + ikey->size);
+ }
+ WT_MEMSIZE_ADD(*child_incrp, sizeof(WT_IKEY) + ikey->size);
+ }
+
+ /*
+ * If there's no address (the page has never been written), or the
+ * address has been instantiated, there's no work to do. Otherwise,
+ * get the address from the on-page cell.
+ */
+ if ((addr = ref->addr) == NULL)
+ return (0);
+ if (__wt_off_page(page, addr))
+ WT_MEMSIZE_TRANSFER(*parent_decrp, *child_incrp,
+ sizeof(WT_ADDR) + addr->size);
+ else {
+ __wt_cell_unpack((WT_CELL *)ref->addr, &unpack);
+ WT_RET(__wt_calloc_def(session, 1, &addr));
+ if ((ret = __wt_strndup(
+ session, unpack.data, unpack.size, &addr->addr)) != 0) {
+ __wt_free(session, addr);
+ return (ret);
+ }
+ addr->size = (uint8_t)unpack.size;
+ addr->type =
+ unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF;
+ ref->addr = addr;
+ WT_MEMSIZE_ADD(*child_incrp, sizeof(WT_ADDR) + addr->size);
+ }
+ return (0);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __split_verify_intl_key_order --
+ * Verify the key order on an internal page after a split, diagnostic only.
+ */
+static void
+__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_ITEM *next, _next, *last, _last, *tmp;
+ WT_REF *ref;
+ uint64_t recno;
+ int cmp, first;
+
+ btree = S2BT(session);
+
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ recno = 0;
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ WT_ASSERT(session, ref->key.recno > recno);
+ recno = ref->key.recno;
+ } WT_INTL_FOREACH_END;
+ break;
+ case WT_PAGE_ROW_INT:
+ next = &_next;
+ WT_CLEAR(_next);
+ last = &_last;
+ WT_CLEAR(_last);
+
+ first = 1;
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ __wt_ref_key(page, ref, &next->data, &next->size);
+ if (last->size == 0) {
+ if (first)
+ first = 0;
+ else {
+ WT_ASSERT(session, __wt_compare(
+ session, btree->collator, last,
+ next, &cmp) == 0);
+ WT_ASSERT(session, cmp < 0);
+ }
+ }
+ tmp = last;
+ last = next;
+ next = tmp;
+ } WT_INTL_FOREACH_END;
+ break;
+ }
+}
+#endif
+
+/*
+ * __split_deepen --
+ * Split an internal page in-memory, deepening the tree.
+ */
+static int
+__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
+{
+ WT_DECL_RET;
+ WT_PAGE *child;
+ WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex;
+ WT_REF **alloc_refp;
+ WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref;
+ size_t child_incr, parent_decr, parent_incr, size;
+ uint32_t children, chunk, i, j, remain, slots;
+ int panic;
+ void *p;
+
+ alloc_index = NULL;
+ parent_incr = parent_decr = 0;
+ panic = 0;
+
+ pindex = WT_INTL_INDEX_COPY(parent);
+
+ /*
+ * Create N children, unless we are dealing with a large page without
+ * many entries, in which case split into the minimum number of pages.
+ */
+ children = WT_MAX(pindex->entries / __split_deepen_per_child,
+ __split_deepen_min_child);
+
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children",
+ parent, pindex->entries, children));
+
+ /*
+ * If the workload is prepending/appending to the tree, we could deepen
+ * without bound. Don't let that happen, keep the first/last pages of
+ * the tree at their current level.
+ *
+ * XXX
+ * To improve this, we could track which pages were last merged into
+ * this page by eviction, and leave those pages alone, to prevent any
+ * sustained insert into the tree from deepening a single location.
+ */
+#undef SPLIT_CORRECT_1
+#define SPLIT_CORRECT_1 1 /* First page correction */
+#undef SPLIT_CORRECT_2
+#define SPLIT_CORRECT_2 2 /* First/last page correction */
+
+ /*
+ * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize
+ * the first/last slots of the allocated WT_PAGE_INDEX to point to the
+ * first/last pages we're keeping at the current level, and the rest of
+ * the slots to point to new WT_REF objects.
+ */
+ size = sizeof(WT_PAGE_INDEX) +
+ (children + SPLIT_CORRECT_2) * sizeof(WT_REF *);
+ WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
+ WT_MEMSIZE_ADD(parent_incr, size);
+ alloc_index->index = (WT_REF **)(alloc_index + 1);
+ alloc_index->entries = children + SPLIT_CORRECT_2;
+ alloc_index->index[0] = pindex->index[0];
+ alloc_index->index[alloc_index->entries - 1] =
+ pindex->index[pindex->entries - 1];
+ for (alloc_refp = alloc_index->index + SPLIT_CORRECT_1,
+ i = 0; i < children; ++alloc_refp, ++i) {
+ WT_ERR(__wt_calloc_def(session, 1, alloc_refp));
+ WT_MEMSIZE_ADD(parent_incr, sizeof(WT_REF));
+ }
+
+ /* Allocate child pages, and connect them into the new page index. */
+ chunk = (pindex->entries - SPLIT_CORRECT_2) / children;
+ remain = (pindex->entries - SPLIT_CORRECT_2) - chunk * (children - 1);
+ for (parent_refp = pindex->index + SPLIT_CORRECT_1,
+ alloc_refp = alloc_index->index + SPLIT_CORRECT_1,
+ i = 0; i < children; ++i) {
+ slots = i == children - 1 ? remain : chunk;
+ WT_ERR(__wt_page_alloc(
+ session, parent->type, 0, slots, 0, &child));
+
+ /*
+ * Initialize the parent page's child reference; we need a copy
+ * of the page's key.
+ */
+ ref = *alloc_refp++;
+ ref->home = parent;
+ ref->page = child;
+ ref->addr = NULL;
+ if (parent->type == WT_PAGE_ROW_INT) {
+ __wt_ref_key(parent, *parent_refp, &p, &size);
+ WT_ERR(
+ __wt_row_ikey(session, 0, p, size, &ref->key.ikey));
+ WT_MEMSIZE_ADD(parent_incr, sizeof(WT_IKEY) + size);
+ } else
+ ref->key.recno = (*parent_refp)->key.recno;
+ ref->state = WT_REF_MEM;
+
+ /* Initialize the child page. */
+ if (parent->type == WT_PAGE_COL_INT)
+ child->pg_intl_recno = (*parent_refp)->key.recno;
+ child->pg_intl_parent_ref = ref;
+
+ /* Mark it dirty. */
+ WT_ERR(__wt_page_modify_init(session, child));
+ __wt_page_only_modify_set(session, child);
+
+ /*
+ * Once the split goes live, the newly created internal pages
+ * might be evicted and their WT_REF structures freed. If those
+ * pages are evicted before threads exit the previous page index
+ * array, a thread might see a freed WT_REF. Set the eviction
+ * transaction requirement for the newly created internal pages.
+ */
+ child->modify->mod_split_txn = __wt_txn_new_id(session);
+
+ /*
+ * The newly allocated child's page index references the same
+ * structures as the parent. (We cannot move WT_REF structures,
+ * threads may be underneath us right now changing the structure
+ * state.) However, if the WT_REF structures reference on-page
+ * information, we have to fix that, because the disk image for
+ * the page that has an page index entry for the WT_REF is about
+ * to change.
+ */
+ child_incr = 0;
+ child_pindex = WT_INTL_INDEX_COPY(child);
+ for (child_refp = child_pindex->index, j = 0; j < slots; ++j) {
+ WT_ERR(__split_ref_instantiate(session,
+ parent, *parent_refp, &parent_decr, &child_incr));
+ *child_refp++ = *parent_refp++;
+
+ WT_MEMSIZE_TRANSFER(
+ parent_decr, child_incr, sizeof(WT_REF));
+ }
+ __wt_cache_page_inmem_incr(session, child, child_incr);
+ }
+ WT_ASSERT(session, alloc_refp -
+ alloc_index->index == alloc_index->entries - SPLIT_CORRECT_1);
+ WT_ASSERT(session,
+ parent_refp - pindex->index == pindex->entries - SPLIT_CORRECT_1);
+
+ /*
+ * Update the parent's index; this is the update which splits the page,
+ * making the change visible to threads descending the tree. From now
+ * on, we're committed to the split. If any subsequent work fails, we
+ * have to panic because we potentially have threads of control using
+ * the new page index we just swapped in.
+ *
+ * A note on error handling: until this point, there's no problem with
+ * unwinding on error. We allocated a new page index, a new set of
+ * WT_REFs and a new set of child pages -- if an error occurred, the
+ * parent remained unchanged, although it may have an incorrect memory
+ * footprint. From now on we've modified the parent page, attention
+ * needs to be paid.
+ */
+ WT_INTL_INDEX_SET(parent, alloc_index);
+ panic = 1;
+
+#ifdef HAVE_DIAGNOSTIC
+ __split_verify_intl_key_order(session, parent);
+#endif
+
+ /*
+ * The moved reference structures now reference the wrong parent page,
+ * and we have to fix that up. The problem is revealed when a thread
+ * of control searches for a page's reference structure slot, and fails
+ * to find it because the page it's searching no longer references it.
+ * When that failure happens, the thread waits for the reference's home
+ * page to be updated, which we do here: walk the children and fix them
+ * up.
+ *
+ * We're not acquiring hazard pointers on these pages, they cannot be
+ * evicted because of the eviction transaction value set above.
+ */
+ for (parent_refp = alloc_index->index,
+ i = alloc_index->entries; i > 0; ++parent_refp, --i) {
+ parent_ref = *parent_refp;
+ WT_ASSERT(session, parent_ref->home == parent);
+ if (parent_ref->state != WT_REF_MEM)
+ continue;
+
+ /*
+ * We left the first/last children of the parent at the current
+ * level to avoid bad split patterns, they might be leaf pages;
+ * check the page type before we continue.
+ */
+ child = parent_ref->page;
+ if (!WT_PAGE_IS_INTERNAL(child))
+ continue;
+#ifdef HAVE_DIAGNOSTIC
+ __split_verify_intl_key_order(session, child);
+#endif
+ WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
+ /*
+ * The page's parent reference may not be wrong, as we
+ * opened up access from the top of the tree already,
+ * pages may have been read in since then. Check and
+ * only update pages that reference the original page,
+ * they must be wrong.
+ */
+ if (child_ref->home == parent) {
+ child_ref->home = child;
+ child_ref->ref_hint = 0;
+ }
+ } WT_INTL_FOREACH_END;
+ }
+
+ /*
+ * Push out the changes: not required for correctness, but don't let
+ * threads spin on incorrect page references longer than necessary.
+ */
+ WT_FULL_BARRIER();
+ alloc_index = NULL;
+
+ /*
+ * We can't free the previous parent's index, there may be threads using
+ * it. Add to the session's discard list, to be freed once we know no
+ * threads can still be using it.
+ *
+ * This change requires care with error handling: we have already
+ * updated the page with a new index. Even if stashing the old value
+ * fails, we don't roll back that change, because threads may already
+ * be using the new index.
+ */
+ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+ WT_MEMSIZE_ADD(parent_decr, size);
+ WT_ERR(__split_safe_free(session, 0, pindex, size));
+
+ /*
+ * Adjust the parent's memory footprint. This may look odd, but we
+ * have already taken the allocation overhead into account, and an
+ * increment followed by a decrement will cancel out the normal
+ * adjustment.
+ */
+ __wt_cache_page_inmem_incr(session, parent, parent_incr);
+ __wt_cache_page_inmem_decr(session, parent, parent_decr);
+
+ if (0) {
+err: __wt_free_ref_index(session, parent, alloc_index, 1);
+
+ /*
+ * If panic is set, we saw an error after opening up the tree
+ * to descent through the parent page's new index. There is
+ * nothing we can do, the tree is inconsistent and there are
+ * threads potentially active in both versions of the tree.
+ */
+ if (panic)
+ ret = __wt_panic(session);
+ }
+ return (ret);
+}
+
+/*
+ * __split_inmem_build --
+ * Instantiate a page in a multi-block set, when an update couldn't be
+ * written.
+ */
+static int
+__split_inmem_build(
+ WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref, WT_MULTI *multi)
+{
+ WT_CURSOR_BTREE cbt;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_UPDATE *upd;
+ WT_UPD_SKIPPED *skip;
+ uint64_t recno;
+ uint32_t i, slot;
+
+ WT_CLEAR(cbt);
+ cbt.iface.session = &session->iface;
+ cbt.btree = S2BT(session);
+
+ /*
+ * We can find unresolved updates when attempting to evict a page, which
+ * can't be written. This code re-creates the in-memory page and applies
+ * the unresolved updates to that page.
+ *
+ * Clear the disk image and link the page into the passed-in WT_REF to
+ * simplify error handling: our caller will not discard the disk image
+ * when discarding the original page, and our caller will discard the
+ * allocated page on error, when discarding the allocated WT_REF.
+ */
+ WT_RET(__wt_page_inmem(
+ session, ref, multi->skip_dsk, WT_PAGE_DISK_ALLOC, &page));
+ multi->skip_dsk = NULL;
+
+ if (orig->type == WT_PAGE_ROW_LEAF)
+ WT_RET(__wt_scr_alloc(session, 0, &key));
+
+ /* Re-create each modification we couldn't write. */
+ for (i = 0, skip = multi->skip; i < multi->skip_entries; ++i, ++skip)
+ switch (orig->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ /* Build a key. */
+ upd = skip->ins->upd;
+ skip->ins->upd = NULL;
+ recno = WT_INSERT_RECNO(skip->ins);
+
+ /* Search the page. */
+ WT_ERR(__wt_col_search(session, recno, ref, &cbt));
+
+ /* Apply the modification. */
+ WT_ERR(__wt_col_modify(
+ session, &cbt, recno, NULL, upd, 0));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ /* Build a key. */
+ if (skip->ins == NULL) {
+ slot = WT_ROW_SLOT(orig, skip->rip);
+ upd = orig->pg_row_upd[slot];
+ orig->pg_row_upd[slot] = NULL;
+
+ WT_ERR(__wt_row_leaf_key(
+ session, orig, skip->rip, key, 0));
+ } else {
+ upd = skip->ins->upd;
+ skip->ins->upd = NULL;
+
+ key->data = WT_INSERT_KEY(skip->ins);
+ key->size = WT_INSERT_KEY_SIZE(skip->ins);
+ }
+
+ /* Search the page. */
+ WT_ERR(__wt_row_search(session, key, ref, &cbt, 1));
+
+ /* Apply the modification. */
+ WT_ERR(
+ __wt_row_modify(session, &cbt, key, NULL, upd, 0));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /*
+ * We modified the page above, which will have set the first dirty
+ * transaction to the last transaction current running. However, the
+ * updates we installed may be older than that. Take the oldest active
+ * transaction ID to make sure these updates are not skipped by a
+ * checkpoint.
+ */
+ page->modify->first_dirty_txn = S2C(session)->txn_global.oldest_id;
+
+err: __wt_scr_free(&key);
+ /* Free any resources that may have been cached in the cursor. */
+ WT_TRET(__wt_btcur_close(&cbt));
+ return (ret);
+}
+
+/*
+ * __wt_multi_to_ref --
+ * Move a multi-block list into an array of WT_REF structures.
+ */
+int
+__wt_multi_to_ref(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp)
+{
+ WT_ADDR *addr;
+ WT_IKEY *ikey;
+ WT_REF *ref;
+ size_t incr;
+
+ addr = NULL;
+ incr = 0;
+
+ /* In some cases, the underlying WT_REF has not yet been allocated. */
+ if (*refp == NULL) {
+ WT_RET(__wt_calloc_def(session, 1, refp));
+ WT_MEMSIZE_ADD(incr, sizeof(WT_REF));
+ }
+ ref = *refp;
+
+ /*
+ * Any parent reference must be filled in by our caller; the primary
+ * use of this function is when splitting into a parent page, and we
+ * aren't holding any locks here that would allow us to know which
+ * parent we'll eventually split into, if the tree is simultaneously
+ * being deepened.
+ */
+ ref->home = NULL;
+
+ if (multi->skip == NULL) {
+ /*
+ * Copy the address: we could simply take the buffer, but that
+ * would complicate error handling, freeing the reference array
+ * would have to avoid freeing the memory, and it's not worth
+ * the confusion.
+ */
+ WT_RET(__wt_calloc_def(session, 1, &addr));
+ WT_MEMSIZE_ADD(incr, sizeof(WT_ADDR));
+ ref->addr = addr;
+ addr->size = multi->addr.size;
+ addr->type = multi->addr.type;
+ WT_RET(__wt_strndup(session,
+ multi->addr.addr, addr->size, &addr->addr));
+ /* Need a cast to avoid an implicit conversion warning. */
+ WT_MEMSIZE_ADD(incr, addr->size);
+ } else
+ WT_RET(__split_inmem_build(session, page, ref, multi));
+
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ ikey = multi->key.ikey;
+ WT_RET(__wt_row_ikey(session, 0,
+ WT_IKEY_DATA(ikey), ikey->size, &ref->key.ikey));
+ WT_MEMSIZE_ADD(incr, sizeof(WT_IKEY) + ikey->size);
+ break;
+ default:
+ ref->key.recno = multi->key.recno;
+ break;
+ }
+
+ ref->state = multi->skip == NULL ? WT_REF_DISK : WT_REF_MEM;
+
+ /*
+ * If our caller wants to track the memory allocations, we have a return
+ * reference.
+ */
+ if (incrp != NULL)
+ *incrp += incr;
+ return (0);
+}
+
+/*
+ * __split_evict_multi --
+ * Resolve a multi-page split, inserting new information into the parent.
+ */
+static int
+__split_evict_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+{
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ WT_PAGE *parent, *child;
+ WT_PAGE_INDEX *alloc_index, *pindex;
+ WT_PAGE_MODIFY *mod;
+ WT_REF **alloc_refp, *parent_ref, ref_copy, **ref_tmp;
+ size_t parent_decr, parent_incr, size;
+ uint32_t i, j, parent_entries, result_entries, split_entries;
+ int complete, hazard, locked;
+
+ parent = NULL; /* -Wconditional-uninitialized */
+ alloc_index = NULL;
+ parent_ref = NULL;
+ ref_tmp = NULL;
+ parent_decr = parent_incr = 0;
+ complete = hazard = locked = 0;
+
+ child = ref->page;
+ mod = child->modify;
+
+ /*
+ * Convert the split page's multiblock reconciliation information into
+ * an array of page reference structures.
+ */
+ split_entries = mod->mod_multi_entries;
+ WT_RET(__wt_calloc_def(session, split_entries, &ref_tmp));
+ for (i = 0; i < split_entries; ++i)
+ WT_ERR(__wt_multi_to_ref(session,
+ child, &mod->mod_multi[i], &ref_tmp[i], &parent_incr));
+
+ /*
+ * Get a page-level lock on the parent to single-thread splits into the
+ * page because we need to single-thread sizing/growing the page index.
+ * It's OK to queue up multiple splits as the child pages split, but the
+ * actual split into the parent has to be serialized. Note we allocate
+ * memory inside of the lock and may want to invest effort in making the
+ * locked period shorter.
+ *
+ * We could race with another thread deepening our parent. To deal
+ * with that, read the parent pointer each time we try to lock it, and
+ * check that it's still correct after it is locked.
+ */
+ for (;;) {
+ parent = ref->home;
+ F_CAS_ATOMIC(parent, WT_PAGE_SPLITTING, ret);
+ if (ret == 0) {
+ if (parent == ref->home)
+ break;
+ F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING);
+ continue;
+ }
+ __wt_yield();
+ }
+ locked = 1;
+
+ /*
+ * We have exclusive access to split the parent, and at this point, the
+ * child prevents the parent from being evicted. However, once we
+ * update the parent's index, it will no longer refer to the child, and
+ * could conceivably be evicted. Get a hazard pointer on the parent
+ * now, so that we can safely access it after updating the index.
+ */
+ if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
+ WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
+ hazard = 1;
+ }
+
+ pindex = WT_INTL_INDEX_COPY(parent);
+ parent_entries = pindex->entries;
+ result_entries = (parent_entries - 1) + split_entries;
+
+ /*
+ * Allocate and initialize a new page index array for the parent, then
+ * copy references from the original index array, plus references from
+ * the newly created split array, into place.
+ */
+ size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
+ WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
+ WT_MEMSIZE_ADD(parent_incr, size);
+ alloc_index->index = (WT_REF **)(alloc_index + 1);
+ alloc_index->entries = result_entries;
+ for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i)
+ if (pindex->index[i] == ref)
+ for (j = 0; j < split_entries; ++j) {
+ ref_tmp[j]->home = parent;
+ *alloc_refp++ = ref_tmp[j];
+
+ /*
+ * Clear the split reference as it moves to the
+ * allocated page index, so it never appears on
+ * both after an error.
+ */
+ ref_tmp[j] = NULL;
+ }
+ else
+ *alloc_refp++ = pindex->index[i];
+ __wt_free(session, ref_tmp);
+
+ /*
+ * Update the parent page's index: this update makes the split visible
+ * to threads descending the tree.
+ */
+ WT_INTL_INDEX_SET(parent, alloc_index);
+ alloc_index = NULL;
+
+#ifdef HAVE_DIAGNOSTIC
+ __split_verify_intl_key_order(session, parent);
+#endif
+
+ /*
+ * Reset the page's original WT_REF field to split. Threads cursoring
+ * through the tree were blocked because that WT_REF state was set to
+ * locked. This update changes the locked state to split, unblocking
+ * those threads and causing them to re-calculate their position based
+ * on the updated parent page's index.
+ */
+ WT_PUBLISH(ref->state, WT_REF_SPLIT);
+
+ /*
+ * A note on error handling: failures before we swapped the new page
+ * index into the parent can be resolved by simply freeing allocated
+ * memory because the original page is unchanged, we can continue to
+ * use it and we have not yet modified the parent. (See below for an
+ * exception, we cannot discard pages referencing unresolved changes.)
+ * Failures after we swap the new page index into the parent are also
+ * relatively benign because the split is OK and complete and the page
+ * is reset so it will be discarded by eviction. For that reason, we
+ * mostly ignore further errors unless there's a panic.
+ */
+ complete = 1;
+
+ /*
+ * The previous parent page's key for this child page may have been an
+ * on-page overflow key. In that case, if the key hasn't been deleted,
+ * delete it now, including its backing blocks. We are exchanging the
+ * WT_REF that referenced it for the split page WT_REFs and their keys,
+ * and there's no longer any reference to it. Done after completing the
+ * split (if we failed, we'd leak the underlying blocks, but the parent
+ * page would be unaffected).
+ */
+ if (parent->type == WT_PAGE_ROW_INT)
+ WT_TRET(__split_ovfl_key_cleanup(session, parent, ref));
+
+ /*
+ * We can't free the previous page index, or the page's original WT_REF
+ * structure and instantiated key, there may be threads using them. Add
+ * them to the session discard list, to be freed once we know it's safe.
+ */
+ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+ WT_TRET(__split_safe_free(session, exclusive, pindex, size));
+ WT_MEMSIZE_ADD(parent_decr, size);
+ if (parent->type == WT_PAGE_ROW_INT &&
+ (ikey = __wt_ref_key_instantiated(ref)) != NULL) {
+ size = sizeof(WT_IKEY) + ikey->size;
+ WT_TRET(__split_safe_free(session, exclusive, ikey, size));
+ WT_MEMSIZE_ADD(parent_decr, size);
+ }
+ /*
+ * Take a copy of the ref in case we can free it immediately: we still
+ * need to discard the page.
+ */
+ ref_copy = *ref;
+ WT_TRET(__split_safe_free(session, exclusive, ref, sizeof(WT_REF)));
+ WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF));
+
+ /*
+ * Adjust the parent's memory footprint. This may look odd, but we
+ * have already taken the allocation overhead into account, and an
+ * increment followed by a decrement will cancel out the normal
+ * adjustment.
+ */
+ __wt_cache_page_inmem_incr(session, parent, parent_incr);
+ __wt_cache_page_inmem_decr(session, parent, parent_decr);
+
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_split);
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "%p: %s split into parent %p %" PRIu32 " -> %" PRIu32
+ " (%" PRIu32 ")",
+ child, __wt_page_type_string(child->type), parent, parent_entries,
+ result_entries, result_entries - parent_entries));
+
+ /*
+ * Simple page splits trickle up the tree, that is, as leaf pages grow
+ * large enough and are evicted, they'll split into their parent. And,
+ * as that parent grows large enough and is evicted, it will split into
+ * its parent and so on. When the page split wave reaches the root,
+ * the tree will permanently deepen as multiple root pages are written.
+ * However, this only helps if first, the pages are evicted (and
+ * we resist evicting internal pages for obvious reasons), and second,
+ * if the tree is closed and re-opened from a disk image, which may be
+ * a rare event.
+ * To avoid the case of internal pages becoming too large when they
+ * aren't being evicted, check internal pages each time a leaf page is
+ * split into them. If it's big enough, deepen the tree at that point.
+ * Do the check here because we've just grown the parent page and
+ * are holding it locked.
+ */
+ if (ret == 0 && !exclusive && __split_should_deepen(session, parent))
+ ret = __split_deepen(session, parent);
+
+err: if (locked)
+ F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING);
+
+ if (hazard)
+ WT_TRET(__wt_hazard_clear(session, parent));
+
+ /*
+ * Discard the child; test for split completion instead of errors, there
+ * might be a relatively innocuous error, and if we split the parent, we
+ * want to discard the child.
+ */
+ if (complete) {
+ /*
+ * Pages with unresolved changes are not marked clean during
+ * reconciliation, do it now.
+ */
+ if (__wt_page_is_modified(child)) {
+ mod->write_gen = 0;
+ __wt_cache_dirty_decr(session, child);
+ }
+ __wt_ref_out(session, &ref_copy);
+ }
+
+ /*
+ * A note on error handling: in the case of evicting a page that has
+ * unresolved changes, we just instantiated some in-memory pages that
+ * reflect those unresolved changes. The problem is those pages
+ * reference the same WT_UPDATE chains as the page we're splitting,
+ * that is, we simply copied references into the new pages. If the
+ * split fails, the original page is fine, but discarding the created
+ * page would free those update chains, and that's wrong. There isn't
+ * an easy solution, there's a lot of small memory allocations in some
+ * common code paths, and unwinding those changes will be difficult.
+ * For now, leak the memory by not discarding the instantiated pages.
+ */
+ __wt_free_ref_index(session, NULL, alloc_index, 0);
+ if (ref_tmp != NULL) {
+ for (i = 0; i < split_entries; ++i)
+ __wt_free_ref(session, child, ref_tmp[i], 0);
+ __wt_free(session, ref_tmp);
+ }
+
+ /*
+ * A note on error handling: if we completed the split, return success,
+ * nothing really bad can have happened.
+ */
+ return (ret == WT_PANIC || !complete ? ret : 0);
+}
+
+/*
+ * __split_evict_single --
+ * Resolve a single page split, replacing a page with a new version.
+ */
+static int
+__split_evict_single(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_REF new;
+
+ page = ref->page;
+ mod = page->modify;
+
+ /* Build the new page. */
+ memset(&new, 0, sizeof(new));
+ WT_RET(__split_inmem_build(session, page, &new, &mod->mod_multi[0]));
+
+ /*
+ * Discard the original page. Pages with unresolved changes are not
+ * marked clean during reconciliation, do it now.
+ */
+ mod->write_gen = 0;
+ __wt_cache_dirty_decr(session, page);
+ __wt_page_out(session, &page);
+
+ /* Swap the new page into place. */
+ ref->page = new.page;
+ WT_PUBLISH(ref->state, WT_REF_MEM);
+
+ return (0);
+}
+
+/*
+ * __wt_split_evict --
+ * Resolve a page split.
+ */
+int
+__wt_split_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+{
+ uint32_t split_entries;
+
+ /*
+ * There are two cases entering this code. First, an in-memory page that
+ * got too large, we forcibly evicted it, and there wasn't anything to
+ * write. (Imagine two threads updating a small set keys on a leaf page.
+ * The page is too large so we try to evict it, but after reconciliation
+ * there's only a small amount of data (so it's a single page we can't
+ * split), and because there are two threads, there's some data we can't
+ * write (so we can't evict it). In that case, we take advantage of the
+ * fact we have exclusive access to the page and rewrite it in memory.)
+ *
+ * Second, a real split where we reconciled a page and it turned into a
+ * lot of pages.
+ */
+ split_entries = ref->page->modify->mod_multi_entries;
+ return (split_entries == 1 ?
+ __split_evict_single(session, ref) :
+ __split_evict_multi(session, ref, exclusive));
+}
diff --git a/src/third_party/wiredtiger/src/btree/rec_track.c b/src/third_party/wiredtiger/src/btree/rec_track.c
new file mode 100644
index 00000000000..92282393a23
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/rec_track.c
@@ -0,0 +1,904 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Estimated memory cost for a structure on the overflow lists, the size of
+ * the structure plus two pointers (assume the average skip list depth is 2).
+ */
+#define WT_OVFL_SIZE(s) \
+ (sizeof(s) + 2 * sizeof(void *))
+
+/*
+ * __ovfl_track_init --
+ * Initialize the overflow tracking structure.
+ */
+static int
+__ovfl_track_init(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ return (__wt_calloc_def(session, 1, &page->modify->ovfl_track));
+}
+
+/*
+ * __ovfl_discard_verbose --
+ * Dump information about a discard overflow record.
+ */
+static int
+__ovfl_discard_verbose(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, const char *tag)
+{
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 512, &tmp));
+
+ unpack = &_unpack;
+ __wt_cell_unpack(cell, unpack);
+
+ WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW,
+ "discard: %s%s%p %s",
+ tag == NULL ? "" : tag,
+ tag == NULL ? "" : ": ",
+ page,
+ __wt_addr_string(session, unpack->data, unpack->size, tmp)));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+#if 0
+/*
+ * __ovfl_discard_dump --
+ * Debugging information.
+ */
+static void
+__ovfl_discard_dump(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CELL **cellp;
+ WT_OVFL_TRACK *track;
+ size_t i;
+
+ if (page->modify == NULL || page->modify->ovfl_track == NULL)
+ return;
+
+ track = page->modify->ovfl_track;
+ for (i = 0, cellp = track->discard;
+ i < track->discard_entries; ++i, ++cellp)
+ (void)__ovfl_discard_verbose(session, page, *cellp, "dump");
+}
+#endif
+
+/*
+ * __ovfl_discard_wrapup --
+ * Resolve the page's overflow discard list after a page is written.
+ */
+static int
+__ovfl_discard_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CELL **cellp;
+ WT_DECL_RET;
+ WT_OVFL_TRACK *track;
+ uint32_t i;
+
+ track = page->modify->ovfl_track;
+ for (i = 0, cellp = track->discard;
+ i < track->discard_entries; ++i, ++cellp) {
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(__ovfl_discard_verbose(
+ session, page, *cellp, "free"));
+
+ /* Discard each cell's overflow item. */
+ WT_RET(__wt_ovfl_discard(session, *cellp));
+ }
+
+ __wt_free(session, track->discard);
+ track->discard_entries = track->discard_allocated = 0;
+
+ return (ret);
+}
+
+/*
+ * __ovfl_discard_wrapup_err --
+ * Resolve the page's overflow discard list after an error occurs.
+ */
+static int
+__ovfl_discard_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_OVFL_TRACK *track;
+
+ track = page->modify->ovfl_track;
+
+ __wt_free(session, track->discard);
+ track->discard_entries = track->discard_allocated = 0;
+
+ return (0);
+}
+
+/*
+ * __wt_ovfl_discard_add --
+ * Add a new entry to the page's list of overflow records that have been
+ * discarded.
+ */
+int
+__wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell)
+{
+ WT_OVFL_TRACK *track;
+
+ if (page->modify->ovfl_track == NULL)
+ WT_RET(__ovfl_track_init(session, page));
+
+ track = page->modify->ovfl_track;
+ WT_RET(__wt_realloc_def(session, &track->discard_allocated,
+ track->discard_entries + 1, &track->discard));
+ track->discard[track->discard_entries++] = cell;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(__ovfl_discard_verbose(session, page, cell, "add"));
+
+ return (0);
+}
+
+/*
+ * __wt_ovfl_discard_free --
+ * Free the page's list of discarded overflow record addresses.
+ */
+void
+__wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_OVFL_TRACK *track;
+
+ if (page->modify == NULL || page->modify->ovfl_track == NULL)
+ return;
+
+ track = page->modify->ovfl_track;
+
+ __wt_free(session, track->discard);
+ track->discard_entries = track->discard_allocated = 0;
+}
+
+/*
+ * __ovfl_reuse_verbose --
+ * Dump information about a reuse overflow record.
+ */
+static int
+__ovfl_reuse_verbose(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_OVFL_REUSE *reuse, const char *tag)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 64, &tmp));
+
+ WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW,
+ "reuse: %s%s%p %s (%s%s%s) {%.*s}",
+ tag == NULL ? "" : tag,
+ tag == NULL ? "" : ": ",
+ page,
+ __wt_addr_string(
+ session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size, tmp),
+ F_ISSET(reuse, WT_OVFL_REUSE_INUSE) ? "inuse" : "",
+ F_ISSET(reuse, WT_OVFL_REUSE_INUSE) &&
+ F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? ", " : "",
+ F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? "just-added" : "",
+ WT_MIN(reuse->value_size, 40), (char *)WT_OVFL_REUSE_VALUE(reuse)));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+#if 0
+/*
+ * __ovfl_reuse_dump --
+ * Debugging information.
+ */
+static void
+__ovfl_reuse_dump(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_OVFL_REUSE **head, *reuse;
+
+ if (page->modify == NULL || page->modify->ovfl_track == NULL)
+ return;
+ head = page->modify->ovfl_track->ovfl_reuse;
+
+ for (reuse = head[0]; reuse != NULL; reuse = reuse->next[0])
+ (void)__ovfl_reuse_verbose(session, page, reuse, "dump");
+}
+#endif
+
+/*
+ * __ovfl_reuse_skip_search --
+ * Return the first, not in-use, matching value in the overflow reuse list.
+ */
+static WT_OVFL_REUSE *
+__ovfl_reuse_skip_search(
+ WT_OVFL_REUSE **head, const void *value, size_t value_size)
+{
+ WT_OVFL_REUSE **e, *next;
+ size_t len;
+ int cmp, i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+ if (*e == NULL) { /* Empty levels */
+ --i;
+ --e;
+ continue;
+ }
+
+ /*
+ * Values are not unique, and it's possible to have long lists
+ * of identical overflow items. (We've seen it in benchmarks.)
+ * Move through a list of identical items at the current level
+ * as long as the next one is in-use, otherwise, drop down a
+ * level. When at the bottom level, return items if reusable,
+ * else NULL.
+ */
+ len = WT_MIN((*e)->value_size, value_size);
+ cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len);
+ if (cmp == 0 && (*e)->value_size == value_size) {
+ if (i == 0)
+ return (F_ISSET(*e,
+ WT_OVFL_REUSE_INUSE) ? NULL : *e);
+ if ((next = (*e)->next[i]) == NULL ||
+ !F_ISSET(next, WT_OVFL_REUSE_INUSE) ||
+ next->value_size != len || memcmp(
+ WT_OVFL_REUSE_VALUE(next), value, len) != 0) {
+ --i; /* Drop down a level */
+ --e;
+ } else /* Keep going at this level */
+ e = &(*e)->next[i];
+ continue;
+ }
+
+ /*
+ * If the skiplist value is larger than the search value, or
+ * they compare equally and the skiplist value is longer than
+ * the search value, drop down a level, otherwise continue on
+ * this level.
+ */
+ if (cmp > 0 || (cmp == 0 && (*e)->value_size > value_size)) {
+ --i; /* Drop down a level */
+ --e;
+ } else /* Keep going at this level */
+ e = &(*e)->next[i];
+ }
+ return (NULL);
+}
+
+/*
+ * __ovfl_reuse_skip_search_stack --
+ * Search an overflow reuse skiplist, returning an insert/remove stack.
+ */
+static void
+__ovfl_reuse_skip_search_stack(WT_OVFL_REUSE **head,
+ WT_OVFL_REUSE ***stack, const void *value, size_t value_size)
+{
+ WT_OVFL_REUSE **e;
+ size_t len;
+ int cmp, i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+ if (*e == NULL) { /* Empty levels */
+ stack[i--] = e--;
+ continue;
+ }
+
+ /*
+ * If the skiplist value is larger than the search value, or
+ * they compare equally and the skiplist value is longer than
+ * the search value, drop down a level, otherwise continue on
+ * this level.
+ */
+ len = WT_MIN((*e)->value_size, value_size);
+ cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len);
+ if (cmp > 0 || (cmp == 0 && (*e)->value_size > value_size))
+ stack[i--] = e--; /* Drop down a level */
+ else
+ e = &(*e)->next[i]; /* Keep going at this level */
+ }
+}
+
+/*
+ * __ovfl_reuse_wrapup --
+ * Resolve the page's overflow reuse list after a page is written.
+ */
+static int
+__ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BM *bm;
+ WT_OVFL_REUSE **e, **head, *reuse;
+ size_t incr, decr;
+ int i;
+
+ bm = S2BT(session)->bm;
+ head = page->modify->ovfl_track->ovfl_reuse;
+
+ /*
+ * Discard any overflow records that aren't in-use, freeing underlying
+ * blocks.
+ *
+ * First, walk the overflow reuse lists (except for the lowest one),
+ * fixing up skiplist links.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
+ for (e = &head[i]; *e != NULL;) {
+ if (F_ISSET(*e, WT_OVFL_REUSE_INUSE)) {
+ e = &(*e)->next[i];
+ continue;
+ }
+ *e = (*e)->next[i];
+ }
+
+ /*
+ * Second, discard any overflow record without an in-use flag, clear
+ * the flags for the next run.
+ *
+ * As part of the pass through the lowest level, figure out how much
+ * space we added/subtracted from the page, and update its footprint.
+ * We don't get it exactly correct because we don't know the depth of
+ * the skiplist here, but it's close enough, and figuring out the
+ * memory footprint change in the reconciliation wrapup code means
+ * fewer atomic updates and less code overall.
+ */
+ incr = decr = 0;
+ for (e = &head[0]; (reuse = *e) != NULL;) {
+ if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) {
+ if (F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED))
+ incr += WT_OVFL_SIZE(WT_OVFL_REUSE) +
+ reuse->addr_size + reuse->value_size;
+
+ F_CLR(reuse,
+ WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED);
+ e = &(*e)->next[0];
+ continue;
+ }
+ *e = (*e)->next[0];
+
+ WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED));
+ decr += WT_OVFL_SIZE(WT_OVFL_REUSE) +
+ reuse->addr_size + reuse->value_size;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(
+ __ovfl_reuse_verbose(session, page, reuse, "free"));
+ WT_RET(bm->free(
+ bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size));
+ __wt_free(session, reuse);
+ }
+
+ if (incr > decr)
+ __wt_cache_page_inmem_incr(session, page, incr - decr);
+ if (decr > incr)
+ __wt_cache_page_inmem_decr(session, page, decr - incr);
+ return (0);
+}
+
+/*
+ * __ovfl_reuse_wrapup_err --
+ * Resolve the page's overflow reuse list after an error occurs.
+ */
+static int
+__ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BM *bm;
+ WT_DECL_RET;
+ WT_OVFL_REUSE **e, **head, *reuse;
+ int i;
+
+ bm = S2BT(session)->bm;
+ head = page->modify->ovfl_track->ovfl_reuse;
+
+ /*
+ * Discard any overflow records that were just added, freeing underlying
+ * blocks.
+ *
+ * First, walk the overflow reuse lists (except for the lowest one),
+ * fixing up skiplist links.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
+ for (e = &head[i]; *e != NULL;) {
+ if (!F_ISSET(*e, WT_OVFL_REUSE_JUST_ADDED)) {
+ e = &(*e)->next[i];
+ continue;
+ }
+ *e = (*e)->next[i];
+ }
+
+ /*
+ * Second, discard any overflow record with a just-added flag, clear the
+ * flags for the next run.
+ */
+ for (e = &head[0]; (reuse = *e) != NULL;) {
+ if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) {
+ F_CLR(reuse, WT_OVFL_REUSE_INUSE);
+ e = &(*e)->next[0];
+ continue;
+ }
+ *e = (*e)->next[0];
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(
+ __ovfl_reuse_verbose(session, page, reuse, "free"));
+ WT_TRET(bm->free(
+ bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size));
+ __wt_free(session, reuse);
+ }
+ return (0);
+}
+
+/*
+ * __wt_ovfl_reuse_search --
+ * Search the page's list of overflow records for a match.
+ */
+int
+__wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page,
+ uint8_t **addrp, size_t *addr_sizep,
+ const void *value, size_t value_size)
+{
+ WT_OVFL_REUSE **head, *reuse;
+
+ *addrp = NULL;
+ *addr_sizep = 0;
+
+ if (page->modify->ovfl_track == NULL)
+ return (0);
+
+ head = page->modify->ovfl_track->ovfl_reuse;
+
+ /*
+ * The search function returns the first matching record in the list
+ * which does not have the in-use flag set, or NULL.
+ */
+ if ((reuse = __ovfl_reuse_skip_search(head, value, value_size)) == NULL)
+ return (0);
+
+ *addrp = WT_OVFL_REUSE_ADDR(reuse);
+ *addr_sizep = reuse->addr_size;
+ F_SET(reuse, WT_OVFL_REUSE_INUSE);
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(__ovfl_reuse_verbose(session, page, reuse, "reclaim"));
+ return (1);
+}
+
+/*
+ * __wt_ovfl_reuse_add --
+ * Add a new entry to the page's list of overflow records tracked for
+ * reuse.
+ */
+int
+__wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page,
+ const uint8_t *addr, size_t addr_size,
+ const void *value, size_t value_size)
+{
+ WT_OVFL_REUSE **head, *reuse, **stack[WT_SKIP_MAXDEPTH];
+ size_t size;
+ u_int i, skipdepth;
+ uint8_t *p;
+
+ if (page->modify->ovfl_track == NULL)
+ WT_RET(__ovfl_track_init(session, page));
+
+ head = page->modify->ovfl_track->ovfl_reuse;
+
+ /* Choose a skiplist depth for this insert. */
+ skipdepth = __wt_skip_choose_depth(session);
+
+ /*
+ * Allocate the WT_OVFL_REUSE structure, next pointers for the skip
+ * list, room for the address and value, then copy everything into
+ * place.
+ *
+ * To minimize the WT_OVFL_REUSE structure size, the address offset
+ * and size are single bytes: that's safe because the address follows
+ * the structure (which can't be more than about 100B), and address
+ * cookies are limited to 255B.
+ */
+ size = sizeof(WT_OVFL_REUSE) +
+ skipdepth * sizeof(WT_OVFL_REUSE *) + addr_size + value_size;
+ WT_RET(__wt_calloc(session, 1, size, &reuse));
+ p = (uint8_t *)reuse +
+ sizeof(WT_OVFL_REUSE) + skipdepth * sizeof(WT_OVFL_REUSE *);
+ reuse->addr_offset = (uint8_t)WT_PTRDIFF(p, reuse);
+ reuse->addr_size = (uint8_t)addr_size;
+ memcpy(p, addr, addr_size);
+ p += addr_size;
+ reuse->value_offset = WT_PTRDIFF32(p, reuse);
+ reuse->value_size = WT_STORE_SIZE(value_size);
+ memcpy(p, value, value_size);
+ F_SET(reuse, WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED);
+
+ /* Insert the new entry into the skiplist. */
+ __ovfl_reuse_skip_search_stack(head, stack, value, value_size);
+ for (i = 0; i < skipdepth; ++i) {
+ reuse->next[i] = *stack[i];
+ *stack[i] = reuse;
+ }
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(__ovfl_reuse_verbose(session, page, reuse, "add"));
+
+ return (0);
+}
+
+/*
+ * __wt_ovfl_reuse_free --
+ * Free the page's list of overflow records tracked for reuse.
+ */
+void
+__wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_OVFL_REUSE *reuse;
+ WT_PAGE_MODIFY *mod;
+ void *next;
+
+ mod = page->modify;
+ if (mod == NULL || mod->ovfl_track == NULL)
+ return;
+
+ for (reuse = mod->ovfl_track->ovfl_reuse[0];
+ reuse != NULL; reuse = next) {
+ next = reuse->next[0];
+ __wt_free(session, reuse);
+ }
+}
+
+/*
+ * __ovfl_txnc_verbose --
+ * Dump information about a transaction-cached overflow record.
+ */
+static int
+__ovfl_txnc_verbose(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_OVFL_TXNC *txnc, const char *tag)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 64, &tmp));
+
+ WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW,
+ "txn-cache: %s%s%p %s %" PRIu64 " {%.*s}",
+ tag == NULL ? "" : tag,
+ tag == NULL ? "" : ": ",
+ page,
+ __wt_addr_string(
+ session, WT_OVFL_TXNC_ADDR(txnc), txnc->addr_size, tmp),
+ txnc->current,
+ WT_MIN(txnc->value_size, 40), (char *)WT_OVFL_TXNC_VALUE(txnc)));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+#if 0
+/*
+ * __ovfl_txnc_dump --
+ * Debugging information.
+ */
+static void
+__ovfl_txnc_dump(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_OVFL_TXNC **head, *txnc;
+
+ if (page->modify == NULL || page->modify->ovfl_track == NULL)
+ return;
+ head = page->modify->ovfl_track->ovfl_txnc;
+
+ for (txnc = head[0]; txnc != NULL; txnc = txnc->next[0])
+ (void)__ovfl_txnc_verbose(session, page, txnc, "dump");
+}
+#endif
+
+/*
+ * __ovfl_txnc_skip_search --
+ * Return the first matching addr in the overflow transaction-cache list.
+ */
+static WT_OVFL_TXNC *
+__ovfl_txnc_skip_search(WT_OVFL_TXNC **head, const void *addr, size_t addr_size)
+{
+ WT_OVFL_TXNC **e;
+ size_t len;
+ int cmp, i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+ if (*e == NULL) { /* Empty levels */
+ --i;
+ --e;
+ continue;
+ }
+
+ /*
+ * Return any exact matches: we don't care in what search level
+ * we found a match.
+ */
+ len = WT_MIN((*e)->addr_size, addr_size);
+ cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len);
+ if (cmp == 0 && (*e)->addr_size == addr_size)
+ return (*e);
+
+ /*
+ * If the skiplist address is larger than the search address, or
+ * they compare equally and the skiplist address is longer than
+ * the search address, drop down a level, otherwise continue on
+ * this level.
+ */
+ if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size)) {
+ --i; /* Drop down a level */
+ --e;
+ } else /* Keep going at this level */
+ e = &(*e)->next[i];
+ }
+ return (NULL);
+}
+
+/*
+ * __ovfl_txnc_skip_search_stack --
+ * Search an overflow transaction-cache skiplist, returning an
+ * insert/remove stack.
+ */
+static void
+__ovfl_txnc_skip_search_stack(WT_OVFL_TXNC **head,
+ WT_OVFL_TXNC ***stack, const void *addr, size_t addr_size)
+{
+ WT_OVFL_TXNC **e;
+ size_t len;
+ int cmp, i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+ if (*e == NULL) { /* Empty levels */
+ stack[i--] = e--;
+ continue;
+ }
+
+ /*
+ * If the skiplist addr is larger than the search addr, or
+ * they compare equally and the skiplist addr is longer than
+ * the search addr, drop down a level, otherwise continue on
+ * this level.
+ */
+ len = WT_MIN((*e)->addr_size, addr_size);
+ cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len);
+ if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size))
+ stack[i--] = e--; /* Drop down a level */
+ else
+ e = &(*e)->next[i]; /* Keep going at this level */
+ }
+}
+
+/*
+ * __ovfl_txnc_wrapup --
+ * Resolve the page's transaction-cache list.
+ */
+static int
+__ovfl_txnc_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_OVFL_TXNC **e, **head, *txnc;
+ size_t decr;
+ int i;
+
+ head = page->modify->ovfl_track->ovfl_txnc;
+
+ /*
+ * Discard any transaction-cache records with transaction IDs earlier
+ * than any in the system.
+ *
+ * First, walk the overflow transaction-cache skip lists (except for
+ * the lowest level), fixing up links.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
+ for (e = &head[i]; *e != NULL;) {
+ if (!__wt_txn_visible_all(session, (*e)->current)) {
+ e = &(*e)->next[i];
+ continue;
+ }
+ *e = (*e)->next[i];
+ }
+
+ /* Second, discard any no longer needed transaction-cache records. */
+ decr = 0;
+ for (e = &head[0]; (txnc = *e) != NULL;) {
+ if (!__wt_txn_visible_all(session, txnc->current)) {
+ e = &(*e)->next[0];
+ continue;
+ }
+ *e = (*e)->next[0];
+
+ decr += WT_OVFL_SIZE(WT_OVFL_TXNC) +
+ txnc->addr_size + txnc->value_size;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(
+ __ovfl_txnc_verbose(session, page, txnc, "free"));
+ __wt_free(session, txnc);
+ }
+
+ if (decr != 0)
+ __wt_cache_page_inmem_decr(session, page, decr);
+ return (0);
+}
+
+/*
+ * __wt_ovfl_txnc_search --
+ * Search the page's list of transaction-cache overflow records for a
+ * match.
+ */
+int
+__wt_ovfl_txnc_search(
+ WT_PAGE *page, const uint8_t *addr, size_t addr_size, WT_ITEM *store)
+{
+ WT_OVFL_TXNC **head, *txnc;
+
+ if (page->modify->ovfl_track == NULL)
+ return (WT_NOTFOUND);
+
+ head = page->modify->ovfl_track->ovfl_txnc;
+
+ if ((txnc = __ovfl_txnc_skip_search(head, addr, addr_size)) == NULL)
+ return (WT_NOTFOUND);
+
+ store->data = WT_OVFL_TXNC_VALUE(txnc);
+ store->size = txnc->value_size;
+ return (0);
+}
+
+/*
+ * __wt_ovfl_txnc_add --
+ * Add a new entry to the page's list of transaction-cached overflow
+ * records.
+ */
+int
+__wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page,
+ const uint8_t *addr, size_t addr_size,
+ const void *value, size_t value_size)
+{
+ WT_OVFL_TXNC **head, **stack[WT_SKIP_MAXDEPTH], *txnc;
+ size_t size;
+ u_int i, skipdepth;
+ uint8_t *p;
+
+ if (page->modify->ovfl_track == NULL)
+ WT_RET(__ovfl_track_init(session, page));
+
+ head = page->modify->ovfl_track->ovfl_txnc;
+
+ /* Choose a skiplist depth for this insert. */
+ skipdepth = __wt_skip_choose_depth(session);
+
+ /*
+ * Allocate the WT_OVFL_TXNC structure, next pointers for the skip
+ * list, room for the address and value, then copy everything into
+ * place.
+ *
+ * To minimize the WT_OVFL_TXNC structure size, the address offset
+ * and size are single bytes: that's safe because the address follows
+ * the structure (which can't be more than about 100B), and address
+ * cookies are limited to 255B.
+ */
+ size = sizeof(WT_OVFL_TXNC) +
+ skipdepth * sizeof(WT_OVFL_TXNC *) + addr_size + value_size;
+ WT_RET(__wt_calloc(session, 1, size, &txnc));
+ p = (uint8_t *)txnc +
+ sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *);
+ txnc->addr_offset = (uint8_t)WT_PTRDIFF(p, txnc);
+ txnc->addr_size = (uint8_t)addr_size;
+ memcpy(p, addr, addr_size);
+ p += addr_size;
+ txnc->value_offset = WT_PTRDIFF32(p, txnc);
+ txnc->value_size = WT_STORE_SIZE(value_size);
+ memcpy(p, value, value_size);
+ txnc->current = __wt_txn_new_id(session);
+
+ __wt_cache_page_inmem_incr(session, page,
+ WT_OVFL_SIZE(WT_OVFL_TXNC) + addr_size + value_size);
+
+ /* Insert the new entry into the skiplist. */
+ __ovfl_txnc_skip_search_stack(head, stack, addr, addr_size);
+ for (i = 0; i < skipdepth; ++i) {
+ txnc->next[i] = *stack[i];
+ *stack[i] = txnc;
+ }
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(__ovfl_txnc_verbose(session, page, txnc, "add"));
+
+ return (0);
+}
+
+/*
+ * __wt_ovfl_txnc_free --
+ * Free the page's list of transaction-cached overflow records.
+ */
+void
+__wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_OVFL_TXNC *txnc;
+ WT_PAGE_MODIFY *mod;
+ void *next;
+
+ mod = page->modify;
+ if (mod == NULL || mod->ovfl_track == NULL)
+ return;
+
+ for (txnc = mod->ovfl_track->ovfl_txnc[0];
+ txnc != NULL; txnc = next) {
+ next = txnc->next[0];
+ __wt_free(session, txnc);
+ }
+}
+
+/*
+ * __wt_ovfl_track_wrapup --
+ * Resolve the page's overflow tracking on reconciliation success.
+ */
+int
+__wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_DECL_RET;
+ WT_OVFL_TRACK *track;
+
+ if (page->modify == NULL || page->modify->ovfl_track == NULL)
+ return (0);
+
+ track = page->modify->ovfl_track;
+ if (track->discard != NULL)
+ WT_RET(__ovfl_discard_wrapup(session, page));
+
+ if (track->ovfl_reuse[0] != NULL)
+ WT_RET(__ovfl_reuse_wrapup(session, page));
+
+ if (track->ovfl_txnc[0] != NULL) {
+ WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock));
+ ret = __ovfl_txnc_wrapup(session, page);
+ WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock));
+ }
+ return (0);
+}
+
+/*
+ * __wt_ovfl_track_wrapup_err --
+ * Resolve the page's overflow tracking on reconciliation error.
+ */
+int
+__wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_DECL_RET;
+ WT_OVFL_TRACK *track;
+
+ if (page->modify == NULL || page->modify->ovfl_track == NULL)
+ return (0);
+
+ track = page->modify->ovfl_track;
+ if (track->discard != NULL)
+ WT_RET(__ovfl_discard_wrapup_err(session, page));
+
+ if (track->ovfl_reuse[0] != NULL)
+ WT_RET(__ovfl_reuse_wrapup_err(session, page));
+
+ if (track->ovfl_txnc[0] != NULL) {
+ WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock));
+ ret = __ovfl_txnc_wrapup(session, page);
+ WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock));
+ }
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/rec_write.c b/src/third_party/wiredtiger/src/btree/rec_write.c
new file mode 100644
index 00000000000..1b3a9a0898f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/rec_write.c
@@ -0,0 +1,5521 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+struct __rec_boundary; typedef struct __rec_boundary WT_BOUNDARY;
+struct __rec_dictionary; typedef struct __rec_dictionary WT_DICTIONARY;
+struct __rec_kv; typedef struct __rec_kv WT_KV;
+
+/*
+ * Reconciliation is the process of taking an in-memory page, walking each entry
+ * in the page, building a backing disk image in a temporary buffer representing
+ * that information, and writing that buffer to disk. What could be simpler?
+ *
+ * WT_RECONCILE --
+ * Information tracking a single page reconciliation.
+ */
+typedef struct {
+ WT_REF *ref; /* Page being reconciled */
+ WT_PAGE *page;
+ uint32_t flags; /* Caller's configuration */
+
+ WT_ITEM dsk; /* Temporary disk-image buffer */
+
+ /* Track whether all changes to the page are written. */
+ uint64_t max_txn;
+ uint64_t skipped_txn;
+ uint32_t orig_write_gen;
+
+ /*
+ * If page updates are skipped because they are as yet unresolved, or
+ * the page has updates we cannot discard, the page is left "dirty":
+ * the page cannot be discarded and a subsequent reconciliation will
+ * be necessary to discard the page.
+ */
+ int leave_dirty;
+
+ /*
+ * Raw compression (don't get me started, as if normal reconciliation
+ * wasn't bad enough). If an application wants absolute control over
+ * what gets written to disk, we give it a list of byte strings and it
+ * gives us back an image that becomes a file block. Because we don't
+ * know the number of items we're storing in a block until we've done
+ * a lot of work, we turn off most compression: dictionary, copy-cell,
+ * prefix and row-store internal page suffix compression are all off.
+ */
+ int raw_compression;
+ uint32_t raw_max_slots; /* Raw compression array sizes */
+ uint32_t *raw_entries; /* Raw compression slot entries */
+ uint32_t *raw_offsets; /* Raw compression slot offsets */
+ uint64_t *raw_recnos; /* Raw compression recno count */
+ WT_ITEM raw_destination; /* Raw compression destination buffer */
+
+ /*
+ * Track if reconciliation has seen any overflow items. If a leaf page
+ * with no overflow items is written, the parent page's address cell is
+ * set to the leaf-no-overflow type. This means we can delete the leaf
+ * page without reading it because we don't have to discard any overflow
+ * items it might reference.
+ *
+ * The test test is per-page reconciliation, that is, once we see an
+ * overflow item on the page, all subsequent leaf pages written for the
+ * page will not be leaf-no-overflow type, regardless of whether or not
+ * they contain overflow items. In other words, leaf-no-overflow is not
+ * guaranteed to be set on every page that doesn't contain an overflow
+ * item, only that if it is set, the page contains no overflow items.
+ *
+ * The reason is because of raw compression: there's no easy/fast way to
+ * figure out if the rows selected by raw compression included overflow
+ * items, and the optimization isn't worth another pass over the data.
+ */
+ int ovfl_items;
+
+ /*
+ * Track if reconciliation of a row-store leaf page has seen empty (zero
+ * length) values. We don't write out anything for empty values, so if
+ * there are empty values on a page, we have to make two passes over the
+ * page when it's read to figure out how many keys it has, expensive in
+ * the common case of no empty values and (entries / 2) keys. Likewise,
+ * a page with only empty values is another common data set, and keys on
+ * that page will be equal to the number of entries. In both cases, set
+ * a flag in the page's on-disk header.
+ *
+ * The test is per-page reconciliation as described above for the
+ * overflow-item test.
+ */
+ int all_empty_value, any_empty_value;
+
+ /*
+ * Reconciliation gets tricky if we have to split a page, which happens
+ * when the disk image we create exceeds the page type's maximum disk
+ * image size.
+ *
+ * First, the sizes of the page we're building. If WiredTiger is doing
+ * page layout, page_size is the same as page_size_max. We accumulate
+ * the maximum page size of raw data and when we reach that size, we
+ * split the page into multiple chunks, eventually compressing those
+ * chunks. When the application is doing page layout (raw compression
+ * is configured), page_size can continue to grow past page_size_max,
+ * and we keep accumulating raw data until the raw compression callback
+ * accepts it.
+ */
+ uint32_t page_size; /* Current page size */
+ uint32_t page_size_max; /* Maximum on-disk page size */
+
+ /*
+ * Second, the split size: if we're doing the page layout, split to a
+ * smaller-than-maximum page size when a split is required so we don't
+ * repeatedly split a packed page.
+ */
+ uint32_t split_size; /* Split page size */
+
+ /*
+ * The problem with splits is we've done a lot of work by the time we
+ * realize we're going to have to split, we don't want to start over.
+ *
+ * To keep from having to start over when we hit the maximum page size,
+ * we track the page information when we approach a split boundary.
+ * If we eventually have to split, we walk this structure and pretend
+ * we were splitting all along. After that, we continue to append to
+ * this structure, and eventually walk it to create a new internal page
+ * that references all of our split pages.
+ */
+ struct __rec_boundary {
+ /*
+ * The start field records location in the initial split buffer,
+ * that is, the first byte of the split chunk recorded before we
+ * decide to split a page; the offset between the first byte of
+ * chunk[0] and the first byte of chunk[1] is chunk[0]'s length.
+ *
+ * Once we split a page, we stop filling in the start field, as
+ * we're writing the split chunks as we find them.
+ */
+ uint8_t *start; /* Split's first byte */
+
+ /*
+ * The recno and entries fields are the starting record number
+ * of the split chunk (for column-store splits), and the number
+ * of entries in the split chunk. These fields are used both
+ * to write the split chunk, and to create a new internal page
+ * to reference the split pages.
+ */
+ uint64_t recno; /* Split's starting record */
+ uint32_t entries; /* Split's entries */
+
+ WT_ADDR addr; /* Split's written location */
+ uint32_t size; /* Split's size */
+ uint32_t cksum; /* Split's checksum */
+ void *dsk; /* Split's disk image */
+
+ /*
+ * When busy pages get large, we need to be able to evict them
+ * even when they contain unresolved updates, or updates which
+ * cannot be evicted because of running transactions. In such
+ * cases, break the page into multiple blocks, write the blocks
+ * that can be evicted, saving lists of updates for blocks that
+ * cannot be evicted, then re-instantiate the blocks that cannot
+ * be evicted as new, in-memory pages, restoring the updates on
+ * those pages.
+ */
+ WT_UPD_SKIPPED *skip; /* Skipped updates */
+ uint32_t skip_next;
+ size_t skip_allocated;
+
+ /*
+ * The key for a row-store page; no column-store key is needed
+ * because the page's recno, stored in the recno field, is the
+ * column-store key.
+ */
+ WT_ITEM key; /* Promoted row-store key */
+
+ /*
+ * During wrapup, after reconciling the root page, we write a
+ * final block as part of a checkpoint. If raw compression
+ * was configured, that block may have already been compressed.
+ */
+ int already_compressed;
+ } *bnd; /* Saved boundaries */
+ uint32_t bnd_next; /* Next boundary slot */
+ uint32_t bnd_next_max; /* Maximum boundary slots used */
+ size_t bnd_entries; /* Total boundary slots */
+ size_t bnd_allocated; /* Bytes allocated */
+
+ /*
+ * We track the total number of page entries copied into split chunks
+ * so we can easily figure out how many entries in the current split
+ * chunk.
+ */
+ uint32_t total_entries; /* Total entries in splits */
+
+ /*
+ * And there's state information as to where in this process we are:
+ * (1) tracking split boundaries because we can still fit more split
+ * chunks into the maximum page size, (2) tracking the maximum page
+ * size boundary because we can't fit any more split chunks into the
+ * maximum page size, (3) not performing boundary checks because it's
+ * either not useful with the current page size configuration, or
+ * because we've already been forced to split.
+ */
+ enum { SPLIT_BOUNDARY=0, /* Next: a split page boundary */
+ SPLIT_MAX=1, /* Next: the maximum page boundary */
+ SPLIT_TRACKING_OFF=2, /* No boundary checks */
+ SPLIT_TRACKING_RAW=3 } /* Underlying compression decides */
+ bnd_state;
+
+ /*
+ * We track current information about the current record number, the
+ * number of entries copied into the temporary buffer, where we are
+ * in the temporary buffer, and how much memory remains. Those items
+ * are packaged here rather than passing pointers to stack locations
+ * around the code.
+ */
+ uint64_t recno; /* Current record number */
+ uint32_t entries; /* Current number of entries */
+ uint8_t *first_free; /* Current first free byte */
+ size_t space_avail; /* Remaining space in this chunk */
+
+ /*
+ * While reviewing updates for each page, we store skipped updates here,
+ * and then move them to per-block areas as the blocks are defined.
+ */
+ WT_UPD_SKIPPED *skip; /* Skipped updates */
+ uint32_t skip_next;
+ size_t skip_allocated;
+
+ /*
+ * We don't need to keep the 0th key around on internal pages, the
+ * search code ignores them as nothing can sort less by definition.
+ * There's some trickiness here, see the code for comments on how
+ * these fields work.
+ */
+ int cell_zero; /* Row-store internal page 0th key */
+
+ /*
+ * WT_DICTIONARY --
+ * We optionally build a dictionary of row-store values for leaf
+ * pages. Where two value cells are identical, only write the value
+ * once, the second and subsequent copies point to the original cell.
+ * The dictionary is fixed size, but organized in a skip-list to make
+ * searches faster.
+ */
+ struct __rec_dictionary {
+ uint64_t hash; /* Hash value */
+ void *cell; /* Matching cell */
+
+ u_int depth; /* Skiplist */
+ WT_DICTIONARY *next[0];
+ } **dictionary; /* Dictionary */
+ u_int dictionary_next, dictionary_slots; /* Next, max entries */
+ /* Skiplist head. */
+ WT_DICTIONARY *dictionary_head[WT_SKIP_MAXDEPTH];
+
+ /*
+ * WT_KV--
+ * An on-page key/value item we're building.
+ */
+ struct __rec_kv {
+ WT_ITEM buf; /* Data */
+ WT_CELL cell; /* Cell and cell's length */
+ size_t cell_len;
+ size_t len; /* Total length of cell + data */
+ } k, v; /* Key/Value being built */
+
+ WT_ITEM *cur, _cur; /* Key/Value being built */
+ WT_ITEM *last, _last; /* Last key/value built */
+
+ int key_pfx_compress; /* If can prefix-compress next key */
+ int key_pfx_compress_conf; /* If prefix compression configured */
+ int key_sfx_compress; /* If can suffix-compress next key */
+ int key_sfx_compress_conf; /* If suffix compression configured */
+
+ int is_bulk_load; /* If it's a bulk load */
+
+ WT_SALVAGE_COOKIE *salvage; /* If it's a salvage operation */
+
+ int tested_ref_state; /* Debugging information */
+} WT_RECONCILE;
+
+static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, int);
+static void __rec_cell_build_addr(
+ WT_RECONCILE *, const void *, size_t, u_int, uint64_t);
+static int __rec_cell_build_int_key(WT_SESSION_IMPL *,
+ WT_RECONCILE *, const void *, size_t, int *);
+static int __rec_cell_build_leaf_key(WT_SESSION_IMPL *,
+ WT_RECONCILE *, const void *, size_t, int *);
+static int __rec_cell_build_ovfl(WT_SESSION_IMPL *,
+ WT_RECONCILE *, WT_KV *, uint8_t, uint64_t);
+static int __rec_cell_build_val(WT_SESSION_IMPL *,
+ WT_RECONCILE *, const void *, size_t, uint64_t);
+static int __rec_child_deleted(
+ WT_SESSION_IMPL *, WT_RECONCILE *, WT_REF *, int *);
+static int __rec_col_fix(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_col_fix_slvg(WT_SESSION_IMPL *,
+ WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
+static int __rec_col_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_col_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_col_var(WT_SESSION_IMPL *,
+ WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
+static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *,
+ WT_SALVAGE_COOKIE *, WT_ITEM *, int, uint8_t, uint64_t);
+static int __rec_destroy_session(WT_SESSION_IMPL *);
+static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t);
+static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_row_leaf(WT_SESSION_IMPL *,
+ WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
+static int __rec_row_leaf_insert(
+ WT_SESSION_IMPL *, WT_RECONCILE *, WT_INSERT *);
+static int __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_split_col(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *);
+static int __rec_split_fixup(WT_SESSION_IMPL *, WT_RECONCILE *);
+static int __rec_split_row(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_split_row_promote(
+ WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t);
+static int __rec_split_write(WT_SESSION_IMPL *,
+ WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, int);
+static int __rec_write_init(WT_SESSION_IMPL *,
+ WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *);
+static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_write_wrapup_err(
+ WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+
+static void __rec_dictionary_free(WT_SESSION_IMPL *, WT_RECONCILE *);
+static int __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int);
+static int __rec_dictionary_lookup(
+ WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *, WT_DICTIONARY **);
+static void __rec_dictionary_reset(WT_RECONCILE *);
+
+/*
+ * __wt_rec_write --
+ * Reconcile an in-memory page into its on-disk format, and write it.
+ */
+int
+__wt_rec_write(WT_SESSION_IMPL *session,
+ WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_RECONCILE *r;
+ int locked;
+
+ conn = S2C(session);
+ page = ref->page;
+ mod = page->modify;
+
+ /* We're shouldn't get called with a clean page, that's an error. */
+ if (!__wt_page_is_modified(page))
+ WT_RET_MSG(session, WT_ERROR,
+ "Attempt to reconcile a clean page.");
+
+ WT_RET(__wt_verbose(session,
+ WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type)));
+ WT_STAT_FAST_CONN_INCR(session, rec_pages);
+ WT_STAT_FAST_DATA_INCR(session, rec_pages);
+ if (LF_ISSET(WT_EVICTING)) {
+ WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction);
+ WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction);
+ }
+
+ /* Record the most recent transaction ID we will *not* write. */
+ mod->disk_snap_min = session->txn.snap_min;
+
+ /* Initialize the reconciliation structure for each new run. */
+ WT_RET(__rec_write_init(
+ session, ref, flags, salvage, &session->reconcile));
+ r = session->reconcile;
+
+ /*
+ * The compaction process looks at the page's modification information;
+ * if compaction is running, lock the page down.
+ *
+ * Otherwise, flip on the scanning flag: obsolete updates cannot be
+ * freed while reconciliation is in progress.
+ */
+ locked = 0;
+ if (conn->compact_in_memory_pass) {
+ locked = 1;
+ WT_PAGE_LOCK(session, page);
+ } else
+ for (;;) {
+ F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
+ if (ret == 0)
+ break;
+ __wt_yield();
+ }
+
+ /* Reconcile the page. */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ if (salvage != NULL)
+ ret = __rec_col_fix_slvg(session, r, page, salvage);
+ else
+ ret = __rec_col_fix(session, r, page);
+ break;
+ case WT_PAGE_COL_INT:
+ ret = __rec_col_int(session, r, page);
+ break;
+ case WT_PAGE_COL_VAR:
+ ret = __rec_col_var(session, r, page, salvage);
+ break;
+ case WT_PAGE_ROW_INT:
+ ret = __rec_row_int(session, r, page);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ ret = __rec_row_leaf(session, r, page, salvage);
+ break;
+ WT_ILLEGAL_VALUE_SET(session);
+ }
+
+ /* Wrap up the page reconciliation. */
+ if (ret == 0)
+ ret = __rec_write_wrapup(session, r, page);
+ else
+ WT_TRET(__rec_write_wrapup_err(session, r, page));
+
+ /* Release the page lock if we're holding one. */
+ if (locked)
+ WT_PAGE_UNLOCK(session, page);
+ else
+ F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
+
+ /*
+ * Clean up the boundary structures: some workloads result in millions
+ * of these structures, and if associated with some random session that
+ * got roped into doing forced eviction, they won't be discarded for the
+ * life of the session.
+ */
+ __rec_bnd_cleanup(session, r, 0);
+
+ WT_RET(ret);
+
+ /*
+ * Root pages are special, splits have to be done, we can't put it off
+ * as the parent's problem any more.
+ */
+ if (__wt_ref_is_root(ref))
+ return (__rec_root_write(session, page, flags));
+
+ /*
+ * Otherwise, mark the page's parent dirty.
+ * Don't mark the tree dirty: if this reconciliation is in service of a
+ * checkpoint, it's cleared the tree's dirty flag, and we don't want to
+ * set it again as part of that walk.
+ */
+ return (__wt_page_parent_modify_set(session, ref, 1));
+}
+
+/*
+ * __rec_root_write --
+ * Handle the write of a root page.
+ */
+static int
+__rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
+{
+ WT_DECL_RET;
+ WT_PAGE *next;
+ WT_PAGE_INDEX *pindex;
+ WT_PAGE_MODIFY *mod;
+ WT_REF fake_ref;
+ uint32_t i;
+
+ mod = page->modify;
+
+ /*
+ * If a single root page was written (either an empty page or there was
+ * a 1-for-1 page swap), we've written root and checkpoint, we're done.
+ * If the root page split, write the resulting WT_REF array. We already
+ * have an infrastructure for writing pages, create a fake root page and
+ * write it instead of adding code to write blocks based on the list of
+ * blocks resulting from a multiblock reconciliation.
+ */
+ switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+ case WT_PM_REC_EMPTY: /* Page is empty */
+ case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
+ return (0);
+ case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ WT_RET(__wt_verbose(session, WT_VERB_SPLIT,
+ "root page split -> %" PRIu32 " pages", mod->mod_multi_entries));
+
+ /*
+ * Create a new root page, initialize the array of child references,
+ * mark it dirty, then write it.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ WT_RET(__wt_page_alloc(session,
+ WT_PAGE_COL_INT, 1, mod->mod_multi_entries, 1, &next));
+ break;
+ case WT_PAGE_ROW_INT:
+ WT_RET(__wt_page_alloc(session,
+ WT_PAGE_ROW_INT, 0, mod->mod_multi_entries, 1, &next));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ pindex = WT_INTL_INDEX_COPY(next);
+ for (i = 0; i < mod->mod_multi_entries; ++i) {
+ WT_ERR(__wt_multi_to_ref(session,
+ next, &mod->mod_multi[i], &pindex->index[i], NULL));
+ pindex->index[i]->home = next;
+ }
+
+ /*
+ * We maintain a list of pages written for the root in order to free the
+ * backing blocks the next time the root is written.
+ */
+ mod->mod_root_split = next;
+
+ WT_ERR(__wt_page_modify_init(session, next));
+ __wt_page_only_modify_set(session, next);
+
+ /*
+ * Fake up a reference structure, and write the next root page.
+ */
+ __wt_root_ref_init(&fake_ref, next, page->type == WT_PAGE_COL_INT);
+ return (__wt_rec_write(session, &fake_ref, NULL, flags));
+
+err: __wt_page_out(session, &next);
+ return (ret);
+}
+
+/*
+ * __rec_raw_compression_config --
+ * Configure raw compression.
+ */
+static inline int
+__rec_raw_compression_config(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ /* Check if raw compression configured. */
+ if (btree->compressor == NULL ||
+ btree->compressor->compress_raw == NULL)
+ return (0);
+
+ /* Only for row-store and variable-length column-store objects. */
+ if (page->type == WT_PAGE_COL_FIX)
+ return (0);
+
+ /*
+ * Raw compression cannot support dictionary compression. (Technically,
+ * we could still use the raw callback on column-store variable length
+ * internal pages with dictionary compression configured, because
+ * dictionary compression only applies to column-store leaf pages, but
+ * that seems an unlikely use case.)
+ */
+ if (btree->dictionary != 0)
+ return (0);
+
+ /* Raw compression cannot support prefix compression. */
+ if (btree->prefix_compression != 0)
+ return (0);
+
+ /*
+ * Raw compression is also turned off during salvage: we can't allow
+ * pages to split during salvage, raw compression has no point if it
+ * can't manipulate the page size.
+ */
+ if (salvage != NULL)
+ return (0);
+
+ return (1);
+}
+
+/*
+ * __rec_write_init --
+ * Initialize the reconciliation structure.
+ */
+static int
+__rec_write_init(WT_SESSION_IMPL *session,
+ WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ WT_RECONCILE *r;
+
+ btree = S2BT(session);
+ page = ref->page;
+
+ if ((r = *(WT_RECONCILE **)reconcilep) == NULL) {
+ WT_RET(__wt_calloc_def(session, 1, &r));
+
+ *(WT_RECONCILE **)reconcilep = r;
+ session->reconcile_cleanup = __rec_destroy_session;
+
+ /* Connect pointers/buffers. */
+ r->cur = &r->_cur;
+ r->last = &r->_last;
+
+ /* Disk buffers need to be aligned for writing. */
+ F_SET(&r->dsk, WT_ITEM_ALIGNED);
+ }
+
+ /* Remember the configuration. */
+ r->ref = ref;
+ r->page = page;
+ r->flags = flags;
+
+ /* Track if the page can be marked clean. */
+ r->leave_dirty = 0;
+
+ /* Raw compression. */
+ r->raw_compression =
+ __rec_raw_compression_config(session, page, salvage);
+ r->raw_destination.flags = WT_ITEM_ALIGNED;
+
+ /* Track overflow items. */
+ r->ovfl_items = 0;
+
+ /* Track empty values. */
+ r->all_empty_value = 1;
+ r->any_empty_value = 0;
+
+ /* The list of cached, skipped updates. */
+ r->skip_next = 0;
+
+ /*
+ * Dictionary compression only writes repeated values once. We grow
+ * the dictionary as necessary, always using the largest size we've
+ * seen.
+ *
+ * Reset the dictionary.
+ *
+ * Sanity check the size: 100 slots is the smallest dictionary we use.
+ */
+ if (btree->dictionary != 0 && btree->dictionary > r->dictionary_slots)
+ WT_RET(__rec_dictionary_init(session,
+ r, btree->dictionary < 100 ? 100 : btree->dictionary));
+ __rec_dictionary_reset(r);
+
+ /*
+ * Suffix compression shortens internal page keys by discarding trailing
+ * bytes that aren't necessary for tree navigation. We don't do suffix
+ * compression if there is a custom collator because we don't know what
+ * bytes a custom collator might use. Some custom collators (for
+ * example, a collator implementing reverse ordering of strings), won't
+ * have any problem with suffix compression: if there's ever a reason to
+ * implement suffix compression for custom collators, we can add a
+ * setting to the collator, configured when the collator is added, that
+ * turns on suffix compression.
+ *
+ * The raw compression routines don't even consider suffix compression,
+ * but it doesn't hurt to confirm that.
+ */
+ r->key_sfx_compress_conf = 0;
+ if (btree->collator == NULL &&
+ btree->internal_key_truncate && !r->raw_compression)
+ r->key_sfx_compress_conf = 1;
+
+ /*
+ * Prefix compression discards repeated prefix bytes from row-store leaf
+ * page keys.
+ */
+ r->key_pfx_compress_conf = 0;
+ if (btree->prefix_compression && page->type == WT_PAGE_ROW_LEAF)
+ r->key_pfx_compress_conf = 1;
+
+ r->salvage = salvage;
+
+ /* Save the page's write generation before reading the page. */
+ WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
+
+ /*
+ * Running transactions may update the page after we write it, so
+ * this is the highest ID we can be confident we will see.
+ */
+ r->skipped_txn = S2C(session)->txn_global.last_running;
+
+ return (0);
+}
+
+/*
+ * __rec_destroy --
+ * Clean up the reconciliation structure.
+ */
+static void
+__rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
+{
+ WT_RECONCILE *r;
+
+ if ((r = *(WT_RECONCILE **)reconcilep) == NULL)
+ return;
+ *(WT_RECONCILE **)reconcilep = NULL;
+
+ __wt_buf_free(session, &r->dsk);
+
+ __wt_free(session, r->raw_entries);
+ __wt_free(session, r->raw_offsets);
+ __wt_free(session, r->raw_recnos);
+ __wt_buf_free(session, &r->raw_destination);
+
+ __rec_bnd_cleanup(session, r, 1);
+
+ __wt_free(session, r->skip);
+
+ __wt_buf_free(session, &r->k.buf);
+ __wt_buf_free(session, &r->v.buf);
+ __wt_buf_free(session, &r->_cur);
+ __wt_buf_free(session, &r->_last);
+
+ __rec_dictionary_free(session, r);
+
+ __wt_free(session, r);
+}
+
+/*
+ * __rec_destroy_session --
+ * Clean up the reconciliation structure, session version.
+ */
+static int
+__rec_destroy_session(WT_SESSION_IMPL *session)
+{
+ __rec_destroy(session, &session->reconcile);
+ return (0);
+}
+
+/*
+ * __rec_bnd_cleanup --
+ * Cleanup the boundary structure information.
+ */
+static void
+__rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy)
+{
+ WT_BOUNDARY *bnd;
+ uint32_t i, last_used;
+
+ if (r->bnd == NULL)
+ return;
+
+ /*
+ * Free the boundary structures' memory. In the case of normal cleanup,
+ * discard any memory we won't reuse in the next reconciliation; in the
+ * case of destruction, discard everything.
+ *
+ * During some big-page evictions we have seen boundary arrays that have
+ * millions of elements. That should not be a normal event, but if the
+ * memory is associated with a random session, it won't be discarded
+ * until the session is closed. If there are more than 10,000 boundary
+ * structure elements, destroy the boundary array and we'll start over.
+ */
+ if (destroy || r->bnd_entries > 10 * 1000) {
+ for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) {
+ __wt_free(session, bnd->addr.addr);
+ __wt_free(session, bnd->dsk);
+ __wt_free(session, bnd->skip);
+ __wt_buf_free(session, &bnd->key);
+ }
+ __wt_free(session, r->bnd);
+ r->bnd_next = 0;
+ r->bnd_entries = r->bnd_allocated = 0;
+ } else {
+ /*
+ * The boundary-next field points to the next boundary structure
+ * we were going to use, but there's no requirement that value
+ * be incremented before reconciliation updates the structure it
+ * points to, that is, there's no guarantee elements of the next
+ * boundary structure are still unchanged. Be defensive, clean
+ * up the "next" structure as well as the ones we know we used.
+ */
+ last_used = r->bnd_next;
+ if (last_used < r->bnd_entries)
+ ++last_used;
+ for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) {
+ __wt_free(session, bnd->addr.addr);
+ __wt_free(session, bnd->dsk);
+ __wt_free(session, bnd->skip);
+ }
+ }
+}
+
+/*
+ * __rec_skip_update_save --
+ * Save a skipped WT_UPDATE list for later restoration.
+ */
+static int
+__rec_skip_update_save(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip)
+{
+ WT_RET(__wt_realloc_def(
+ session, &r->skip_allocated, r->skip_next + 1, &r->skip));
+ r->skip[r->skip_next].ins = ins;
+ r->skip[r->skip_next].rip = rip;
+ ++r->skip_next;
+ return (0);
+}
+
+/*
+ * __rec_skip_update_move --
+ * Move a skipped WT_UPDATE list from the per-page cache to a specific
+ * block's list.
+ */
+static int
+__rec_skip_update_move(
+ WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_UPD_SKIPPED *skip)
+{
+ WT_RET(__wt_realloc_def(
+ session, &bnd->skip_allocated, bnd->skip_next + 1, &bnd->skip));
+ bnd->skip[bnd->skip_next] = *skip;
+ ++bnd->skip_next;
+
+ skip->ins = NULL;
+ skip->rip = NULL;
+ return (0);
+}
+
+/*
+ * __rec_txn_read --
+ * Return the first visible update in a list (or NULL if none are visible),
+ * set a flag if any updates were skipped, track the maximum transaction ID on
+ * the page.
+ */
+static inline int
+__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
+ WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
+{
+ WT_ITEM ovfl;
+ WT_PAGE *page;
+ WT_UPDATE *upd, *upd_list, *upd_ovfl;
+ size_t notused;
+ uint64_t max_txn, min_txn, txnid;
+ int skipped;
+
+ *updp = NULL;
+
+ page = r->page;
+
+ /*
+ * If we're called with an WT_INSERT reference, use its WT_UPDATE
+ * list, else is an on-page row-store WT_UPDATE list.
+ */
+ upd_list = ins == NULL ? WT_ROW_UPDATE(page, rip) : ins->upd;
+ skipped = 0;
+
+ for (max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list;
+ upd != NULL; upd = upd->next) {
+ if ((txnid = upd->txnid) == WT_TXN_ABORTED)
+ continue;
+
+ /* Track the largest/smallest transaction IDs on the list. */
+ if (TXNID_LT(max_txn, txnid))
+ max_txn = txnid;
+ if (TXNID_LT(txnid, min_txn))
+ min_txn = txnid;
+ if (TXNID_LT(txnid, r->skipped_txn) &&
+ !__wt_txn_visible_all(session, txnid))
+ r->skipped_txn = txnid;
+
+ /*
+ * Record whether any updates were skipped on the way to finding
+ * the first visible update.
+ *
+ * If updates were skipped before the one being written, future
+ * reads without intervening modifications to the page could
+ * see a different value; if no updates were skipped, the page
+ * can safely be marked clean and does not need to be
+ * reconciled until modified again.
+ */
+ if (*updp == NULL) {
+ if (__wt_txn_visible(session, txnid))
+ *updp = upd;
+ else
+ skipped = 1;
+ }
+ }
+
+ /*
+ * Track the maximum transaction ID in the page. We store this in the
+ * page at the end of reconciliation if no updates are skipped, it's
+ * used to avoid evicting clean pages from memory with changes required
+ * to satisfy a snapshot read.
+ */
+ if (TXNID_LT(r->max_txn, max_txn))
+ r->max_txn = max_txn;
+
+ /*
+ * If all updates are globally visible and no updates were skipped, the
+ * page can be marked clean and we're done, regardless of whether we're
+ * evicting or checkpointing.
+ *
+ * The oldest transaction ID may have moved while we were scanning the
+ * page, so it is possible to skip an update but then find that by the
+ * end of the scan, all updates are stable.
+ */
+ if (__wt_txn_visible_all(session, max_txn) && !skipped)
+ return (0);
+
+ /*
+ * If some updates are not globally visible, or were skipped, the page
+ * cannot be marked clean.
+ */
+ r->leave_dirty = 1;
+
+ /* If we're not evicting, we're done, we know what we'll write. */
+ if (!F_ISSET(r, WT_EVICTING))
+ return (0);
+
+ /* In some cases, there had better not be any updates we can't write. */
+ if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
+ WT_PANIC_RET(session, EINVAL,
+ "reconciliation illegally skipped an update");
+
+ /*
+ * If evicting and we aren't able to save/restore the not-yet-visible
+ * updates, the page can't be evicted.
+ */
+ if (!F_ISSET(r, WT_SKIP_UPDATE_RESTORE))
+ return (EBUSY);
+
+ /*
+ * Evicting a page with not-yet-visible updates: save and restore the
+ * list of updates on a newly instantiated page.
+ *
+ * The order of the updates on the list matters so we can't move only
+ * the unresolved updates, we have to move the entire update list.
+ *
+ * Clear the returned update so our caller ignores the key/value pair
+ * in the case of an insert/append entry (everything we need is in the
+ * update list), and otherwise writes the original on-page key/value
+ * pair to which the update list applies.
+ */
+ *updp = NULL;
+
+ /*
+ * Handle the case were we don't want to write an original on-page value
+ * item to disk because it's been updated or removed.
+ *
+ * Here's the deal: an overflow value was updated or removed and its
+ * backing blocks freed. If any transaction in the system might still
+ * read the value, a copy was cached in page reconciliation tracking
+ * memory, and the page cell set to WT_CELL_VALUE_OVFL_RM. Eviction
+ * then chose the page and we're splitting it up in order to push parts
+ * of it out of memory.
+ *
+ * We could write the original on-page value item to disk... if we had
+ * a copy. The cache may not have a copy (a globally visible update
+ * would have kept a value from ever being cached), or an update that
+ * subsequent became globally visible could cause a cached value to be
+ * discarded. Either way, once there's a globally visible update, we
+ * may not have the value.
+ *
+ * Fortunately, if there's a globally visible update we don't care about
+ * the original version, so we simply ignore it, no transaction can ever
+ * try and read it. If there isn't a globally visible update, there had
+ * better be a cached value.
+ *
+ * In the latter case, we could write the value out to disk, but (1) we
+ * are planning on re-instantiating this page in memory, it isn't going
+ * to disk, and (2) the value item is eventually going to be discarded,
+ * that seems like a waste of a write. Instead, find the cached value
+ * and append it to the update list we're saving for later restoration.
+ */
+ if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM &&
+ !__wt_txn_visible_all(session, min_txn)) {
+ WT_RET(__wt_ovfl_txnc_search(
+ page, vpack->data, vpack->size, &ovfl));
+ /*
+ * Create an update structure with an impossibly low transaction
+ * ID and append it to the update list we're about to save.
+ * Restoring that update list when this page is re-instantiated
+ * creates an update for the key/value pair visible to every
+ * running transaction in the system, ensuring the on-page value
+ * will be ignored.
+ */
+ WT_RET(__wt_update_alloc(session, &ovfl, &upd_ovfl, &notused));
+ upd_ovfl->txnid = WT_TXN_NONE;
+ for (upd = upd_list; upd->next != NULL; upd = upd->next)
+ ;
+ upd->next = upd_ovfl;
+ }
+
+ return (__rec_skip_update_save(session, r, ins, rip));
+}
+
+/*
+ * CHILD_RELEASE --
+ * Macros to clean up during internal-page reconciliation, releasing the
+ * hazard pointer we're holding on child pages.
+ */
+#undef CHILD_RELEASE
+#define CHILD_RELEASE(session, hazard, ref) do { \
+ if (hazard) { \
+ hazard = 0; \
+ WT_TRET( \
+ __wt_page_release(session, ref, WT_READ_NO_EVICT)); \
+ } \
+} while (0)
+#undef CHILD_RELEASE_ERR
+#define CHILD_RELEASE_ERR(session, hazard, ref) do { \
+ CHILD_RELEASE(session, hazard, ref); \
+ WT_ERR(ret); \
+} while (0)
+
+/*
+ * __rec_child_modify --
+ * Return if the internal page's child references any modifications.
+ */
+static int
+__rec_child_modify(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_REF *ref, int *hazardp, int *statep)
+{
+ WT_DECL_RET;
+ WT_PAGE_MODIFY *mod;
+
+ /* We may acquire a hazard pointer our caller must release. */
+ *hazardp = 0;
+
+#define WT_CHILD_IGNORE 1 /* Deleted child: ignore */
+#define WT_CHILD_MODIFIED 2 /* Modified child */
+#define WT_CHILD_PROXY 3 /* Deleted child: proxy */
+ *statep = 0;
+
+ /*
+ * This function is called when walking an internal page to decide how
+ * to handle child pages referenced by the internal page, specifically
+ * if the child page is to be merged into its parent.
+ *
+ * Internal pages are reconciled for two reasons: first, when evicting
+ * an internal page, second by the checkpoint code when writing internal
+ * pages. During eviction, the subtree is locked down so all pages
+ * should be in the WT_REF_DISK or WT_REF_LOCKED state. During
+ * checkpoint, any eviction that might affect our review of an internal
+ * page is prohibited, however, as the subtree is not reserved for our
+ * exclusive use, there are other page states that must be considered.
+ */
+ for (;; __wt_yield())
+ switch (r->tested_ref_state = ref->state) {
+ case WT_REF_DISK:
+ /* On disk, not modified by definition. */
+ goto done;
+
+ case WT_REF_DELETED:
+ /*
+ * The child is in a deleted state.
+ *
+ * It's possible the state could change underneath us as
+ * the page is read in, and we can race between checking
+ * for a deleted state and looking at the transaction ID
+ * to see if the delete is visible to us. Lock down the
+ * structure.
+ */
+ if (!WT_ATOMIC_CAS4(
+ ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+ break;
+ ret = __rec_child_deleted(session, r, ref, statep);
+ WT_PUBLISH(ref->state, WT_REF_DELETED);
+ goto done;
+
+ case WT_REF_LOCKED:
+ /*
+ * Locked.
+ *
+ * If evicting, the evicted page's subtree, including
+ * this child, was selected for eviction by us and the
+ * state is stable until we reset it, it's an in-memory
+ * state. This is the expected state for a child being
+ * merged into a page (where the page was selected by
+ * the eviction server for eviction).
+ */
+ if (F_ISSET(r, WT_EVICTING))
+ goto in_memory;
+
+ /*
+ * If called during checkpoint, the child is being
+ * considered by the eviction server or the child is a
+ * fast-delete page being read. The eviction may have
+ * started before the checkpoint and so we must wait
+ * for the eviction to be resolved. I suspect we could
+ * handle fast-delete reads, but we can't distinguish
+ * between the two and fast-delete reads aren't expected
+ * to be common.
+ */
+ break;
+
+ case WT_REF_MEM:
+ /*
+ * In memory.
+ *
+ * If evicting, the evicted page's subtree, including
+ * this child, was selected for eviction by us and the
+ * state is stable until we reset it, it's an in-memory
+ * state. This is the expected state for a child being
+ * merged into a page (where the page belongs to a file
+ * being discarded from the cache during close).
+ */
+ if (F_ISSET(r, WT_EVICTING))
+ goto in_memory;
+
+ /*
+ * If called during checkpoint, acquire a hazard pointer
+ * so the child isn't evicted, it's an in-memory case.
+ *
+ * This call cannot return split/restart, dirty page
+ * eviction is shutout during checkpoint, all splits in
+ * process will have completed before we walk any pages
+ * for checkpoint.
+ */
+ if ((ret = __wt_page_in(session, ref,
+ WT_READ_CACHE | WT_READ_NO_EVICT |
+ WT_READ_NO_GEN | WT_READ_NO_WAIT)) == WT_NOTFOUND) {
+ ret = 0;
+ break;
+ }
+ *hazardp = 1;
+ goto in_memory;
+
+ case WT_REF_READING:
+ /*
+ * Being read, not modified by definition.
+ *
+ * We should never be here during eviction, a child page
+ * in this state within an evicted page's subtree would
+ * have caused normally eviction to fail, and exclusive
+ * eviction shouldn't ever see pages being read.
+ */
+ WT_ASSERT(session, !F_ISSET(r, WT_EVICTING));
+ goto done;
+
+ case WT_REF_SPLIT:
+ /*
+ * The page was split out from under us.
+ *
+ * We should never be here during eviction, a child page
+ * in this state within an evicted page's subtree would
+ * have caused eviction to fail.
+ *
+ * We should never be here during checkpoint, dirty page
+ * eviction is shutout during checkpoint, all splits in
+ * process will have completed before we walk any pages
+ * for checkpoint.
+ */
+ WT_ASSERT(session, ref->state != WT_REF_SPLIT);
+ /* FALLTHROUGH */
+
+ WT_ILLEGAL_VALUE(session);
+ }
+
+in_memory:
+ /*
+ * In-memory states: the child is potentially modified if the page's
+ * modify structure has been instantiated. If the modify structure
+ * exists and the page has actually been modified, set that state.
+ * If that's not the case, we would normally use the original cell's
+ * disk address as our reference, but, if we're forced to instantiate
+ * a deleted child page and it's never modified, we end up here with
+ * a page that has a modify structure, no modifications, and no disk
+ * address. Ignore those pages, they're not modified and there is no
+ * reason to write the cell.
+ */
+ mod = ref->page->modify;
+ if (mod != NULL && mod->flags != 0)
+ *statep = WT_CHILD_MODIFIED;
+ else if (ref->addr == NULL) {
+ *statep = WT_CHILD_IGNORE;
+ CHILD_RELEASE(session, *hazardp, ref);
+ }
+
+done: WT_HAVE_DIAGNOSTIC_YIELD;
+ return (ret);
+}
+
+/*
+ * __rec_child_deleted --
+ * Handle pages with leaf pages in the WT_REF_DELETED state.
+ */
+static int
+__rec_child_deleted(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, int *statep)
+{
+ WT_BM *bm;
+ WT_PAGE_DELETED *page_del;
+ size_t addr_size;
+ const uint8_t *addr;
+
+ bm = S2BT(session)->bm;
+ page_del = ref->page_del;
+
+ /*
+ * Internal pages with child leaf pages in the WT_REF_DELETED state are
+ * a special case during reconciliation. First, if the deletion was a
+ * result of a session truncate call, the deletion may not be visible to
+ * us. In that case, we proceed as with any change that's not visible
+ * during reconciliation by setting the skipped flag and ignoring the
+ * change for the purposes of writing the internal page.
+ *
+ * In this case, there must be an associated page-deleted structure, and
+ * it holds the transaction ID we care about.
+ */
+ if (page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) {
+ /*
+ * In some cases, there had better not be any updates we can't
+ * write.
+ */
+ if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
+ WT_PANIC_RET(session, EINVAL,
+ "reconciliation illegally skipped an update");
+
+ /* If this page cannot be evicted, quit now. */
+ if (F_ISSET(r, WT_EVICTING))
+ return (EBUSY);
+ }
+
+ /*
+ * The deletion is visible to us, deal with any underlying disk blocks.
+ *
+ * First, check to see if there is an address associated with this leaf:
+ * if there isn't, we're done, the underlying page is already gone. If
+ * the page still exists, check for any transactions in the system that
+ * might want to see the page's state before it's deleted.
+ *
+ * If any such transactions exist, we cannot discard the underlying leaf
+ * page to the block manager because the transaction may eventually read
+ * it. However, this write might be part of a checkpoint, and should we
+ * recover to that checkpoint, we'll need to delete the leaf page, else
+ * we'd leak it. The solution is to write a proxy cell on the internal
+ * page ensuring the leaf page is eventually discarded.
+ *
+ * If no such transactions exist, we can discard the leaf page to the
+ * block manager and no cell needs to be written at all. We do this
+ * outside of the underlying tracking routines because this action is
+ * permanent and irrevocable. (Clearing the address means we've lost
+ * track of the disk address in a permanent way. This is safe because
+ * there's no path to reading the leaf page again: if there's ever a
+ * read into this part of the name space again, the cache read function
+ * instantiates an entirely new page.)
+ */
+ if (ref->addr != NULL &&
+ (page_del == NULL ||
+ __wt_txn_visible_all(session, page_del->txnid))) {
+ WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ WT_RET(bm->free(bm, session, addr, addr_size));
+
+ if (__wt_off_page(ref->home, ref->addr)) {
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ }
+ ref->addr = NULL;
+ }
+
+ /*
+ * Minor memory cleanup: if a truncate call deleted this page and we
+ * were ever forced to instantiate the page in memory, we would have
+ * built a list of updates in the page reference in order to be able
+ * to abort the truncate. It's a cheap test to make that memory go
+ * away, we do it here because there's really nowhere else we do the
+ * checks. In short, if we have such a list, and the backing address
+ * blocks are gone, there can't be any transaction that can abort.
+ */
+ if (ref->addr == NULL && page_del != NULL) {
+ __wt_free(session, ref->page_del->update_list);
+ __wt_free(session, ref->page_del);
+ }
+
+ /*
+ * If there's still a disk address, then we have to write a proxy
+ * record, otherwise, we can safely ignore this child page.
+ */
+ *statep = ref->addr == NULL ? WT_CHILD_IGNORE : WT_CHILD_PROXY;
+ return (0);
+}
+
+/*
+ * __rec_incr --
+ * Update the memory tracking structure for a set of new entries.
+ */
+static inline void
+__rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size)
+{
+ /*
+ * The buffer code is fragile and prone to off-by-one errors -- check
+ * for overflow in diagnostic mode.
+ */
+ WT_ASSERT(session, r->space_avail >= size);
+ WT_ASSERT(session,
+ WT_BLOCK_FITS(r->first_free, size, r->dsk.mem, r->page_size));
+
+ r->entries += v;
+ r->space_avail -= size;
+ r->first_free += size;
+}
+
+/*
+ * __rec_copy_incr --
+ * Copy a key/value cell and buffer pair into the new image.
+ */
+static inline void
+__rec_copy_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *kv)
+{
+ size_t len;
+ uint8_t *p, *t;
+
+ /*
+ * If there's only one chunk of data to copy (because the cell and data
+ * are being copied from the original disk page), the cell length won't
+ * be set, the WT_ITEM data/length will reference the data to be copied.
+ *
+ * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do
+ * the copy in-line.
+ */
+ for (p = (uint8_t *)r->first_free,
+ t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len)
+ *p++ = *t++;
+
+ /* The data can be quite large -- call memcpy. */
+ if (kv->buf.size != 0)
+ memcpy(p, kv->buf.data, kv->buf.size);
+
+ WT_ASSERT(session, kv->len == kv->cell_len + kv->buf.size);
+ __rec_incr(session, r, 1, kv->len);
+}
+
+/*
+ * __rec_dict_replace --
+ * Check for a dictionary match.
+ */
+static int
+__rec_dict_replace(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, uint64_t rle, WT_KV *val)
+{
+ WT_DICTIONARY *dp;
+ uint64_t offset;
+
+ /*
+ * We optionally create a dictionary of values and only write a unique
+ * value once per page, using a special "copy" cell for all subsequent
+ * copies of the value. We have to do the cell build and resolution at
+ * this low level because we need physical cell offsets for the page.
+ *
+ * Sanity check: short-data cells can be smaller than dictionary-copy
+ * cells. If the data is already small, don't bother doing the work.
+ * This isn't just work avoidance: on-page cells can't grow as a result
+ * of writing a dictionary-copy cell, the reconciliation functions do a
+ * split-boundary test based on the size required by the value's cell;
+ * if we grow the cell after that test we'll potentially write off the
+ * end of the buffer's memory.
+ */
+ if (val->buf.size <= WT_INTPACK32_MAXSIZE)
+ return (0);
+ WT_RET(__rec_dictionary_lookup(session, r, val, &dp));
+ if (dp == NULL)
+ return (0);
+
+ /*
+ * If the dictionary cell reference is not set, we're creating a new
+ * entry in the dictionary, update its location.
+ *
+ * If the dictionary cell reference is set, we have a matching value.
+ * Create a copy cell instead.
+ */
+ if (dp->cell == NULL)
+ dp->cell = r->first_free;
+ else {
+ offset = WT_PTRDIFF(r->first_free, dp->cell);
+ val->len = val->cell_len =
+ __wt_cell_pack_copy(&val->cell, rle, offset);
+ val->buf.data = NULL;
+ val->buf.size = 0;
+ }
+ return (0);
+}
+
+/*
+ * __rec_key_state_update --
+ * Update prefix and suffix compression based on the last key.
+ */
+static inline void
+__rec_key_state_update(WT_RECONCILE *r, int ovfl_key)
+{
+ WT_ITEM *a;
+
+ /*
+ * If writing an overflow key onto the page, don't update the "last key"
+ * value, and leave the state of prefix compression alone. (If we are
+ * currently doing prefix compression, we have a key state which will
+ * continue to work, we're just skipping the key just created because
+ * it's an overflow key and doesn't participate in prefix compression.
+ * If we are not currently doing prefix compression, we can't start, an
+ * overflow key doesn't give us any state.)
+ *
+ * Additionally, if we wrote an overflow key onto the page, turn off the
+ * suffix compression of row-store internal node keys. (When we split,
+ * "last key" is the largest key on the previous page, and "cur key" is
+ * the first key on the next page, which is being promoted. In some
+ * cases we can discard bytes from the "cur key" that are not needed to
+ * distinguish between the "last key" and "cur key", compressing the
+ * size of keys on internal nodes. If we just built an overflow key,
+ * we're not going to update the "last key", making suffix compression
+ * impossible for the next key. Alternatively, we could remember where
+ * the last key was on the page, detect it's an overflow key, read it
+ * from disk and do suffix compression, but that's too much work for an
+ * unlikely event.)
+ *
+ * If we're not writing an overflow key on the page, update the last-key
+ * value and turn on both prefix and suffix compression.
+ */
+ if (ovfl_key)
+ r->key_sfx_compress = 0;
+ else {
+ a = r->cur;
+ r->cur = r->last;
+ r->last = a;
+
+ r->key_pfx_compress = r->key_pfx_compress_conf;
+ r->key_sfx_compress = r->key_sfx_compress_conf;
+ }
+}
+
+/*
+ * Macros from fixed-length entries to/from bytes.
+ */
+#define WT_FIX_BYTES_TO_ENTRIES(btree, bytes) \
+ ((uint32_t)((((bytes) * 8) / (btree)->bitcnt)))
+#define WT_FIX_ENTRIES_TO_BYTES(btree, entries) \
+ ((uint32_t)WT_ALIGN((entries) * (btree)->bitcnt, 8))
+
+/*
+ * __rec_leaf_page_max --
+ * Figure out the maximum leaf page size for the reconciliation.
+ */
+static inline uint32_t
+__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ uint32_t page_size;
+
+ btree = S2BT(session);
+ page = r->page;
+
+ page_size = 0;
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ /*
+ * Column-store pages can grow if there are missing records
+ * (that is, we lost a chunk of the range, and have to write
+ * deleted records). Fixed-length objects are a problem, if
+ * there's a big missing range, we could theoretically have to
+ * write large numbers of missing objects.
+ */
+ page_size = (uint32_t)WT_ALIGN(WT_FIX_ENTRIES_TO_BYTES(btree,
+ r->salvage->take + r->salvage->missing), btree->allocsize);
+ break;
+ case WT_PAGE_COL_VAR:
+ /*
+ * Column-store pages can grow if there are missing records
+ * (that is, we lost a chunk of the range, and have to write
+ * deleted records). Variable-length objects aren't usually a
+ * problem because we can write any number of deleted records
+ * in a single page entry because of the RLE, we just need to
+ * ensure that additional entry fits.
+ */
+ break;
+ case WT_PAGE_ROW_LEAF:
+ default:
+ /*
+ * Row-store pages can't grow, salvage never does anything
+ * other than reduce the size of a page read from disk.
+ */
+ break;
+ }
+
+ /*
+ * Default size for variable-length column-store and row-store pages
+ * during salvage is the maximum leaf page size.
+ */
+ if (page_size < btree->maxleafpage)
+ page_size = btree->maxleafpage;
+
+ /*
+ * The page we read from the disk should be smaller than the page size
+ * we just calculated, check out of paranoia.
+ */
+ if (page_size < page->dsk->mem_size)
+ page_size = page->dsk->mem_size;
+
+ /*
+ * Salvage is the backup plan: don't let this fail.
+ */
+ return (page_size * 2);
+}
+
+/*
+ * __rec_split_bnd_init --
+ * Initialize a single boundary structure.
+ */
+static void
+__rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
+{
+ bnd->start = NULL;
+
+ bnd->recno = 0;
+ bnd->entries = 0;
+
+ __wt_free(session, bnd->addr.addr);
+ WT_CLEAR(bnd->addr);
+ bnd->size = 0;
+ bnd->cksum = 0;
+ __wt_free(session, bnd->dsk);
+
+ __wt_free(session, bnd->skip);
+ bnd->skip_next = 0;
+ bnd->skip_allocated = 0;
+
+ /* Ignore the key, we re-use that memory in each new reconciliation. */
+
+ bnd->already_compressed = 0;
+}
+
+/*
+ * __rec_split_bnd_grow --
+ * Grow the boundary array as necessary.
+ */
+static int
+__rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ /*
+ * Make sure there's enough room for another boundary. The calculation
+ * is +2, because when filling in the current boundary's information,
+ * we save the start point of the next boundary (for example, a record
+ * number or key), in the (current + 1) slot.
+ *
+ * For the same reason, we're always initializing one ahead.
+ */
+ WT_RET(__wt_realloc_def(
+ session, &r->bnd_allocated, r->bnd_next + 2, &r->bnd));
+ r->bnd_entries = r->bnd_allocated / sizeof(r->bnd[0]);
+
+ __rec_split_bnd_init(session, &r->bnd[r->bnd_next + 1]);
+
+ return (0);
+}
+
+/*
+ * __rec_split_init --
+ * Initialization for the reconciliation split functions.
+ */
+static int
+__rec_split_init(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_PAGE *page, uint64_t recno, uint32_t max)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_PAGE_HEADER *dsk;
+ size_t corrected_page_size;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ /*
+ * The maximum leaf page size governs when an in-memory leaf page splits
+ * into multiple on-disk pages; however, salvage can't be allowed to
+ * split, there's no parent page yet. If we're doing salvage, override
+ * the caller's selection of a maximum page size, choosing a page size
+ * that ensures we won't split.
+ */
+ if (r->salvage != NULL)
+ max = __rec_leaf_page_max(session, r);
+
+ /*
+ * Set the page sizes. If we're doing the page layout, the maximum page
+ * size is the same as the page size. If the application is doing page
+ * layout (raw compression is configured), we accumulate some amount of
+ * additional data because we don't know how well it will compress, and
+ * we don't want to increment our way up to the amount of data needed by
+ * the application to successfully compress to the target page size.
+ */
+ r->page_size = r->page_size_max = max;
+ if (r->raw_compression)
+ r->page_size *= 10;
+
+ /*
+ * Ensure the disk image buffer is large enough for the max object, as
+ * corrected by the underlying block manager.
+ */
+ corrected_page_size = r->page_size;
+ WT_RET(bm->write_size(bm, session, &corrected_page_size));
+ WT_RET(__wt_buf_init(session, &r->dsk, corrected_page_size));
+
+ /*
+ * Clear the disk page's header and block-manager space, set the page
+ * type (the type doesn't change, and setting it later would require
+ * additional code in a few different places).
+ */
+ dsk = r->dsk.mem;
+ memset(dsk, 0, WT_PAGE_HEADER_BYTE_SIZE(btree));
+ dsk->type = page->type;
+
+ /*
+ * If we have to split, we want to choose a smaller page size for the
+ * split pages, because otherwise we could end up splitting one large
+ * packed page over and over. We don't want to pick the minimum size
+ * either, because that penalizes an application that did a bulk load
+ * and subsequently inserted a few items into packed pages. Currently
+ * defaulted to 75%, but I have no empirical evidence that's "correct".
+ *
+ * The maximum page size may be a multiple of the split page size (for
+ * example, there's a maximum page size of 128KB, but because the table
+ * is active and we don't want to split a lot, the split size is 20KB).
+ * The maximum page size may NOT be an exact multiple of the split page
+ * size.
+ *
+ * It's lots of work to build these pages and don't want to start over
+ * when we reach the maximum page size (it's painful to restart after
+ * creating overflow items and compacted data, for example, as those
+ * items have already been written to disk). So, the loop calls the
+ * helper functions when approaching a split boundary, and we save the
+ * information at that point. That allows us to go back and split the
+ * page at the boundary points if we eventually overflow the maximum
+ * page size.
+ *
+ * Finally, all this doesn't matter for fixed-size column-store pages,
+ * raw compression, and salvage. Fixed-size column store pages can
+ * split under (very) rare circumstances, but they're allocated at a
+ * fixed page size, never anything smaller. In raw compression, the
+ * underlying compression routine decides when we split, so it's not
+ * our problem. In salvage, as noted above, we can't split at all.
+ */
+ if (r->raw_compression || r->salvage != NULL) {
+ r->split_size = 0;
+ r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+ }
+ else if (page->type == WT_PAGE_COL_FIX) {
+ r->split_size = r->page_size_max;
+ r->space_avail =
+ r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+ } else {
+ r->split_size = __wt_split_page_size(btree, r->page_size_max);
+ r->space_avail =
+ r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+ }
+ r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
+
+ /* Initialize the first boundary. */
+ r->bnd_next = 0;
+ WT_RET(__rec_split_bnd_grow(session, r));
+ __rec_split_bnd_init(session, &r->bnd[0]);
+ r->bnd[0].recno = recno;
+ r->bnd[0].start = WT_PAGE_HEADER_BYTE(btree, dsk);
+
+ /*
+ * If the maximum page size is the same as the split page size, either
+ * because of the object type or application configuration, there isn't
+ * any need to maintain split boundaries within a larger page.
+ *
+ * No configuration for salvage here, because salvage can't split.
+ */
+ if (r->raw_compression)
+ r->bnd_state = SPLIT_TRACKING_RAW;
+ else if (max == r->split_size)
+ r->bnd_state = SPLIT_TRACKING_OFF;
+ else
+ r->bnd_state = SPLIT_BOUNDARY;
+
+ /* Initialize the entry counters. */
+ r->entries = r->total_entries = 0;
+
+ /* Initialize the starting record number. */
+ r->recno = recno;
+
+ /* New page, compression off. */
+ r->key_pfx_compress = r->key_sfx_compress = 0;
+
+ return (0);
+}
+
+/*
+ * __rec_is_checkpoint --
+ * Return if we're writing a checkpoint.
+ */
+static int
+__rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd)
+{
+ /*
+ * Check to see if we're going to create a checkpoint.
+ *
+ * This function exists as a place to hang this comment.
+ *
+ * Any time we write the root page of the tree without splitting we are
+ * creating a checkpoint (and have to tell the underlying block manager
+ * so it creates and writes the additional information checkpoints
+ * require). However, checkpoints are completely consistent, and so we
+ * have to resolve information about the blocks we're expecting to free
+ * as part of the checkpoint, before writing the checkpoint. In short,
+ * we don't do checkpoint writes here; clear the boundary information as
+ * a reminder and create the checkpoint during wrapup.
+ */
+ if (bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) {
+ bnd->addr.addr = NULL;
+ bnd->addr.size = 0;
+ bnd->addr.type = 0;
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * __rec_split_row_promote_cell --
+ * Get a key from a cell for the purposes of promotion.
+ */
+static int
+__rec_split_row_promote_cell(
+ WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, WT_ITEM *key)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *kpack, _kpack;
+
+ btree = S2BT(session);
+ kpack = &_kpack;
+
+ /*
+ * The cell had better have a zero-length prefix and not be a copy cell;
+ * the first cell on a page cannot refer an earlier cell on the page.
+ */
+ cell = WT_PAGE_HEADER_BYTE(btree, dsk);
+ __wt_cell_unpack(cell, kpack);
+ WT_ASSERT(session,
+ kpack->prefix == 0 && kpack->raw != WT_CELL_VALUE_COPY);
+
+ WT_RET(__wt_cell_data_copy(session, dsk->type, kpack, key));
+ return (0);
+}
+
+/*
+ * __rec_split_row_promote --
+ * Key promotion for a row-store.
+ */
+static int
+__rec_split_row_promote(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ITEM *key, uint8_t type)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(update);
+ WT_DECL_RET;
+ WT_ITEM *max;
+ WT_UPD_SKIPPED *skip;
+ size_t cnt, len, size;
+ uint32_t i;
+ const uint8_t *pa, *pb;
+ int cmp;
+
+ /*
+ * For a column-store, the promoted key is the recno and we already have
+ * a copy. For a row-store, it's the first key on the page, a variable-
+ * length byte string, get a copy.
+ *
+ * This function is called from the split code at each split boundary,
+ * but that means we're not called before the first boundary, and we
+ * will eventually have to get the first key explicitly when splitting
+ * a page.
+ *
+ * For the current slot, take the last key we built, after doing suffix
+ * compression. The "last key we built" describes some process: before
+ * calling the split code, we must place the last key on the page before
+ * the boundary into the "last" key structure, and the first key on the
+ * page after the boundary into the "current" key structure, we're going
+ * to compare them for suffix compression.
+ *
+ * Suffix compression is a hack to shorten keys on internal pages. We
+ * only need enough bytes in the promoted key to ensure searches go to
+ * the correct page: the promoted key has to be larger than the last key
+ * on the leaf page preceding it, but we don't need any more bytes than
+ * that. In other words, we can discard any suffix bytes not required
+ * to distinguish between the key being promoted and the last key on the
+ * leaf page preceding it. This can only be done for the first level of
+ * internal pages, you cannot repeat suffix truncation as you split up
+ * the tree, it loses too much information.
+ *
+ * Note #1: if the last key on the previous page was an overflow key,
+ * we don't have the in-memory key against which to compare, and don't
+ * try to do suffix compression. The code for that case turns suffix
+ * compression off for the next key, we don't have to deal with it here.
+ */
+ if (type != WT_PAGE_ROW_LEAF || !r->key_sfx_compress)
+ return (__wt_buf_set(session, key, r->cur->data, r->cur->size));
+
+ btree = S2BT(session);
+ WT_RET(__wt_scr_alloc(session, 0, &update));
+
+ /*
+ * Note #2: if we skipped updates, an update key may be larger than the
+ * last key stored in the previous block (probable for append-centric
+ * workloads). If there are skipped updates, check for one larger than
+ * the last key and smaller than the current key.
+ */
+ max = r->last;
+ for (i = r->skip_next; i > 0; --i) {
+ skip = &r->skip[i - 1];
+ if (skip->ins == NULL)
+ WT_ERR(__wt_row_leaf_key(
+ session, r->page, skip->rip, update, 0));
+ else {
+ update->data = WT_INSERT_KEY(skip->ins);
+ update->size = WT_INSERT_KEY_SIZE(skip->ins);
+ }
+
+ /* Compare against the current key, it must be less. */
+ WT_ERR(__wt_compare(
+ session, btree->collator, update, r->cur, &cmp));
+ if (cmp >= 0)
+ continue;
+
+ /* Compare against the last key, it must be greater. */
+ WT_ERR(__wt_compare(
+ session, btree->collator, update, r->last, &cmp));
+ if (cmp >= 0)
+ max = update;
+
+ /*
+ * The skipped updates are in key-sort order so the entry we're
+ * looking for is either the last one or the next-to-last one
+ * in the list. Once we've compared an entry against the last
+ * key on the page, we're done.
+ */
+ break;
+ }
+
+ /*
+ * The largest key on the last block must sort before the current key,
+ * so we'll either find a larger byte value in the current key, or the
+ * current key will be a longer key, and the interesting byte is one
+ * past the length of the shorter key.
+ */
+ pa = max->data;
+ pb = r->cur->data;
+ len = WT_MIN(max->size, r->cur->size);
+ size = len + 1;
+ for (cnt = 1; len > 0; ++cnt, --len, ++pa, ++pb)
+ if (*pa != *pb) {
+ if (size != cnt) {
+ WT_STAT_FAST_DATA_INCRV(session,
+ rec_suffix_compression, size - cnt);
+ size = cnt;
+ }
+ break;
+ }
+ ret = __wt_buf_set(session, key, r->cur->data, size);
+
+err: __wt_scr_free(&update);
+ return (ret);
+}
+
+/*
+ * __rec_split --
+ * Handle the page reconciliation bookkeeping. (Did you know "bookkeeper"
+ * has 3 doubled letters in a row? Sweet-tooth does, too.)
+ */
+static int
+__rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ WT_BTREE *btree;
+ WT_BOUNDARY *last, *next;
+ WT_PAGE_HEADER *dsk;
+ uint32_t len;
+
+ /*
+ * We should never split during salvage, and we're about to drop core
+ * because there's no parent page.
+ */
+ if (r->salvage != NULL)
+ WT_PANIC_RET(session, WT_PANIC,
+ "%s page too large, attempted split during salvage",
+ __wt_page_type_string(r->page->type));
+
+ /*
+ * Handle page-buffer size tracking; we have to do this work in every
+ * reconciliation loop, and I don't want to repeat the code that many
+ * times.
+ */
+ btree = S2BT(session);
+ dsk = r->dsk.mem;
+
+ /* Hitting a page boundary resets the dictionary, in all cases. */
+ __rec_dictionary_reset(r);
+
+ /*
+ * There are 3 cases we have to handle.
+ *
+ * #1
+ * About to cross a split boundary: save current boundary information
+ * and return.
+ *
+ * #2
+ * About to cross the maximum boundary: use saved boundary information
+ * to write all of the split pages.
+ *
+ * #3
+ * About to cross a split boundary, but we've either already done the
+ * split thing when we approached the maximum boundary, in which
+ * case we write the page and keep going, or we were never tracking
+ * split boundaries at all.
+ *
+ * Cases #1 and #2 are the hard ones: we're called when we're about to
+ * cross each split boundary, and we save information away so we can
+ * split if we have to. We're also called when we're about to cross
+ * the maximum page boundary: in that case, we do the actual split and
+ * clean up all the previous boundaries, then keep going.
+ */
+ switch (r->bnd_state) {
+ case SPLIT_BOUNDARY: /* Case #1 */
+ /*
+ * Save the information about where we are when the split would
+ * have happened.
+ */
+ WT_RET(__rec_split_bnd_grow(session, r));
+ last = &r->bnd[r->bnd_next++];
+ next = last + 1;
+
+ /* Set the number of entries for the just finished chunk. */
+ last->entries = r->entries - r->total_entries;
+ r->total_entries = r->entries;
+
+ /* Set the key for the next chunk. */
+ next->recno = r->recno;
+ if (dsk->type == WT_PAGE_ROW_INT ||
+ dsk->type == WT_PAGE_ROW_LEAF)
+ WT_RET(__rec_split_row_promote(
+ session, r, &next->key, dsk->type));
+
+ /*
+ * Set the starting buffer address and clear the entries (the
+ * latter not required, but cleaner).
+ */
+ next->start = r->first_free;
+ next->entries = 0;
+
+ /*
+ * Set the space available to another split-size chunk, if we
+ * have one. If we don't have room for another split chunk,
+ * add whatever space remains in the maximum page size, and
+ * hope it's enough.
+ */
+ len = WT_PTRDIFF32(r->first_free, dsk);
+ if (len + r->split_size <= r->page_size)
+ r->space_avail =
+ r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+ else {
+ r->bnd_state = SPLIT_MAX;
+ r->space_avail = r->page_size -
+ (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
+ }
+ break;
+ case SPLIT_MAX: /* Case #2 */
+ /*
+ * It didn't all fit into a single page.
+ *
+ * Cycle through the saved split-point information, writing the
+ * split chunks we have tracked.
+ */
+ WT_RET(__rec_split_fixup(session, r));
+
+ /* We're done saving split chunks. */
+ r->bnd_state = SPLIT_TRACKING_OFF;
+ break;
+ case SPLIT_TRACKING_OFF: /* Case #3 */
+ /*
+ * It didn't all fit, but either we've already noticed it and
+ * are now processing the rest of the page at the split-size
+ * boundaries, or the split size was the same as the page size,
+ * so we never bothered with saving split-point information.
+ */
+ WT_RET(__rec_split_bnd_grow(session, r));
+ last = &r->bnd[r->bnd_next++];
+ next = last + 1;
+
+ /*
+ * Set the key for the next chunk (before writing the block, a
+ * key range is needed in that code).
+ */
+ next->recno = r->recno;
+ if (dsk->type == WT_PAGE_ROW_INT ||
+ dsk->type == WT_PAGE_ROW_LEAF)
+ WT_RET(__rec_split_row_promote(
+ session, r, &next->key, dsk->type));
+
+ /* Clear the entries (not required, but cleaner). */
+ next->entries = 0;
+
+ /* Finalize the header information and write the page. */
+ dsk->recno = last->recno;
+ dsk->u.entries = r->entries;
+ dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
+ WT_RET(__rec_split_write(session, r, last, &r->dsk, 0));
+
+ /*
+ * Set the caller's entry count and buffer information for the
+ * next chunk. We only get here if we're not splitting or have
+ * already split, so it's split-size chunks from here on out.
+ */
+ r->entries = 0;
+ r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
+ r->space_avail =
+ r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+ break;
+ case SPLIT_TRACKING_RAW:
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
+
+/*
+ * __rec_split_raw_worker --
+ * Handle the raw compression page reconciliation bookkeeping.
+ */
+static int
+__rec_split_raw_worker(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, int no_more_rows)
+{
+ WT_BM *bm;
+ WT_BOUNDARY *last, *next;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_COMPRESSOR *compressor;
+ WT_DECL_RET;
+ WT_ITEM *dst, *write_ref;
+ WT_PAGE_HEADER *dsk, *dsk_dst;
+ WT_SESSION *wt_session;
+ size_t corrected_page_size, len, result_len;
+ uint64_t recno;
+ uint32_t entry, i, result_slots, slots;
+ int last_block;
+ uint8_t *dsk_start;
+
+ wt_session = (WT_SESSION *)session;
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ unpack = &_unpack;
+ compressor = btree->compressor;
+ dst = &r->raw_destination;
+ dsk = r->dsk.mem;
+
+ WT_RET(__rec_split_bnd_grow(session, r));
+ last = &r->bnd[r->bnd_next];
+ next = last + 1;
+
+ /*
+ * Build arrays of offsets and cumulative counts of cells and rows in
+ * the page: the offset is the byte offset to the possible split-point
+ * (adjusted for an initial chunk that cannot be compressed), entries
+ * is the cumulative page entries covered by the byte offset, recnos is
+ * the cumulative rows covered by the byte offset.
+ */
+ if (r->entries >= r->raw_max_slots) {
+ __wt_free(session, r->raw_entries);
+ __wt_free(session, r->raw_offsets);
+ __wt_free(session, r->raw_recnos);
+ r->raw_max_slots = 0;
+
+ i = r->entries + 100;
+ WT_RET(__wt_calloc_def(session, i, &r->raw_entries));
+ WT_RET(__wt_calloc_def(session, i, &r->raw_offsets));
+ if (dsk->type == WT_PAGE_COL_INT ||
+ dsk->type == WT_PAGE_COL_VAR)
+ WT_RET(__wt_calloc_def(session, i, &r->raw_recnos));
+ r->raw_max_slots = i;
+ }
+
+ /*
+ * We're going to walk the disk image, which requires setting the
+ * number of entries.
+ */
+ dsk->u.entries = r->entries;
+
+ /*
+ * We track the record number at each column-store split point, set an
+ * initial value.
+ */
+ recno = 0;
+ if (dsk->type == WT_PAGE_COL_VAR)
+ recno = last->recno;
+
+ entry = slots = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ ++entry;
+
+ /*
+ * Row-store pages can split at keys, but not at values,
+ * column-store pages can split at values.
+ */
+ __wt_cell_unpack(cell, unpack);
+ switch (unpack->type) {
+ case WT_CELL_KEY:
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_KEY_SHORT:
+ break;
+ case WT_CELL_ADDR_DEL:
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ case WT_CELL_DEL:
+ case WT_CELL_VALUE:
+ case WT_CELL_VALUE_OVFL:
+ case WT_CELL_VALUE_SHORT:
+ if (dsk->type == WT_PAGE_COL_INT) {
+ recno = unpack->v;
+ break;
+ }
+ if (dsk->type == WT_PAGE_COL_VAR) {
+ recno += __wt_cell_rle(unpack);
+ break;
+ }
+ r->raw_entries[slots] = entry;
+ continue;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /*
+ * We can't compress the first 64B of the block (it must be
+ * written without compression), and a possible split point
+ * may appear in that 64B; keep it simple, ignore the first
+ * allocation size of data, anybody splitting smaller than
+ * that (as calculated before compression), is doing it wrong.
+ */
+ if ((len = WT_PTRDIFF(cell, dsk)) > btree->allocsize)
+ r->raw_offsets[++slots] =
+ WT_STORE_SIZE(len - WT_BLOCK_COMPRESS_SKIP);
+
+ if (dsk->type == WT_PAGE_COL_INT ||
+ dsk->type == WT_PAGE_COL_VAR)
+ r->raw_recnos[slots] = recno;
+ r->raw_entries[slots] = entry;
+ }
+
+ /*
+ * If we haven't managed to find at least one split point, we're done,
+ * don't bother calling the underlying compression function.
+ */
+ if (slots == 0) {
+ result_len = 0;
+ result_slots = 0;
+ goto no_slots;
+ }
+
+ /* The slot at array's end is the total length of the data. */
+ r->raw_offsets[++slots] =
+ WT_STORE_SIZE(WT_PTRDIFF(cell, dsk) - WT_BLOCK_COMPRESS_SKIP);
+
+ /*
+ * Allocate a destination buffer. If there's a pre-size function, use
+ * it to determine the destination buffer's minimum size, otherwise the
+ * destination buffer is documented to be at least the maximum object
+ * size.
+ *
+ * The destination buffer really only needs to be large enough for the
+ * target block size, corrected for the requirements of the underlying
+ * block manager. If the target block size is 8KB, that's a multiple
+ * of 512B and so the underlying block manager is fine with it. But...
+ * we don't control what the pre_size method returns us as a required
+ * size, and we don't want to document the compress_raw method has to
+ * skip bytes in the buffer because that's confusing, so do something
+ * more complicated. First, find out how much space the compress_raw
+ * function might need, either the value returned from pre_size, or the
+ * maximum object size. Add the compress-skip bytes, and then correct
+ * that value for the underlying block manager. As a result, we have
+ * a destination buffer that's the right "object" size when calling the
+ * compress_raw method, and there are bytes in the header just for us.
+ */
+ if (compressor->pre_size == NULL)
+ result_len = r->page_size_max;
+ else
+ WT_RET(compressor->pre_size(compressor, wt_session,
+ (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
+ (size_t)r->raw_offsets[slots], &result_len));
+ corrected_page_size = result_len + WT_BLOCK_COMPRESS_SKIP;
+ WT_RET(bm->write_size(bm, session, &corrected_page_size));
+ WT_RET(__wt_buf_init(session, dst, corrected_page_size));
+
+ /*
+ * Copy the header bytes into the destination buffer, then call the
+ * compression function.
+ */
+ memcpy(dst->mem, dsk, WT_BLOCK_COMPRESS_SKIP);
+ ret = compressor->compress_raw(compressor, wt_session,
+ r->page_size_max, btree->split_pct,
+ WT_BLOCK_COMPRESS_SKIP, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
+ r->raw_offsets, slots,
+ (uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP,
+ result_len, no_more_rows, &result_len, &result_slots);
+ switch (ret) {
+ case EAGAIN:
+ /*
+ * The compression function wants more rows; accumulate and
+ * retry.
+ *
+ * Reset the resulting slots count, just in case the compression
+ * function modified it before giving up.
+ */
+ result_slots = 0;
+ break;
+ case 0:
+ /*
+ * If the compression function returned zero result slots, it's
+ * giving up and we write the original data. (This is a pretty
+ * bad result: we've not done compression on a block much larger
+ * than the maximum page size, but once compression gives up,
+ * there's not much else we can do.)
+ *
+ * If the compression function returned non-zero result slots,
+ * we were successful and have a block to write.
+ */
+ if (result_slots == 0) {
+ WT_STAT_FAST_DATA_INCR(session, compress_raw_fail);
+
+ /*
+ * If there are no more rows, we can write the original
+ * data from the original buffer.
+ */
+ if (no_more_rows)
+ break;
+
+ /*
+ * Copy the original data to the destination buffer, as
+ * if the compression function simply copied it. Take
+ * all but the last row of the original data (the last
+ * row has to be set as the key for the next block).
+ */
+ result_slots = slots - 1;
+ result_len = r->raw_offsets[result_slots];
+ WT_RET(__wt_buf_grow(
+ session, dst, result_len + WT_BLOCK_COMPRESS_SKIP));
+ memcpy((uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP,
+ (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
+ result_len);
+
+ /*
+ * Mark it as uncompressed so the standard compression
+ * function is called before the buffer is written.
+ */
+ last->already_compressed = 0;
+ } else {
+ WT_STAT_FAST_DATA_INCR(session, compress_raw_ok);
+
+ /*
+ * If there are more rows and the compression function
+ * consumed all of the current data, there are problems:
+ * First, with row-store objects, we're potentially
+ * skipping updates, we must have a key for the next
+ * block so we know with what block a skipped update is
+ * associated. Second, if the compression function
+ * compressed all of the data, we're not pushing it
+ * hard enough (unless we got lucky and gave it exactly
+ * the right amount to work with, which is unlikely).
+ * Handle both problems by accumulating more data any
+ * time we're not writing the last block and compression
+ * ate all of the rows.
+ */
+ if (result_slots == slots && !no_more_rows)
+ result_slots = 0;
+ else
+ last->already_compressed = 1;
+ }
+ break;
+ default:
+ return (ret);
+ }
+
+no_slots:
+ /*
+ * Check for the last block we're going to write: if no more rows and
+ * we failed to compress anything, or we compressed everything, it's
+ * the last block.
+ */
+ last_block = no_more_rows &&
+ (result_slots == 0 || result_slots == slots);
+
+ if (result_slots != 0) {
+ /*
+ * We have a block, finalize the header information.
+ */
+ dst->size = result_len + WT_BLOCK_COMPRESS_SKIP;
+ dsk_dst = dst->mem;
+ dsk_dst->recno = last->recno;
+ dsk_dst->mem_size =
+ r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP;
+ dsk_dst->u.entries = r->raw_entries[result_slots - 1];
+
+ /*
+ * There is likely a remnant in the working buffer that didn't
+ * get compressed; copy it down to the start of the buffer and
+ * update the starting record number, free space and so on.
+ * !!!
+ * Note use of memmove, the source and destination buffers can
+ * overlap.
+ */
+ len = WT_PTRDIFF(r->first_free, (uint8_t *)dsk +
+ r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP);
+ dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+ (void)memmove(dsk_start, (uint8_t *)r->first_free - len, len);
+
+ r->entries -= r->raw_entries[result_slots - 1];
+ r->first_free = dsk_start + len;
+ r->space_avail =
+ r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
+
+ /*
+ * Set the key for the next block (before writing the block, a
+ * key range is needed in that code).
+ */
+ switch (dsk->type) {
+ case WT_PAGE_COL_INT:
+ next->recno = r->raw_recnos[result_slots];
+ break;
+ case WT_PAGE_COL_VAR:
+ next->recno = r->raw_recnos[result_slots - 1];
+ break;
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ next->recno = 0;
+ if (!last_block) {
+ /*
+ * Confirm there was uncompressed data remaining
+ * in the buffer, we're about to read it for the
+ * next chunk's initial key.
+ */
+ WT_ASSERT(session, len > 0);
+ WT_RET(__rec_split_row_promote_cell(
+ session, dsk, &next->key));
+ }
+ break;
+ }
+ write_ref = dst;
+ } else if (no_more_rows) {
+ /*
+ * Compression failed and there are no more rows to accumulate,
+ * write the original buffer instead.
+ */
+ WT_STAT_FAST_DATA_INCR(session, compress_raw_fail);
+
+ dsk->recno = last->recno;
+ dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
+ dsk->u.entries = r->entries;
+
+ r->entries = 0;
+ r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
+ r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+
+ write_ref = &r->dsk;
+ last->already_compressed = 0;
+ } else {
+ /*
+ * Compression failed, there are more rows to accumulate and the
+ * compression function wants to try again; increase the size of
+ * the "page" and try again after we accumulate some more rows.
+ */
+ WT_STAT_FAST_DATA_INCR(session, compress_raw_fail_temporary);
+
+ len = WT_PTRDIFF(r->first_free, r->dsk.mem);
+ corrected_page_size = r->page_size * 2;
+ WT_RET(bm->write_size(bm, session, &corrected_page_size));
+ WT_RET(__wt_buf_grow(session, &r->dsk, corrected_page_size));
+ r->page_size *= 2;
+ r->first_free = (uint8_t *)r->dsk.mem + len;
+ r->space_avail =
+ r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
+ return (0);
+ }
+
+ /* We have a block, update the boundary counter. */
+ ++r->bnd_next;
+
+ /*
+ * If we are writing the whole page in our first/only attempt, it might
+ * be a checkpoint (checkpoints are only a single page, by definition).
+ * Further, checkpoints aren't written here, the wrapup functions do the
+ * write, and they do the write from the original buffer location. If
+ * it's a checkpoint and the block isn't in the right buffer, copy it.
+ *
+ * If it's not a checkpoint, write the block.
+ */
+ if (r->bnd_next == 1 && last_block && __rec_is_checkpoint(r, last)) {
+ if (write_ref == dst)
+ WT_RET(__wt_buf_set(
+ session, &r->dsk, dst->mem, dst->size));
+ } else
+ WT_RET(
+ __rec_split_write(session, r, last, write_ref, last_block));
+ return (0);
+}
+
+/*
+ * __rec_raw_decompress --
+ * Decompress a raw-compressed image.
+ */
+static int
+__rec_raw_decompress(
+ WT_SESSION_IMPL *session, const void *image, size_t size, void *retp)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_PAGE_HEADER const *dsk;
+ size_t result_len;
+
+ btree = S2BT(session);
+ dsk = image;
+
+ /*
+ * We skipped an update and we can't write a block, but unfortunately,
+ * the block has already been compressed. Decompress the block so we
+ * can subsequently re-instantiate it in memory.
+ */
+ WT_RET(__wt_scr_alloc(session, dsk->mem_size, &tmp));
+ memcpy(tmp->mem, image, WT_BLOCK_COMPRESS_SKIP);
+ WT_ERR(btree->compressor->decompress(btree->compressor,
+ &session->iface,
+ (uint8_t *)image + WT_BLOCK_COMPRESS_SKIP,
+ size - WT_BLOCK_COMPRESS_SKIP,
+ (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP,
+ dsk->mem_size - WT_BLOCK_COMPRESS_SKIP,
+ &result_len));
+ if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
+ WT_ERR(__wt_illegal_value(session, btree->dhandle->name));
+
+ WT_ERR(__wt_strndup(session, tmp->data, dsk->mem_size, retp));
+ WT_ASSERT(session, __wt_verify_dsk_image(
+ session, "[raw evict split]", tmp->data, dsk->mem_size) == 0);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __rec_split_raw --
+ * Raw compression split routine.
+ */
+static inline int
+__rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ return (__rec_split_raw_worker(session, r, 0));
+}
+
+/*
+ * __rec_split_finish_std --
+ * Finish processing a page, standard version.
+ */
+static int
+__rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ WT_BOUNDARY *bnd;
+ WT_PAGE_HEADER *dsk;
+
+ /* Adjust the boundary information based on our split status. */
+ switch (r->bnd_state) {
+ case SPLIT_BOUNDARY:
+ case SPLIT_MAX:
+ /*
+ * We never split, the reconciled page fit into a maximum page
+ * size. Change the first boundary slot to represent the full
+ * page (the first boundary slot is largely correct, just update
+ * the number of entries).
+ */
+ r->bnd_next = 0;
+ break;
+ case SPLIT_TRACKING_OFF:
+ /*
+ * If we have already split, or aren't tracking boundaries, put
+ * the remaining data in the next boundary slot.
+ */
+ WT_RET(__rec_split_bnd_grow(session, r));
+ break;
+ case SPLIT_TRACKING_RAW:
+ /*
+ * We were configured for raw compression, but never actually
+ * wrote anything.
+ */
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /*
+ * We only arrive here with no entries to write if the page was entirely
+ * empty, and if the page is empty, we merge it into its parent during
+ * the parent's reconciliation. A page with skipped updates isn't truly
+ * empty, continue on.
+ */
+ if (r->entries == 0 && r->skip_next == 0)
+ return (0);
+
+ /* Set the boundary reference and increment the count. */
+ bnd = &r->bnd[r->bnd_next++];
+ bnd->entries = r->entries;
+
+ /* Finalize the header information. */
+ dsk = r->dsk.mem;
+ dsk->recno = bnd->recno;
+ dsk->u.entries = r->entries;
+ dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
+
+ /* If this is a checkpoint, we're done, otherwise write the page. */
+ return (
+ __rec_is_checkpoint(r, bnd) ? 0 :
+ __rec_split_write(session, r, bnd, &r->dsk, 1));
+}
+
+/*
+ * __rec_split_finish --
+ * Finish processing a page.
+ */
+static int
+__rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ /* We're done reconciling - write the final page */
+ if (r->raw_compression && r->entries != 0) {
+ while (r->entries != 0)
+ WT_RET(__rec_split_raw_worker(session, r, 1));
+ } else
+ WT_RET(__rec_split_finish_std(session, r));
+
+ return (0);
+}
+
+/*
+ * __rec_split_fixup --
+ * Fix up after crossing the maximum page boundary.
+ */
+static int
+__rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ WT_BOUNDARY *bnd;
+ WT_BTREE *btree;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_PAGE_HEADER *dsk;
+ uint32_t i, len;
+ uint8_t *dsk_start;
+
+ /*
+ * When we overflow physical limits of the page, we walk the list of
+ * split chunks we've created and write those pages out, then update
+ * the caller's information.
+ */
+ btree = S2BT(session);
+
+ /*
+ * The data isn't laid out on a page boundary or nul padded; copy it to
+ * a clean, aligned, padded buffer before writing it.
+ *
+ * Allocate a scratch buffer to hold the new disk image. Copy the
+ * WT_PAGE_HEADER header onto the scratch buffer, most of the header
+ * information remains unchanged between the pages.
+ */
+ WT_RET(__wt_scr_alloc(session, r->page_size_max, &tmp));
+ dsk = tmp->mem;
+ memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE);
+
+ /*
+ * For each split chunk we've created, update the disk image and copy
+ * it into place.
+ */
+ dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+ for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) {
+ /* Copy the page contents to the temporary buffer. */
+ len = WT_PTRDIFF32((bnd + 1)->start, bnd->start);
+ memcpy(dsk_start, bnd->start, len);
+
+ /* Finalize the header information and write the page. */
+ dsk->recno = bnd->recno;
+ dsk->u.entries = bnd->entries;
+ dsk->mem_size =
+ tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len;
+ WT_ERR(__rec_split_write(session, r, bnd, tmp, 0));
+ }
+
+ /*
+ * There is probably a remnant in the working buffer that didn't get
+ * written; copy it down to the beginning of the working buffer, and
+ * update the starting record number.
+ *
+ * Confirm the remnant is no larger than the available split buffer.
+ *
+ * Fix up our caller's information.
+ */
+ len = WT_PTRDIFF32(r->first_free, bnd->start);
+ if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree))
+ WT_PANIC_ERR(session, EINVAL,
+ "Reconciliation remnant too large for the split buffer");
+
+ dsk = r->dsk.mem;
+ dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+ (void)memmove(dsk_start, bnd->start, len);
+
+ r->entries -= r->total_entries;
+ r->first_free = dsk_start + len;
+ r->space_avail =
+ (r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) - len;
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __rec_split_write --
+ * Write a disk block out for the split helper functions.
+ */
+static int
+__rec_split_write(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_BOUNDARY *bnd, WT_ITEM *buf, int last_block)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_MULTI *multi;
+ WT_PAGE *page;
+ WT_PAGE_HEADER *dsk;
+ WT_PAGE_MODIFY *mod;
+ WT_UPD_SKIPPED *skip;
+ size_t addr_size;
+ uint32_t bnd_slot, i, j;
+ int cmp;
+ uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE];
+
+ btree = S2BT(session);
+ dsk = buf->mem;
+ page = r->page;
+ mod = page->modify;
+
+ WT_RET(__wt_scr_alloc(session, 0, &key));
+
+ /* Set the zero-length value flag in the page header. */
+ if (dsk->type == WT_PAGE_ROW_LEAF) {
+ F_CLR(dsk, WT_PAGE_EMPTY_V_ALL | WT_PAGE_EMPTY_V_NONE);
+
+ if (r->entries != 0 && r->all_empty_value)
+ F_SET(dsk, WT_PAGE_EMPTY_V_ALL);
+ if (r->entries != 0 && !r->any_empty_value)
+ F_SET(dsk, WT_PAGE_EMPTY_V_NONE);
+ }
+
+ /* Initialize the address (set the page type for the parent). */
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ bnd->addr.type = WT_ADDR_LEAF_NO;
+ break;
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_ROW_LEAF:
+ bnd->addr.type = r->ovfl_items ? WT_ADDR_LEAF : WT_ADDR_LEAF_NO;
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ bnd->addr.type = WT_ADDR_INT;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ bnd->size = (uint32_t)buf->size;
+ bnd->cksum = 0;
+
+ /*
+ * Check if we've skipped updates that belong to this block, and move
+ * any to the per-block structure. Quit as soon as we find a skipped
+ * update that doesn't belong to the block, they're in sorted order.
+ *
+ * This code requires a key be filled in for the next block (or the
+ * last block flag be set, if there's no next block).
+ */
+ for (i = 0, skip = r->skip; i < r->skip_next; ++i, ++skip) {
+ /* The last block gets all remaining skipped updates. */
+ if (last_block) {
+ WT_ERR(__rec_skip_update_move(session, bnd, skip));
+ continue;
+ }
+
+ /*
+ * Get the skipped update's key and compare it with this block's
+ * key range. If the skipped update list belongs with the block
+ * we're about to write, move it to the per-block memory. Check
+ * only to the first update that doesn't go with the block, they
+ * must be in sorted order.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ if (WT_INSERT_RECNO(skip->ins) >= (bnd + 1)->recno)
+ goto skip_check_complete;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (skip->ins == NULL)
+ WT_ERR(__wt_row_leaf_key(
+ session, page, skip->rip, key, 0));
+ else {
+ key->data = WT_INSERT_KEY(skip->ins);
+ key->size = WT_INSERT_KEY_SIZE(skip->ins);
+ }
+ WT_ERR(__wt_compare(session,
+ btree->collator, key, &(bnd + 1)->key, &cmp));
+ if (cmp >= 0)
+ goto skip_check_complete;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ WT_ERR(__rec_skip_update_move(session, bnd, skip));
+ }
+
+skip_check_complete:
+ /*
+ * If there are updates that weren't moved to the block, shuffle them to
+ * the beginning of the cached list (we maintain the skipped updates in
+ * sorted order, new skipped updates must be appended to the list).
+ */
+ for (j = 0; i < r->skip_next; ++j, ++i)
+ r->skip[j] = r->skip[i];
+ r->skip_next = j;
+
+ /*
+ * If we had to skip updates in order to build this disk image, we can't
+ * actually write it. Instead, we will re-instantiate the page using the
+ * disk image and the list of updates we skipped.
+ *
+ * If the buffer is compressed (raw compression was configured), we have
+ * to decompress it so we can instantiate it later.
+ */
+ if (bnd->skip != NULL) {
+ if (bnd->already_compressed)
+ WT_ERR(__rec_raw_decompress(
+ session, buf->data, buf->size, &bnd->dsk));
+ else {
+ WT_ERR(__wt_strndup(
+ session, buf->data, buf->size, &bnd->dsk));
+ WT_ASSERT(session, __wt_verify_dsk_image(session,
+ "[evict split]", buf->data, buf->size) == 0);
+ }
+ goto done;
+ }
+
+ /*
+ * If we wrote this block before, re-use it. Pages get written in the
+ * same block order every time, only check the appropriate slot. The
+ * expensive part of this test is the checksum, only do that work when
+ * there has been or will be a reconciliation of this page involving
+ * split pages. This test isn't perfect: we're doing a checksum if a
+ * previous reconciliation of the page split or if we will split this
+ * time, but that test won't calculate a checksum on the first block
+ * the first time the page splits.
+ */
+ bnd_slot = (uint32_t)(bnd - r->bnd);
+ if (bnd_slot > 1 ||
+ (F_ISSET(mod, WT_PM_REC_MULTIBLOCK) && mod->mod_multi != NULL)) {
+ /*
+ * There are page header fields which need to be cleared to get
+ * consistent checksums: specifically, the write generation and
+ * the memory owned by the block manager. We are reusing the
+ * same buffer space each time, clear it before calculating the
+ * checksum.
+ */
+ dsk->write_gen = 0;
+ memset(WT_BLOCK_HEADER_REF(dsk), 0, btree->block_header);
+ bnd->cksum = __wt_cksum(buf->data, buf->size);
+
+ if (F_ISSET(mod, WT_PM_REC_MULTIBLOCK) &&
+ mod->mod_multi_entries > bnd_slot) {
+ multi = &mod->mod_multi[bnd_slot];
+ if (multi->size == bnd->size &&
+ multi->cksum == bnd->cksum) {
+ multi->addr.reuse = 1;
+ bnd->addr = multi->addr;
+
+ WT_STAT_FAST_DATA_INCR(session, rec_page_match);
+ goto done;
+ }
+ }
+ }
+
+ WT_ERR(__wt_bt_write(session,
+ buf, addr, &addr_size, 0, bnd->already_compressed));
+ WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr));
+ bnd->addr.size = (uint8_t)addr_size;
+
+done:
+err: __wt_scr_free(&key);
+ return (ret);
+}
+
+/*
+ * __wt_bulk_init --
+ * Bulk insert initialization.
+ */
+int
+__wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_PAGE_INDEX *pindex;
+ WT_RECONCILE *r;
+ uint64_t recno;
+
+ btree = S2BT(session);
+ /*
+ * Bulk-load is only permitted on newly created files, not any empty
+ * file -- see the checkpoint code for a discussion.
+ */
+ if (!btree->bulk_load_ok)
+ WT_RET_MSG(session, EINVAL,
+ "bulk-load is only possible for newly created trees");
+
+ /* Set a reference to the empty leaf page. */
+ pindex = WT_INTL_INDEX_COPY(btree->root.page);
+ cbulk->ref = pindex->index[0];
+ cbulk->leaf = cbulk->ref->page;
+
+ WT_RET(
+ __rec_write_init(session, cbulk->ref, 0, NULL, &cbulk->reconcile));
+ r = cbulk->reconcile;
+ r->is_bulk_load = 1;
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ recno = 1;
+ break;
+ case BTREE_ROW:
+ recno = 0;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (__rec_split_init(
+ session, r, cbulk->leaf, recno, btree->maxleafpage));
+}
+
+/*
+ * __wt_bulk_wrapup --
+ * Bulk insert cleanup.
+ */
+int
+__wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_PAGE *parent;
+ WT_RECONCILE *r;
+
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ if (cbulk->entry != 0)
+ __rec_incr(session, r, cbulk->entry,
+ __bitstr_size(
+ (size_t)cbulk->entry * btree->bitcnt));
+ break;
+ case BTREE_COL_VAR:
+ if (cbulk->rle != 0)
+ WT_RET(__wt_bulk_insert_var(session, cbulk));
+ break;
+ case BTREE_ROW:
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ WT_RET(__rec_split_finish(session, r));
+ WT_RET(__rec_write_wrapup(session, r, r->page));
+
+ /* Mark the page's parent dirty. */
+ parent = r->ref->home;
+ WT_RET(__wt_page_modify_init(session, parent));
+ __wt_page_modify_set(session, parent);
+
+ __rec_destroy(session, &cbulk->reconcile);
+
+ return (0);
+}
+
+/*
+ * __wt_bulk_insert_row --
+ * Row-store bulk insert.
+ */
+int
+__wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_KV *key, *val;
+ WT_RECONCILE *r;
+ int ovfl_key;
+
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+ cursor = &cbulk->cbt.iface;
+
+ key = &r->k;
+ val = &r->v;
+ WT_RET(__rec_cell_build_leaf_key(session, r, /* Build key cell */
+ cursor->key.data, cursor->key.size, &ovfl_key));
+ WT_RET(__rec_cell_build_val(session, r, /* Build value cell */
+ cursor->value.data, cursor->value.size, (uint64_t)0));
+
+ /* Boundary: split or write the page. */
+ while (key->len + val->len > r->space_avail)
+ if (r->raw_compression)
+ WT_RET(__rec_split_raw(session, r));
+ else {
+ WT_RET(__rec_split(session, r));
+
+ /*
+ * Turn off prefix compression until a full key written
+ * to the new page, and (unless we're already working
+ * with an overflow key), rebuild the key without prefix
+ * compression.
+ */
+ if (r->key_pfx_compress_conf) {
+ r->key_pfx_compress = 0;
+ if (!ovfl_key)
+ WT_RET(__rec_cell_build_leaf_key(
+ session, r, NULL, 0, &ovfl_key));
+ }
+ }
+
+ /* Copy the key/value pair onto the page. */
+ __rec_copy_incr(session, r, key);
+ if (val->len == 0)
+ r->any_empty_value = 1;
+ else {
+ r->all_empty_value = 0;
+ if (btree->dictionary)
+ WT_RET(__rec_dict_replace(session, r, 0, val));
+ __rec_copy_incr(session, r, val);
+ }
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+
+ return (0);
+}
+
+/*
+ * __rec_col_fix_bulk_insert_split_check --
+ * Check if a bulk-loaded fixed-length column store page needs to split.
+ */
+static inline int
+__rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_RECONCILE *r;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session;
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+
+ if (cbulk->entry == cbulk->nrecs) {
+ if (cbulk->entry != 0) {
+ /*
+ * If everything didn't fit, update the counters and
+ * split.
+ *
+ * Boundary: split or write the page.
+ */
+ __rec_incr(session, r, cbulk->entry,
+ __bitstr_size(
+ (size_t)cbulk->entry * btree->bitcnt));
+ WT_RET(__rec_split(session, r));
+ }
+ cbulk->entry = 0;
+ cbulk->nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+ }
+ return (0);
+}
+
+/*
+ * __wt_bulk_insert_fix --
+ * Fixed-length column-store bulk insert.
+ */
+int
+__wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_RECONCILE *r;
+ uint32_t entries, offset, page_entries, page_size;
+ const uint8_t *data;
+
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+ cursor = &cbulk->cbt.iface;
+
+ if (cbulk->bitmap) {
+ if (((r->recno - 1) * btree->bitcnt) & 0x7)
+ WT_RET_MSG(session, EINVAL,
+ "Bulk bitmap load not aligned on a byte boundary");
+ for (data = cursor->value.data,
+ entries = (uint32_t)cursor->value.size;
+ entries > 0;
+ entries -= page_entries, data += page_size) {
+ WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
+
+ page_entries =
+ WT_MIN(entries, cbulk->nrecs - cbulk->entry);
+ page_size = __bitstr_size(page_entries * btree->bitcnt);
+ offset = __bitstr_size(cbulk->entry * btree->bitcnt);
+ memcpy(r->first_free + offset, data, page_size);
+ cbulk->entry += page_entries;
+ r->recno += page_entries;
+ }
+ return (0);
+ }
+
+ WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
+
+ __bit_setv(r->first_free,
+ cbulk->entry, btree->bitcnt, ((uint8_t *)cursor->value.data)[0]);
+ ++cbulk->entry;
+ ++r->recno;
+
+ return (0);
+}
+
+/*
+ * __wt_bulk_insert_var --
+ * Variable-length column-store bulk insert.
+ */
+int
+__wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_KV *val;
+ WT_RECONCILE *r;
+
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+
+ /*
+ * Store the bulk cursor's last buffer, not the current value, we're
+ * creating a duplicate count, which means we want the previous value
+ * seen, not the current value.
+ */
+ val = &r->v;
+ WT_RET(__rec_cell_build_val(
+ session, r, cbulk->last.data, cbulk->last.size, cbulk->rle));
+
+ /* Boundary: split or write the page. */
+ while (val->len > r->space_avail)
+ if (r->raw_compression)
+ WT_RET(__rec_split_raw(session, r));
+ else
+ WT_RET(__rec_split(session, r));
+
+ /* Copy the value onto the page. */
+ if (btree->dictionary)
+ WT_RET(__rec_dict_replace(session, r, cbulk->rle, val));
+ __rec_copy_incr(session, r, val);
+
+ /* Update the starting record number in case we split. */
+ r->recno += cbulk->rle;
+
+ return (0);
+}
+
+/*
+ * __rec_vtype --
+ * Return a value cell's address type.
+ */
+static inline u_int
+__rec_vtype(WT_ADDR *addr)
+{
+ if (addr->type == WT_ADDR_INT)
+ return (WT_CELL_ADDR_INT);
+ if (addr->type == WT_ADDR_LEAF)
+ return (WT_CELL_ADDR_LEAF);
+ return (WT_CELL_ADDR_LEAF_NO);
+}
+
+/*
+ * __rec_col_int --
+ * Reconcile a column-store internal page.
+ */
+static int
+__rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_ADDR *addr;
+ WT_BTREE *btree;
+ WT_CELL_UNPACK *vpack, _vpack;
+ WT_DECL_RET;
+ WT_KV *val;
+ WT_PAGE *child;
+ WT_REF *ref;
+ int hazard, state;
+
+ btree = S2BT(session);
+ child = NULL;
+ hazard = 0;
+
+ val = &r->v;
+ vpack = &_vpack;
+
+ WT_RET(__rec_split_init(
+ session, r, page, page->pg_intl_recno, btree->maxintlpage));
+
+ /* For each entry in the in-memory page... */
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ /* Update the starting record number in case we split. */
+ r->recno = ref->key.recno;
+
+ /*
+ * Modified child.
+ * The page may be emptied or internally created during a split.
+ * Deleted/split pages are merged into the parent and discarded.
+ */
+ WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state));
+ addr = NULL;
+ child = ref->page;
+ if (state != 0) {
+ /*
+ * Currently the only non-zero returned stated possible
+ * for a column-store page is child-modified (all other
+ * states are part of the fast-truncate support, which
+ * is row-store only).
+ */
+ WT_ASSERT(session, state == WT_CHILD_MODIFIED);
+
+ switch (F_ISSET(child->modify, WT_PM_REC_MASK)) {
+ case WT_PM_REC_EMPTY:
+ /*
+ * Column-store pages are almost never empty, as
+ * discarding a page would remove a chunk of the
+ * name space. The exceptions are pages created
+ * when the tree is created, and never filled.
+ */
+ CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ case WT_PM_REC_MULTIBLOCK:
+ WT_ERR(__rec_col_merge(session, r, child));
+ CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ case WT_PM_REC_REPLACE:
+ addr = &child->modify->mod_replace;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ }
+
+ /*
+ * Build the value cell. The child page address is in one of 3
+ * places: if the page was replaced, the page's modify structure
+ * references it and we built the value cell just above in the
+ * switch statement. Else, the WT_REF->addr reference points to
+ * an on-page cell or an off-page WT_ADDR structure: if it's an
+ * on-page cell and we copy it from the page, else build a new
+ * cell.
+ */
+ if (addr == NULL && __wt_off_page(page, ref->addr))
+ addr = ref->addr;
+ if (addr == NULL) {
+ __wt_cell_unpack(ref->addr, vpack);
+ val->buf.data = ref->addr;
+ val->buf.size = __wt_cell_total_len(vpack);
+ val->cell_len = 0;
+ val->len = val->buf.size;
+ } else
+ __rec_cell_build_addr(r, addr->addr, addr->size,
+ __rec_vtype(addr), ref->key.recno);
+ CHILD_RELEASE_ERR(session, hazard, ref);
+
+ /* Boundary: split or write the page. */
+ while (val->len > r->space_avail)
+ if (r->raw_compression)
+ WT_ERR(__rec_split_raw(session, r));
+ else
+ WT_ERR(__rec_split(session, r));
+
+ /* Copy the value onto the page. */
+ __rec_copy_incr(session, r, val);
+ } WT_INTL_FOREACH_END;
+
+ /* Write the remnant page. */
+ return (__rec_split_finish(session, r));
+
+err: CHILD_RELEASE(session, hazard, ref);
+ return (ret);
+}
+
+/*
+ * __rec_col_merge --
+ * Merge in a split page.
+ */
+static int
+__rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_ADDR *addr;
+ WT_KV *val;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ uint32_t i;
+
+ mod = page->modify;
+
+ val = &r->v;
+
+ /* For each entry in the split array... */
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ /* Update the starting record number in case we split. */
+ r->recno = multi->key.recno;
+
+ /* Build the value cell. */
+ addr = &multi->addr;
+ __rec_cell_build_addr(r,
+ addr->addr, addr->size, __rec_vtype(addr), r->recno);
+
+ /* Boundary: split or write the page. */
+ while (val->len > r->space_avail)
+ if (r->raw_compression)
+ WT_RET(__rec_split_raw(session, r));
+ else
+ WT_RET(__rec_split(session, r));
+
+ /* Copy the value onto the page. */
+ __rec_copy_incr(session, r, val);
+ }
+ return (0);
+}
+
+/*
+ * __rec_col_fix --
+ * Reconcile a fixed-width, column-store leaf page.
+ */
+static int
+__rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_INSERT *ins;
+ WT_UPDATE *upd;
+ uint64_t recno;
+ uint32_t entry, nrecs;
+
+ btree = S2BT(session);
+
+ WT_RET(__rec_split_init(
+ session, r, page, page->pg_fix_recno, btree->maxleafpage));
+
+ /* Update any changes to the original on-page data items. */
+ WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) {
+ WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+ if (upd != NULL)
+ __bit_setv_recno(page, WT_INSERT_RECNO(ins),
+ btree->bitcnt, ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+ }
+
+ /* Copy the updated, disk-image bytes into place. */
+ memcpy(r->first_free, page->pg_fix_bitf,
+ __bitstr_size((size_t)page->pg_fix_entries * btree->bitcnt));
+
+ /* Calculate the number of entries per page remainder. */
+ entry = page->pg_fix_entries;
+ nrecs = WT_FIX_BYTES_TO_ENTRIES(
+ btree, r->space_avail) - page->pg_fix_entries;
+ r->recno += entry;
+
+ /* Walk any append list. */
+ WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
+ WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+ if (upd == NULL)
+ continue;
+ for (;;) {
+ /*
+ * The application may have inserted records which left
+ * gaps in the name space.
+ */
+ for (recno = WT_INSERT_RECNO(ins);
+ nrecs > 0 && r->recno < recno;
+ --nrecs, ++entry, ++r->recno)
+ __bit_setv(
+ r->first_free, entry, btree->bitcnt, 0);
+
+ if (nrecs > 0) {
+ __bit_setv(r->first_free, entry, btree->bitcnt,
+ ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+ --nrecs;
+ ++entry;
+ ++r->recno;
+ break;
+ }
+
+ /*
+ * If everything didn't fit, update the counters and
+ * split.
+ *
+ * Boundary: split or write the page.
+ */
+ __rec_incr(session, r, entry,
+ __bitstr_size((size_t)entry * btree->bitcnt));
+ WT_RET(__rec_split(session, r));
+
+ /* Calculate the number of entries per page. */
+ entry = 0;
+ nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+ }
+ }
+
+ /* Update the counters. */
+ __rec_incr(
+ session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt));
+
+ /* Write the remnant page. */
+ return (__rec_split_finish(session, r));
+}
+
+/*
+ * __rec_col_fix_slvg --
+ * Reconcile a fixed-width, column-store leaf page created during salvage.
+ */
+static int
+__rec_col_fix_slvg(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+ WT_BTREE *btree;
+ uint64_t page_start, page_take;
+ uint32_t entry, nrecs;
+
+ btree = S2BT(session);
+
+ /*
+ * !!!
+ * It's vanishingly unlikely and probably impossible for fixed-length
+ * column-store files to have overlapping key ranges. It's possible
+ * for an entire key range to go missing (if a page is corrupted and
+ * lost), but because pages can't split, it shouldn't be possible to
+ * find pages where the key ranges overlap. That said, we check for
+ * it during salvage and clean up after it here because it doesn't
+ * cost much and future column-store formats or operations might allow
+ * for fixed-length format ranges to overlap during salvage, and I
+ * don't want to have to retrofit the code later.
+ */
+ WT_RET(__rec_split_init(
+ session, r, page, page->pg_fix_recno, btree->maxleafpage));
+
+ /* We may not be taking all of the entries on the original page. */
+ page_take = salvage->take == 0 ? page->pg_fix_entries : salvage->take;
+ page_start = salvage->skip == 0 ? 0 : salvage->skip;
+
+ /* Calculate the number of entries per page. */
+ entry = 0;
+ nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+
+ for (; nrecs > 0 && salvage->missing > 0;
+ --nrecs, --salvage->missing, ++entry)
+ __bit_setv(r->first_free, entry, btree->bitcnt, 0);
+
+ for (; nrecs > 0 && page_take > 0;
+ --nrecs, --page_take, ++page_start, ++entry)
+ __bit_setv(r->first_free, entry, btree->bitcnt,
+ __bit_getv(page->pg_fix_bitf,
+ (uint32_t)page_start, btree->bitcnt));
+
+ r->recno += entry;
+ __rec_incr(session, r, entry,
+ __bitstr_size((size_t)entry * btree->bitcnt));
+
+ /*
+ * We can't split during salvage -- if everything didn't fit, it's
+ * all gone wrong.
+ */
+ if (salvage->missing != 0 || page_take != 0)
+ WT_PANIC_RET(session, WT_PANIC,
+ "%s page too large, attempted split during salvage",
+ __wt_page_type_string(page->type));
+
+ /* Write the page. */
+ return (__rec_split_finish(session, r));
+}
+
+/*
+ * __rec_col_var_helper --
+ * Create a column-store variable length record cell and write it onto a
+ * page.
+ */
+static int
+__rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
+ WT_SALVAGE_COOKIE *salvage,
+ WT_ITEM *value, int deleted, uint8_t overflow_type, uint64_t rle)
+{
+ WT_BTREE *btree;
+ WT_KV *val;
+
+ btree = S2BT(session);
+
+ val = &r->v;
+
+ /*
+ * Occasionally, salvage needs to discard records from the beginning or
+ * end of the page, and because the items may be part of a RLE cell, do
+ * the adjustments here. It's not a mistake we don't bother telling
+ * our caller we've handled all the records from the page we care about,
+ * and can quit processing the page: salvage is a rare operation and I
+ * don't want to complicate our caller's loop.
+ */
+ if (salvage != NULL) {
+ if (salvage->done)
+ return (0);
+ if (salvage->skip != 0) {
+ if (rle <= salvage->skip) {
+ salvage->skip -= rle;
+ return (0);
+ }
+ rle -= salvage->skip;
+ salvage->skip = 0;
+ }
+ if (salvage->take != 0) {
+ if (rle <= salvage->take)
+ salvage->take -= rle;
+ else {
+ rle = salvage->take;
+ salvage->take = 0;
+ }
+ if (salvage->take == 0)
+ salvage->done = 1;
+ }
+ }
+
+ if (deleted) {
+ val->cell_len = __wt_cell_pack_del(&val->cell, rle);
+ val->buf.data = NULL;
+ val->buf.size = 0;
+ val->len = val->cell_len;
+ } else if (overflow_type) {
+ val->cell_len = __wt_cell_pack_ovfl(
+ &val->cell, overflow_type, rle, value->size);
+ val->buf.data = value->data;
+ val->buf.size = value->size;
+ val->len = val->cell_len + value->size;
+ } else
+ WT_RET(__rec_cell_build_val(
+ session, r, value->data, value->size, rle));
+
+ /* Boundary: split or write the page. */
+ while (val->len > r->space_avail)
+ if (r->raw_compression)
+ WT_RET(__rec_split_raw(session, r));
+ else
+ WT_RET(__rec_split(session, r));
+
+ /* Copy the value onto the page. */
+ if (!deleted && !overflow_type && btree->dictionary)
+ WT_RET(__rec_dict_replace(session, r, rle, val));
+ __rec_copy_incr(session, r, val);
+
+ /* Update the starting record number in case we split. */
+ r->recno += rle;
+
+ return (0);
+}
+
+/*
+ * __rec_col_var --
+ * Reconcile a variable-width column-store leaf page.
+ */
+static int
+__rec_col_var(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+ enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *vpack, _vpack;
+ WT_COL *cip;
+ WT_DECL_ITEM(orig);
+ WT_DECL_RET;
+ WT_INSERT *ins;
+ WT_ITEM *last;
+ WT_UPDATE *upd;
+ uint64_t n, nrepeat, repeat_count, rle, src_recno;
+ uint32_t i, size;
+ int deleted, last_deleted, orig_deleted, update_no_copy;
+ const void *data;
+
+ btree = S2BT(session);
+ last = r->last;
+ vpack = &_vpack;
+
+ WT_RET(__wt_scr_alloc(session, 0, &orig));
+ data = NULL;
+ size = 0;
+ upd = NULL;
+
+ WT_RET(__rec_split_init(
+ session, r, page, page->pg_var_recno, btree->maxleafpage));
+
+ /*
+ * The salvage code may be calling us to reconcile a page where there
+ * were missing records in the column-store name space. If taking the
+ * first record from on the page, it might be a deleted record, so we
+ * have to give the RLE code a chance to figure that out. Else, if
+ * not taking the first record from the page, write a single element
+ * representing the missing records onto a new page. (Don't pass the
+ * salvage cookie to our helper function in this case, we're handling
+ * one of the salvage cookie fields on our own, and we don't need the
+ * helper function's assistance.)
+ */
+ rle = 0;
+ last_deleted = 0;
+ if (salvage != NULL && salvage->missing != 0) {
+ if (salvage->skip == 0) {
+ rle = salvage->missing;
+ last_deleted = 1;
+
+ /*
+ * Correct the number of records we're going to "take",
+ * pretending the missing records were on the page.
+ */
+ salvage->take += salvage->missing;
+ } else
+ WT_ERR(__rec_col_var_helper(
+ session, r, NULL, NULL, 1, 0, salvage->missing));
+ }
+
+ /*
+ * We track two data items through this loop: the previous (last) item
+ * and the current item: if the last item is the same as the current
+ * item, we increment the RLE count for the last item; if the last item
+ * is different from the current item, we write the last item onto the
+ * page, and replace it with the current item. The r->recno counter
+ * tracks records written to the page, and is incremented by the helper
+ * function immediately after writing records to the page. The record
+ * number of our source record, that is, the current item, is maintained
+ * in src_recno.
+ */
+ src_recno = r->recno + rle;
+
+ /* For each entry in the in-memory page... */
+ WT_COL_FOREACH(page, cip, i) {
+ ovfl_state = OVFL_IGNORE;
+ if ((cell = WT_COL_PTR(page, cip)) == NULL) {
+ nrepeat = 1;
+ ins = NULL;
+ orig_deleted = 1;
+ } else {
+ __wt_cell_unpack(cell, vpack);
+ nrepeat = __wt_cell_rle(vpack);
+ ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip));
+
+ /*
+ * If the original value is "deleted", there's no value
+ * to compare, we're done.
+ */
+ orig_deleted = vpack->type == WT_CELL_DEL ? 1 : 0;
+ if (orig_deleted)
+ goto record_loop;
+
+ /*
+ * Overflow items are tricky: we don't know until we're
+ * finished processing the set of values if we need the
+ * overflow value or not. If we don't use the overflow
+ * item at all, we have to discard it from the backing
+ * file, otherwise we'll leak blocks on the checkpoint.
+ * That's safe because if the backing overflow value is
+ * still needed by any running transaction, we'll cache
+ * a copy in the reconciliation tracking structures.
+ *
+ * Regardless, we avoid copying in overflow records: if
+ * there's a WT_INSERT entry that modifies a reference
+ * counted overflow record, we may have to write copies
+ * of the overflow record, and in that case we'll do the
+ * comparisons, but we don't read overflow items just to
+ * see if they match records on either side.
+ */
+ if (vpack->ovfl) {
+ ovfl_state = OVFL_UNUSED;
+ goto record_loop;
+ }
+
+ /*
+ * If data is Huffman encoded, we have to decode it in
+ * order to compare it with the last item we saw, which
+ * may have been an update string. This guarantees we
+ * find every single pair of objects we can RLE encode,
+ * including applications updating an existing record
+ * where the new value happens (?) to match a Huffman-
+ * encoded value in a previous or next record.
+ */
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, WT_PAGE_COL_VAR, vpack, orig));
+ }
+
+record_loop: /*
+ * Generate on-page entries: loop repeat records, looking for
+ * WT_INSERT entries matching the record number. The WT_INSERT
+ * lists are in sorted order, so only need check the next one.
+ */
+ for (n = 0;
+ n < nrepeat; n += repeat_count, src_recno += repeat_count) {
+ upd = NULL;
+ if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) {
+ WT_ERR(__rec_txn_read(
+ session, r, ins, NULL, vpack, &upd));
+ ins = WT_SKIP_NEXT(ins);
+ }
+ if (upd != NULL) {
+ update_no_copy = 1; /* No data copy */
+ repeat_count = 1; /* Single record */
+
+ deleted = WT_UPDATE_DELETED_ISSET(upd);
+ if (!deleted) {
+ data = WT_UPDATE_DATA(upd);
+ size = upd->size;
+ }
+ } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
+ update_no_copy = 1; /* No data copy */
+ repeat_count = 1; /* Single record */
+
+ deleted = 0;
+
+ /*
+ * If doing update save and restore, there's an
+ * update that's not globally visible, and the
+ * underlying value is a removed overflow value,
+ * we end up here.
+ *
+ * When the update save/restore code noticed the
+ * removed overflow value, it appended a copy of
+ * the cached, original overflow value to the
+ * update list being saved (ensuring the on-page
+ * item will never be accessed after the page is
+ * re-instantiated), then returned a NULL update
+ * to us.
+ *
+ * Assert the case: if we remove an underlying
+ * overflow object, checkpoint reconciliation
+ * should never see it again, there should be a
+ * visible update in the way.
+ *
+ * Write a placeholder.
+ */
+ WT_ASSERT(session,
+ F_ISSET(r, WT_SKIP_UPDATE_RESTORE));
+
+ data = "@";
+ size = 1;
+ } else {
+ update_no_copy = 0; /* Maybe data copy */
+
+ /*
+ * The repeat count is the number of records up
+ * to the next WT_INSERT record, or up to the
+ * end of the entry if we have no more WT_INSERT
+ * records.
+ */
+ if (ins == NULL)
+ repeat_count = nrepeat - n;
+ else
+ repeat_count =
+ WT_INSERT_RECNO(ins) - src_recno;
+
+ deleted = orig_deleted;
+ if (deleted)
+ goto compare;
+
+ /*
+ * If we are handling overflow items, use the
+ * overflow item itself exactly once, after
+ * which we have to copy it into a buffer and
+ * from then on use a complete copy because we
+ * are re-creating a new overflow record each
+ * time.
+ */
+ switch (ovfl_state) {
+ case OVFL_UNUSED:
+ /*
+ * An as-yet-unused overflow item.
+ *
+ * We're going to copy the on-page cell,
+ * write out any record we're tracking.
+ */
+ if (rle != 0) {
+ WT_ERR(__rec_col_var_helper(
+ session, r, salvage, last,
+ last_deleted, 0, rle));
+ rle = 0;
+ }
+
+ last->data = vpack->data;
+ last->size = vpack->size;
+ WT_ERR(__rec_col_var_helper(
+ session, r, salvage, last, 0,
+ WT_CELL_VALUE_OVFL, repeat_count));
+
+ /* Track if page has overflow items. */
+ r->ovfl_items = 1;
+
+ ovfl_state = OVFL_USED;
+ continue;
+ case OVFL_USED:
+ /*
+ * Original is an overflow item; we used
+ * it for a key and now we need another
+ * copy; read it into memory.
+ */
+ WT_ERR(__wt_dsk_cell_data_ref(session,
+ WT_PAGE_COL_VAR, vpack, orig));
+
+ ovfl_state = OVFL_IGNORE;
+ /* FALLTHROUGH */
+ case OVFL_IGNORE:
+ /*
+ * Original is an overflow item and we
+ * were forced to copy it into memory,
+ * or the original wasn't an overflow
+ * item; use the data copied into orig.
+ */
+ data = orig->data;
+ size = (uint32_t)orig->size;
+ break;
+ }
+ }
+
+compare: /*
+ * If we have a record against which to compare, and
+ * the records compare equal, increment the rle counter
+ * and continue. If the records don't compare equal,
+ * output the last record and swap the last and current
+ * buffers: do NOT update the starting record number,
+ * we've been doing that all along.
+ */
+ if (rle != 0) {
+ if ((deleted && last_deleted) ||
+ (!last_deleted && !deleted &&
+ last->size == size &&
+ memcmp(last->data, data, size) == 0)) {
+ rle += repeat_count;
+ continue;
+ }
+ WT_ERR(__rec_col_var_helper(session, r,
+ salvage, last, last_deleted, 0, rle));
+ }
+
+ /*
+ * Swap the current/last state.
+ *
+ * Reset RLE counter and turn on comparisons.
+ */
+ if (!deleted) {
+ /*
+ * We can't simply assign the data values into
+ * the last buffer because they may have come
+ * from a copy built from an encoded/overflow
+ * cell and creating the next record is going
+ * to overwrite that memory. Check, because
+ * encoded/overflow cells aren't that common
+ * and we'd like to avoid the copy. If data
+ * was taken from the current unpack structure
+ * (which points into the page), or was taken
+ * from an update structure, we can just use
+ * the pointers, they're not moving.
+ */
+ if (data == vpack->data || update_no_copy) {
+ last->data = data;
+ last->size = size;
+ } else
+ WT_ERR(__wt_buf_set(
+ session, last, data, size));
+ }
+ last_deleted = deleted;
+ rle = repeat_count;
+ }
+
+ /*
+ * If we had a reference to an overflow record we never used,
+ * discard the underlying blocks, they're no longer useful.
+ *
+ * One complication: we must cache a copy before discarding the
+ * on-disk version if there's a transaction in the system that
+ * might read the original value.
+ */
+ if (ovfl_state == OVFL_UNUSED &&
+ vpack->raw != WT_CELL_VALUE_OVFL_RM)
+ WT_ERR(__wt_ovfl_cache(session, page, upd, vpack));
+ }
+
+ /* Walk any append list. */
+ WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
+ WT_ERR(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+ if (upd == NULL)
+ continue;
+ for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) {
+ /*
+ * The application may have inserted records which left
+ * gaps in the name space.
+ */
+ if (src_recno < n)
+ deleted = 1;
+ else {
+ deleted = WT_UPDATE_DELETED_ISSET(upd);
+ if (!deleted) {
+ data = WT_UPDATE_DATA(upd);
+ size = upd->size;
+ }
+ }
+
+ /*
+ * Handle RLE accounting and comparisons -- see comment
+ * above, this code fragment does the same thing.
+ */
+ if (rle != 0) {
+ if ((deleted && last_deleted) ||
+ (!last_deleted && !deleted &&
+ last->size == size &&
+ memcmp(last->data, data, size) == 0)) {
+ ++rle;
+ continue;
+ }
+ WT_ERR(__rec_col_var_helper(session, r,
+ salvage, last, last_deleted, 0, rle));
+ }
+
+ /*
+ * Swap the current/last state. We always assign the
+ * data values to the buffer because they can only be
+ * the data from a WT_UPDATE structure.
+ *
+ * Reset RLE counter and turn on comparisons.
+ */
+ if (!deleted) {
+ last->data = data;
+ last->size = size;
+ }
+ last_deleted = deleted;
+ rle = 1;
+ }
+ }
+
+ /* If we were tracking a record, write it. */
+ if (rle != 0)
+ WT_ERR(__rec_col_var_helper(
+ session, r, salvage, last, last_deleted, 0, rle));
+
+ /* Write the remnant page. */
+ ret = __rec_split_finish(session, r);
+
+err: __wt_scr_free(&orig);
+ return (ret);
+}
+
+/*
+ * __rec_row_int --
+ * Reconcile a row-store internal page.
+ */
+static int
+__rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_ADDR *addr;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ WT_KV *key, *val;
+ WT_PAGE *child;
+ WT_REF *ref;
+ size_t size;
+ u_int vtype;
+ int hazard, key_onpage_ovfl, ovfl_key, state;
+ const void *p;
+
+ btree = S2BT(session);
+ child = NULL;
+ hazard = 0;
+
+ key = &r->k;
+ kpack = &_kpack;
+ WT_CLEAR(*kpack); /* -Wuninitialized */
+ val = &r->v;
+ vpack = &_vpack;
+ WT_CLEAR(*vpack); /* -Wuninitialized */
+
+ WT_RET(__rec_split_init(session, r, page, 0ULL, btree->maxintlpage));
+
+ /*
+ * Ideally, we'd never store the 0th key on row-store internal pages
+ * because it's never used during tree search and there's no reason
+ * to waste the space. The problem is how we do splits: when we split,
+ * we've potentially picked out several "split points" in the buffer
+ * which is overflowing the maximum page size, and when the overflow
+ * happens, we go back and physically split the buffer, at those split
+ * points, into new pages. It would be both difficult and expensive
+ * to re-process the 0th key at each split point to be an empty key,
+ * so we don't do that. However, we are reconciling an internal page
+ * for whatever reason, and the 0th key is known to be useless. We
+ * truncate the key to a single byte, instead of removing it entirely,
+ * it simplifies various things in other parts of the code (we don't
+ * have to special case transforming the page from its disk image to
+ * its in-memory version, for example).
+ */
+ r->cell_zero = 1;
+
+ /* For each entry in the in-memory page... */
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ /*
+ * There are different paths if the key is an overflow item vs.
+ * a straight-forward on-page value. If an overflow item, we
+ * would have instantiated it, and we can use that fact to set
+ * things up.
+ *
+ * Note the cell reference and unpacked key cell are available
+ * only in the case of an instantiated, off-page key.
+ */
+ ikey = __wt_ref_key_instantiated(ref);
+ if (ikey == NULL || ikey->cell_offset == 0) {
+ cell = NULL;
+ key_onpage_ovfl = 0;
+ } else {
+ cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
+ __wt_cell_unpack(cell, kpack);
+ key_onpage_ovfl =
+ kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM;
+ }
+
+ WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state));
+ addr = ref->addr;
+ child = ref->page;
+ vtype = 0;
+
+ /* Deleted child we don't have to write. */
+ if (state == WT_CHILD_IGNORE) {
+ /*
+ * Overflow keys referencing discarded pages are no
+ * longer useful, schedule them for discard. Don't
+ * worry about instantiation, internal page keys are
+ * always instantiated. Don't worry about reuse,
+ * reusing this key in this reconciliation is unlikely.
+ */
+ if (key_onpage_ovfl)
+ WT_ERR(__wt_ovfl_discard_add(
+ session, page, kpack->cell));
+ CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ }
+
+ /* Deleted child requiring a proxy cell. */
+ if (state == WT_CHILD_PROXY)
+ vtype = WT_CELL_ADDR_DEL;
+
+ /*
+ * Modified child. Empty pages are merged into the parent and
+ * discarded.
+ */
+ if (state == WT_CHILD_MODIFIED)
+ switch (F_ISSET(child->modify, WT_PM_REC_MASK)) {
+ case WT_PM_REC_EMPTY:
+ /*
+ * Overflow keys referencing empty pages are no
+ * longer useful, schedule them for discard.
+ * Don't worry about instantiation, internal
+ * page keys are always instantiated. Don't
+ * worry about reuse, reusing this key in this
+ * reconciliation is unlikely.
+ */
+ if (key_onpage_ovfl)
+ WT_ERR(__wt_ovfl_discard_add(
+ session, page, kpack->cell));
+ CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ case WT_PM_REC_MULTIBLOCK:
+ /*
+ * Overflow keys referencing split pages are no
+ * longer useful (the split page's key is the
+ * interesting key); schedule them for discard.
+ * Don't worry about instantiation, internal
+ * page keys are always instantiated. Don't
+ * worry about reuse, reusing this key in this
+ * reconciliation is unlikely.
+ */
+ if (key_onpage_ovfl)
+ WT_ERR(__wt_ovfl_discard_add(
+ session, page, kpack->cell));
+
+ WT_ERR(__rec_row_merge(session, r, child));
+ CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ case WT_PM_REC_REPLACE:
+ /*
+ * If the page is replaced, the page's modify
+ * structure has the page's address.
+ */
+ addr = &child->modify->mod_replace;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /*
+ * Build the value cell, the child page's address. Addr points
+ * to an on-page cell or an off-page WT_ADDR structure. The
+ * cell type has been set in the case of page deletion requiring
+ * a proxy cell, otherwise use the information from the addr or
+ * original cell.
+ */
+ if (__wt_off_page(page, addr)) {
+ p = addr->addr;
+ size = addr->size;
+ if (vtype == 0)
+ vtype = __rec_vtype(addr);
+ } else {
+ __wt_cell_unpack(ref->addr, vpack);
+ p = vpack->data;
+ size = vpack->size;
+ if (vtype == 0)
+ vtype = vpack->raw;
+ }
+ __rec_cell_build_addr(r, p, size, vtype, 0);
+ CHILD_RELEASE_ERR(session, hazard, ref);
+
+ /*
+ * Build key cell.
+ * Truncate any 0th key, internal pages don't need 0th keys.
+ */
+ if (key_onpage_ovfl) {
+ key->buf.data = cell;
+ key->buf.size = __wt_cell_total_len(kpack);
+ key->cell_len = 0;
+ key->len = key->buf.size;
+ ovfl_key = 1;
+ } else {
+ __wt_ref_key(page, ref, &p, &size);
+ WT_ERR(__rec_cell_build_int_key(
+ session, r, p, r->cell_zero ? 1 : size, &ovfl_key));
+ }
+ r->cell_zero = 0;
+
+ /* Boundary: split or write the page. */
+ while (key->len + val->len > r->space_avail) {
+ if (r->raw_compression) {
+ WT_ERR(__rec_split_raw(session, r));
+ continue;
+ }
+
+ /*
+ * In one path above, we copied address blocks from the
+ * page rather than building the actual key. In that
+ * case, we have to build the actual key now because we
+ * are about to promote it.
+ */
+ if (key_onpage_ovfl) {
+ WT_ERR(__wt_buf_set(session,
+ r->cur, WT_IKEY_DATA(ikey), ikey->size));
+ key_onpage_ovfl = 0;
+ }
+ WT_ERR(__rec_split(session, r));
+ }
+
+ /* Copy the key and value onto the page. */
+ __rec_copy_incr(session, r, key);
+ __rec_copy_incr(session, r, val);
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+ } WT_INTL_FOREACH_END;
+
+ /* Write the remnant page. */
+ return (__rec_split_finish(session, r));
+
+err: CHILD_RELEASE(session, hazard, ref);
+ return (ret);
+}
+
+/*
+ * __rec_row_merge --
+ * Merge in a split page.
+ */
+static int
+__rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_ADDR *addr;
+ WT_KV *key, *val;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ uint32_t i;
+ int ovfl_key;
+
+ mod = page->modify;
+
+ key = &r->k;
+ val = &r->v;
+
+ /* For each entry in the split array... */
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ /* Build the key and value cells. */
+ WT_RET(__rec_cell_build_int_key(session, r,
+ WT_IKEY_DATA(multi->key.ikey),
+ r->cell_zero ? 1 : multi->key.ikey->size, &ovfl_key));
+ r->cell_zero = 0;
+
+ addr = &multi->addr;
+ __rec_cell_build_addr(
+ r, addr->addr, addr->size, __rec_vtype(addr), 0);
+
+ /* Boundary: split or write the page. */
+ while (key->len + val->len > r->space_avail)
+ if (r->raw_compression)
+ WT_RET(__rec_split_raw(session, r));
+ else
+ WT_RET(__rec_split(session, r));
+
+ /* Copy the key and value onto the page. */
+ __rec_copy_incr(session, r, key);
+ __rec_copy_incr(session, r, val);
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+ }
+ return (0);
+}
+
+/*
+ * __rec_row_leaf --
+ * Reconcile a row-store leaf page.
+ */
+static int
+__rec_row_leaf(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell, *val_cell;
+ WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
+ WT_DECL_ITEM(tmpkey);
+ WT_DECL_ITEM(tmpval);
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ WT_INSERT *ins;
+ WT_KV *key, *val;
+ WT_ROW *rip;
+ WT_UPDATE *upd;
+ size_t size;
+ uint64_t slvg_skip;
+ uint32_t i;
+ int dictionary, onpage_ovfl, ovfl_key;
+ const void *p;
+ void *copy;
+
+ btree = S2BT(session);
+ slvg_skip = salvage == NULL ? 0 : salvage->skip;
+
+ key = &r->k;
+ val = &r->v;
+
+ WT_RET(__rec_split_init(session, r, page, 0ULL, btree->maxleafpage));
+
+ /*
+ * Write any K/V pairs inserted into the page before the first from-disk
+ * key on the page.
+ */
+ if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL)
+ WT_RET(__rec_row_leaf_insert(session, r, ins));
+
+ /*
+ * Temporary buffers in which to instantiate any uninstantiated keys
+ * or value items we need.
+ */
+ WT_RET(__wt_scr_alloc(session, 0, &tmpkey));
+ WT_RET(__wt_scr_alloc(session, 0, &tmpval));
+
+ /* For each entry in the page... */
+ WT_ROW_FOREACH(page, rip, i) {
+ /*
+ * The salvage code, on some rare occasions, wants to reconcile
+ * a page but skip some leading records on the page. Because
+ * the row-store leaf reconciliation function copies keys from
+ * the original disk page, this is non-trivial -- just changing
+ * the in-memory pointers isn't sufficient, we have to change
+ * the WT_CELL structures on the disk page, too. It's ugly, but
+ * we pass in a value that tells us how many records to skip in
+ * this case.
+ */
+ if (slvg_skip != 0) {
+ --slvg_skip;
+ continue;
+ }
+
+ /*
+ * Figure out the key: set any cell reference (and unpack it),
+ * set any instantiated key reference.
+ */
+ copy = WT_ROW_KEY_COPY(rip);
+ (void)__wt_row_leaf_key_info(
+ page, copy, &ikey, &cell, NULL, NULL);
+ if (cell == NULL)
+ kpack = NULL;
+ else {
+ kpack = &_kpack;
+ __wt_cell_unpack(cell, kpack);
+ }
+
+ /* Unpack the on-page value cell, and look for an update. */
+ if ((val_cell =
+ __wt_row_leaf_value_cell(page, rip, NULL)) == NULL)
+ vpack = NULL;
+ else {
+ vpack = &_vpack;
+ __wt_cell_unpack(val_cell, vpack);
+ }
+ WT_ERR(__rec_txn_read(session, r, NULL, rip, vpack, &upd));
+
+ /* Build value cell. */
+ dictionary = 0;
+ if (upd == NULL) {
+ /*
+ * When the page was read into memory, there may not
+ * have been a value item.
+ *
+ * If there was a value item, check if it's a dictionary
+ * cell (a copy of another item on the page). If it's a
+ * copy, we have to create a new value item as the old
+ * item might have been discarded from the page.
+ */
+ if (vpack == NULL) {
+ val->buf.data = NULL;
+ val->cell_len = val->len = val->buf.size = 0;
+ } else if (vpack->raw == WT_CELL_VALUE_COPY) {
+ /* If the item is Huffman encoded, decode it. */
+ if (btree->huffman_value == NULL) {
+ p = vpack->data;
+ size = vpack->size;
+ } else {
+ WT_ERR(__wt_huffman_decode(session,
+ btree->huffman_value,
+ vpack->data, vpack->size,
+ tmpval));
+ p = tmpval->data;
+ size = tmpval->size;
+ }
+ WT_ERR(__rec_cell_build_val(
+ session, r, p, size, (uint64_t)0));
+ dictionary = 1;
+ } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
+ /*
+ * If doing update save and restore in service
+ * of eviction, there's an update that's not
+ * globally visible, and the underlying value
+ * is a removed overflow value, we end up here.
+ *
+ * When the update save/restore code noticed the
+ * removed overflow value, it appended a copy of
+ * the cached, original overflow value to the
+ * update list being saved (ensuring any on-page
+ * item will never be accessed after the page is
+ * re-instantiated), then returned a NULL update
+ * to us.
+ *
+ * Assert the case.
+ */
+ WT_ASSERT(session,
+ F_ISSET(r, WT_SKIP_UPDATE_RESTORE));
+
+ /*
+ * If the key is also a removed overflow item,
+ * don't write anything at all.
+ *
+ * We don't have to write anything because the
+ * code re-instantiating the page gets the key
+ * to match the saved list of updates from the
+ * original page. By not putting the key on
+ * the page, we'll move the key/value set from
+ * a row-store leaf page slot to an insert list,
+ * but that shouldn't matter.
+ *
+ * The reason we bother with the test is because
+ * overflows are expensive to write. It's hard
+ * to imagine a real workload where this test is
+ * worth the effort, but it's a simple test.
+ */
+ if (kpack != NULL &&
+ kpack->raw == WT_CELL_KEY_OVFL_RM)
+ goto leaf_insert;
+
+ /*
+ * The on-page value will never be accessed,
+ * write a placeholder record.
+ */
+ WT_ERR(__rec_cell_build_val(
+ session, r, "@", 1, (uint64_t)0));
+ } else {
+ val->buf.data = val_cell;
+ val->buf.size = __wt_cell_total_len(vpack);
+ val->cell_len = 0;
+ val->len = val->buf.size;
+
+ /* Track if page has overflow items. */
+ if (vpack->ovfl)
+ r->ovfl_items = 1;
+ }
+ } else {
+ /*
+ * If the original value was an overflow and we've not
+ * already done so, discard it. One complication: we
+ * must cache a copy before discarding the on-disk
+ * version if there's a transaction in the system that
+ * might read the original value.
+ */
+ if (vpack != NULL &&
+ vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM)
+ WT_ERR(
+ __wt_ovfl_cache(session, page, rip, vpack));
+
+ /* If this key/value pair was deleted, we're done. */
+ if (WT_UPDATE_DELETED_ISSET(upd)) {
+ /*
+ * Overflow keys referencing discarded values
+ * are no longer useful, discard the backing
+ * blocks. Don't worry about reuse, reusing
+ * keys from a row-store page reconciliation
+ * seems unlikely enough to ignore.
+ */
+ if (kpack != NULL && kpack->ovfl &&
+ kpack->raw != WT_CELL_KEY_OVFL_RM) {
+ /*
+ * Keys are part of the name-space, we
+ * can't remove them from the in-memory
+ * tree; if an overflow key was deleted
+ * without being instantiated (for
+ * example, cursor-based truncation, do
+ * it now.
+ */
+ if (ikey == NULL)
+ WT_ERR(__wt_row_leaf_key(
+ session,
+ page, rip, tmpkey, 1));
+
+ WT_ERR(__wt_ovfl_discard_add(
+ session, page, kpack->cell));
+ }
+
+ /*
+ * We aren't actually creating the key so we
+ * can't use bytes from this key to provide
+ * prefix information for a subsequent key.
+ */
+ tmpkey->size = 0;
+
+ /* Proceed with appended key/value pairs. */
+ goto leaf_insert;
+ }
+
+ /*
+ * If no value, nothing needs to be copied. Otherwise,
+ * build the value's WT_CELL chunk from the most recent
+ * update value.
+ */
+ if (upd->size == 0) {
+ val->buf.data = NULL;
+ val->cell_len = val->len = val->buf.size = 0;
+ } else {
+ WT_ERR(__rec_cell_build_val(session, r,
+ WT_UPDATE_DATA(upd), upd->size,
+ (uint64_t)0));
+ dictionary = 1;
+ }
+ }
+
+ /*
+ * Build key cell.
+ *
+ * If the key is an overflow key that hasn't been removed, use
+ * the original backing blocks.
+ */
+ onpage_ovfl = kpack != NULL &&
+ kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM;
+ if (onpage_ovfl) {
+ key->buf.data = cell;
+ key->buf.size = __wt_cell_total_len(kpack);
+ key->cell_len = 0;
+ key->len = key->buf.size;
+ ovfl_key = 1;
+
+ /*
+ * We aren't creating a key so we can't use this key as
+ * a prefix for a subsequent key.
+ */
+ tmpkey->size = 0;
+
+ /* Track if page has overflow items. */
+ r->ovfl_items = 1;
+ } else {
+ /*
+ * Get the key from the page or an instantiated key, or
+ * inline building the key from a previous key (it's a
+ * fast path for simple, prefix-compressed keys), or by
+ * by building the key from scratch.
+ */
+ if (__wt_row_leaf_key_info(page, copy,
+ NULL, &cell, &tmpkey->data, &tmpkey->size))
+ goto build;
+
+ kpack = &_kpack;
+ __wt_cell_unpack(cell, kpack);
+ if (btree->huffman_key == NULL &&
+ kpack->type == WT_CELL_KEY &&
+ tmpkey->size >= kpack->prefix) {
+ /*
+ * The previous clause checked for a prefix of
+ * zero, which means the temporary buffer must
+ * have a non-zero size, and it references a
+ * valid key.
+ */
+ WT_ASSERT(session, tmpkey->size != 0);
+
+ /*
+ * Grow the buffer as necessary, ensuring data
+ * data has been copied into local buffer space,
+ * then append the suffix to the prefix already
+ * in the buffer.
+ *
+ * Don't grow the buffer unnecessarily or copy
+ * data we don't need, truncate the item's data
+ * length to the prefix bytes.
+ */
+ tmpkey->size = kpack->prefix;
+ WT_ERR(__wt_buf_grow(session,
+ tmpkey, tmpkey->size + kpack->size));
+ memcpy((uint8_t *)tmpkey->mem + tmpkey->size,
+ kpack->data, kpack->size);
+ tmpkey->size += kpack->size;
+ } else
+ WT_ERR(__wt_row_leaf_key_copy(
+ session, page, rip, tmpkey));
+build:
+ WT_ERR(__rec_cell_build_leaf_key(session, r,
+ tmpkey->data, tmpkey->size, &ovfl_key));
+ }
+
+ /* Boundary: split or write the page. */
+ while (key->len + val->len > r->space_avail) {
+ if (r->raw_compression) {
+ WT_ERR(__rec_split_raw(session, r));
+ continue;
+ }
+
+ /*
+ * In one path above, we copied address blocks from the
+ * page rather than building the actual key. In that
+ * case, we have to build the actual key now because we
+ * are about to promote it.
+ */
+ if (onpage_ovfl) {
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, WT_PAGE_ROW_LEAF, kpack, r->cur));
+ onpage_ovfl = 0;
+ }
+ WT_ERR(__rec_split(session, r));
+
+ /*
+ * Turn off prefix compression until a full key written
+ * to the new page, and (unless we're already working
+ * with an overflow key), rebuild the key without prefix
+ * compression.
+ */
+ if (r->key_pfx_compress_conf) {
+ r->key_pfx_compress = 0;
+ if (!ovfl_key)
+ WT_ERR(__rec_cell_build_leaf_key(
+ session, r, NULL, 0, &ovfl_key));
+ }
+ }
+
+ /* Copy the key/value pair onto the page. */
+ __rec_copy_incr(session, r, key);
+ if (val->len == 0)
+ r->any_empty_value = 1;
+ else {
+ r->all_empty_value = 0;
+ if (dictionary && btree->dictionary)
+ WT_ERR(__rec_dict_replace(session, r, 0, val));
+ __rec_copy_incr(session, r, val);
+ }
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+
+leaf_insert: /* Write any K/V pairs inserted into the page after this key. */
+ if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT(page, rip))) != NULL)
+ WT_ERR(__rec_row_leaf_insert(session, r, ins));
+ }
+
+ /* Write the remnant page. */
+ ret = __rec_split_finish(session, r);
+
+err: __wt_scr_free(&tmpkey);
+ __wt_scr_free(&tmpval);
+ return (ret);
+}
+
+/*
+ * __rec_row_leaf_insert --
+ * Walk an insert chain, writing K/V pairs.
+ */
+static int
+__rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
+{
+ WT_BTREE *btree;
+ WT_KV *key, *val;
+ WT_UPDATE *upd;
+ int ovfl_key;
+
+ btree = S2BT(session);
+
+ key = &r->k;
+ val = &r->v;
+
+ for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) {
+ /* Look for an update. */
+ WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+ if (upd == NULL || WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+
+ if (upd->size == 0) /* Build value cell. */
+ val->len = 0;
+ else
+ WT_RET(__rec_cell_build_val(session, r,
+ WT_UPDATE_DATA(upd), upd->size, (uint64_t)0));
+
+ /* Build key cell. */
+ WT_RET(__rec_cell_build_leaf_key(session, r,
+ WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key));
+
+ /* Boundary: split or write the page. */
+ while (key->len + val->len > r->space_avail) {
+ if (r->raw_compression) {
+ WT_RET(__rec_split_raw(session, r));
+ continue;
+ }
+ WT_RET(__rec_split(session, r));
+
+ /*
+ * Turn off prefix compression until a full key written
+ * to the new page, and (unless we're already working
+ * with an overflow key), rebuild the key without prefix
+ * compression.
+ */
+ if (r->key_pfx_compress_conf) {
+ r->key_pfx_compress = 0;
+ if (!ovfl_key)
+ WT_RET(__rec_cell_build_leaf_key(
+ session, r, NULL, 0, &ovfl_key));
+ }
+ }
+
+ /* Copy the key/value pair onto the page. */
+ __rec_copy_incr(session, r, key);
+ if (val->len == 0)
+ r->any_empty_value = 1;
+ else {
+ r->all_empty_value = 0;
+ if (btree->dictionary)
+ WT_RET(__rec_dict_replace(session, r, 0, val));
+ __rec_copy_incr(session, r, val);
+ }
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+ }
+
+ return (0);
+}
+
+/*
+ * __rec_split_discard --
+ * Discard the pages resulting from a previous split.
+ */
+static int
+__rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BM *bm;
+ WT_DECL_RET;
+ WT_PAGE_MODIFY *mod;
+ WT_MULTI *multi;
+ uint32_t i;
+
+ bm = S2BT(session)->bm;
+ mod = page->modify;
+
+ /*
+ * A page that split is being reconciled for the second, or subsequent
+ * time; discard underlying block space used in the last reconciliation
+ * that is not being reused for this reconciliation.
+ */
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ __wt_free(session, multi->key.ikey);
+ break;
+ }
+ if (multi->skip == NULL) {
+ if (multi->addr.reuse)
+ multi->addr.addr = NULL;
+ else {
+ WT_RET(bm->free(bm, session,
+ multi->addr.addr, multi->addr.size));
+ __wt_free(session, multi->addr.addr);
+ }
+ } else {
+ __wt_free(session, multi->skip);
+ __wt_free(session, multi->skip_dsk);
+ }
+ }
+ __wt_free(session, mod->mod_multi);
+ mod->mod_multi_entries = 0;
+
+ /*
+ * This routine would be trivial, and only walk a single page freeing
+ * any blocks written to support the split, except for root splits.
+ * In the case of root splits, we have to cope with multiple pages in
+ * a linked list, and we also have to discard overflow items written
+ * for the page.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ if (mod->mod_root_split == NULL)
+ break;
+ WT_RET(__rec_split_discard(session, mod->mod_root_split));
+ WT_RET(__wt_ovfl_track_wrapup(session, mod->mod_root_split));
+ __wt_page_out(session, &mod->mod_root_split);
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * __rec_write_wrapup --
+ * Finish the reconciliation.
+ */
+static int
+__rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_BM *bm;
+ WT_BOUNDARY *bnd;
+ WT_BTREE *btree;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ WT_REF *ref;
+ size_t addr_size;
+ const uint8_t *addr;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ mod = page->modify;
+ ref = r->ref;
+
+ /*
+ * This page may have previously been reconciled, and that information
+ * is now about to be replaced. Make sure it's discarded at some point,
+ * and clear the underlying modification information, we're creating a
+ * new reality.
+ */
+ switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+ case 0: /*
+ * The page has never been reconciled before, free the original
+ * address blocks (if any). The "if any" is for empty trees
+ * created when a new tree is opened or previously deleted pages
+ * instantiated in memory.
+ *
+ * The exception is root pages are never tracked or free'd, they
+ * are checkpoints, and must be explicitly dropped.
+ */
+ if (__wt_ref_is_root(ref))
+ break;
+ if (ref->addr != NULL) {
+ /*
+ * Free the page and clear the address (so we don't free
+ * it twice).
+ */
+ WT_RET(__wt_ref_info(
+ session, ref, &addr, &addr_size, NULL));
+ WT_RET(bm->free(bm, session, addr, addr_size));
+ if (__wt_off_page(ref->home, ref->addr)) {
+ __wt_free(
+ session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ }
+ ref->addr = NULL;
+ }
+ break;
+ case WT_PM_REC_EMPTY: /* Page deleted */
+ break;
+ case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
+ /*
+ * Discard the multiple replacement blocks.
+ */
+ WT_RET(__rec_split_discard(session, page));
+ break;
+ case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
+ /*
+ * Discard the replacement leaf page's blocks.
+ *
+ * The exception is root pages are never tracked or free'd, they
+ * are checkpoints, and must be explicitly dropped.
+ */
+ if (!__wt_ref_is_root(ref))
+ WT_RET(bm->free(bm, session,
+ mod->mod_replace.addr, mod->mod_replace.size));
+
+ /* Discard the replacement page's address. */
+ __wt_free(session, mod->mod_replace.addr);
+ mod->mod_replace.size = 0;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ F_CLR(mod, WT_PM_REC_MASK);
+
+ /*
+ * Wrap up overflow tracking. If we are about to create a checkpoint,
+ * the system must be entirely consistent at that point (the underlying
+ * block manager is presumably going to do some action to resolve the
+ * list of allocated/free/whatever blocks that are associated with the
+ * checkpoint).
+ */
+ WT_RET(__wt_ovfl_track_wrapup(session, page));
+
+ switch (r->bnd_next) {
+ case 0: /* Page delete */
+ WT_RET(__wt_verbose(
+ session, WT_VERB_RECONCILE, "page %p empty", page));
+ WT_STAT_FAST_DATA_INCR(session, rec_page_delete);
+
+ /* If this is the root page, we need to create a sync point. */
+ ref = r->ref;
+ if (__wt_ref_is_root(ref))
+ WT_RET(
+ bm->checkpoint(bm, session, NULL, btree->ckpt, 0));
+
+ /*
+ * If the page was empty, we want to discard it from the tree
+ * by discarding the parent's key when evicting the parent.
+ * Mark the page as deleted, then return success, leaving the
+ * page in memory. If the page is subsequently modified, that
+ * is OK, we'll just reconcile it again.
+ */
+ F_SET(mod, WT_PM_REC_EMPTY);
+ break;
+ case 1: /* 1-for-1 page swap */
+ /*
+ * Because WiredTiger's pages grow without splitting, we're
+ * replacing a single page with another single page most of
+ * the time.
+ */
+ bnd = &r->bnd[0];
+
+ /*
+ * If we're saving/restoring changes for this page, there's
+ * nothing to write. Allocate, then initialize the array of
+ * replacement blocks.
+ */
+ if (bnd->skip != NULL) {
+ WT_RET(__wt_calloc_def(
+ session, r->bnd_next, &mod->mod_multi));
+ multi = mod->mod_multi;
+ multi->skip = bnd->skip;
+ multi->skip_entries = bnd->skip_next;
+ bnd->skip = NULL;
+ multi->skip_dsk = bnd->dsk;
+ bnd->dsk = NULL;
+ mod->mod_multi_entries = 1;
+
+ F_SET(mod, WT_PM_REC_MULTIBLOCK);
+ break;
+ }
+
+ /*
+ * If this is a root page, then we don't have an address and we
+ * have to create a sync point. The address was cleared when
+ * we were about to write the buffer so we know what to do here.
+ */
+ if (bnd->addr.addr == NULL)
+ WT_RET(__wt_bt_write(session,
+ &r->dsk, NULL, NULL, 1, bnd->already_compressed));
+ else {
+ mod->mod_replace = bnd->addr;
+ bnd->addr.addr = NULL;
+ }
+
+ F_SET(mod, WT_PM_REC_REPLACE);
+ break;
+ default: /* Page split */
+ WT_RET(__wt_verbose(session, WT_VERB_RECONCILE,
+ "page %p reconciled into %" PRIu32 " pages",
+ page, r->bnd_next));
+
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ WT_STAT_FAST_DATA_INCR(
+ session, rec_multiblock_internal);
+ break;
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_ROW_LEAF:
+ WT_STAT_FAST_DATA_INCR(session, rec_multiblock_leaf);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /* Display the actual split keys. */
+ if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT)) {
+ WT_DECL_ITEM(tkey);
+ WT_DECL_RET;
+ uint32_t i;
+
+ if (page->type == WT_PAGE_ROW_INT ||
+ page->type == WT_PAGE_ROW_LEAF)
+ WT_RET(__wt_scr_alloc(session, 0, &tkey));
+ for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i)
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_ERR(__wt_buf_set_printable(
+ session, tkey,
+ bnd->key.data, bnd->key.size));
+ WT_ERR(__wt_verbose(
+ session, WT_VERB_SPLIT,
+ "split: starting key "
+ "%.*s",
+ (int)tkey->size,
+ (const char *)tkey->data));
+ break;
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_VAR:
+ WT_ERR(__wt_verbose(
+ session, WT_VERB_SPLIT,
+ "split: starting recno %" PRIu64,
+ bnd->recno));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+err: __wt_scr_free(&tkey);
+ WT_RET(ret);
+ }
+ if (r->bnd_next > r->bnd_next_max) {
+ r->bnd_next_max = r->bnd_next;
+ WT_STAT_FAST_DATA_SET(
+ session, rec_multiblock_max, r->bnd_next_max);
+ }
+
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__rec_split_row(session, r, page));
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ WT_RET(__rec_split_col(session, r, page));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ F_SET(mod, WT_PM_REC_MULTIBLOCK);
+ break;
+ }
+
+ /*
+ * If updates were skipped, the tree isn't clean. The checkpoint call
+ * cleared the tree's modified value before calling the eviction thread,
+ * so we must explicitly reset the tree's modified flag. We insert a
+ * barrier after the change for clarity (the requirement is the value
+ * be set before a subsequent checkpoint reads it, and because the
+ * current checkpoint is waiting on this reconciliation to complete,
+ * there's no risk of that happening).
+ *
+ * Otherwise, if no updates were skipped, we have a new maximum
+ * transaction written for the page (used to decide if a clean page can
+ * be evicted). The page only might be clean; if the write generation
+ * is unchanged since reconciliation started, clear it and update cache
+ * dirty statistics, if the write generation changed, then the page has
+ * been written since we started reconciliation, it cannot be
+ * discarded.
+ */
+ if (r->leave_dirty) {
+ mod->first_dirty_txn = r->skipped_txn;
+
+ btree->modified = 1;
+ WT_FULL_BARRIER();
+ } else {
+ mod->rec_max_txn = r->max_txn;
+
+ if (WT_ATOMIC_CAS4(mod->write_gen, r->orig_write_gen, 0))
+ __wt_cache_dirty_decr(session, page);
+ }
+
+ return (0);
+}
+
+/*
+ * __rec_write_wrapup_err --
+ * Finish the reconciliation on error.
+ */
+static int
+__rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_BM *bm;
+ WT_BOUNDARY *bnd;
+ WT_DECL_RET;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ uint32_t i;
+
+ bm = S2BT(session)->bm;
+ mod = page->modify;
+
+ /*
+ * Clear the address-reused flag from the multiblock reconciliation
+ * information (otherwise we might think the backing block is being
+ * reused on a subsequent reconciliation where we want to free it).
+ */
+ if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_MULTIBLOCK)
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i)
+ multi->addr.reuse = 0;
+
+ /*
+ * On error, discard blocks we've written, they're unreferenced by the
+ * tree. This is not a question of correctness, we're avoiding block
+ * leaks.
+ *
+ * Don't discard backing blocks marked for reuse, they remain part of
+ * a previous reconciliation.
+ */
+ WT_TRET(__wt_ovfl_track_wrapup_err(session, page));
+ for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i)
+ if (bnd->addr.addr != NULL) {
+ if (bnd->addr.reuse)
+ bnd->addr.addr = NULL;
+ else {
+ WT_TRET(bm->free(bm, session,
+ bnd->addr.addr, bnd->addr.size));
+ __wt_free(session, bnd->addr.addr);
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * __rec_split_row --
+ * Split a row-store page into a set of replacement blocks.
+ */
+static int
+__rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_BOUNDARY *bnd;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ WT_REF *ref;
+ uint32_t i;
+ size_t size;
+ void *p;
+
+ mod = page->modify;
+
+ /* We never set the first page's key, grab it from the original page. */
+ ref = r->ref;
+ if (__wt_ref_is_root(ref))
+ WT_RET(__wt_buf_set(session, &r->bnd[0].key, "", 1));
+ else {
+ __wt_ref_key(ref->home, ref, &p, &size);
+ WT_RET(__wt_buf_set(session, &r->bnd[0].key, p, size));
+ }
+
+ /* Allocate, then initialize the array of replacement blocks. */
+ WT_RET(__wt_calloc_def(session, r->bnd_next, &mod->mod_multi));
+
+ for (multi = mod->mod_multi,
+ bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
+ WT_RET(__wt_row_ikey(session, 0,
+ bnd->key.data, bnd->key.size, &multi->key.ikey));
+
+ if (bnd->skip == NULL) {
+ multi->addr = bnd->addr;
+ multi->addr.reuse = 0;
+ multi->size = bnd->size;
+ multi->cksum = bnd->cksum;
+ bnd->addr.addr = NULL;
+ } else {
+ multi->skip = bnd->skip;
+ multi->skip_entries = bnd->skip_next;
+ bnd->skip = NULL;
+ multi->skip_dsk = bnd->dsk;
+ bnd->dsk = NULL;
+ }
+ }
+ mod->mod_multi_entries = r->bnd_next;
+
+ return (0);
+}
+
+/*
+ * __rec_split_col --
+ * Split a column-store page into a set of replacement blocks.
+ */
+static int
+__rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_BOUNDARY *bnd;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ uint32_t i;
+
+ mod = page->modify;
+
+ /* Allocate, then initialize the array of replacement blocks. */
+ WT_RET(__wt_calloc_def(session, r->bnd_next, &mod->mod_multi));
+
+ for (multi = mod->mod_multi,
+ bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
+ multi->key.recno = bnd->recno;
+
+ if (bnd->skip == NULL) {
+ multi->addr = bnd->addr;
+ multi->addr.reuse = 0;
+ multi->size = bnd->size;
+ multi->cksum = bnd->cksum;
+ bnd->addr.addr = NULL;
+ } else {
+ multi->skip = bnd->skip;
+ multi->skip_entries = bnd->skip_next;
+ bnd->skip = NULL;
+ multi->skip_dsk = bnd->dsk;
+ bnd->dsk = NULL;
+ }
+ }
+ mod->mod_multi_entries = r->bnd_next;
+
+ return (0);
+}
+
+/*
+ * __rec_cell_build_int_key --
+ * Process a key and return a WT_CELL structure and byte string to be
+ * stored on a row-store internal page.
+ */
+static int
+__rec_cell_build_int_key(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, const void *data, size_t size, int *is_ovflp)
+{
+ WT_BTREE *btree;
+ WT_KV *key;
+
+ *is_ovflp = 0;
+
+ btree = S2BT(session);
+
+ key = &r->k;
+
+ /* Copy the bytes into the "current" and key buffers. */
+ WT_RET(__wt_buf_set(session, r->cur, data, size));
+ WT_RET(__wt_buf_set(session, &key->buf, data, size));
+
+ /* Create an overflow object if the data won't fit. */
+ if (size > btree->maxintlitem) {
+ WT_STAT_FAST_DATA_INCR(session, rec_overflow_key_internal);
+
+ *is_ovflp = 1;
+ return (__rec_cell_build_ovfl(
+ session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0));
+ }
+
+ key->cell_len = __wt_cell_pack_int_key(&key->cell, key->buf.size);
+ key->len = key->cell_len + key->buf.size;
+
+ return (0);
+}
+
+/*
+ * __rec_cell_build_leaf_key --
+ * Process a key and return a WT_CELL structure and byte string to be
+ * stored on a row-store leaf page.
+ */
+static int
+__rec_cell_build_leaf_key(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, const void *data, size_t size, int *is_ovflp)
+{
+ WT_BTREE *btree;
+ WT_KV *key;
+ size_t pfx_max;
+ uint8_t pfx;
+ const uint8_t *a, *b;
+
+ *is_ovflp = 0;
+
+ btree = S2BT(session);
+
+ key = &r->k;
+
+ pfx = 0;
+ if (data == NULL)
+ /*
+ * When data is NULL, our caller has a prefix compressed key
+ * they can't use (probably because they just crossed a split
+ * point). Use the full key saved when last called, instead.
+ */
+ WT_RET(__wt_buf_set(
+ session, &key->buf, r->cur->data, r->cur->size));
+ else {
+ /*
+ * Save a copy of the key for later reference: we use the full
+ * key for prefix-compression comparisons, and if we are, for
+ * any reason, unable to use the compressed key we generate.
+ */
+ WT_RET(__wt_buf_set(session, r->cur, data, size));
+
+ /*
+ * Do prefix compression on the key. We know by definition the
+ * previous key sorts before the current key, which means the
+ * keys must differ and we just need to compare up to the
+ * shorter of the two keys.
+ */
+ if (r->key_pfx_compress) {
+ /*
+ * We can't compress out more than 256 bytes, limit the
+ * comparison to that.
+ */
+ pfx_max = UINT8_MAX;
+ if (size < pfx_max)
+ pfx_max = size;
+ if (r->last->size < pfx_max)
+ pfx_max = r->last->size;
+ for (a = data, b = r->last->data; pfx < pfx_max; ++pfx)
+ if (*a++ != *b++)
+ break;
+
+ /*
+ * Prefix compression may cost us CPU and memory when
+ * the page is re-loaded, don't do it unless there's
+ * reasonable gain.
+ */
+ if (pfx < btree->prefix_compression_min)
+ pfx = 0;
+ else
+ WT_STAT_FAST_DATA_INCRV(
+ session, rec_prefix_compression, pfx);
+ }
+
+ /* Copy the non-prefix bytes into the key buffer. */
+ WT_RET(__wt_buf_set(
+ session, &key->buf, (uint8_t *)data + pfx, size - pfx));
+ }
+
+ /* Optionally compress the key using the Huffman engine. */
+ if (btree->huffman_key != NULL)
+ WT_RET(__wt_huffman_encode(session, btree->huffman_key,
+ key->buf.data, (uint32_t)key->buf.size, &key->buf));
+
+ /* Create an overflow object if the data won't fit. */
+ if (key->buf.size > btree->maxleafitem) {
+ /*
+ * Overflow objects aren't prefix compressed -- rebuild any
+ * object that was prefix compressed.
+ */
+ if (pfx == 0) {
+ WT_STAT_FAST_DATA_INCR(session, rec_overflow_key_leaf);
+
+ *is_ovflp = 1;
+ return (__rec_cell_build_ovfl(
+ session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0));
+ }
+ return (
+ __rec_cell_build_leaf_key(session, r, NULL, 0, is_ovflp));
+ }
+
+ key->cell_len = __wt_cell_pack_leaf_key(&key->cell, pfx, key->buf.size);
+ key->len = key->cell_len + key->buf.size;
+
+ return (0);
+}
+
+/*
+ * __rec_cell_build_addr --
+ * Process an address reference and return a cell structure to be stored
+ * on the page.
+ */
+static void
+__rec_cell_build_addr(WT_RECONCILE *r,
+ const void *addr, size_t size, u_int cell_type, uint64_t recno)
+{
+ WT_KV *val;
+
+ val = &r->v;
+
+ /*
+ * We don't check the address size because we can't store an address on
+ * an overflow page: if the address won't fit, the overflow page's
+ * address won't fit either. This possibility must be handled by Btree
+ * configuration, we have to disallow internal page sizes that are too
+ * small with respect to the largest address cookie the underlying block
+ * manager might return.
+ */
+
+ /*
+ * We don't copy the data into the buffer, it's not necessary; just
+ * re-point the buffer's data/length fields.
+ */
+ val->buf.data = addr;
+ val->buf.size = size;
+ val->cell_len =
+ __wt_cell_pack_addr(&val->cell, cell_type, recno, val->buf.size);
+ val->len = val->cell_len + val->buf.size;
+}
+
+/*
+ * __rec_cell_build_val --
+ * Process a data item and return a WT_CELL structure and byte string to
+ * be stored on the page.
+ */
+static int
+__rec_cell_build_val(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, const void *data, size_t size, uint64_t rle)
+{
+ WT_BTREE *btree;
+ WT_KV *val;
+
+ btree = S2BT(session);
+
+ val = &r->v;
+
+ /*
+ * We don't copy the data into the buffer, it's not necessary; just
+ * re-point the buffer's data/length fields.
+ */
+ val->buf.data = data;
+ val->buf.size = size;
+
+ /* Handle zero-length cells quickly. */
+ if (size != 0) {
+ /* Optionally compress the data using the Huffman engine. */
+ if (btree->huffman_value != NULL)
+ WT_RET(__wt_huffman_encode(
+ session, btree->huffman_value,
+ val->buf.data, (uint32_t)val->buf.size, &val->buf));
+
+ /* Create an overflow object if the data won't fit. */
+ if (val->buf.size > btree->maxleafitem) {
+ WT_STAT_FAST_DATA_INCR(session, rec_overflow_value);
+
+ return (__rec_cell_build_ovfl(
+ session, r, val, WT_CELL_VALUE_OVFL, rle));
+ }
+ }
+ val->cell_len = __wt_cell_pack_data(&val->cell, rle, val->buf.size);
+ val->len = val->cell_len + val->buf.size;
+
+ return (0);
+}
+
+/*
+ * __rec_cell_build_ovfl --
+ * Store overflow items in the file, returning the address cookie.
+ */
+static int
+__rec_cell_build_ovfl(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_KV *kv, uint8_t type, uint64_t rle)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_HEADER *dsk;
+ size_t size;
+ uint8_t *addr, buf[WT_BTREE_MAX_ADDR_COOKIE];
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ page = r->page;
+
+ /* Track if page has overflow items. */
+ r->ovfl_items = 1;
+
+ /*
+ * See if this overflow record has already been written and reuse it if
+ * possible. Else, write a new overflow record.
+ */
+ if (!__wt_ovfl_reuse_search(session, page,
+ &addr, &size, kv->buf.data, kv->buf.size)) {
+ /* Allocate a buffer big enough to write the overflow record. */
+ size = kv->buf.size;
+ WT_RET(bm->write_size(bm, session, &size));
+ WT_RET(__wt_scr_alloc(session, size, &tmp));
+
+ /* Initialize the buffer: disk header and overflow record. */
+ dsk = tmp->mem;
+ memset(dsk, 0, WT_PAGE_HEADER_SIZE);
+ dsk->type = WT_PAGE_OVFL;
+ dsk->u.datalen = (uint32_t)kv->buf.size;
+ memcpy(WT_PAGE_HEADER_BYTE(btree, dsk),
+ kv->buf.data, kv->buf.size);
+ dsk->mem_size = tmp->size =
+ WT_PAGE_HEADER_BYTE_SIZE(btree) + (uint32_t)kv->buf.size;
+
+ /* Write the buffer. */
+ addr = buf;
+ WT_ERR(__wt_bt_write(session, tmp, addr, &size, 0, 0));
+
+ /*
+ * Track the overflow record (unless it's a bulk load, which
+ * by definition won't ever reuse a record.
+ */
+ if (!r->is_bulk_load)
+ WT_ERR(__wt_ovfl_reuse_add(session, page,
+ addr, size, kv->buf.data, kv->buf.size));
+ }
+
+ /* Set the callers K/V to reference the overflow record's address. */
+ WT_ERR(__wt_buf_set(session, &kv->buf, addr, size));
+
+ /* Build the cell and return. */
+ kv->cell_len = __wt_cell_pack_ovfl(&kv->cell, type, rle, kv->buf.size);
+ kv->len = kv->cell_len + kv->buf.size;
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * The dictionary --
+ * The rest of this file is support for dictionaries.
+ *
+ * It's difficult to write generic skiplist functions without turning a single
+ * memory allocation into two, or requiring a function call instead of a simple
+ * comparison. Fortunately, skiplists are relatively simple things and we can
+ * include them in-place. If you need generic skip-list functions to modify,
+ * this set wouldn't be a bad place to start.
+ *
+ * __rec_dictionary_skip_search --
+ * Search a dictionary skiplist.
+ */
+static WT_DICTIONARY *
+__rec_dictionary_skip_search(WT_DICTIONARY **head, uint64_t hash)
+{
+ WT_DICTIONARY **e;
+ int i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+ if (*e == NULL) { /* Empty levels */
+ --i;
+ --e;
+ continue;
+ }
+
+ /*
+ * Return any exact matches: we don't care in what search level
+ * we found a match.
+ */
+ if ((*e)->hash == hash) /* Exact match */
+ return (*e);
+ if ((*e)->hash > hash) { /* Drop down a level */
+ --i;
+ --e;
+ } else /* Keep going at this level */
+ e = &(*e)->next[i];
+ }
+ return (NULL);
+}
+
+/*
+ * __rec_dictionary_skip_search_stack --
+ * Search a dictionary skiplist, returning an insert/remove stack.
+ */
+static void
+__rec_dictionary_skip_search_stack(
+ WT_DICTIONARY **head, WT_DICTIONARY ***stack, uint64_t hash)
+{
+ WT_DICTIONARY **e;
+ int i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;)
+ if (*e == NULL || (*e)->hash > hash)
+ stack[i--] = e--; /* Drop down a level */
+ else
+ e = &(*e)->next[i]; /* Keep going at this level */
+}
+
+/*
+ * __rec_dictionary_skip_insert --
+ * Insert an entry into the dictionary skip-list.
+ */
+static void
+__rec_dictionary_skip_insert(
+ WT_DICTIONARY **head, WT_DICTIONARY *e, uint64_t hash)
+{
+ WT_DICTIONARY **stack[WT_SKIP_MAXDEPTH];
+ u_int i;
+
+ /* Insert the new entry into the skiplist. */
+ __rec_dictionary_skip_search_stack(head, stack, hash);
+ for (i = 0; i < e->depth; ++i) {
+ e->next[i] = *stack[i];
+ *stack[i] = e;
+ }
+}
+
+/*
+ * __rec_dictionary_init --
+ * Allocate and initialize the dictionary.
+ */
+static int
+__rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u_int slots)
+{
+ u_int depth, i;
+
+ /* Free any previous dictionary. */
+ __rec_dictionary_free(session, r);
+
+ r->dictionary_slots = slots;
+ WT_RET(__wt_calloc(session,
+ r->dictionary_slots, sizeof(WT_DICTIONARY *), &r->dictionary));
+ for (i = 0; i < r->dictionary_slots; ++i) {
+ depth = __wt_skip_choose_depth(session);
+ WT_RET(__wt_calloc(session, 1,
+ sizeof(WT_DICTIONARY) + depth * sizeof(WT_DICTIONARY *),
+ &r->dictionary[i]));
+ r->dictionary[i]->depth = depth;
+ }
+ return (0);
+}
+
+/*
+ * __rec_dictionary_free --
+ * Free the dictionary.
+ */
+static void
+__rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ u_int i;
+
+ if (r->dictionary == NULL)
+ return;
+
+ /*
+ * We don't correct dictionary_slots when we fail during allocation,
+ * but that's OK, the value is either NULL or a memory reference to
+ * be free'd.
+ */
+ for (i = 0; i < r->dictionary_slots; ++i)
+ __wt_free(session, r->dictionary[i]);
+ __wt_free(session, r->dictionary);
+}
+
+/*
+ * __rec_dictionary_reset --
+ * Reset the dictionary when reconciliation restarts and when crossing a
+ * page boundary (a potential split).
+ */
+static void
+__rec_dictionary_reset(WT_RECONCILE *r)
+{
+ if (r->dictionary_slots) {
+ r->dictionary_next = 0;
+ memset(r->dictionary_head, 0, sizeof(r->dictionary_head));
+ }
+}
+
+/*
+ * __rec_dictionary_lookup --
+ * Check the dictionary for a matching value on this page.
+ */
+static int
+__rec_dictionary_lookup(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *val, WT_DICTIONARY **dpp)
+{
+ WT_DICTIONARY *dp, *next;
+ uint64_t hash;
+ int match;
+
+ *dpp = NULL;
+
+ /* Search the dictionary, and return any match we find. */
+ hash = __wt_hash_fnv64(val->buf.data, val->buf.size);
+ for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash);
+ dp != NULL && dp->hash == hash; dp = dp->next[0]) {
+ WT_RET(__wt_cell_pack_data_match(
+ dp->cell, &val->cell, val->buf.data, &match));
+ if (match) {
+ WT_STAT_FAST_DATA_INCR(session, rec_dictionary);
+ *dpp = dp;
+ return (0);
+ }
+ }
+
+ /*
+ * We're not doing value replacement in the dictionary. We stop adding
+ * new entries if we run out of empty dictionary slots (but continue to
+ * use the existing entries). I can't think of any reason a leaf page
+ * value is more likely to be seen because it was seen more recently
+ * than some other value: if we find working sets where that's not the
+ * case, it shouldn't be too difficult to maintain a pointer which is
+ * the next dictionary slot to re-use.
+ */
+ if (r->dictionary_next >= r->dictionary_slots)
+ return (0);
+
+ /*
+ * Set the hash value, we'll add this entry into the dictionary when we
+ * write it into the page's disk image buffer (because that's when we
+ * know where on the page it will be written).
+ */
+ next = r->dictionary[r->dictionary_next++];
+ next->cell = NULL; /* Not necessary, just cautious. */
+ next->hash = hash;
+ __rec_dictionary_skip_insert(r->dictionary_head, next, hash);
+ *dpp = next;
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c
new file mode 100644
index 00000000000..308bc1f0dc5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/row_key.c
@@ -0,0 +1,500 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static void __inmem_row_leaf_slots(uint8_t *, uint32_t, uint32_t, uint32_t);
+
+/*
+ * __wt_row_leaf_keys --
+ * Instantiate the interesting keys for random search of a page.
+ */
+int
+__wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(key);
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_ROW *rip;
+ uint32_t gap, i;
+
+ btree = S2BT(session);
+
+ if (page->pg_row_entries == 0) { /* Just checking... */
+ F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);
+ return (0);
+ }
+
+ /*
+ * Row-store leaf pages are written as one big prefix-compressed chunk,
+ * that is, only the first key on the page is not prefix-compressed, and
+ * to instantiate the last key on the page, you have to take the first
+ * key on the page and roll it forward to the end of the page. We don't
+ * want to do that on every page access, of course, so we instantiate a
+ * set of keys, essentially creating prefix chunks on the page, where we
+ * can roll forward from the closest, previous, instantiated key. The
+ * complication is that not all keys on a page are equal: we're doing a
+ * binary search on the page, which means there are keys we look at a
+ * lot (every time we search the page), and keys we never look at unless
+ * they are actually being searched for. This function figures out the
+ * "interesting" keys on a page, and then we sequentially walk that list
+ * instantiating those keys.
+ *
+ * Allocate a bit array and figure out the set of "interesting" keys,
+ * marking up the array.
+ */
+ WT_RET(__wt_scr_alloc(session, 0, &key));
+ WT_RET(__wt_scr_alloc(session,
+ (uint32_t)__bitstr_size(page->pg_row_entries), &tmp));
+
+ if ((gap = btree->key_gap) == 0)
+ gap = 1;
+ __inmem_row_leaf_slots(tmp->mem, 0, page->pg_row_entries, gap);
+
+ /* Instantiate the keys. */
+ for (rip = page->pg_row_d, i = 0; i < page->pg_row_entries; ++rip, ++i)
+ if (__bit_test(tmp->mem, i))
+ WT_ERR(__wt_row_leaf_key_work(
+ session, page, rip, key, 1));
+
+ F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);
+
+err: __wt_scr_free(&key);
+ __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __inmem_row_leaf_slots --
+ * Figure out the interesting slots of a page for random search, up to
+ * the specified depth.
+ */
+static void
+__inmem_row_leaf_slots(
+ uint8_t *list, uint32_t base, uint32_t entries, uint32_t gap)
+{
+ uint32_t indx, limit;
+
+ if (entries < gap)
+ return;
+
+ /*
+ * !!!
+ * Don't clean this code up -- it deliberately looks like the binary
+ * search code.
+ *
+ * !!!
+ * There's got to be a function that would give me this information, but
+ * I don't see any performance reason we can't just do this recursively.
+ */
+ limit = entries;
+ indx = base + (limit >> 1);
+ __bit_set(list, indx);
+
+ __inmem_row_leaf_slots(list, base, limit >> 1, gap);
+
+ base = indx + 1;
+ --limit;
+ __inmem_row_leaf_slots(list, base, limit >> 1, gap);
+}
+
+/*
+ * __wt_row_leaf_key_copy --
+ * Get a copy of a row-store leaf-page key.
+ */
+int
+__wt_row_leaf_key_copy(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key)
+{
+ WT_RET(__wt_row_leaf_key(session, page, rip, key, 0));
+
+ /* The return buffer may only hold a reference to a key, copy it. */
+ if (!WT_DATA_IN_ITEM(key))
+ WT_RET(__wt_buf_set(session, key, key->data, key->size));
+
+ return (0);
+}
+
+/*
+ * __wt_row_leaf_key_work --
+ * Return a reference to, a row-store leaf-page key, optionally instantiate
+ * the key into the in-memory page.
+ */
+int
+__wt_row_leaf_key_work(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, int instantiate)
+{
+ enum { FORWARD, BACKWARD } direction;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ WT_ROW *rip, *jump_rip;
+ size_t size;
+ u_int last_prefix;
+ int jump_slot_offset, slot_offset;
+ void *copy;
+ const void *p;
+
+ /*
+ * !!!
+ * It is unusual to call this function: most code should be calling the
+ * front-end, __wt_row_leaf_key, be careful if you're calling this code
+ * directly.
+ */
+
+ btree = S2BT(session);
+ unpack = &_unpack;
+ rip = rip_arg;
+
+ jump_rip = NULL;
+ jump_slot_offset = 0;
+ last_prefix = 0;
+
+ p = NULL; /* -Werror=maybe-uninitialized */
+ size = 0; /* -Werror=maybe-uninitialized */
+
+ direction = BACKWARD;
+ for (slot_offset = 0;;) {
+ if (0) {
+switch_and_jump: /* Switching to a forward roll. */
+ WT_ASSERT(session, direction == BACKWARD);
+ direction = FORWARD;
+
+ /* Skip list of keys with compatible prefixes. */
+ rip = jump_rip;
+ slot_offset = jump_slot_offset;
+ }
+ copy = WT_ROW_KEY_COPY(rip);
+
+ /*
+ * Figure out what the key looks like.
+ */
+ (void)__wt_row_leaf_key_info(
+ page, copy, &ikey, &cell, &p, &size);
+
+ /* 1: the test for a directly referenced on-page key. */
+ if (cell == NULL) {
+ keyb->data = p;
+ keyb->size = size;
+
+ /*
+ * If this is the key we originally wanted, we don't
+ * care if we're rolling forward or backward, or if
+ * it's an overflow key or not, it's what we wanted.
+ * This shouldn't normally happen, the fast-path code
+ * that front-ends this function will have figured it
+ * out before we were called.
+ *
+ * The key doesn't need to be instantiated, skip past
+ * that test.
+ */
+ if (slot_offset == 0)
+ goto done;
+
+ /*
+ * This key is not an overflow key by definition and
+ * isn't compressed in any way, we can use it to roll
+ * forward.
+ * If rolling backward, switch directions.
+ * If rolling forward: there's a bug somewhere,
+ * we should have hit this key when rolling backward.
+ */
+ goto switch_and_jump;
+ }
+
+ /* 2: the test for an instantiated off-page key. */
+ if (ikey != NULL) {
+ /*
+ * If this is the key we originally wanted, we don't
+ * care if we're rolling forward or backward, or if
+ * it's an overflow key or not, it's what we wanted.
+ * Take a copy and wrap up.
+ *
+ * The key doesn't need to be instantiated, skip past
+ * that test.
+ */
+ if (slot_offset == 0) {
+ keyb->data = p;
+ keyb->size = size;
+ goto done;
+ }
+
+ /*
+ * If we wanted a different key and this key is an
+ * overflow key:
+ * If we're rolling backward, this key is useless
+ * to us because it doesn't have a valid prefix: keep
+ * rolling backward.
+ * If we're rolling forward, there's no work to be
+ * done because prefixes skip overflow keys: keep
+ * rolling forward.
+ */
+ if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL)
+ goto next;
+
+ /*
+ * If we wanted a different key and this key is not an
+ * overflow key, it has a valid prefix, we can use it.
+ * If rolling backward, take a copy of the key and
+ * switch directions, we can roll forward from this key.
+ * If rolling forward, replace the key we've been
+ * building with this key, it's what we would have built
+ * anyway.
+ * In short: if it's not an overflow key, take a copy
+ * and roll forward.
+ */
+ keyb->data = p;
+ keyb->size = size;
+ direction = FORWARD;
+ goto next;
+ }
+
+ /*
+ * It must be an on-page cell, unpack it.
+ */
+ __wt_cell_unpack(cell, unpack);
+
+ /* 3: the test for an on-page reference to an overflow key. */
+ if (unpack->type == WT_CELL_KEY_OVFL) {
+ /*
+ * If this is the key we wanted from the start, we don't
+ * care if it's an overflow key, get a copy and wrap up.
+ *
+ * Avoid racing with reconciliation deleting overflow
+ * keys. Deleted overflow keys must be instantiated
+ * first, acquire the overflow lock and check. Read
+ * the key if we still need to do so, but holding the
+ * overflow lock. Note we are not using the version of
+ * the cell-data-ref calls that acquire the overflow
+ * lock and do a look-aside into the tracking cache:
+ * this is an overflow key, not a value, meaning it's
+ * instantiated before being deleted, not copied into
+ * the tracking cache.
+ */
+ if (slot_offset == 0) {
+ WT_ERR(
+ __wt_readlock(session, btree->ovfl_lock));
+ copy = WT_ROW_KEY_COPY(rip);
+ if (!__wt_row_leaf_key_info(page, copy,
+ NULL, &cell, &keyb->data, &keyb->size)) {
+ __wt_cell_unpack(cell, unpack);
+ ret = __wt_dsk_cell_data_ref(session,
+ WT_PAGE_ROW_LEAF, unpack, keyb);
+ }
+ WT_TRET(
+ __wt_readunlock(session, btree->ovfl_lock));
+ WT_ERR(ret);
+ break;
+ }
+
+ /*
+ * If we wanted a different key:
+ * If we're rolling backward, this key is useless
+ * to us because it doesn't have a valid prefix: keep
+ * rolling backward.
+ * If we're rolling forward, there's no work to be
+ * done because prefixes skip overflow keys: keep
+ * rolling forward.
+ */
+ goto next;
+ }
+
+ /*
+ * 4: the test for an on-page reference to a key that isn't
+ * prefix compressed.
+ */
+ if (unpack->prefix == 0) {
+ /*
+ * The only reason to be here is a Huffman encoded key,
+ * a non-encoded key with no prefix compression should
+ * have been directly referenced, and we should not have
+ * needed to unpack its cell.
+ */
+ WT_ASSERT(session, btree->huffman_key != NULL);
+
+ /*
+ * If this is the key we originally wanted, we don't
+ * care if we're rolling forward or backward, it's
+ * what we want. Take a copy and wrap up.
+ *
+ * If we wanted a different key, this key has a valid
+ * prefix, we can use it.
+ * If rolling backward, take a copy of the key and
+ * switch directions, we can roll forward from this key.
+ * If rolling forward there's a bug, we should have
+ * found this key while rolling backwards and switched
+ * directions then.
+ *
+ * The key doesn't need to be instantiated, skip past
+ * that test.
+ */
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, WT_PAGE_ROW_LEAF, unpack, keyb));
+ if (slot_offset == 0)
+ goto done;
+ goto switch_and_jump;
+ }
+
+ /*
+ * 5: an on-page reference to a key that's prefix compressed.
+ * If rolling backward, keep looking for something we can
+ * use.
+ * If rolling forward, build the full key and keep rolling
+ * forward.
+ */
+ if (direction == BACKWARD) {
+ /*
+ * If there's a set of keys with identical prefixes, we
+ * don't want to instantiate each one, the prefixes are
+ * all the same.
+ *
+ * As we roll backward through the page, track the last
+ * time the prefix decreased in size, so we can start
+ * with that key during our roll-forward. For a page
+ * populated with a single key prefix, we'll be able to
+ * instantiate the key we want as soon as we find a key
+ * without a prefix.
+ */
+ if (slot_offset == 0)
+ last_prefix = unpack->prefix;
+ if (slot_offset == 0 || last_prefix > unpack->prefix) {
+ jump_rip = rip;
+ jump_slot_offset = slot_offset;
+ last_prefix = unpack->prefix;
+ }
+ }
+ if (direction == FORWARD) {
+ /*
+ * Get a reference to the current key's bytes. Usually
+ * we want bytes from the page, fast-path that case.
+ */
+ if (btree->huffman_key == NULL) {
+ p = unpack->data;
+ size = unpack->size;
+ } else {
+ if (tmp == NULL)
+ WT_ERR(
+ __wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, WT_PAGE_ROW_LEAF, unpack, tmp));
+ p = tmp->data;
+ size = tmp->size;
+ }
+
+ /*
+ * Grow the buffer as necessary as well as ensure data
+ * has been copied into local buffer space, then append
+ * the suffix to the prefix already in the buffer.
+ *
+ * Don't grow the buffer unnecessarily or copy data we
+ * don't need, truncate the item's data length to the
+ * prefix bytes.
+ */
+ keyb->size = unpack->prefix;
+ WT_ERR(__wt_buf_grow(session, keyb, keyb->size + size));
+ memcpy((uint8_t *)keyb->data + keyb->size, p, size);
+ keyb->size += size;
+
+ if (slot_offset == 0)
+ break;
+ }
+
+next: switch (direction) {
+ case BACKWARD:
+ --rip;
+ ++slot_offset;
+ break;
+ case FORWARD:
+ ++rip;
+ --slot_offset;
+ break;
+ }
+ }
+
+ /*
+ * Optionally instantiate the key: there's a cost to figuring out a key
+ * value in a leaf page with prefix-compressed or Huffman encoded keys,
+ * amortize the cost by instantiating a copy of the calculated key in
+ * allocated memory. We don't instantiate keys when pages are first
+ * brought into memory because it's wasted effort if the page is only
+ * read by a cursor in sorted order. If, instead, the page is read by a
+ * cursor in reverse order, we immediately instantiate periodic keys for
+ * the page (otherwise the reverse walk would be insanely slow). If,
+ * instead, the page is randomly searched, we instantiate keys as they
+ * are accessed (meaning, for example, as long as the binary search only
+ * touches one-half of the page, the only keys we instantiate will be in
+ * that half of the page).
+ */
+ if (instantiate) {
+ copy = WT_ROW_KEY_COPY(rip_arg);
+ (void)__wt_row_leaf_key_info(
+ page, copy, &ikey, &cell, NULL, NULL);
+ if (ikey == NULL) {
+ WT_ERR(__wt_row_ikey(session,
+ WT_PAGE_DISK_OFFSET(page, cell),
+ keyb->data, keyb->size, &ikey));
+
+ /*
+ * Serialize the swap of the key into place: on success,
+ * update the page's memory footprint, on failure, free
+ * the allocated memory.
+ */
+ if (WT_ATOMIC_CAS8(WT_ROW_KEY_COPY(rip), copy, ikey))
+ __wt_cache_page_inmem_incr(session,
+ page, sizeof(WT_IKEY) + ikey->size);
+ else
+ __wt_free(session, ikey);
+ }
+ }
+
+done:
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_row_ikey_incr --
+ * Instantiate a key in a WT_IKEY structure and increment the page's
+ * memory footprint.
+ */
+int
+__wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page,
+ uint32_t cell_offset, const void *key, size_t size, void *ikeyp)
+{
+ WT_RET(__wt_row_ikey(session, cell_offset, key, size, ikeyp));
+
+ __wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + size);
+
+ return (0);
+}
+
+/*
+ * __wt_row_ikey --
+ * Instantiate a key in a WT_IKEY structure.
+ */
+int
+__wt_row_ikey(WT_SESSION_IMPL *session,
+ uint32_t cell_offset, const void *key, size_t size, void *ikeyp)
+{
+ WT_IKEY *ikey;
+
+ /*
+ * Allocate memory for the WT_IKEY structure and the key, then copy
+ * the key into place.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_IKEY) + size, &ikey));
+ ikey->size = WT_STORE_SIZE(size);
+ ikey->cell_offset = cell_offset;
+ memcpy(WT_IKEY_DATA(ikey), key, size);
+
+ *(WT_IKEY **)ikeyp = ikey;
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c
new file mode 100644
index 00000000000..e0036d14cbb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/row_modify.c
@@ -0,0 +1,346 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_page_modify_alloc --
+ * Allocate a page's modification structure.
+ */
+int
+__wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_PAGE_MODIFY *modify;
+
+ conn = S2C(session);
+
+ WT_RET(__wt_calloc_def(session, 1, &modify));
+
+ /*
+ * Select a spinlock for the page; let the barrier immediately below
+ * keep things from racing too badly.
+ */
+ modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS(conn);
+
+ /*
+ * Multiple threads of control may be searching and deciding to modify
+ * a page. If our modify structure is used, update the page's memory
+ * footprint, else discard the modify structure, another thread did the
+ * work.
+ */
+ if (WT_ATOMIC_CAS8(page->modify, NULL, modify))
+ __wt_cache_page_inmem_incr(session, page, sizeof(*modify));
+ else
+ __wt_free(session, modify);
+ return (0);
+}
+
+/*
+ * __wt_row_modify --
+ * Row-store insert, update and delete.
+ */
+int
+__wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
+ WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
+{
+ WT_DECL_RET;
+ WT_INSERT *ins;
+ WT_INSERT_HEAD *ins_head, **ins_headp;
+ WT_PAGE *page;
+ WT_UPDATE *old_upd, **upd_entry;
+ size_t ins_size, upd_size;
+ uint32_t ins_slot;
+ u_int i, skipdepth;
+ int logged;
+
+ ins = NULL;
+ page = cbt->ref->page;
+ logged = 0;
+
+ /* This code expects a remove to have a NULL value. */
+ if (is_remove)
+ value = NULL;
+
+ /* If we don't yet have a modify structure, we'll need one. */
+ WT_RET(__wt_page_modify_init(session, page));
+
+ /*
+ * Modify: allocate an update array as necessary, build a WT_UPDATE
+ * structure, and call a serialized function to insert the WT_UPDATE
+ * structure.
+ *
+ * Insert: allocate an insert array as necessary, build a WT_INSERT
+ * and WT_UPDATE structure pair, and call a serialized function to
+ * insert the WT_INSERT structure.
+ */
+ if (cbt->compare == 0) {
+ if (cbt->ins == NULL) {
+ /* Allocate an update array as necessary. */
+ WT_PAGE_ALLOC_AND_SWAP(session, page,
+ page->pg_row_upd, upd_entry, page->pg_row_entries);
+
+ /* Set the WT_UPDATE array reference. */
+ upd_entry = &page->pg_row_upd[cbt->slot];
+ } else
+ upd_entry = &cbt->ins->upd;
+
+ if (upd == NULL) {
+ /* Make sure the update can proceed. */
+ WT_ERR(__wt_txn_update_check(
+ session, old_upd = *upd_entry));
+
+ /* Allocate a WT_UPDATE structure and transaction ID. */
+ WT_ERR(
+ __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_txn_modify(session, upd));
+ logged = 1;
+
+ /* Avoid WT_CURSOR.update data copy. */
+ cbt->modify_update = upd;
+ } else {
+ upd_size = sizeof(WT_UPDATE) +
+ (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size);
+
+ /*
+ * We are restoring updates that couldn't be evicted,
+ * there should only be one update list per key.
+ */
+ WT_ASSERT(session, *upd_entry == NULL);
+ /*
+ * Set the "old" entry to the second update in the list
+ * so that the serialization function succeeds in
+ * swapping the first update into place.
+ */
+ old_upd = *upd_entry = upd->next;
+ }
+
+ /*
+ * Point the new WT_UPDATE item to the next element in the list.
+ * If we get it right, the serialization function lock acts as
+ * our memory barrier to flush this write.
+ */
+ upd->next = old_upd;
+
+ /* Serialize the update. */
+ WT_ERR(__wt_update_serial(
+ session, page, upd_entry, &upd, upd_size));
+ } else {
+ /*
+ * Allocate the insert array as necessary.
+ *
+ * We allocate an additional insert array slot for insert keys
+ * sorting less than any key on the page. The test to select
+ * that slot is baroque: if the search returned the first page
+ * slot, we didn't end up processing an insert list, and the
+ * comparison value indicates the search key was smaller than
+ * the returned slot, then we're using the smallest-key insert
+ * slot. That's hard, so we set a flag.
+ */
+ WT_PAGE_ALLOC_AND_SWAP(session, page,
+ page->pg_row_ins, ins_headp, page->pg_row_entries + 1);
+
+ ins_slot = F_ISSET(cbt, WT_CBT_SEARCH_SMALLEST) ?
+ page->pg_row_entries: cbt->slot;
+ ins_headp = &page->pg_row_ins[ins_slot];
+
+ /* Allocate the WT_INSERT_HEAD structure as necessary. */
+ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
+ ins_head = *ins_headp;
+
+ /* Choose a skiplist depth for this insert. */
+ skipdepth = __wt_skip_choose_depth(session);
+
+ /*
+ * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
+ * update the cursor to reference it (the WT_INSERT_HEAD might
+ * be allocated, the WT_INSERT was allocated).
+ */
+ WT_ERR(__wt_row_insert_alloc(
+ session, key, skipdepth, &ins, &ins_size));
+ cbt->ins_head = ins_head;
+ cbt->ins = ins;
+
+ if (upd == NULL) {
+ WT_ERR(
+ __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_txn_modify(session, upd));
+ logged = 1;
+
+ /* Avoid WT_CURSOR.update data copy. */
+ cbt->modify_update = upd;
+ } else
+ upd_size = sizeof(WT_UPDATE) +
+ (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size);
+
+ ins->upd = upd;
+ ins_size += upd_size;
+
+ /*
+ * If there was no insert list during the search, the cursor's
+ * information cannot be correct, search couldn't have
+ * initialized it.
+ *
+ * Otherwise, point the new WT_INSERT item's skiplist to the
+ * next elements in the insert list (which we will check are
+ * still valid inside the serialization function).
+ *
+ * The serial mutex acts as our memory barrier to flush these
+ * writes before inserting them into the list.
+ */
+ if (WT_SKIP_FIRST(ins_head) == NULL)
+ for (i = 0; i < skipdepth; i++) {
+ cbt->ins_stack[i] = &ins_head->head[i];
+ ins->next[i] = cbt->next_stack[i] = NULL;
+ }
+ else
+ for (i = 0; i < skipdepth; i++)
+ ins->next[i] = cbt->next_stack[i];
+
+ /* Insert the WT_INSERT structure. */
+ WT_ERR(__wt_insert_serial(
+ session, page, cbt->ins_head, cbt->ins_stack,
+ &ins, ins_size, skipdepth));
+ }
+
+ if (logged)
+ WT_ERR(__wt_txn_log_op(session, cbt));
+
+ if (0) {
+err: /*
+ * Remove the update from the current transaction, so we don't
+ * try to modify it on rollback.
+ */
+ if (logged)
+ __wt_txn_unmodify(session);
+ __wt_free(session, ins);
+ cbt->ins = NULL;
+ __wt_free(session, upd);
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_row_insert_alloc --
+ * Row-store insert: allocate a WT_INSERT structure and fill it in.
+ */
+int
+__wt_row_insert_alloc(WT_SESSION_IMPL *session,
+ WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep)
+{
+ WT_INSERT *ins;
+ size_t ins_size;
+
+ /*
+ * Allocate the WT_INSERT structure, next pointers for the skip list,
+ * and room for the key. Then copy the key into place.
+ */
+ ins_size = sizeof(WT_INSERT) +
+ skipdepth * sizeof(WT_INSERT *) + key->size;
+ WT_RET(__wt_calloc(session, 1, ins_size, &ins));
+
+ ins->u.key.offset = WT_STORE_SIZE(ins_size - key->size);
+ WT_INSERT_KEY_SIZE(ins) = WT_STORE_SIZE(key->size);
+ memcpy(WT_INSERT_KEY(ins), key->data, key->size);
+
+ *insp = ins;
+ if (ins_sizep != NULL)
+ *ins_sizep = ins_size;
+ return (0);
+}
+
+/*
+ * __wt_update_alloc --
+ * Allocate a WT_UPDATE structure and associated value and fill it in.
+ */
+int
+__wt_update_alloc(
+ WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep)
+{
+ WT_UPDATE *upd;
+ size_t size;
+
+ /*
+ * Allocate the WT_UPDATE structure and room for the value, then copy
+ * the value into place.
+ */
+ size = value == NULL ? 0 : value->size;
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd));
+ if (value == NULL)
+ WT_UPDATE_DELETED_SET(upd);
+ else {
+ upd->size = WT_STORE_SIZE(size);
+ memcpy(WT_UPDATE_DATA(upd), value->data, size);
+ }
+
+ *updp = upd;
+ *sizep = sizeof(WT_UPDATE) + size;
+ return (0);
+}
+
+/*
+ * __wt_update_obsolete_check --
+ * Check for obsolete updates.
+ */
+WT_UPDATE *
+__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+ WT_UPDATE *first, *next;
+
+ /*
+ * This function identifies obsolete updates, and truncates them from
+ * the rest of the chain; because this routine is called from inside
+ * a serialization function, the caller has responsibility for actually
+ * freeing the memory.
+ *
+ * Walk the list of updates, looking for obsolete updates at the end.
+ */
+ for (first = NULL; upd != NULL; upd = upd->next)
+ if (__wt_txn_visible_all(session, upd->txnid)) {
+ if (first == NULL)
+ first = upd;
+ } else if (upd->txnid != WT_TXN_ABORTED)
+ first = NULL;
+
+ /*
+ * We cannot discard this WT_UPDATE structure, we can only discard
+ * WT_UPDATE structures subsequent to it, other threads of control will
+ * terminate their walk in this element. Save a reference to the list
+ * we will discard, and terminate the list.
+ */
+ if (first != NULL &&
+ (next = first->next) != NULL &&
+ WT_ATOMIC_CAS8(first->next, next, NULL))
+ return (next);
+
+ return (NULL);
+}
+
+/*
+ * __wt_update_obsolete_free --
+ * Free an obsolete update list.
+ */
+void
+__wt_update_obsolete_free(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd)
+{
+ WT_UPDATE *next;
+ size_t size;
+
+ /* Free a WT_UPDATE list. */
+ for (size = 0; upd != NULL; upd = next) {
+ /* Deleted items have a dummy size: don't include that. */
+ size += sizeof(WT_UPDATE) +
+ (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size);
+
+ next = upd->next;
+ __wt_free(session, upd);
+ }
+ if (size != 0)
+ __wt_cache_page_inmem_decr(session, page, size);
+}
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
new file mode 100644
index 00000000000..b190aaaded5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -0,0 +1,553 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_search_insert_append --
+ * Fast append search of a row-store insert list, creating a skiplist stack
+ * as we go.
+ */
+static inline int
+__wt_search_insert_append(WT_SESSION_IMPL *session,
+ WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, int *donep)
+{
+ WT_BTREE *btree;
+ WT_COLLATOR *collator;
+ WT_INSERT *ins;
+ WT_INSERT_HEAD *inshead;
+ WT_ITEM key;
+ int cmp, i;
+
+ btree = S2BT(session);
+ collator = btree->collator;
+ *donep = 0;
+
+ inshead = cbt->ins_head;
+ if ((ins = WT_SKIP_LAST(inshead)) == NULL)
+ return (0);
+ key.data = WT_INSERT_KEY(ins);
+ key.size = WT_INSERT_KEY_SIZE(ins);
+
+ WT_RET(__wt_compare(session, collator, srch_key, &key, &cmp));
+ if (cmp >= 0) {
+ /*
+ * !!!
+ * We may race with another appending thread.
+ *
+ * To catch that case, rely on the atomic pointer read above
+ * and set the next stack to NULL here. If we have raced with
+ * another thread, one of the next pointers will not be NULL by
+ * the time they are checked against the next stack inside the
+ * serialized insert function.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1; i >= 0; i--) {
+ cbt->ins_stack[i] = (i == 0) ? &ins->next[0] :
+ (inshead->tail[i] != NULL) ?
+ &inshead->tail[i]->next[i] : &inshead->head[i];
+ cbt->next_stack[i] = NULL;
+ }
+ cbt->compare = -cmp;
+ cbt->ins = ins;
+ *donep = 1;
+ }
+ return (0);
+}
+
+/*
+ * __wt_search_insert --
+ * Search a row-store insert list, creating a skiplist stack as we go.
+ */
+int
+__wt_search_insert(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key)
+{
+ WT_BTREE *btree;
+ WT_COLLATOR *collator;
+ WT_INSERT *ins, **insp, *last_ins;
+ WT_INSERT_HEAD *inshead;
+ WT_ITEM key;
+ size_t match, skiphigh, skiplow;
+ int cmp, i;
+
+ btree = S2BT(session);
+ collator = btree->collator;
+ inshead = cbt->ins_head;
+ cmp = 0; /* -Wuninitialized */
+
+ /*
+ * The insert list is a skip list: start at the highest skip level, then
+ * go as far as possible at each level before stepping down to the next.
+ */
+ match = skiphigh = skiplow = 0;
+ ins = last_ins = NULL;
+ for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) {
+ if ((ins = *insp) == NULL) {
+ cbt->next_stack[i] = NULL;
+ cbt->ins_stack[i--] = insp--;
+ continue;
+ }
+
+ /*
+ * Comparisons may be repeated as we drop down skiplist levels;
+ * don't repeat comparisons, they might be expensive.
+ */
+ if (ins != last_ins) {
+ last_ins = ins;
+ key.data = WT_INSERT_KEY(ins);
+ key.size = WT_INSERT_KEY_SIZE(ins);
+ match = WT_MIN(skiplow, skiphigh);
+ WT_RET(__wt_compare_skip(
+ session, collator, srch_key, &key, &cmp, &match));
+ }
+
+ if (cmp > 0) { /* Keep going at this level */
+ insp = &ins->next[i];
+ skiplow = match;
+ } else if (cmp < 0) { /* Drop down a level */
+ cbt->next_stack[i] = ins;
+ cbt->ins_stack[i--] = insp--;
+ skiphigh = match;
+ } else
+ for (; i >= 0; i--) {
+ cbt->next_stack[i] = ins->next[i];
+ cbt->ins_stack[i] = &ins->next[i];
+ }
+ }
+
+ /*
+ * For every insert element we review, we're getting closer to a better
+ * choice; update the compare field to its new value. If we went past
+ * the last item in the list, return the last one: that is used to
+ * decide whether we are positioned in a skiplist.
+ */
+ cbt->compare = -cmp;
+ cbt->ins = (ins != NULL) ? ins : last_ins;
+ return (0);
+}
+
+/*
+ * __wt_row_search --
+ * Search a row-store tree for a specific key.
+ */
+int
+__wt_row_search(WT_SESSION_IMPL *session,
+ WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert)
+{
+ WT_BTREE *btree;
+ WT_COLLATOR *collator;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *current, *descent;
+ WT_ROW *rip;
+ size_t match, skiphigh, skiplow;
+ uint32_t base, indx, limit;
+ int append_check, cmp, depth, descend_right, done;
+
+ btree = S2BT(session);
+ collator = btree->collator;
+ item = &cbt->search_key;
+
+ __cursor_pos_clear(cbt);
+
+ /*
+ * The row-store search routine uses a different comparison API.
+ * The assumption is we're comparing more than a few keys with
+ * matching prefixes, and it's a win to avoid the memory fetches
+ * by skipping over those prefixes. That's done by tracking the
+ * length of the prefix match for the lowest and highest keys we
+ * compare as we descend the tree.
+ */
+ skiphigh = skiplow = 0;
+
+ /*
+ * If a cursor repeatedly appends to the tree, compare the search key
+ * against the last key on each internal page during insert before
+ * doing the full binary search.
+ *
+ * Track if the descent is to the right-side of the tree, used to set
+ * the cursor's append history.
+ */
+ append_check = insert && cbt->append_tree;
+ descend_right = 1;
+
+ /*
+ * In the service of eviction splits, we're only searching a single leaf
+ * page, not a full tree.
+ */
+ if (leaf != NULL) {
+ current = leaf;
+ goto leaf_only;
+ }
+
+ /* Search the internal pages of the tree. */
+ cmp = -1;
+ current = &btree->root;
+ for (depth = 2;; ++depth) {
+restart: page = current->page;
+ if (page->type != WT_PAGE_ROW_INT)
+ break;
+
+ pindex = WT_INTL_INDEX_COPY(page);
+
+ /*
+ * Fast-path internal pages with one child, a common case for
+ * the root page in new trees.
+ */
+ if (pindex->entries == 1) {
+ descent = pindex->index[0];
+ goto descend;
+ }
+
+ /* Fast-path appends. */
+ if (append_check) {
+ descent = pindex->index[pindex->entries - 1];
+ __wt_ref_key(page, descent, &item->data, &item->size);
+ WT_ERR(__wt_compare(
+ session, collator, srch_key, item, &cmp));
+ if (cmp >= 0)
+ goto descend;
+
+ /* A failed append check turns off append checks. */
+ append_check = 0;
+ }
+
+ /*
+ * Binary search of the internal page. There are two versions
+ * (a default loop and an application-specified collation loop),
+ * because moving the collation test and error handling inside
+ * the loop costs about 5%.
+ *
+ * The 0th key on an internal page is a problem for a couple of
+ * reasons. First, we have to force the 0th key to sort less
+ * than any application key, so internal pages don't have to be
+ * updated if the application stores a new, "smallest" key in
+ * the tree. Second, reconciliation is aware of this and will
+ * store a byte of garbage in the 0th key, so the comparison of
+ * an application key and a 0th key is meaningless (but doing
+ * the comparison could still incorrectly modify our tracking
+ * of the leading bytes in each key that we can skip during the
+ * comparison). For these reasons, skip the 0th key.
+ */
+ base = 1;
+ limit = pindex->entries - 1;
+ if (collator == NULL)
+ for (; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ descent = pindex->index[indx];
+ __wt_ref_key(
+ page, descent, &item->data, &item->size);
+
+ match = WT_MIN(skiplow, skiphigh);
+ cmp = __wt_lex_compare_skip(
+ srch_key, item, &match);
+ if (cmp > 0) {
+ skiplow = match;
+ base = indx + 1;
+ --limit;
+ } else if (cmp < 0)
+ skiphigh = match;
+ else
+ goto descend;
+ }
+ else
+ for (; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ descent = pindex->index[indx];
+ __wt_ref_key(
+ page, descent, &item->data, &item->size);
+
+ WT_ERR(__wt_compare(
+ session, collator, srch_key, item, &cmp));
+ if (cmp > 0) {
+ base = indx + 1;
+ --limit;
+ } else if (cmp == 0)
+ goto descend;
+ }
+
+ /*
+ * Set the slot to descend the tree: descent is already set if
+ * there was an exact match on the page, otherwise, base is
+ * the smallest index greater than key, possibly (last + 1).
+ */
+ descent = pindex->index[base - 1];
+
+ /*
+ * If we end up somewhere other than the last slot, it's not a
+ * right-side descent.
+ */
+ if (pindex->entries != base - 1)
+ descend_right = 0;
+
+descend: /*
+ * Swap the current page for the child page. If the page splits
+ * while we're retrieving it, restart the search in the current
+ * page; otherwise return on error, the swap call ensures we're
+ * holding nothing on failure.
+ */
+ switch (ret = __wt_page_swap(session, current, descent, 0)) {
+ case 0:
+ current = descent;
+ break;
+ case WT_RESTART:
+ skiphigh = skiplow = 0;
+ goto restart;
+ default:
+ return (ret);
+ }
+ }
+
+ /* Track how deep the tree gets. */
+ if (depth > btree->maximum_depth)
+ btree->maximum_depth = depth;
+
+leaf_only:
+ page = current->page;
+ cbt->ref = current;
+
+ /*
+ * In the case of a right-side tree descent during an insert, do a fast
+ * check for an append to the page, try to catch cursors appending data
+ * into the tree.
+ *
+ * It's tempting to make this test more rigorous: if a cursor inserts
+ * randomly into a two-level tree (a root referencing a single child
+ * that's empty except for an insert list), the right-side descent flag
+ * will be set and this comparison wasted. The problem resolves itself
+ * as the tree grows larger: either we're no longer doing right-side
+ * descent, or we'll avoid additional comparisons in internal pages,
+ * making up for the wasted comparison here. Similarly, the cursor's
+ * history is set any time it's an insert and a right-side descent,
+ * both to avoid a complicated/expensive test, and, in the case of
+ * multiple threads appending to the tree, we want to mark them all as
+ * appending, even if this test doesn't work.
+ */
+ if (insert && descend_right) {
+ cbt->append_tree = 1;
+
+ if (page->pg_row_entries == 0) {
+ cbt->slot = WT_ROW_SLOT(page, page->pg_row_d);
+
+ F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+ cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+ } else {
+ cbt->slot = WT_ROW_SLOT(page,
+ page->pg_row_d + (page->pg_row_entries - 1));
+
+ cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot);
+ }
+
+ WT_ERR(
+ __wt_search_insert_append(session, cbt, srch_key, &done));
+ if (done)
+ return (0);
+
+ /*
+ * Don't leave the insert list head set, code external to the
+ * search uses it.
+ */
+ cbt->ins_head = NULL;
+ }
+
+ /*
+ * Binary search of the leaf page. There are two versions (a default
+ * loop and an application-specified collation loop), because moving
+ * the collation test and error handling inside the loop costs about 5%.
+ */
+ base = 0;
+ limit = page->pg_row_entries;
+ if (collator == NULL)
+ for (; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ rip = page->pg_row_d + indx;
+ WT_ERR(__wt_row_leaf_key(session, page, rip, item, 1));
+
+ match = WT_MIN(skiplow, skiphigh);
+ cmp = __wt_lex_compare_skip(srch_key, item, &match);
+ if (cmp > 0) {
+ skiplow = match;
+ base = indx + 1;
+ --limit;
+ } else if (cmp < 0)
+ skiphigh = match;
+ else
+ goto leaf_match;
+ }
+ else
+ for (; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ rip = page->pg_row_d + indx;
+ WT_ERR(__wt_row_leaf_key(session, page, rip, item, 1));
+
+ WT_ERR(__wt_compare(
+ session, collator, srch_key, item, &cmp));
+ if (cmp > 0) {
+ base = indx + 1;
+ --limit;
+ } else if (cmp == 0)
+ goto leaf_match;
+ }
+
+ /*
+ * The best case is finding an exact match in the leaf page's WT_ROW
+ * array, probable for any read-mostly workload. Check that case and
+ * get out fast.
+ */
+ if (0) {
+leaf_match: cbt->compare = 0;
+ cbt->slot = WT_ROW_SLOT(page, rip);
+ return (0);
+ }
+
+ /*
+ * We didn't find an exact match in the WT_ROW array.
+ *
+ * Base is the smallest index greater than key and may be the 0th index
+ * or the (last + 1) index. Set the slot to be the largest index less
+ * than the key if that's possible (if base is the 0th index it means
+ * the application is inserting a key before any key found on the page).
+ *
+ * It's still possible there is an exact match, but it's on an insert
+ * list. Figure out which insert chain to search and then set up the
+ * return information assuming we'll find nothing in the insert list
+ * (we'll correct as needed inside the search routine, depending on
+ * what we find).
+ *
+ * If inserting a key smaller than any key found in the WT_ROW array,
+ * use the extra slot of the insert array, otherwise the insert array
+ * maps one-to-one to the WT_ROW array.
+ */
+ if (base == 0) {
+ cbt->compare = 1;
+ cbt->slot = WT_ROW_SLOT(page, page->pg_row_d);
+
+ F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+ cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+ } else {
+ cbt->compare = -1;
+ cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (base - 1));
+
+ cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot);
+ }
+
+ /* If there's no insert list, we're done. */
+ if (WT_SKIP_FIRST(cbt->ins_head) == NULL)
+ return (0);
+
+ /*
+ * Test for an append first when inserting onto an insert list, try to
+ * catch cursors repeatedly inserting at a single point.
+ */
+ if (insert) {
+ WT_ERR(
+ __wt_search_insert_append(session, cbt, srch_key, &done));
+ if (done)
+ return (0);
+ }
+ WT_ERR(__wt_search_insert(session, cbt, srch_key));
+
+ return (0);
+
+err: if (leaf != NULL)
+ WT_TRET(__wt_page_release(session, current, 0));
+ return (ret);
+}
+
+/*
+ * __wt_row_random --
+ * Return a random key from a row-store tree.
+ */
+int
+__wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_INSERT *p, *t;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *current, *descent;
+
+ btree = S2BT(session);
+
+ __cursor_pos_clear(cbt);
+
+restart:
+ /* Walk the internal pages of the tree. */
+ current = &btree->root;
+ for (;;) {
+ page = current->page;
+ if (page->type != WT_PAGE_ROW_INT)
+ break;
+
+ pindex = WT_INTL_INDEX_COPY(page);
+ descent = pindex->index[
+ __wt_random(session->rnd) % pindex->entries];
+
+ /*
+ * Swap the parent page for the child page; return on error,
+ * the swap function ensures we're holding nothing on failure.
+ */
+ if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) {
+ current = descent;
+ continue;
+ }
+ /*
+ * Restart is returned if we find a page that's been split; the
+ * held page isn't discarded when restart is returned, discard
+ * it and restart the search from the top of the tree.
+ */
+ if (ret == WT_RESTART &&
+ (ret = __wt_page_release(session, current, 0)) == 0)
+ goto restart;
+ return (ret);
+ }
+
+ if (page->pg_row_entries != 0) {
+ /*
+ * The use case for this call is finding a place to split the
+ * tree. Cheat (it's not like this is "random", anyway), and
+ * make things easier by returning the first key on the page.
+ * If the caller is attempting to split a newly created tree,
+ * or a tree with just one big page, that's not going to work,
+ * check for that.
+ */
+ cbt->ref = current;
+ cbt->compare = 0;
+ pindex = WT_INTL_INDEX_COPY(btree->root.page);
+ cbt->slot = pindex->entries < 2 ?
+ __wt_random(session->rnd) % page->pg_row_entries : 0;
+
+ return (__wt_row_leaf_key(session,
+ page, page->pg_row_d + cbt->slot, &cbt->search_key, 0));
+ }
+
+ /*
+ * If the tree is new (and not empty), it might have a large insert
+ * list, pick the key in the middle of that insert list.
+ */
+ F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+ if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
+ WT_ERR(WT_NOTFOUND);
+ for (p = t = WT_SKIP_FIRST(cbt->ins_head);;) {
+ if ((p = WT_SKIP_NEXT(p)) == NULL)
+ break;
+ if ((p = WT_SKIP_NEXT(p)) == NULL)
+ break;
+ t = WT_SKIP_NEXT(t);
+ }
+ cbt->ref = current;
+ cbt->compare = 0;
+ cbt->ins = t;
+
+ return (0);
+
+err: WT_TRET(__wt_page_release(session, current, 0));
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config.c b/src/third_party/wiredtiger/src/config/config.c
new file mode 100644
index 00000000000..c792cb4fcf2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config.c
@@ -0,0 +1,745 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __config_err --
+ * Error message and return for config string parse failures.
+ */
+static int
+__config_err(WT_CONFIG *conf, const char *msg, int err)
+{
+ WT_RET_MSG(conf->session, err,
+ "Error parsing '%.*s' at byte %u: %s",
+ (int)(conf->end - conf->orig), conf->orig,
+ (u_int)(conf->cur - conf->orig), msg);
+}
+
+/*
+ * __wt_config_initn --
+ * Initialize a config handle, used to iterate through a config string of
+ * specified length.
+ */
+int
+__wt_config_initn(
+ WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len)
+{
+ conf->session = session;
+ conf->orig = conf->cur = str;
+ conf->end = str + len;
+ conf->depth = 0;
+ conf->top = -1;
+ conf->go = NULL;
+
+ return (0);
+}
+
+/*
+ * __wt_config_init --
+ * Initialize a config handle, used to iterate through a NUL-terminated
+ * config string.
+ */
+int
+__wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str)
+{
+ size_t len;
+
+ len = (str == NULL) ? 0 : strlen(str);
+
+ return (__wt_config_initn(session, conf, str, len));
+}
+
+/*
+ * __wt_config_subinit --
+ * Initialize a config handle, used to iterate through a config string
+ * extracted from another config string (used for parsing nested
+ * structures).
+ */
+int
+__wt_config_subinit(
+ WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item)
+{
+ return (__wt_config_initn(session, conf, item->str, item->len));
+}
+
+#define PUSH(i, t) do { \
+ if (conf->top == -1) \
+ conf->top = conf->depth; \
+ if (conf->depth == conf->top) { \
+ if (out->len > 0) \
+ return (__config_err(conf, \
+ "New value starts without a separator", \
+ EINVAL)); \
+ out->type = (t); \
+ out->str = (conf->cur + (i)); \
+ } \
+} while (0)
+
+#define CAP(i) do { \
+ if (conf->depth == conf->top) \
+ out->len = (size_t)((conf->cur + (i) + 1) - out->str); \
+} while (0)
+
+typedef enum {
+ A_LOOP, A_BAD, A_DOWN, A_UP, A_VALUE, A_NEXT, A_QDOWN, A_QUP,
+ A_ESC, A_UNESC, A_BARE, A_NUMBARE, A_UNBARE, A_UTF8_2,
+ A_UTF8_3, A_UTF8_4, A_UTF_CONTINUE
+} CONFIG_ACTION;
+
+/*
+ * static void *gostruct[] = {
+ * [0 ... 255] = &&l_bad,
+ * ['\t'] = &&l_loop, [' '] = &&l_loop,
+ * ['\r'] = &&l_loop, ['\n'] = &&l_loop,
+ * ['"'] = &&l_qup,
+ * [':'] = &&l_value, ['='] = &&l_value,
+ * [','] = &&l_next,
+ * // tracking [] and {} individually would allow fuller
+ * // validation but is really messy
+ * ['('] = &&l_up, [')'] = &&l_down,
+ * ['['] = &&l_up, [']'] = &&l_down,
+ * ['{'] = &&l_up, ['}'] = &&l_down,
+ * // bare identifiers
+ * ['-'] = &&l_numbare,
+ * ['0' ... '9'] = &&l_numbare,
+ * ['_'] = &&l_bare,
+ * ['A' ... 'Z'] = &&l_bare, ['a' ... 'z'] = &&l_bare,
+ * ['/'] = &&l_bare,
+ * };
+ */
+static const int8_t gostruct[256] = {
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_LOOP, A_LOOP, A_BAD, A_BAD, A_LOOP, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_LOOP, A_BAD, A_QUP,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UP, A_DOWN, A_BAD, A_BAD,
+ A_NEXT, A_NUMBARE, A_BARE, A_BARE, A_NUMBARE, A_NUMBARE,
+ A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE,
+ A_NUMBARE, A_NUMBARE, A_NUMBARE, A_VALUE, A_BAD, A_BAD,
+ A_VALUE, A_BAD, A_BAD, A_BAD, A_BARE, A_BARE, A_BARE, A_BARE,
+ A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE,
+ A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE,
+ A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_UP, A_BAD,
+ A_DOWN, A_BAD, A_BARE, A_BAD, A_BARE, A_BARE, A_BARE, A_BARE,
+ A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE,
+ A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE,
+ A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_UP, A_BAD,
+ A_DOWN, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ * static void *gobare[] =
+ * {
+ * [0 ... 31] = &&l_bad,
+ * // could be more pedantic/validation-checking
+ * [32 ... 126] = &&l_loop,
+ * ['\t'] = &&l_unbare, [' '] = &&l_unbare,
+ * ['\r'] = &&l_unbare, ['\n'] = &&l_unbare,
+ * [':'] = &&l_unbare, ['='] = &&l_unbare,
+ * [','] = &&l_unbare,
+ * [')'] = &&l_unbare, [']'] = &&l_unbare, ['}'] = &&l_unbare,
+ * [127 ... 255] = &&l_bad
+ * };
+ */
+static const int8_t gobare[256] = {
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_UNBARE, A_UNBARE, A_BAD, A_BAD, A_UNBARE, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UNBARE,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_UNBARE, A_LOOP, A_LOOP, A_UNBARE, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_UNBARE, A_LOOP, A_LOOP, A_UNBARE, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_UNBARE,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_UNBARE, A_LOOP, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ * static void *gostring[] =
+ * {
+ * [0 ... 31] = &&l_bad, [127] = &&l_bad,
+ * [32 ... 126] = &&l_loop,
+ * ['\\'] = &&l_esc, ['"'] = &&l_qdown,
+ * [128 ... 191] = &&l_bad,
+ * [192 ... 223] = &&l_utf8_2,
+ * [224 ... 239] = &&l_utf8_3,
+ * [240 ... 247] = &&l_utf8_4,
+ * [248 ... 255] = &&l_bad
+ * };
+ */
+static const int8_t gostring[256] = {
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_LOOP, A_LOOP, A_QDOWN,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_ESC, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UTF8_2,
+ A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+ A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+ A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+ A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+ A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+ A_UTF8_2, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3,
+ A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3,
+ A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_4,
+ A_UTF8_4, A_UTF8_4, A_UTF8_4, A_UTF8_4, A_UTF8_4, A_UTF8_4,
+ A_UTF8_4, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ * static void *goutf8_continue[] =
+ * {
+ * [0 ... 127] = &&l_bad,
+ * [128 ... 191] = &&l_utf_continue,
+ * [192 ... 255] = &&l_bad
+ * };
+ */
+static const int8_t goutf8_continue[256] = {
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ * static void *goesc[] =
+ * {
+ * [0 ... 255] = &&l_bad,
+ * ['"'] = &&l_unesc, ['\\'] = &&l_unesc,
+ * ['/'] = &&l_unesc, ['b'] = &&l_unesc,
+ * ['f'] = &&l_unesc, ['n'] = &&l_unesc,
+ * ['r'] = &&l_unesc, ['t'] = &&l_unesc, ['u'] = &&l_unesc
+ * };
+ */
+static const int8_t goesc[256] = {
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UNESC,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_UNESC, A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD,
+ A_BAD, A_BAD, A_UNESC, A_BAD, A_UNESC, A_UNESC, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ * __config_next --
+ * Get the next config item in the string without processing the value.
+ */
+static int
+__config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+ WT_CONFIG_ITEM *out = key;
+ int utf8_remain = 0;
+ static const WT_CONFIG_ITEM true_value = {
+ "", 0, 1, WT_CONFIG_ITEM_BOOL
+ };
+
+ key->len = 0;
+ /* Keys with no value default to true. */
+ *value = true_value;
+
+ if (conf->go == NULL)
+ conf->go = gostruct;
+
+ while (conf->cur < conf->end) {
+ switch (conf->go[(int)*conf->cur]) {
+ case A_LOOP:
+ break;
+
+ case A_BAD:
+ return (__config_err(
+ conf, "Unexpected character", EINVAL));
+
+ case A_DOWN:
+ --conf->depth;
+ CAP(0);
+ break;
+
+ case A_UP:
+ if (conf->top == -1)
+ conf->top = 1;
+ PUSH(0, WT_CONFIG_ITEM_STRUCT);
+ ++conf->depth;
+ break;
+
+ case A_VALUE:
+ if (conf->depth == conf->top) {
+ /*
+ * Special case: ':' is permitted in unquoted
+ * values.
+ */
+ if (out == value && *conf->cur != ':')
+ return (__config_err(conf,
+ "Value already complete", EINVAL));
+ out = value;
+ }
+ break;
+
+ case A_NEXT:
+ /*
+ * If we're at the top level and we have a complete
+ * key (and optional value), we're done.
+ */
+ if (conf->depth == conf->top && key->len > 0) {
+ ++conf->cur;
+ return (0);
+ } else
+ break;
+
+ case A_QDOWN:
+ CAP(-1);
+ conf->go = gostruct;
+ break;
+
+ case A_QUP:
+ PUSH(1, WT_CONFIG_ITEM_STRING);
+ conf->go = gostring;
+ break;
+
+ case A_ESC:
+ conf->go = goesc;
+ break;
+
+ case A_UNESC:
+ conf->go = gostring;
+ break;
+
+ case A_BARE:
+ PUSH(0, WT_CONFIG_ITEM_ID);
+ conf->go = gobare;
+ break;
+
+ case A_NUMBARE:
+ PUSH(0, WT_CONFIG_ITEM_NUM);
+ conf->go = gobare;
+ break;
+
+ case A_UNBARE:
+ CAP(-1);
+ conf->go = gostruct;
+ continue;
+
+ case A_UTF8_2:
+ conf->go = goutf8_continue;
+ utf8_remain = 1;
+ break;
+
+ case A_UTF8_3:
+ conf->go = goutf8_continue;
+ utf8_remain = 2;
+ break;
+
+ case A_UTF8_4:
+ conf->go = goutf8_continue;
+ utf8_remain = 3;
+ break;
+
+ case A_UTF_CONTINUE:
+ if (!--utf8_remain)
+ conf->go = gostring;
+ break;
+ }
+
+ conf->cur++;
+ }
+
+ /* Might have a trailing key/value without a closing brace */
+ if (conf->go == gobare) {
+ CAP(-1);
+ conf->go = gostruct;
+ }
+
+ /* Did we find something? */
+ if (conf->depth <= conf->top && key->len > 0)
+ return (0);
+
+ /* We're either at the end of the string or we failed to parse. */
+ if (conf->depth == 0)
+ return (WT_NOTFOUND);
+
+ return (__config_err(conf,
+ "Closing brackets missing from config string", EINVAL));
+}
+
+/*
+ * Arithmetic shift of a negative number is undefined by ISO/IEC 9899, and the
+ * WiredTiger API supports negative numbers. Check it's not a negative number,
+ * and then cast the shift out of paranoia.
+ */
+#define WT_SHIFT_INT64(v, s) do { \
+ if ((v) < 0) \
+ goto range; \
+ (v) = (int64_t)(((uint64_t)(v)) << (s)); \
+} while (0)
+
+/*
+ * __config_process_value --
+ * Deal with special config values like true / false.
+ */
+static int
+__config_process_value(WT_CONFIG *conf, WT_CONFIG_ITEM *value)
+{
+ char *endptr;
+
+ /* Empty values are okay: we can't do anything interesting with them. */
+ if (value->len == 0)
+ return (0);
+
+ if (value->type == WT_CONFIG_ITEM_ID) {
+ if (strncasecmp(value->str, "true", value->len) == 0) {
+ value->type = WT_CONFIG_ITEM_BOOL;
+ value->val = 1;
+ } else if (strncasecmp(value->str, "false", value->len) == 0) {
+ value->type = WT_CONFIG_ITEM_BOOL;
+ value->val = 0;
+ }
+ } else if (value->type == WT_CONFIG_ITEM_NUM) {
+ errno = 0;
+ value->val = strtoll(value->str, &endptr, 10);
+
+ /* Check any leftover characters. */
+ while (endptr < value->str + value->len)
+ switch (*endptr++) {
+ case 'b':
+ case 'B':
+ /* Byte: no change. */
+ break;
+ case 'k':
+ case 'K':
+ WT_SHIFT_INT64(value->val, 10);
+ break;
+ case 'm':
+ case 'M':
+ WT_SHIFT_INT64(value->val, 20);
+ break;
+ case 'g':
+ case 'G':
+ WT_SHIFT_INT64(value->val, 30);
+ break;
+ case 't':
+ case 'T':
+ WT_SHIFT_INT64(value->val, 40);
+ break;
+ case 'p':
+ case 'P':
+ WT_SHIFT_INT64(value->val, 50);
+ break;
+ default:
+ /*
+ * We didn't get a well-formed number. That
+ * might be okay, the required type will be
+ * checked by __wt_config_check.
+ */
+ value->type = WT_CONFIG_ITEM_ID;
+ break;
+ }
+
+ /*
+ * If we parsed the whole string but the number is out of range,
+ * report an error. Don't report an error for strings that
+ * aren't well-formed integers: if an integer is expected, that
+ * will be caught by __wt_config_check.
+ */
+ if (value->type == WT_CONFIG_ITEM_NUM && errno == ERANGE)
+ goto range;
+ }
+
+ return (0);
+
+range:
+ return (__config_err(conf, "Number out of range", ERANGE));
+}
+
+/*
+ * __wt_config_next --
+ * Get the next config item in the string and process the value.
+ */
+int
+__wt_config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+ WT_RET(__config_next(conf, key, value));
+ return (__config_process_value(conf, value));
+}
+
+/*
+ * __config_getraw --
+ * Given a config parser, find the final value for a given key.
+ */
+static int
+__config_getraw(
+ WT_CONFIG *cparser, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, int top)
+{
+ WT_CONFIG sparser;
+ WT_CONFIG_ITEM k, v, subk;
+ WT_DECL_RET;
+ int found;
+
+ found = 0;
+ while ((ret = __config_next(cparser, &k, &v)) == 0) {
+ if (k.type != WT_CONFIG_ITEM_STRING &&
+ k.type != WT_CONFIG_ITEM_ID)
+ continue;
+ if (k.len == key->len &&
+ strncasecmp(key->str, k.str, k.len) == 0) {
+ *value = v;
+ found = 1;
+ } else if (k.len < key->len && key->str[k.len] == '.' &&
+ strncasecmp(key->str, k.str, k.len) == 0) {
+ subk.str = key->str + k.len + 1;
+ subk.len = (key->len - k.len) - 1;
+ WT_RET(__wt_config_initn(
+ cparser->session, &sparser, v.str, v.len));
+ if ((ret =
+ __config_getraw(&sparser, &subk, value, 0)) == 0)
+ found = 1;
+ WT_RET_NOTFOUND_OK(ret);
+ }
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ if (!found)
+ return (WT_NOTFOUND);
+ return (top ? __config_process_value(cparser, value) : 0);
+}
+
+/*
+ * __wt_config_get --
+ * Given a NULL-terminated list of configuration strings, find
+ * the final value for a given key.
+ */
+int
+__wt_config_get(WT_SESSION_IMPL *session,
+ const char **cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+ WT_CONFIG cparser;
+ WT_DECL_RET;
+ int found;
+
+ for (found = 0; *cfg != NULL; cfg++) {
+ WT_RET(__wt_config_init(session, &cparser, *cfg));
+ if ((ret = __config_getraw(&cparser, key, value, 1)) == 0)
+ found = 1;
+ else if (ret != WT_NOTFOUND)
+ return (ret);
+ }
+
+ return (found ? 0 : WT_NOTFOUND);
+}
+
+/*
+ * __wt_config_gets --
+ * Given a NULL-terminated list of configuration strings, find the final
+ * value for a given string key.
+ */
+int
+__wt_config_gets(WT_SESSION_IMPL *session,
+ const char **cfg, const char *key, WT_CONFIG_ITEM *value)
+{
+ WT_CONFIG_ITEM key_item =
+ { key, strlen(key), 0, WT_CONFIG_ITEM_STRING };
+
+ return (__wt_config_get(session, cfg, &key_item, value));
+}
+
+/*
+ * __wt_config_getone --
+ * Get the value for a given key from a single config string.
+ */
+int
+__wt_config_getone(WT_SESSION_IMPL *session,
+ const char *config, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+ WT_CONFIG cparser;
+
+ WT_RET(__wt_config_init(session, &cparser, config));
+ return (__config_getraw(&cparser, key, value, 1));
+}
+
+/*
+ * __wt_config_getones --
+ * Get the value for a given string key from a single config string.
+ */
+int
+__wt_config_getones(WT_SESSION_IMPL *session,
+ const char *config, const char *key, WT_CONFIG_ITEM *value)
+{
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM key_item =
+ { key, strlen(key), 0, WT_CONFIG_ITEM_STRING };
+
+ WT_RET(__wt_config_init(session, &cparser, config));
+ return (__config_getraw(&cparser, &key_item, value, 1));
+}
+
+/*
+ * __wt_config_gets_def --
+ * Performance hack: skip parsing config strings by hard-coding defaults.
+ *
+ * It's expensive to repeatedly parse configuration strings, so don't do
+ * it unless it's necessary in performance paths like cursor creation.
+ * Assume the second configuration string is the application's
+ * configuration string, and if it's not set (which is true most of the
+ * time), then use the supplied default value. This makes it faster to
+ * open cursors when checking for obscure open configuration strings like
+ * "next_random".
+ */
+int
+__wt_config_gets_def(WT_SESSION_IMPL *session,
+ const char **cfg, const char *key, int def, WT_CONFIG_ITEM *value)
+{
+ static const WT_CONFIG_ITEM false_value = {
+ "", 0, 0, WT_CONFIG_ITEM_NUM
+ };
+
+ *value = false_value;
+ value->val = def;
+ if (cfg == NULL || cfg[0] == NULL || cfg[1] == NULL)
+ return (0);
+ else if (cfg[2] == NULL)
+ WT_RET_NOTFOUND_OK(
+ __wt_config_getones(session, cfg[1], key, value));
+ return (__wt_config_gets(session, cfg, key, value));
+}
+
+/*
+ * __wt_config_subgetraw --
+ * Get the value for a given key from a config string in a WT_CONFIG_ITEM.
+ * This is useful for dealing with nested structs in config strings.
+ */
+int
+__wt_config_subgetraw(WT_SESSION_IMPL *session,
+ WT_CONFIG_ITEM *cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+ WT_CONFIG cparser;
+
+ WT_RET(__wt_config_initn(session, &cparser, cfg->str, cfg->len));
+ return (__config_getraw(&cparser, key, value, 1));
+}
+
+/*
+ * __wt_config_subgets --
+ * Get the value for a given key from a config string in a WT_CONFIG_ITEM.
+ * This is useful for dealing with nested structs in config strings.
+ */
+int
+__wt_config_subgets(WT_SESSION_IMPL *session,
+ WT_CONFIG_ITEM *cfg, const char *key, WT_CONFIG_ITEM *value)
+{
+ WT_CONFIG_ITEM key_item =
+ { key, strlen(key), 0, WT_CONFIG_ITEM_STRING };
+
+ return (__wt_config_subgetraw(session, cfg, &key_item, value));
+}
diff --git a/src/third_party/wiredtiger/src/config/config_api.c b/src/third_party/wiredtiger/src/config/config_api.c
new file mode 100644
index 00000000000..42f4c117b81
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_api.c
@@ -0,0 +1,105 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __config_parser_close --
+ * WT_CONFIG_PARSER->close method.
+ */
+static int
+__config_parser_close(WT_CONFIG_PARSER *wt_config_parser)
+{
+ WT_CONFIG_PARSER_IMPL *config_parser;
+
+ config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser;
+
+ if (config_parser == NULL)
+ return (EINVAL);
+
+ __wt_free(config_parser->session, config_parser);
+ return (0);
+}
+
+/*
+ * __config_parser_get --
+ * WT_CONFIG_PARSER->search method.
+ */
+static int
+__config_parser_get(WT_CONFIG_PARSER *wt_config_parser,
+ const char *key, WT_CONFIG_ITEM *cval)
+{
+ WT_CONFIG_PARSER_IMPL *config_parser;
+
+ config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser;
+
+ if (config_parser == NULL)
+ return (EINVAL);
+
+ return (__wt_config_subgets(config_parser->session,
+ &config_parser->config_item, key, cval));
+}
+
+/*
+ * __config_parser_next --
+ * WT_CONFIG_PARSER->next method.
+ */
+static int
+__config_parser_next(WT_CONFIG_PARSER *wt_config_parser,
+ WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *cval)
+{
+ WT_CONFIG_PARSER_IMPL *config_parser;
+
+ config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser;
+
+ if (config_parser == NULL)
+ return (EINVAL);
+
+ return (__wt_config_next(&config_parser->config, key, cval));
+}
+
+/*
+ * wiredtiger_config_parser_open --
+ * Create a configuration parser.
+ */
+int
+wiredtiger_config_parser_open(WT_SESSION *wt_session,
+ const char *config, size_t len, WT_CONFIG_PARSER **config_parserp)
+{
+ static const WT_CONFIG_PARSER stds = {
+ __config_parser_close,
+ __config_parser_next,
+ __config_parser_get
+ };
+ WT_CONFIG_ITEM config_item =
+ { config, len, 0, WT_CONFIG_ITEM_STRING };
+ WT_CONFIG_PARSER_IMPL *config_parser;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ *config_parserp = NULL;
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ WT_RET(__wt_calloc_def(session, 1, &config_parser));
+ config_parser->iface = stds;
+ config_parser->session = session;
+
+ /*
+ * Setup a WT_CONFIG_ITEM to be used for get calls and a WT_CONFIG
+ * structure for iterations through the configuration string.
+ */
+ memcpy(&config_parser->config_item, &config_item, sizeof(config_item));
+ WT_ERR(__wt_config_initn(
+ session, &config_parser->config, config, len));
+
+ if (ret == 0)
+ *config_parserp = (WT_CONFIG_PARSER *)config_parser;
+ else
+err: __wt_free(session, config_parser);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_check.c b/src/third_party/wiredtiger/src/config/config_check.c
new file mode 100644
index 00000000000..310e54c3349
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_check.c
@@ -0,0 +1,370 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int config_check(
+ WT_SESSION_IMPL *, const WT_CONFIG_CHECK *, const char *, size_t);
+
+/*
+ * __conn_foc_add --
+ * Add a new entry into the connection's free-on-close list.
+ */
+static int
+__conn_foc_add(WT_SESSION_IMPL *session, const void *p)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /*
+ * Our caller is expected to be holding any locks we need.
+ */
+ WT_RET(__wt_realloc_def(
+ session, &conn->foc_size, conn->foc_cnt + 1, &conn->foc));
+
+ conn->foc[conn->foc_cnt++] = (void *)p;
+ return (0);
+}
+
+/*
+ * __wt_conn_foc_discard --
+ * Discard any memory the connection accumulated.
+ */
+void
+__wt_conn_foc_discard(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ size_t i;
+
+ conn = S2C(session);
+
+ /*
+ * If we have a list of chunks to free, run through the list, then
+ * free the list itself.
+ */
+ for (i = 0; i < conn->foc_cnt; ++i)
+ __wt_free(session, conn->foc[i]);
+ __wt_free(session, conn->foc);
+}
+
+/*
+ * __wt_configure_method --
+ * WT_CONNECTION.configure_method.
+ */
+int
+__wt_configure_method(WT_SESSION_IMPL *session,
+ const char *method, const char *uri,
+ const char *config, const char *type, const char *check)
+{
+ const WT_CONFIG_CHECK *cp;
+ WT_CONFIG_CHECK *checks, *newcheck;
+ const WT_CONFIG_ENTRY **epp;
+ WT_CONFIG_ENTRY *entry;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ size_t cnt;
+ char *newcheck_name, *p;
+
+ /*
+ * !!!
+ * We ignore the specified uri, that is, all new configuration options
+ * will be valid for all data sources. That's shouldn't be too bad
+ * as the worst that can happen is an application might specify some
+ * configuration option and not get an error -- the option should be
+ * ignored by the underlying implementation since it's unexpected, so
+ * there shouldn't be any real problems. Eventually I expect we will
+ * get the whole data-source thing sorted, at which time there may be
+ * configuration arrays for each data source, and that's when the uri
+ * will matter.
+ */
+ WT_UNUSED(uri);
+
+ conn = S2C(session);
+ checks = newcheck = NULL;
+ entry = NULL;
+ newcheck_name = NULL;
+
+ /* Argument checking; we only support a limited number of types. */
+ if (config == NULL)
+ WT_RET_MSG(session, EINVAL, "no configuration specified");
+ if (type == NULL)
+ WT_RET_MSG(session, EINVAL, "no configuration type specified");
+ if (strcmp(type, "boolean") != 0 && strcmp(type, "int") != 0 &&
+ strcmp(type, "list") != 0 && strcmp(type, "string") != 0)
+ WT_RET_MSG(session, EINVAL,
+ "type must be one of \"boolean\", \"int\", \"list\" or "
+ "\"string\"");
+
+ /* Find a match for the method name. */
+ for (epp = conn->config_entries; (*epp)->method != NULL; ++epp)
+ if (strcmp((*epp)->method, method) == 0)
+ break;
+ if ((*epp)->method == NULL)
+ WT_RET_MSG(session,
+ WT_NOTFOUND, "no method matching %s found", method);
+
+ /*
+ * Technically possible for threads to race, lock the connection while
+ * adding the new configuration information. We're holding the lock
+ * for an extended period of time, but configuration changes should be
+ * rare and only happen during startup.
+ */
+ __wt_spin_lock(session, &conn->api_lock);
+
+ /*
+ * Allocate new configuration entry and fill it in.
+ *
+ * The new base value is the previous base value, a separator and the
+ * new configuration string.
+ */
+ WT_ERR(__wt_calloc_def(session, 1, &entry));
+ entry->method = (*epp)->method;
+ WT_ERR(__wt_calloc_def(session,
+ strlen((*epp)->base) + strlen(",") + strlen(config) + 1, &p));
+ (void)strcpy(p, (*epp)->base);
+ (void)strcat(p, ",");
+ (void)strcat(p, config);
+ entry->base = p;
+
+ /*
+ * There may be a default value in the config argument passed in (for
+ * example, (kvs_parallelism=64"). The default value isn't part of the
+ * name, build a new one.
+ */
+ WT_ERR(__wt_strdup(session, config, &newcheck_name));
+ if ((p = strchr(newcheck_name, '=')) != NULL)
+ *p = '\0';
+
+ /*
+ * The new configuration name may replace an existing check with new
+ * information, in that case skip the old version.
+ */
+ cnt = 0;
+ if ((*epp)->checks != NULL)
+ for (cp = (*epp)->checks; cp->name != NULL; ++cp)
+ ++cnt;
+ WT_ERR(__wt_calloc_def(session, cnt + 2, &checks));
+ cnt = 0;
+ if ((*epp)->checks != NULL)
+ for (cp = (*epp)->checks; cp->name != NULL; ++cp)
+ if (strcmp(newcheck_name, cp->name) != 0)
+ checks[cnt++] = *cp;
+ newcheck = &checks[cnt];
+ newcheck->name = newcheck_name;
+ WT_ERR(__wt_strdup(session, type, &newcheck->type));
+ if (check != NULL)
+ WT_ERR(__wt_strdup(session, check, &newcheck->checks));
+ entry->checks = checks;
+
+ /*
+ * Confirm the configuration string passes the new set of
+ * checks.
+ */
+ WT_ERR(config_check(session, entry->checks, config, 0));
+
+ /*
+ * The next time this configuration is updated, we don't want to figure
+ * out which of these pieces of memory were allocated and will need to
+ * be free'd on close (this isn't a heavily used API and it's too much
+ * work); add them all to the free-on-close list now. We don't check
+ * for errors deliberately, we'd have to figure out which elements have
+ * already been added to the free-on-close array and which have not in
+ * order to avoid freeing chunks of memory twice. Again, this isn't a
+ * commonly used API and it shouldn't ever happen, just leak it.
+ */
+ (void)__conn_foc_add(session, entry->base);
+ (void)__conn_foc_add(session, entry);
+ (void)__conn_foc_add(session, checks);
+ (void)__conn_foc_add(session, newcheck->type);
+ (void)__conn_foc_add(session, newcheck->checks);
+ (void)__conn_foc_add(session, newcheck_name);
+
+ /*
+ * Instead of using locks to protect configuration information, assume
+ * we can atomically update a pointer to a chunk of memory, and because
+ * a pointer is never partially written, readers will correctly see the
+ * original or new versions of the memory. Readers might be using the
+ * old version as it's being updated, though, which means we cannot free
+ * the old chunk of memory until all possible readers have finished.
+ * Currently, that's on connection close: in other words, we can use
+ * this because it's small amounts of memory, and we really, really do
+ * not want to acquire locks every time we access configuration strings,
+ * since that's done on every API call.
+ */
+ WT_PUBLISH(*epp, entry);
+
+ if (0) {
+err: if (entry != NULL) {
+ __wt_free(session, entry->base);
+ __wt_free(session, entry);
+ }
+ __wt_free(session, checks);
+ if (newcheck != NULL) {
+ __wt_free(session, newcheck->type);
+ __wt_free(session, newcheck->checks);
+ }
+ __wt_free(session, newcheck_name);
+ }
+
+ __wt_spin_unlock(session, &conn->api_lock);
+ return (ret);
+}
+
+/*
+ * __wt_config_check --
+ * Check the keys in an application-supplied config string match what is
+ * specified in an array of check strings.
+ */
+int
+__wt_config_check(WT_SESSION_IMPL *session,
+ const WT_CONFIG_ENTRY *entry, const char *config, size_t config_len)
+{
+ /*
+ * Callers don't check, it's a fast call without a configuration or
+ * check array.
+ */
+ return (config == NULL || entry->checks == NULL ?
+ 0 : config_check(session, entry->checks, config, config_len));
+}
+
+/*
+ * config_check --
+ * Check the keys in an application-supplied config string match what is
+ * specified in an array of check strings.
+ */
+static int
+config_check(WT_SESSION_IMPL *session,
+ const WT_CONFIG_CHECK *checks, const char *config, size_t config_len)
+{
+ WT_CONFIG parser, cparser, sparser;
+ WT_CONFIG_ITEM k, v, ck, cv, dummy;
+ WT_DECL_RET;
+ int badtype, found, i;
+
+ /*
+ * The config_len parameter is optional, and allows passing in strings
+ * that are not nul-terminated.
+ */
+ if (config_len == 0)
+ WT_RET(__wt_config_init(session, &parser, config));
+ else
+ WT_RET(__wt_config_initn(session, &parser, config, config_len));
+ while ((ret = __wt_config_next(&parser, &k, &v)) == 0) {
+ if (k.type != WT_CONFIG_ITEM_STRING &&
+ k.type != WT_CONFIG_ITEM_ID)
+ WT_RET_MSG(session, EINVAL,
+ "Invalid configuration key found: '%.*s'",
+ (int)k.len, k.str);
+
+ /* Search for a matching entry. */
+ for (i = 0; checks[i].name != NULL; i++)
+ if (WT_STRING_MATCH(checks[i].name, k.str, k.len))
+ break;
+ if (checks[i].name == NULL)
+ WT_RET_MSG(session, EINVAL,
+ "unknown configuration key: '%.*s'",
+ (int)k.len, k.str);
+
+ if (strcmp(checks[i].type, "boolean") == 0) {
+ badtype = (v.type != WT_CONFIG_ITEM_BOOL &&
+ (v.type != WT_CONFIG_ITEM_NUM ||
+ (v.val != 0 && v.val != 1)));
+ } else if (strcmp(checks[i].type, "category") == 0) {
+ /* Deal with categories of the form: XXX=(XXX=blah). */
+ ret = config_check(session,
+ checks[i].subconfigs,
+ k.str + strlen(checks[i].name) + 1, v.len);
+ if (ret != EINVAL)
+ badtype = 0;
+ else
+ badtype = 1;
+ } else if (strcmp(checks[i].type, "format") == 0) {
+ badtype = 0;
+ } else if (strcmp(checks[i].type, "int") == 0) {
+ badtype = (v.type != WT_CONFIG_ITEM_NUM);
+ } else if (strcmp(checks[i].type, "list") == 0) {
+ badtype = (v.len > 0 &&
+ v.type != WT_CONFIG_ITEM_STRUCT);
+ } else if (strcmp(checks[i].type, "string") == 0) {
+ badtype = 0;
+ } else
+ WT_RET_MSG(session, EINVAL,
+ "unknown configuration type: '%s'",
+ checks[i].type);
+
+ if (badtype)
+ WT_RET_MSG(session, EINVAL,
+ "Invalid value for key '%.*s': expected a %s",
+ (int)k.len, k.str, checks[i].type);
+
+ if (checks[i].checks == NULL)
+ continue;
+
+ /* Setup an iterator for the check string. */
+ WT_RET(__wt_config_init(session, &cparser, checks[i].checks));
+ while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) {
+ if (WT_STRING_MATCH("min", ck.str, ck.len)) {
+ if (v.val < cv.val)
+ WT_RET_MSG(session, EINVAL,
+ "Value too small for key '%.*s' "
+ "the minimum is %.*s",
+ (int)k.len, k.str,
+ (int)cv.len, cv.str);
+ } else if (WT_STRING_MATCH("max", ck.str, ck.len)) {
+ if (v.val > cv.val)
+ WT_RET_MSG(session, EINVAL,
+ "Value too large for key '%.*s' "
+ "the maximum is %.*s",
+ (int)k.len, k.str,
+ (int)cv.len, cv.str);
+ } else if (WT_STRING_MATCH("choices", ck.str, ck.len)) {
+ if (v.len == 0)
+ WT_RET_MSG(session, EINVAL,
+ "Key '%.*s' requires a value",
+ (int)k.len, k.str);
+ if (v.type == WT_CONFIG_ITEM_STRUCT) {
+ /*
+ * Handle the 'verbose' case of a list
+ * containing restricted choices.
+ */
+ WT_RET(__wt_config_subinit(session,
+ &sparser, &v));
+ found = 1;
+ while (found &&
+ (ret = __wt_config_next(&sparser,
+ &v, &dummy)) == 0) {
+ ret = __wt_config_subgetraw(
+ session, &cv, &v, &dummy);
+ found = (ret == 0);
+ }
+ } else {
+ ret = __wt_config_subgetraw(session,
+ &cv, &v, &dummy);
+ found = (ret == 0);
+ }
+
+ if (ret != 0 && ret != WT_NOTFOUND)
+ return (ret);
+ if (!found)
+ WT_RET_MSG(session, EINVAL,
+ "Value '%.*s' not a "
+ "permitted choice for key '%.*s'",
+ (int)v.len, v.str,
+ (int)k.len, k.str);
+ } else
+ WT_RET_MSG(session, EINVAL,
+ "unexpected configuration description "
+ "keyword %.*s", (int)ck.len, ck.str);
+ }
+ }
+
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_collapse.c b/src/third_party/wiredtiger/src/config/config_collapse.c
new file mode 100644
index 00000000000..3e4c539cbe9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_collapse.c
@@ -0,0 +1,380 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_config_collapse --
+ * Collapse a set of configuration strings into newly allocated memory.
+ *
+ * This function takes a NULL-terminated list of configuration strings (where
+ * the first one contains all the defaults and the values are in order from
+ * least to most preferred, that is, the default values are least preferred),
+ * and collapses them into newly allocated memory. The algorithm is to walk
+ * the first of the configuration strings, and for each entry, search all of
+ * the configuration strings for a final value, keeping the last value found.
+ *
+ * Notes:
+ * Any key not appearing in the first configuration string is discarded
+ * from the final result, because we'll never search for it.
+ *
+ * Nested structures aren't parsed. For example, imagine a configuration
+ * string contains "key=(k2=v2,k3=v3)", and a subsequent string has
+ * "key=(k4=v4)", the result will be "key=(k4=v4)", as we search for and
+ * use the final value of "key", regardless of field overlap or missing
+ * fields in the nested value.
+ */
+int
+__wt_config_collapse(
+ WT_SESSION_IMPL *session, const char **cfg, const char **config_ret)
+{
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+
+ WT_ERR(__wt_config_init(session, &cparser, cfg[0]));
+ while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
+ if (k.type != WT_CONFIG_ITEM_STRING &&
+ k.type != WT_CONFIG_ITEM_ID)
+ WT_ERR_MSG(session, EINVAL,
+ "Invalid configuration key found: '%s'\n", k.str);
+ WT_ERR(__wt_config_get(session, cfg, &k, &v));
+ /* Include the quotes around string keys/values. */
+ if (k.type == WT_CONFIG_ITEM_STRING) {
+ --k.str;
+ k.len += 2;
+ }
+ if (v.type == WT_CONFIG_ITEM_STRING) {
+ --v.str;
+ v.len += 2;
+ }
+ WT_ERR(__wt_buf_catfmt(session, tmp, "%.*s=%.*s,",
+ (int)k.len, k.str, (int)v.len, v.str));
+ }
+ if (ret != WT_NOTFOUND)
+ goto err;
+
+ /*
+ * If the caller passes us no valid configuration strings, we get here
+ * with no bytes to copy -- that's OK, the underlying string copy can
+ * handle empty strings.
+ *
+ * Strip any trailing comma.
+ */
+ if (tmp->size != 0)
+ --tmp->size;
+ ret = __wt_strndup(session, tmp->data, tmp->size, config_ret);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * We need a character that can't appear in a key as a separator.
+ */
+#undef SEP /* separator key, character */
+#define SEP "["
+#undef SEPC
+#define SEPC '['
+
+/*
+ * Individual configuration entries, including a generation number used to make
+ * the qsort stable.
+ */
+typedef struct {
+ char *k, *v; /* key, value */
+ size_t gen; /* generation */
+} WT_CONFIG_MERGE_ENTRY;
+
+/*
+ * The array of configuration entries.
+ */
+typedef struct {
+ size_t entries_allocated; /* allocated */
+ size_t entries_next; /* next slot */
+
+ WT_CONFIG_MERGE_ENTRY *entries; /* array of entries */
+} WT_CONFIG_MERGE;
+
+/*
+ * __config_merge_scan --
+ * Walk a configuration string, inserting entries into the merged array.
+ */
+static int
+__config_merge_scan(WT_SESSION_IMPL *session,
+ const char *key, const char *value, WT_CONFIG_MERGE *cp)
+{
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_ITEM(kb);
+ WT_DECL_ITEM(vb);
+ WT_DECL_RET;
+ size_t len;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &kb));
+ WT_ERR(__wt_scr_alloc(session, 0, &vb));
+
+ WT_ERR(__wt_config_init(session, &cparser, value));
+ while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
+ if (k.type != WT_CONFIG_ITEM_STRING &&
+ k.type != WT_CONFIG_ITEM_ID)
+ WT_ERR_MSG(session, EINVAL,
+ "Invalid configuration key found: '%s'\n", k.str);
+
+ /* Include the quotes around string keys/values. */
+ if (k.type == WT_CONFIG_ITEM_STRING) {
+ --k.str;
+ k.len += 2;
+ }
+ if (v.type == WT_CONFIG_ITEM_STRING) {
+ --v.str;
+ v.len += 2;
+ }
+
+ /*
+ * !!!
+ * We're using a JSON quote character to separate the names we
+ * create for nested structures. That's not completely safe as
+ * it's possible to quote characters in JSON such that a quote
+ * character appears as a literal character in a key name. In
+ * a few cases, applications can create their own key namespace
+ * (for example, shared library extension names), and therefore
+ * it's possible for an application to confuse us. Error if we
+ * we ever see a key with a magic character.
+ */
+ for (len = 0; len < k.len; ++len)
+ if (k.str[len] == SEPC)
+ WT_ERR_MSG(session, EINVAL,
+ "key %.*s contains a '%c' separator "
+ "character",
+ (int)k.len, (char *)k.str, SEPC);
+
+ /* Build the key/value strings. */
+ WT_ERR(__wt_buf_fmt(session,
+ kb, "%s%s%.*s",
+ key == NULL ? "" : key,
+ key == NULL ? "" : SEP,
+ (int)k.len, k.str));
+ WT_ERR(__wt_buf_fmt(session,
+ vb, "%.*s", (int)v.len, v.str));
+
+ /*
+ * If the value is a structure, recursively parse it.
+ *
+ * !!!
+ * Don't merge unless the structure has field names. WiredTiger
+ * stores checkpoint LSNs in the metadata file using nested
+ * structures without field names: "checkpoint_lsn=(1,0)", not
+ * "checkpoint_lsn=(file=1,offset=0)". The value type is still
+ * WT_CONFIG_ITEM_STRUCT, so we check for a field name in the
+ * value.
+ */
+ if (v.type == WT_CONFIG_ITEM_STRUCT &&
+ strchr(vb->data, '=') != NULL) {
+ WT_ERR(__config_merge_scan(
+ session, kb->data, vb->data, cp));
+ continue;
+ }
+
+ /* Insert the value into the array. */
+ WT_ERR(__wt_realloc_def(session,
+ &cp->entries_allocated,
+ cp->entries_next + 1, &cp->entries));
+ WT_ERR(__wt_strndup(session,
+ kb->data, kb->size, &cp->entries[cp->entries_next].k));
+ WT_ERR(__wt_strndup(session,
+ vb->data, vb->size, &cp->entries[cp->entries_next].v));
+ cp->entries[cp->entries_next].gen = cp->entries_next;
+ ++cp->entries_next;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: __wt_scr_free(&kb);
+ __wt_scr_free(&vb);
+ return (ret);
+}
+
+/*
+ * __strip_comma --
+ * Strip a trailing comma.
+ */
+static void
+__strip_comma(WT_ITEM *buf)
+{
+ if (buf->size != 0 && ((char *)buf->data)[buf->size - 1] == ',')
+ --buf->size;
+}
+
+/*
+ * __config_merge_format_next --
+ * Walk the array, building entries.
+ */
+static int
+__config_merge_format_next(WT_SESSION_IMPL *session, const char *prefix,
+ size_t plen, size_t *enp, WT_CONFIG_MERGE *cp, WT_ITEM *build)
+{
+ WT_CONFIG_MERGE_ENTRY *ep;
+ size_t len1, len2, next;
+ char *p;
+
+ for (; *enp < cp->entries_next; ++*enp) {
+ ep = &cp->entries[*enp];
+ len1 = strlen(ep->k);
+
+ /*
+ * The entries are in sorted order, take the last entry for any
+ * key.
+ */
+ if (*enp < (cp->entries_next - 1)) {
+ len2 = strlen((ep + 1)->k);
+
+ /* Choose the last of identical keys. */
+ if (len1 == len2 &&
+ memcmp(ep->k, (ep + 1)->k, len1) == 0)
+ continue;
+
+ /*
+ * The test is complicated by matching empty entries
+ * "foo=" against nested structures "foo,bar=", where
+ * the latter is a replacement for the former.
+ */
+ if (len2 > len1 &&
+ (ep + 1)->k[len1] == SEPC &&
+ memcmp(ep->k, (ep + 1)->k, len1) == 0)
+ continue;
+ }
+
+ /*
+ * If we're skipping a prefix and this entry doesn't match it,
+ * back off one entry and pop up a level.
+ */
+ if (plen != 0 &&
+ (plen > len1 || memcmp(ep->k, prefix, plen) != 0)) {
+ --*enp;
+ break;
+ }
+
+ /*
+ * If the entry introduces a new level, recurse through that
+ * new level.
+ */
+ if ((p = strchr(ep->k + plen, SEPC)) != NULL) {
+ next = WT_PTRDIFF(p, ep->k);
+ WT_RET(__wt_buf_catfmt(session,
+ build, "%.*s=(", (int)(next - plen), ep->k + plen));
+ WT_RET(__config_merge_format_next(
+ session, ep->k, next + 1, enp, cp, build));
+ __strip_comma(build);
+ WT_RET(__wt_buf_catfmt(session, build, "),"));
+ continue;
+ }
+
+ /* Append the entry to the buffer. */
+ WT_RET(__wt_buf_catfmt(
+ session, build, "%s=%s,", ep->k + plen, ep->v));
+ }
+
+ return (0);
+}
+
+/*
+ * __config_merge_format --
+ * Take the sorted array of entries, and format them into allocated memory.
+ */
+static int
+__config_merge_format(
+ WT_SESSION_IMPL *session, WT_CONFIG_MERGE *cp, const char **config_ret)
+{
+ WT_DECL_ITEM(build);
+ WT_DECL_RET;
+ size_t entries;
+
+ WT_RET(__wt_scr_alloc(session, 4 * 1024, &build));
+
+ entries = 0;
+ WT_ERR(__config_merge_format_next(session, "", 0, &entries, cp, build));
+
+ __strip_comma(build);
+
+ ret = __wt_strndup(session, build->data, build->size, config_ret);
+
+err: __wt_scr_free(&build);
+ return (ret);
+}
+
+/*
+ * __config_merge_cmp --
+ * Qsort function: sort the config merge array.
+ */
+static int
+__config_merge_cmp(const void *a, const void *b)
+{
+ WT_CONFIG_MERGE_ENTRY *ae, *be;
+ int cmp;
+
+ ae = (WT_CONFIG_MERGE_ENTRY *)a;
+ be = (WT_CONFIG_MERGE_ENTRY *)b;
+
+ if ((cmp = strcmp(ae->k, be->k)) != 0)
+ return (cmp);
+ return (ae->gen > be->gen ? 1 : -1);
+}
+
+/*
+ * __wt_config_merge --
+ * Merge a set of configuration strings into newly allocated memory.
+ *
+ * This function takes a NULL-terminated list of configuration strings (where
+ * the values are in order from least to most preferred), and merges them into
+ * newly allocated memory. The algorithm is to walk the configuration strings
+ * and build a table of each key/value pair. The pairs are sorted based on the
+ * name and the configuration string in which they were found, and a final
+ * configuration string is built from the result.
+ *
+ * Note:
+ * Nested structures are parsed and merge. For example, if configuration
+ * strings "key=(k1=v1,k2=v2)" and "key=(k1=v2)" appear, the result will
+ * be "key=(k1=v2,k2=v2)" because the nested values are merged.
+ */
+int
+__wt_config_merge(
+ WT_SESSION_IMPL *session, const char **cfg, const char **config_ret)
+{
+ WT_CONFIG_MERGE merge;
+ WT_DECL_RET;
+ size_t i;
+
+ /* Start out with a reasonable number of entries. */
+ WT_CLEAR(merge);
+
+ WT_RET(__wt_realloc_def(
+ session, &merge.entries_allocated, 100, &merge.entries));
+
+ /* Scan the configuration strings, entering them into the array. */
+ for (; *cfg != NULL; ++cfg)
+ WT_ERR(__config_merge_scan(session, NULL, *cfg, &merge));
+
+ /*
+ * Sort the array by key and, in the case of identical keys, by
+ * generation.
+ */
+ qsort(merge.entries, merge.entries_next,
+ sizeof(WT_CONFIG_MERGE_ENTRY), __config_merge_cmp);
+
+ /* Convert the array of entries into a string. */
+ ret = __config_merge_format(session, &merge, config_ret);
+
+err: for (i = 0; i < merge.entries_next; ++i) {
+ __wt_free(session, merge.entries[i].k);
+ __wt_free(session, merge.entries[i].v);
+ }
+ __wt_free(session, merge.entries);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_concat.c b/src/third_party/wiredtiger/src/config/config_concat.c
new file mode 100644
index 00000000000..99475ef6f47
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_concat.c
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_config_concat --
+ * Given a NULL-terminated list of configuration strings, concatenate them
+ * into newly allocated memory. Nothing special is assumed about any of
+ * the config strings, they are simply combined in order.
+ *
+ * This code deals with the case where some of the config strings are
+ * wrapped in brackets but others aren't: the resulting string does not
+ * have brackets.
+ */
+int
+__wt_config_concat(
+ WT_SESSION_IMPL *session, const char **cfg, const char **config_ret)
+{
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ const char **cp;
+
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+
+ for (cp = cfg; *cp != NULL; ++cp) {
+ WT_ERR(__wt_config_init(session, &cparser, *cp));
+ while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
+ if (k.type != WT_CONFIG_ITEM_STRING &&
+ k.type != WT_CONFIG_ITEM_ID)
+ WT_ERR_MSG(session, EINVAL,
+ "Invalid configuration key found: '%s'\n",
+ k.str);
+ /* Include the quotes around string keys/values. */
+ if (k.type == WT_CONFIG_ITEM_STRING) {
+ --k.str;
+ k.len += 2;
+ }
+ if (v.type == WT_CONFIG_ITEM_STRING) {
+ --v.str;
+ v.len += 2;
+ }
+ WT_ERR(__wt_buf_catfmt(session, tmp, "%.*s%s%.*s,",
+ (int)k.len, k.str,
+ (v.len > 0) ? "=" : "",
+ (int)v.len, v.str));
+ }
+ if (ret != WT_NOTFOUND)
+ goto err;
+ }
+
+ /*
+ * If the caller passes us no valid configuration strings, we get here
+ * with no bytes to copy -- that's OK, the underlying string copy can
+ * handle empty strings.
+ *
+ * Strip any trailing comma.
+ */
+ if (tmp->size != 0)
+ --tmp->size;
+ ret = __wt_strndup(session, tmp->data, tmp->size, config_ret);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
new file mode 100644
index 00000000000..0cd2d32df57
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -0,0 +1,744 @@
+/* DO NOT EDIT: automatically built by dist/config.py. */
+
+#include "wt_internal.h"
+
+static const WT_CONFIG_CHECK confchk_colgroup_meta[] = {
+ { "app_metadata", "string", NULL, NULL },
+ { "columns", "list", NULL, NULL },
+ { "source", "string", NULL, NULL },
+ { "type", "string", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_async_new_op[] = {
+ { "append", "boolean", NULL, NULL },
+ { "overwrite", "boolean", NULL, NULL },
+ { "raw", "boolean", NULL, NULL },
+ { "timeout", "int", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_close[] = {
+ { "leak_memory", "boolean", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_load_extension[] = {
+ { "config", "string", NULL, NULL },
+ { "entry", "string", NULL, NULL },
+ { "terminate", "string", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_open_session[] = {
+ { "isolation", "string",
+ "choices=[\"read-uncommitted\",\"read-committed\",\"snapshot\"]",
+ NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_async_subconfigs[] = {
+ { "enabled", "boolean", NULL, NULL },
+ { "ops_max", "int", "min=10,max=4096", NULL },
+ { "threads", "int", "min=1,max=20", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_checkpoint_subconfigs[] = {
+ { "log_size", "int", "min=0,max=2GB", NULL },
+ { "name", "string", NULL, NULL },
+ { "wait", "int", "min=0,max=100000", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_eviction_subconfigs[] = {
+ { "threads_max", "int", "min=1,max=20", NULL },
+ { "threads_min", "int", "min=1,max=20", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_lsm_manager_subconfigs[] = {
+ { "merge", "boolean", NULL, NULL },
+ { "worker_thread_max", "int", "min=3,max=20", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_shared_cache_subconfigs[] = {
+ { "chunk", "int", "min=1MB,max=10TB", NULL },
+ { "name", "string", NULL, NULL },
+ { "reserve", "int", NULL, NULL },
+ { "size", "int", "min=1MB,max=10TB", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_statistics_log_subconfigs[] = {
+ { "on_close", "boolean", NULL, NULL },
+ { "path", "string", NULL, NULL },
+ { "sources", "list", NULL, NULL },
+ { "timestamp", "string", NULL, NULL },
+ { "wait", "int", "min=0,max=100000", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_reconfigure[] = {
+ { "async", "category", NULL, confchk_async_subconfigs },
+ { "cache_size", "int", "min=1MB,max=10TB", NULL },
+ { "checkpoint", "category", NULL,
+ confchk_checkpoint_subconfigs },
+ { "error_prefix", "string", NULL, NULL },
+ { "eviction", "category", NULL, confchk_eviction_subconfigs },
+ { "eviction_dirty_target", "int", "min=10,max=99", NULL },
+ { "eviction_target", "int", "min=10,max=99", NULL },
+ { "eviction_trigger", "int", "min=10,max=99", NULL },
+ { "lsm_manager", "category", NULL,
+ confchk_lsm_manager_subconfigs },
+ { "lsm_merge", "boolean", NULL, NULL },
+ { "shared_cache", "category", NULL,
+ confchk_shared_cache_subconfigs },
+ { "statistics", "list",
+ "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+ NULL },
+ { "statistics_log", "category", NULL,
+ confchk_statistics_log_subconfigs },
+ { "verbose", "list",
+ "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+ ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
+ NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_file_meta[] = {
+ { "allocation_size", "int", "min=512B,max=128MB", NULL },
+ { "app_metadata", "string", NULL, NULL },
+ { "block_allocation", "string",
+ "choices=[\"first\",\"best\"]",
+ NULL },
+ { "block_compressor", "string", NULL, NULL },
+ { "cache_resident", "boolean", NULL, NULL },
+ { "checkpoint", "string", NULL, NULL },
+ { "checkpoint_lsn", "string", NULL, NULL },
+ { "checksum", "string",
+ "choices=[\"on\",\"off\",\"uncompressed\"]",
+ NULL },
+ { "collator", "string", NULL, NULL },
+ { "columns", "list", NULL, NULL },
+ { "dictionary", "int", "min=0", NULL },
+ { "format", "string", "choices=[\"btree\"]", NULL },
+ { "huffman_key", "string", NULL, NULL },
+ { "huffman_value", "string", NULL, NULL },
+ { "id", "string", NULL, NULL },
+ { "internal_item_max", "int", "min=0", NULL },
+ { "internal_key_truncate", "boolean", NULL, NULL },
+ { "internal_page_max", "int", "min=512B,max=512MB", NULL },
+ { "key_format", "format", NULL, NULL },
+ { "key_gap", "int", "min=0", NULL },
+ { "leaf_item_max", "int", "min=0", NULL },
+ { "leaf_page_max", "int", "min=512B,max=512MB", NULL },
+ { "memory_page_max", "int", "min=512B,max=10TB", NULL },
+ { "os_cache_dirty_max", "int", "min=0", NULL },
+ { "os_cache_max", "int", "min=0", NULL },
+ { "prefix_compression", "boolean", NULL, NULL },
+ { "prefix_compression_min", "int", "min=0", NULL },
+ { "split_pct", "int", "min=25,max=100", NULL },
+ { "value_format", "format", NULL, NULL },
+ { "version", "string", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_index_meta[] = {
+ { "app_metadata", "string", NULL, NULL },
+ { "columns", "list", NULL, NULL },
+ { "key_format", "format", NULL, NULL },
+ { "source", "string", NULL, NULL },
+ { "type", "string", NULL, NULL },
+ { "value_format", "format", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_begin_transaction[] = {
+ { "isolation", "string",
+ "choices=[\"read-uncommitted\",\"read-committed\",\"snapshot\"]",
+ NULL },
+ { "name", "string", NULL, NULL },
+ { "priority", "int", "min=-100,max=100", NULL },
+ { "sync", "boolean", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_checkpoint[] = {
+ { "drop", "list", NULL, NULL },
+ { "force", "boolean", NULL, NULL },
+ { "name", "string", NULL, NULL },
+ { "target", "list", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_compact[] = {
+ { "timeout", "int", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_lsm_subconfigs[] = {
+ { "auto_throttle", "boolean", NULL, NULL },
+ { "bloom", "boolean", NULL, NULL },
+ { "bloom_bit_count", "int", "min=2,max=1000", NULL },
+ { "bloom_config", "string", NULL, NULL },
+ { "bloom_hash_count", "int", "min=2,max=100", NULL },
+ { "bloom_oldest", "boolean", NULL, NULL },
+ { "chunk_max", "int", "min=100MB,max=10TB", NULL },
+ { "chunk_size", "int", "min=512K,max=500MB", NULL },
+ { "merge_max", "int", "min=2,max=100", NULL },
+ { "merge_min", "int", "max=100", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_create[] = {
+ { "allocation_size", "int", "min=512B,max=128MB", NULL },
+ { "app_metadata", "string", NULL, NULL },
+ { "block_allocation", "string",
+ "choices=[\"first\",\"best\"]",
+ NULL },
+ { "block_compressor", "string", NULL, NULL },
+ { "cache_resident", "boolean", NULL, NULL },
+ { "checksum", "string",
+ "choices=[\"on\",\"off\",\"uncompressed\"]",
+ NULL },
+ { "colgroups", "list", NULL, NULL },
+ { "collator", "string", NULL, NULL },
+ { "columns", "list", NULL, NULL },
+ { "dictionary", "int", "min=0", NULL },
+ { "exclusive", "boolean", NULL, NULL },
+ { "format", "string", "choices=[\"btree\"]", NULL },
+ { "huffman_key", "string", NULL, NULL },
+ { "huffman_value", "string", NULL, NULL },
+ { "internal_item_max", "int", "min=0", NULL },
+ { "internal_key_truncate", "boolean", NULL, NULL },
+ { "internal_page_max", "int", "min=512B,max=512MB", NULL },
+ { "key_format", "format", NULL, NULL },
+ { "key_gap", "int", "min=0", NULL },
+ { "leaf_item_max", "int", "min=0", NULL },
+ { "leaf_page_max", "int", "min=512B,max=512MB", NULL },
+ { "lsm", "category", NULL, confchk_lsm_subconfigs },
+ { "memory_page_max", "int", "min=512B,max=10TB", NULL },
+ { "os_cache_dirty_max", "int", "min=0", NULL },
+ { "os_cache_max", "int", "min=0", NULL },
+ { "prefix_compression", "boolean", NULL, NULL },
+ { "prefix_compression_min", "int", "min=0", NULL },
+ { "source", "string", NULL, NULL },
+ { "split_pct", "int", "min=25,max=100", NULL },
+ { "type", "string", NULL, NULL },
+ { "value_format", "format", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_drop[] = {
+ { "force", "boolean", NULL, NULL },
+ { "remove_files", "boolean", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_open_cursor[] = {
+ { "append", "boolean", NULL, NULL },
+ { "bulk", "string", NULL, NULL },
+ { "checkpoint", "string", NULL, NULL },
+ { "dump", "string",
+ "choices=[\"hex\",\"json\",\"print\"]",
+ NULL },
+ { "next_random", "boolean", NULL, NULL },
+ { "overwrite", "boolean", NULL, NULL },
+ { "raw", "boolean", NULL, NULL },
+ { "readonly", "boolean", NULL, NULL },
+ { "skip_sort_check", "boolean", NULL, NULL },
+ { "statistics", "list",
+ "choices=[\"all\",\"fast\",\"clear\"]",
+ NULL },
+ { "target", "list", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_reconfigure[] = {
+ { "isolation", "string",
+ "choices=[\"read-uncommitted\",\"read-committed\",\"snapshot\"]",
+ NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_salvage[] = {
+ { "force", "boolean", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_verify[] = {
+ { "dump_address", "boolean", NULL, NULL },
+ { "dump_blocks", "boolean", NULL, NULL },
+ { "dump_offsets", "list", NULL, NULL },
+ { "dump_pages", "boolean", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_table_meta[] = {
+ { "app_metadata", "string", NULL, NULL },
+ { "colgroups", "list", NULL, NULL },
+ { "columns", "list", NULL, NULL },
+ { "key_format", "format", NULL, NULL },
+ { "value_format", "format", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_log_subconfigs[] = {
+ { "archive", "boolean", NULL, NULL },
+ { "enabled", "boolean", NULL, NULL },
+ { "file_max", "int", "min=100KB,max=2GB", NULL },
+ { "path", "string", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_transaction_sync_subconfigs[] = {
+ { "enabled", "boolean", NULL, NULL },
+ { "method", "string",
+ "choices=[\"dsync\",\"fsync\",\"none\"]",
+ NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
+ { "async", "category", NULL, confchk_async_subconfigs },
+ { "buffer_alignment", "int", "min=-1,max=1MB", NULL },
+ { "cache_size", "int", "min=1MB,max=10TB", NULL },
+ { "checkpoint", "category", NULL,
+ confchk_checkpoint_subconfigs },
+ { "checkpoint_sync", "boolean", NULL, NULL },
+ { "config_base", "boolean", NULL, NULL },
+ { "create", "boolean", NULL, NULL },
+ { "direct_io", "list",
+ "choices=[\"checkpoint\",\"data\",\"log\"]",
+ NULL },
+ { "error_prefix", "string", NULL, NULL },
+ { "eviction", "category", NULL, confchk_eviction_subconfigs },
+ { "eviction_dirty_target", "int", "min=10,max=99", NULL },
+ { "eviction_target", "int", "min=10,max=99", NULL },
+ { "eviction_trigger", "int", "min=10,max=99", NULL },
+ { "exclusive", "boolean", NULL, NULL },
+ { "extensions", "list", NULL, NULL },
+ { "file_extend", "list", "choices=[\"data\",\"log\"]", NULL },
+ { "hazard_max", "int", "min=15", NULL },
+ { "log", "category", NULL, confchk_log_subconfigs },
+ { "lsm_manager", "category", NULL,
+ confchk_lsm_manager_subconfigs },
+ { "lsm_merge", "boolean", NULL, NULL },
+ { "mmap", "boolean", NULL, NULL },
+ { "multiprocess", "boolean", NULL, NULL },
+ { "session_max", "int", "min=1", NULL },
+ { "shared_cache", "category", NULL,
+ confchk_shared_cache_subconfigs },
+ { "statistics", "list",
+ "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+ NULL },
+ { "statistics_log", "category", NULL,
+ confchk_statistics_log_subconfigs },
+ { "transaction_sync", "category", NULL,
+ confchk_transaction_sync_subconfigs },
+ { "use_environment_priv", "boolean", NULL, NULL },
+ { "verbose", "list",
+ "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+ ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
+ NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
+ { "async", "category", NULL, confchk_async_subconfigs },
+ { "buffer_alignment", "int", "min=-1,max=1MB", NULL },
+ { "cache_size", "int", "min=1MB,max=10TB", NULL },
+ { "checkpoint", "category", NULL,
+ confchk_checkpoint_subconfigs },
+ { "checkpoint_sync", "boolean", NULL, NULL },
+ { "config_base", "boolean", NULL, NULL },
+ { "create", "boolean", NULL, NULL },
+ { "direct_io", "list",
+ "choices=[\"checkpoint\",\"data\",\"log\"]",
+ NULL },
+ { "error_prefix", "string", NULL, NULL },
+ { "eviction", "category", NULL, confchk_eviction_subconfigs },
+ { "eviction_dirty_target", "int", "min=10,max=99", NULL },
+ { "eviction_target", "int", "min=10,max=99", NULL },
+ { "eviction_trigger", "int", "min=10,max=99", NULL },
+ { "exclusive", "boolean", NULL, NULL },
+ { "extensions", "list", NULL, NULL },
+ { "file_extend", "list", "choices=[\"data\",\"log\"]", NULL },
+ { "hazard_max", "int", "min=15", NULL },
+ { "log", "category", NULL, confchk_log_subconfigs },
+ { "lsm_manager", "category", NULL,
+ confchk_lsm_manager_subconfigs },
+ { "lsm_merge", "boolean", NULL, NULL },
+ { "mmap", "boolean", NULL, NULL },
+ { "multiprocess", "boolean", NULL, NULL },
+ { "session_max", "int", "min=1", NULL },
+ { "shared_cache", "category", NULL,
+ confchk_shared_cache_subconfigs },
+ { "statistics", "list",
+ "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+ NULL },
+ { "statistics_log", "category", NULL,
+ confchk_statistics_log_subconfigs },
+ { "transaction_sync", "category", NULL,
+ confchk_transaction_sync_subconfigs },
+ { "use_environment_priv", "boolean", NULL, NULL },
+ { "verbose", "list",
+ "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+ ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
+ NULL },
+ { "version", "string", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
+ { "async", "category", NULL, confchk_async_subconfigs },
+ { "buffer_alignment", "int", "min=-1,max=1MB", NULL },
+ { "cache_size", "int", "min=1MB,max=10TB", NULL },
+ { "checkpoint", "category", NULL,
+ confchk_checkpoint_subconfigs },
+ { "checkpoint_sync", "boolean", NULL, NULL },
+ { "direct_io", "list",
+ "choices=[\"checkpoint\",\"data\",\"log\"]",
+ NULL },
+ { "error_prefix", "string", NULL, NULL },
+ { "eviction", "category", NULL, confchk_eviction_subconfigs },
+ { "eviction_dirty_target", "int", "min=10,max=99", NULL },
+ { "eviction_target", "int", "min=10,max=99", NULL },
+ { "eviction_trigger", "int", "min=10,max=99", NULL },
+ { "extensions", "list", NULL, NULL },
+ { "file_extend", "list", "choices=[\"data\",\"log\"]", NULL },
+ { "hazard_max", "int", "min=15", NULL },
+ { "log", "category", NULL, confchk_log_subconfigs },
+ { "lsm_manager", "category", NULL,
+ confchk_lsm_manager_subconfigs },
+ { "lsm_merge", "boolean", NULL, NULL },
+ { "mmap", "boolean", NULL, NULL },
+ { "multiprocess", "boolean", NULL, NULL },
+ { "session_max", "int", "min=1", NULL },
+ { "shared_cache", "category", NULL,
+ confchk_shared_cache_subconfigs },
+ { "statistics", "list",
+ "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+ NULL },
+ { "statistics_log", "category", NULL,
+ confchk_statistics_log_subconfigs },
+ { "transaction_sync", "category", NULL,
+ confchk_transaction_sync_subconfigs },
+ { "verbose", "list",
+ "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+ ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
+ NULL },
+ { "version", "string", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
+ { "async", "category", NULL, confchk_async_subconfigs },
+ { "buffer_alignment", "int", "min=-1,max=1MB", NULL },
+ { "cache_size", "int", "min=1MB,max=10TB", NULL },
+ { "checkpoint", "category", NULL,
+ confchk_checkpoint_subconfigs },
+ { "checkpoint_sync", "boolean", NULL, NULL },
+ { "direct_io", "list",
+ "choices=[\"checkpoint\",\"data\",\"log\"]",
+ NULL },
+ { "error_prefix", "string", NULL, NULL },
+ { "eviction", "category", NULL, confchk_eviction_subconfigs },
+ { "eviction_dirty_target", "int", "min=10,max=99", NULL },
+ { "eviction_target", "int", "min=10,max=99", NULL },
+ { "eviction_trigger", "int", "min=10,max=99", NULL },
+ { "extensions", "list", NULL, NULL },
+ { "file_extend", "list", "choices=[\"data\",\"log\"]", NULL },
+ { "hazard_max", "int", "min=15", NULL },
+ { "log", "category", NULL, confchk_log_subconfigs },
+ { "lsm_manager", "category", NULL,
+ confchk_lsm_manager_subconfigs },
+ { "lsm_merge", "boolean", NULL, NULL },
+ { "mmap", "boolean", NULL, NULL },
+ { "multiprocess", "boolean", NULL, NULL },
+ { "session_max", "int", "min=1", NULL },
+ { "shared_cache", "category", NULL,
+ confchk_shared_cache_subconfigs },
+ { "statistics", "list",
+ "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+ NULL },
+ { "statistics_log", "category", NULL,
+ confchk_statistics_log_subconfigs },
+ { "transaction_sync", "category", NULL,
+ confchk_transaction_sync_subconfigs },
+ { "verbose", "list",
+ "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+ ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
+ NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_ENTRY config_entries[] = {
+ { "colgroup.meta",
+ "app_metadata=,columns=,source=,type=file",
+ confchk_colgroup_meta
+ },
+ { "connection.add_collator",
+ "",
+ NULL
+ },
+ { "connection.add_compressor",
+ "",
+ NULL
+ },
+ { "connection.add_data_source",
+ "",
+ NULL
+ },
+ { "connection.add_extractor",
+ "",
+ NULL
+ },
+ { "connection.async_new_op",
+ "append=0,overwrite=,raw=0,timeout=1200",
+ confchk_connection_async_new_op
+ },
+ { "connection.close",
+ "leak_memory=0",
+ confchk_connection_close
+ },
+ { "connection.load_extension",
+ "config=,entry=wiredtiger_extension_init,"
+ "terminate=wiredtiger_extension_terminate",
+ confchk_connection_load_extension
+ },
+ { "connection.open_session",
+ "isolation=read-committed",
+ confchk_connection_open_session
+ },
+ { "connection.reconfigure",
+ "async=(enabled=0,ops_max=1024,threads=2),cache_size=100MB,"
+ "checkpoint=(log_size=0,name=\"WiredTigerCheckpoint\",wait=0),"
+ "error_prefix=,eviction=(threads_max=1,threads_min=1),"
+ "eviction_dirty_target=80,eviction_target=80,eviction_trigger=95,"
+ "lsm_manager=(merge=,worker_thread_max=4),lsm_merge=,"
+ "shared_cache=(chunk=10MB,name=,reserve=0,size=500MB),"
+ "statistics=none,statistics_log=(on_close=0,"
+ "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "timestamp=\"%b %d %H:%M:%S\",wait=0),verbose=",
+ confchk_connection_reconfigure
+ },
+ { "cursor.close",
+ "",
+ NULL
+ },
+ { "file.meta",
+ "allocation_size=4KB,app_metadata=,block_allocation=best,"
+ "block_compressor=,cache_resident=0,checkpoint=,checkpoint_lsn=,"
+ "checksum=uncompressed,collator=,columns=,dictionary=0,"
+ "format=btree,huffman_key=,huffman_value=,id=,internal_item_max=0"
+ ",internal_key_truncate=,internal_page_max=4KB,key_format=u,"
+ "key_gap=10,leaf_item_max=0,leaf_page_max=32KB,"
+ "memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0,"
+ "prefix_compression=0,prefix_compression_min=4,split_pct=75,"
+ "value_format=u,version=(major=0,minor=0)",
+ confchk_file_meta
+ },
+ { "index.meta",
+ "app_metadata=,columns=,key_format=u,source=,type=file,"
+ "value_format=u",
+ confchk_index_meta
+ },
+ { "session.begin_transaction",
+ "isolation=,name=,priority=0,sync=",
+ confchk_session_begin_transaction
+ },
+ { "session.checkpoint",
+ "drop=,force=0,name=,target=",
+ confchk_session_checkpoint
+ },
+ { "session.close",
+ "",
+ NULL
+ },
+ { "session.commit_transaction",
+ "",
+ NULL
+ },
+ { "session.compact",
+ "timeout=1200",
+ confchk_session_compact
+ },
+ { "session.create",
+ "allocation_size=4KB,app_metadata=,block_allocation=best,"
+ "block_compressor=,cache_resident=0,checksum=uncompressed,"
+ "colgroups=,collator=,columns=,dictionary=0,exclusive=0,"
+ "format=btree,huffman_key=,huffman_value=,internal_item_max=0,"
+ "internal_key_truncate=,internal_page_max=4KB,key_format=u,"
+ "key_gap=10,leaf_item_max=0,leaf_page_max=32KB,"
+ "lsm=(auto_throttle=,bloom=,bloom_bit_count=16,bloom_config=,"
+ "bloom_hash_count=8,bloom_oldest=0,chunk_max=5GB,chunk_size=10MB,"
+ "merge_max=15,merge_min=0),memory_page_max=5MB,"
+ "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,"
+ "prefix_compression_min=4,source=,split_pct=75,type=file,"
+ "value_format=u",
+ confchk_session_create
+ },
+ { "session.drop",
+ "force=0,remove_files=",
+ confchk_session_drop
+ },
+ { "session.log_printf",
+ "",
+ NULL
+ },
+ { "session.open_cursor",
+ "append=0,bulk=0,checkpoint=,dump=,next_random=0,overwrite=,raw=0"
+ ",readonly=0,skip_sort_check=0,statistics=,target=",
+ confchk_session_open_cursor
+ },
+ { "session.reconfigure",
+ "isolation=read-committed",
+ confchk_session_reconfigure
+ },
+ { "session.rename",
+ "",
+ NULL
+ },
+ { "session.rollback_transaction",
+ "",
+ NULL
+ },
+ { "session.salvage",
+ "force=0",
+ confchk_session_salvage
+ },
+ { "session.truncate",
+ "",
+ NULL
+ },
+ { "session.upgrade",
+ "",
+ NULL
+ },
+ { "session.verify",
+ "dump_address=0,dump_blocks=0,dump_offsets=,dump_pages=0",
+ confchk_session_verify
+ },
+ { "table.meta",
+ "app_metadata=,colgroups=,columns=,key_format=u,value_format=u",
+ confchk_table_meta
+ },
+ { "wiredtiger_open",
+ "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
+ "cache_size=100MB,checkpoint=(log_size=0,"
+ "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=,"
+ "config_base=,create=0,direct_io=,error_prefix=,"
+ "eviction=(threads_max=1,threads_min=1),eviction_dirty_target=80,"
+ "eviction_target=80,eviction_trigger=95,exclusive=0,extensions=,"
+ "file_extend=,hazard_max=1000,log=(archive=,enabled=0,"
+ "file_max=100MB,path=\"\"),lsm_manager=(merge=,"
+ "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+ "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0,"
+ "size=500MB),statistics=none,statistics_log=(on_close=0,"
+ "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
+ ",method=fsync),use_environment_priv=0,verbose=",
+ confchk_wiredtiger_open
+ },
+ { "wiredtiger_open_all",
+ "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
+ "cache_size=100MB,checkpoint=(log_size=0,"
+ "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=,"
+ "config_base=,create=0,direct_io=,error_prefix=,"
+ "eviction=(threads_max=1,threads_min=1),eviction_dirty_target=80,"
+ "eviction_target=80,eviction_trigger=95,exclusive=0,extensions=,"
+ "file_extend=,hazard_max=1000,log=(archive=,enabled=0,"
+ "file_max=100MB,path=\"\"),lsm_manager=(merge=,"
+ "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+ "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0,"
+ "size=500MB),statistics=none,statistics_log=(on_close=0,"
+ "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
+ ",method=fsync),use_environment_priv=0,verbose=,version=(major=0,"
+ "minor=0)",
+ confchk_wiredtiger_open_all
+ },
+ { "wiredtiger_open_basecfg",
+ "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
+ "cache_size=100MB,checkpoint=(log_size=0,"
+ "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=,"
+ "direct_io=,error_prefix=,eviction=(threads_max=1,threads_min=1),"
+ "eviction_dirty_target=80,eviction_target=80,eviction_trigger=95,"
+ "extensions=,file_extend=,hazard_max=1000,log=(archive=,enabled=0"
+ ",file_max=100MB,path=\"\"),lsm_manager=(merge=,"
+ "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+ "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0,"
+ "size=500MB),statistics=none,statistics_log=(on_close=0,"
+ "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
+ ",method=fsync),verbose=,version=(major=0,minor=0)",
+ confchk_wiredtiger_open_basecfg
+ },
+ { "wiredtiger_open_usercfg",
+ "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
+ "cache_size=100MB,checkpoint=(log_size=0,"
+ "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=,"
+ "direct_io=,error_prefix=,eviction=(threads_max=1,threads_min=1),"
+ "eviction_dirty_target=80,eviction_target=80,eviction_trigger=95,"
+ "extensions=,file_extend=,hazard_max=1000,log=(archive=,enabled=0"
+ ",file_max=100MB,path=\"\"),lsm_manager=(merge=,"
+ "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+ "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0,"
+ "size=500MB),statistics=none,statistics_log=(on_close=0,"
+ "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
+ ",method=fsync),verbose=",
+ confchk_wiredtiger_open_usercfg
+ },
+ { NULL, NULL, NULL }
+};
+
+int
+__wt_conn_config_init(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ const WT_CONFIG_ENTRY *ep, **epp;
+
+ conn = S2C(session);
+
+ /* Build a list of pointers to the configuration information. */
+ WT_RET(__wt_calloc_def(session,
+ sizeof(config_entries) / sizeof(config_entries[0]), &epp));
+ conn->config_entries = epp;
+
+ /* Fill in the list to reference the default information. */
+ for (ep = config_entries;;) {
+ *epp++ = ep++;
+ if (ep->method == NULL)
+ break;
+ }
+ return (0);
+}
+
+void
+__wt_conn_config_discard(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ __wt_free(session, conn->config_entries);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_ext.c b/src/third_party/wiredtiger/src/config/config_ext.c
new file mode 100644
index 00000000000..26b3799d61c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_ext.c
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ext_config_parser_open --
+ * WT_EXTENSION_API->config_parser_open implementation
+ */
+int
+__wt_ext_config_parser_open(WT_EXTENSION_API *wt_ext, WT_SESSION *wt_session,
+ const char *config, size_t len, WT_CONFIG_PARSER **config_parserp)
+{
+ WT_UNUSED(wt_ext);
+ return (wiredtiger_config_parser_open(
+ wt_session, config, len, config_parserp));
+}
+
+/*
+ * __wt_ext_config_get --
+ * Given a NULL-terminated list of configuration strings, find the final
+ * value for a given string key (external API version).
+ */
+int
+__wt_ext_config_get(WT_EXTENSION_API *wt_api,
+ WT_SESSION *wt_session, WT_CONFIG_ARG *cfg_arg, const char *key,
+ WT_CONFIG_ITEM *cval)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+ const char **cfg;
+
+ conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = conn->default_session;
+
+ if ((cfg = (const char **)cfg_arg) == NULL)
+ return (WT_NOTFOUND);
+ return (__wt_config_gets(session, cfg, key, cval));
+}
diff --git a/src/third_party/wiredtiger/src/config/config_upgrade.c b/src/third_party/wiredtiger/src/config/config_upgrade.c
new file mode 100644
index 00000000000..24297df839b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_upgrade.c
@@ -0,0 +1,32 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_config_upgrade --
+ * Upgrade a configuration string by appended the replacement version.
+ */
+int
+__wt_config_upgrade(WT_SESSION_IMPL *session, WT_ITEM *buf)
+{
+ WT_CONFIG_ITEM v;
+ const char *config;
+
+ config = buf->data;
+
+ /*
+ * wiredtiger_open:
+ * lsm_merge=boolean -> lsm_manager=(merge=boolean)
+ */
+ if (__wt_config_getones(
+ session, config, "lsm_merge", &v) != WT_NOTFOUND)
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ",lsm_manager=(merge=%s)", v.val ? "true" : "false"));
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/conn/api_strerror.c b/src/third_party/wiredtiger/src/conn/api_strerror.c
new file mode 100644
index 00000000000..1ad136eae12
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/api_strerror.c
@@ -0,0 +1,43 @@
+/* DO NOT EDIT: automatically built by dist/api_err.py. */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_strerror --
+ * Return a string for any error value.
+ */
+const char *
+wiredtiger_strerror(int error)
+{
+ static char errbuf[64];
+ char *p;
+
+ if (error == 0)
+ return ("Successful return: 0");
+
+ switch (error) {
+ case WT_DUPLICATE_KEY:
+ return ("WT_DUPLICATE_KEY: attempt to insert an existing key");
+ case WT_ERROR:
+ return ("WT_ERROR: non-specific WiredTiger error");
+ case WT_NOTFOUND:
+ return ("WT_NOTFOUND: item not found");
+ case WT_PANIC:
+ return ("WT_PANIC: WiredTiger library panic");
+ case WT_RESTART:
+ return ("WT_RESTART: restart the operation (internal)");
+ case WT_ROLLBACK:
+ return ("WT_ROLLBACK: conflict between concurrent operations");
+ default:
+ if (error > 0 && (p = strerror(error)) != NULL)
+ return (p);
+ break;
+ }
+
+ /*
+ * !!!
+ * Not thread-safe, but this is never supposed to happen.
+ */
+ (void)snprintf(errbuf, sizeof(errbuf), "Unknown error: %d", error);
+ return (errbuf);
+}
diff --git a/src/third_party/wiredtiger/src/conn/api_version.c b/src/third_party/wiredtiger/src/conn/api_version.c
new file mode 100644
index 00000000000..1355220c585
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/api_version.c
@@ -0,0 +1,24 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_version --
+ * Return library version information.
+ */
+const char *
+wiredtiger_version(int *majorp, int *minorp, int *patchp)
+{
+ if (majorp != NULL)
+ *majorp = WIREDTIGER_VERSION_MAJOR;
+ if (minorp != NULL)
+ *minorp = WIREDTIGER_VERSION_MINOR;
+ if (patchp != NULL)
+ *patchp = WIREDTIGER_VERSION_PATCH;
+ return (WIREDTIGER_VERSION_STRING);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
new file mode 100644
index 00000000000..c7562ab94c3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -0,0 +1,1573 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __conn_statistics_config(WT_SESSION_IMPL *, const char *[]);
+
+/*
+ * ext_collate --
+ * Call the collation function (external API version).
+ */
+static int
+ext_collate(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+ WT_COLLATOR *collator, WT_ITEM *first, WT_ITEM *second, int *cmpp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = conn->default_session;
+
+ WT_RET(__wt_compare(session, collator, first, second, cmpp));
+
+ return (0);
+}
+
+/*
+ * ext_collator_config --
+ * Given a configuration, configure the collator (external API version).
+ */
+static int
+ext_collator_config(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+ WT_CONFIG_ARG *cfg_arg, WT_COLLATOR **collatorp, int *ownp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+ const char **cfg;
+
+ conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = conn->default_session;
+
+ /* The default is a standard lexicographic comparison. */
+ if ((cfg = (const char **)cfg_arg) == NULL)
+ return (0);
+
+ return (__wt_collator_config(session, cfg, collatorp, ownp));
+}
+
+/*
+ * __wt_collator_config --
+ * Given a configuration, configure the collator.
+ */
+int
+__wt_collator_config(WT_SESSION_IMPL *session, const char **cfg,
+ WT_COLLATOR **collatorp, int *ownp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_NAMED_COLLATOR *ncoll;
+
+ *collatorp = NULL;
+ *ownp = 0;
+
+ conn = S2C(session);
+
+ if ((ret = __wt_config_gets(session, cfg, "collator", &cval)) != 0)
+ return (ret == WT_NOTFOUND ? 0 : ret);
+
+ if (cval.len > 0) {
+ TAILQ_FOREACH(ncoll, &conn->collqh, q)
+ if (WT_STRING_MATCH(ncoll->name, cval.str, cval.len))
+ break;
+
+ if (ncoll == NULL)
+ WT_RET_MSG(session, EINVAL,
+ "unknown collator '%.*s'", (int)cval.len, cval.str);
+
+ if (ncoll->collator->customize != NULL) {
+ WT_RET(__wt_config_gets(session,
+ session->dhandle->cfg, "app_metadata", &cval));
+ WT_RET(ncoll->collator->customize(
+ ncoll->collator, &session->iface,
+ session->dhandle->name, &cval, collatorp));
+ }
+ if (*collatorp == NULL)
+ *collatorp = ncoll->collator;
+ else
+ *ownp = 1;
+ }
+
+ return (0);
+}
+
+/*
+ * __conn_get_extension_api --
+ * WT_CONNECTION.get_extension_api method.
+ */
+static WT_EXTENSION_API *
+__conn_get_extension_api(WT_CONNECTION *wt_conn)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+
+ conn->extension_api.conn = wt_conn;
+ conn->extension_api.err_printf = __wt_ext_err_printf;
+ conn->extension_api.msg_printf = __wt_ext_msg_printf;
+ conn->extension_api.strerror = wiredtiger_strerror;
+ conn->extension_api.scr_alloc = __wt_ext_scr_alloc;
+ conn->extension_api.scr_free = __wt_ext_scr_free;
+ conn->extension_api.collator_config = ext_collator_config;
+ conn->extension_api.collate = ext_collate;
+ conn->extension_api.config_parser_open = __wt_ext_config_parser_open;
+ conn->extension_api.config_get = __wt_ext_config_get;
+ conn->extension_api.metadata_insert = __wt_ext_metadata_insert;
+ conn->extension_api.metadata_remove = __wt_ext_metadata_remove;
+ conn->extension_api.metadata_search = __wt_ext_metadata_search;
+ conn->extension_api.metadata_update = __wt_ext_metadata_update;
+ conn->extension_api.struct_pack = __wt_ext_struct_pack;
+ conn->extension_api.struct_size = __wt_ext_struct_size;
+ conn->extension_api.struct_unpack = __wt_ext_struct_unpack;
+ conn->extension_api.transaction_id = __wt_ext_transaction_id;
+ conn->extension_api.transaction_isolation_level =
+ __wt_ext_transaction_isolation_level;
+ conn->extension_api.transaction_notify = __wt_ext_transaction_notify;
+ conn->extension_api.transaction_oldest = __wt_ext_transaction_oldest;
+ conn->extension_api.transaction_visible = __wt_ext_transaction_visible;
+ conn->extension_api.version = wiredtiger_version;
+
+ return (&conn->extension_api);
+}
+
+#ifdef HAVE_BUILTIN_EXTENSION_SNAPPY
+ extern int snappy_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *);
+#endif
+#ifdef HAVE_BUILTIN_EXTENSION_ZLIB
+ extern int zlib_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *);
+#endif
+
+/*
+ * __conn_load_default_extensions --
+ * Load extensions that are enabled via --with-builtins
+ */
+static int
+__conn_load_default_extensions(WT_CONNECTION_IMPL *conn)
+{
+ WT_UNUSED(conn);
+#ifdef HAVE_BUILTIN_EXTENSION_SNAPPY
+ WT_RET(snappy_extension_init(&conn->iface, NULL));
+#endif
+#ifdef HAVE_BUILTIN_EXTENSION_ZLIB
+ WT_RET(zlib_extension_init(&conn->iface, NULL));
+#endif
+ return (0);
+}
+
+/*
+ * __conn_load_extension --
+ * WT_CONNECTION->load_extension method.
+ */
+static int
+__conn_load_extension(
+ WT_CONNECTION *wt_conn, const char *path, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_DLH *dlh;
+ WT_SESSION_IMPL *session;
+ int (*load)(WT_CONNECTION *, WT_CONFIG_ARG *);
+ int is_local;
+ const char *init_name, *terminate_name;
+
+ dlh = NULL;
+ init_name = terminate_name = NULL;
+ is_local = (strcmp(path, "local") == 0);
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL(conn, session, load_extension, config, cfg);
+
+ /*
+ * This assumes the underlying shared libraries are reference counted,
+ * that is, that re-opening a shared library simply increments a ref
+ * count, and closing it simply decrements the ref count, and the last
+ * close discards the reference entirely -- in other words, we do not
+ * check to see if we've already opened this shared library.
+ */
+ WT_ERR(__wt_dlopen(session, is_local ? NULL : path, &dlh));
+
+ /*
+ * Find the load function, remember the unload function for when we
+ * close.
+ */
+ WT_ERR(__wt_config_gets(session, cfg, "entry", &cval));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &init_name));
+ WT_ERR(__wt_dlsym(session, dlh, init_name, 1, &load));
+
+ WT_ERR(__wt_config_gets(session, cfg, "terminate", &cval));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &terminate_name));
+ WT_ERR(__wt_dlsym(session, dlh, terminate_name, 0, &dlh->terminate));
+
+ /* Call the load function last, it simplifies error handling. */
+ WT_ERR(load(wt_conn, (WT_CONFIG_ARG *)cfg));
+
+ /* Link onto the environment's list of open libraries. */
+ __wt_spin_lock(session, &conn->api_lock);
+ TAILQ_INSERT_TAIL(&conn->dlhqh, dlh, q);
+ __wt_spin_unlock(session, &conn->api_lock);
+ dlh = NULL;
+
+err: if (dlh != NULL)
+ WT_TRET(__wt_dlclose(session, dlh));
+ __wt_free(session, init_name);
+ __wt_free(session, terminate_name);
+
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_load_extensions --
+ * Load the list of application-configured extensions.
+ */
+static int
+__conn_load_extensions(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONFIG subconfig;
+ WT_CONFIG_ITEM cval, skey, sval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(exconfig);
+ WT_DECL_ITEM(expath);
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ WT_ERR(__conn_load_default_extensions(conn));
+
+ WT_ERR(__wt_config_gets(session, cfg, "extensions", &cval));
+ WT_ERR(__wt_config_subinit(session, &subconfig, &cval));
+ while ((ret = __wt_config_next(&subconfig, &skey, &sval)) == 0) {
+ if (expath == NULL)
+ WT_ERR(__wt_scr_alloc(session, 0, &expath));
+ WT_ERR(__wt_buf_fmt(
+ session, expath, "%.*s", (int)skey.len, skey.str));
+ if (sval.len > 0) {
+ if (exconfig == NULL)
+ WT_ERR(__wt_scr_alloc(session, 0, &exconfig));
+ WT_ERR(__wt_buf_fmt(session,
+ exconfig, "%.*s", (int)sval.len, sval.str));
+ }
+ WT_ERR(conn->iface.load_extension(&conn->iface,
+ expath->data, (sval.len > 0) ? exconfig->data : NULL));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: __wt_scr_free(&expath);
+ __wt_scr_free(&exconfig);
+
+ return (ret);
+}
+
+/*
+ * __conn_add_collator --
+ * WT_CONNECTION->add_collator method.
+ */
+static int
+__conn_add_collator(WT_CONNECTION *wt_conn,
+ const char *name, WT_COLLATOR *collator, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_NAMED_COLLATOR *ncoll;
+ WT_SESSION_IMPL *session;
+
+ ncoll = NULL;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL(conn, session, add_collator, config, cfg);
+ WT_UNUSED(cfg);
+
+ WT_ERR(__wt_calloc_def(session, 1, &ncoll));
+ WT_ERR(__wt_strdup(session, name, &ncoll->name));
+ ncoll->collator = collator;
+
+ __wt_spin_lock(session, &conn->api_lock);
+ TAILQ_INSERT_TAIL(&conn->collqh, ncoll, q);
+ ncoll = NULL;
+ __wt_spin_unlock(session, &conn->api_lock);
+
+err: if (ncoll != NULL) {
+ __wt_free(session, ncoll->name);
+ __wt_free(session, ncoll);
+ }
+
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_conn_remove_collator --
+ * Remove collator added by WT_CONNECTION->add_collator, only used
+ * internally.
+ */
+int
+__wt_conn_remove_collator(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_NAMED_COLLATOR *ncoll;
+
+ conn = S2C(session);
+
+ while ((ncoll = TAILQ_FIRST(&conn->collqh)) != NULL) {
+ /* Call any termination method. */
+ if (ncoll->collator->terminate != NULL)
+ WT_TRET(ncoll->collator->terminate(
+ ncoll->collator, (WT_SESSION *)session));
+
+ /* Remove from the connection's list, free memory. */
+ TAILQ_REMOVE(&conn->collqh, ncoll, q);
+ __wt_free(session, ncoll->name);
+ __wt_free(session, ncoll);
+ }
+
+ return (ret);
+}
+
+/*
+ * __conn_add_compressor --
+ * WT_CONNECTION->add_compressor method.
+ */
+static int
+__conn_add_compressor(WT_CONNECTION *wt_conn,
+ const char *name, WT_COMPRESSOR *compressor, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_NAMED_COMPRESSOR *ncomp;
+ WT_SESSION_IMPL *session;
+
+ WT_UNUSED(name);
+ WT_UNUSED(compressor);
+ ncomp = NULL;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL(conn, session, add_compressor, config, cfg);
+ WT_UNUSED(cfg);
+
+ WT_ERR(__wt_calloc_def(session, 1, &ncomp));
+ WT_ERR(__wt_strdup(session, name, &ncomp->name));
+ ncomp->compressor = compressor;
+
+ __wt_spin_lock(session, &conn->api_lock);
+ TAILQ_INSERT_TAIL(&conn->compqh, ncomp, q);
+ ncomp = NULL;
+ __wt_spin_unlock(session, &conn->api_lock);
+
+err: if (ncomp != NULL) {
+ __wt_free(session, ncomp->name);
+ __wt_free(session, ncomp);
+ }
+
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_conn_remove_compressor --
+ * remove compressor added by WT_CONNECTION->add_compressor, only used
+ * internally.
+ */
+int
+__wt_conn_remove_compressor(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_NAMED_COMPRESSOR *ncomp;
+
+ conn = S2C(session);
+
+ while ((ncomp = TAILQ_FIRST(&conn->compqh)) != NULL) {
+ /* Call any termination method. */
+ if (ncomp->compressor->terminate != NULL)
+ WT_TRET(ncomp->compressor->terminate(
+ ncomp->compressor, (WT_SESSION *)session));
+
+ /* Remove from the connection's list, free memory. */
+ TAILQ_REMOVE(&conn->compqh, ncomp, q);
+ __wt_free(session, ncomp->name);
+ __wt_free(session, ncomp);
+ }
+
+ return (ret);
+}
+
+/*
+ * __conn_add_data_source --
+ * WT_CONNECTION->add_data_source method.
+ */
+static int
+__conn_add_data_source(WT_CONNECTION *wt_conn,
+ const char *prefix, WT_DATA_SOURCE *dsrc, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_NAMED_DATA_SOURCE *ndsrc;
+ WT_SESSION_IMPL *session;
+
+ ndsrc = NULL;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL(conn, session, add_data_source, config, cfg);
+ WT_UNUSED(cfg);
+
+ WT_ERR(__wt_calloc_def(session, 1, &ndsrc));
+ WT_ERR(__wt_strdup(session, prefix, &ndsrc->prefix));
+ ndsrc->dsrc = dsrc;
+
+ /* Link onto the environment's list of data sources. */
+ __wt_spin_lock(session, &conn->api_lock);
+ TAILQ_INSERT_TAIL(&conn->dsrcqh, ndsrc, q);
+ ndsrc = NULL;
+ __wt_spin_unlock(session, &conn->api_lock);
+
+err: if (ndsrc != NULL) {
+ __wt_free(session, ndsrc->prefix);
+ __wt_free(session, ndsrc);
+ }
+
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_conn_remove_data_source --
+ * Remove data source added by WT_CONNECTION->add_data_source.
+ */
+int
+__wt_conn_remove_data_source(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_NAMED_DATA_SOURCE *ndsrc;
+
+ conn = S2C(session);
+
+ while ((ndsrc = TAILQ_FIRST(&conn->dsrcqh)) != NULL) {
+ /* Call any termination method. */
+ if (ndsrc->dsrc->terminate != NULL)
+ WT_TRET(ndsrc->dsrc->terminate(
+ ndsrc->dsrc, (WT_SESSION *)session));
+
+ /* Remove from the connection's list, free memory. */
+ TAILQ_REMOVE(&conn->dsrcqh, ndsrc, q);
+ __wt_free(session, ndsrc->prefix);
+ __wt_free(session, ndsrc);
+ }
+
+ return (ret);
+}
+
+/*
+ * __conn_add_extractor --
+ * WT_CONNECTION->add_extractor method.
+ */
+static int
+__conn_add_extractor(WT_CONNECTION *wt_conn,
+ const char *name, WT_EXTRACTOR *extractor, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ WT_UNUSED(name);
+ WT_UNUSED(extractor);
+ ret = ENOTSUP;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL(conn, session, add_extractor, config, cfg);
+ WT_UNUSED(cfg);
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_async_flush --
+ * WT_CONNECTION.async_flush method.
+ */
+static int
+__conn_async_flush(WT_CONNECTION *wt_conn)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL_NOCONF(conn, session, async_flush);
+ WT_ERR(__wt_async_flush(session));
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_async_new_op --
+ * WT_CONNECTION.async_new_op method.
+ */
+static int
+__conn_async_new_op(WT_CONNECTION *wt_conn, const char *uri, const char *config,
+ WT_ASYNC_CALLBACK *callback, WT_ASYNC_OP **asyncopp)
+{
+ WT_ASYNC_OP_IMPL *op;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL(conn, session, async_new_op, config, cfg);
+ WT_ERR(__wt_async_new_op(session, uri, config, cfg, callback, &op));
+
+ *asyncopp = &op->iface;
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_get_home --
+ * WT_CONNECTION.get_home method.
+ */
+static const char *
+__conn_get_home(WT_CONNECTION *wt_conn)
+{
+ return (((WT_CONNECTION_IMPL *)wt_conn)->home);
+}
+
+/*
+ * __conn_configure_method --
+ * WT_CONNECTION.configure_method method.
+ */
+static int
+__conn_configure_method(WT_CONNECTION *wt_conn, const char *method,
+ const char *uri, const char *config, const char *type, const char *check)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL_NOCONF(conn, session, configure_method);
+
+ ret = __wt_configure_method(session, method, uri, config, type, check);
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_is_new --
+ * WT_CONNECTION->is_new method.
+ */
+static int
+__conn_is_new(WT_CONNECTION *wt_conn)
+{
+ return (((WT_CONNECTION_IMPL *)wt_conn)->is_new);
+}
+
+/*
+ * __conn_close --
+ * WT_CONNECTION->close method.
+ */
+static int
+__conn_close(WT_CONNECTION *wt_conn, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ WT_SESSION_IMPL *s, *session;
+ uint32_t i;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+
+ CONNECTION_API_CALL(conn, session, close, config, cfg);
+
+ WT_TRET(__wt_config_gets(session, cfg, "leak_memory", &cval));
+ if (cval.val != 0)
+ F_SET(conn, WT_CONN_LEAK_MEMORY);
+
+err: /*
+ * Rollback all running transactions.
+ * We do this as a separate pass because an active transaction in one
+ * session could cause trouble when closing a file, even if that
+ * session never referenced that file.
+ */
+ for (s = conn->sessions, i = 0; i < conn->session_cnt; ++s, ++i)
+ if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL) &&
+ F_ISSET(&s->txn, TXN_RUNNING)) {
+ wt_session = &s->iface;
+ WT_TRET(wt_session->rollback_transaction(
+ wt_session, NULL));
+ }
+
+ /* Close open, external sessions. */
+ for (s = conn->sessions, i = 0; i < conn->session_cnt; ++s, ++i)
+ if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL)) {
+ wt_session = &s->iface;
+ /*
+ * Notify the user that we are closing the session
+ * handle via the registered close callback.
+ */
+ if (s->event_handler->handle_close != NULL)
+ WT_TRET(s->event_handler->handle_close(
+ s->event_handler, wt_session, NULL));
+ WT_TRET(wt_session->close(wt_session, config));
+ }
+
+ WT_TRET(__wt_connection_close(conn));
+
+ /* We no longer have a session, don't try to update it. */
+ session = NULL;
+
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_reconfigure --
+ * WT_CONNECTION->reconfigure method.
+ */
+static int
+__conn_reconfigure(WT_CONNECTION *wt_conn, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ const char *p, *config_cfg[] = { NULL, NULL, NULL };
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+
+ CONNECTION_API_CALL(conn, session, reconfigure, config, cfg);
+ WT_UNUSED(cfg);
+
+ /* Serialize reconfiguration. */
+ __wt_spin_lock(session, &conn->reconfig_lock);
+
+ /*
+ * The configuration argument has been checked for validity, replace the
+ * previous connection configuration.
+ *
+ * DO NOT merge the configuration before the reconfigure calls. Some
+ * of the underlying reconfiguration functions do explicit checks with
+ * the second element of the configuration array, knowing the defaults
+ * are in slot #1 and the application's modifications are in slot #2.
+ */
+ config_cfg[0] = conn->cfg;
+ config_cfg[1] = config;
+
+ WT_ERR(__conn_statistics_config(session, config_cfg));
+ WT_ERR(__wt_async_reconfig(session, config_cfg));
+ WT_ERR(__wt_cache_config(session, config_cfg));
+ WT_ERR(__wt_cache_pool_config(session, config_cfg));
+ WT_ERR(__wt_checkpoint_server_create(session, config_cfg));
+ WT_ERR(__wt_lsm_manager_reconfig(session, config_cfg));
+ WT_ERR(__wt_statlog_create(session, config_cfg));
+ WT_ERR(__wt_verbose_config(session, config_cfg));
+
+ WT_ERR(__wt_config_merge(session, config_cfg, &p));
+ __wt_free(session, conn->cfg);
+ conn->cfg = p;
+
+err: __wt_spin_unlock(session, &conn->reconfig_lock);
+
+ API_END_RET(session, ret);
+}
+
+/*
+ * __conn_open_session --
+ * WT_CONNECTION->open_session method.
+ */
+static int
+__conn_open_session(WT_CONNECTION *wt_conn,
+ WT_EVENT_HANDLER *event_handler, const char *config,
+ WT_SESSION **wt_sessionp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session, *session_ret;
+
+ *wt_sessionp = NULL;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ session_ret = NULL;
+
+ CONNECTION_API_CALL(conn, session, open_session, config, cfg);
+ WT_UNUSED(cfg);
+
+ WT_ERR(__wt_open_session(conn, event_handler, config, &session_ret));
+
+ *wt_sessionp = &session_ret->iface;
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_config_append --
+ * Append an entry to a config stack.
+ */
+static void
+__conn_config_append(const char *cfg[], const char *config)
+{
+ while (*cfg != NULL)
+ ++cfg;
+ *cfg = config;
+}
+
+/*
+ * __conn_config_check_version --
+ * Check if a configuration version isn't compatible.
+ */
+static int
+__conn_config_check_version(WT_SESSION_IMPL *session, const char *config)
+{
+ WT_CONFIG_ITEM vmajor, vminor;
+
+ /*
+ * Version numbers aren't included in all configuration strings, but
+ * we check all of them just in case. Ignore configurations without
+ * a version.
+ */
+ if (__wt_config_getones(
+ session, config, "version.major", &vmajor) == WT_NOTFOUND)
+ return (0);
+ WT_RET(__wt_config_getones(session, config, "version.minor", &vminor));
+
+ if (vmajor.val > WIREDTIGER_VERSION_MAJOR ||
+ (vmajor.val == WIREDTIGER_VERSION_MAJOR &&
+ vminor.val > WIREDTIGER_VERSION_MINOR))
+ WT_RET_MSG(session, ENOTSUP,
+ "WiredTiger configuration is from an incompatible release "
+ "of the WiredTiger engine");
+
+ return (0);
+}
+
+/*
+ * __conn_config_file --
+ * Read WiredTiger config files from the home directory.
+ */
+static int
+__conn_config_file(WT_SESSION_IMPL *session,
+ const char *filename, int is_user, const char **cfg, WT_ITEM *cbuf)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *fh;
+ size_t len;
+ wt_off_t size;
+ int exist, quoted;
+ char *p, *t;
+
+ conn = S2C(session);
+ fh = NULL;
+
+ /* Configuration files are always optional. */
+ WT_RET(__wt_exist(session, filename, &exist));
+ if (!exist)
+ return (0);
+
+ /*
+ * The base configuration should not exist if we are creating this
+ * database.
+ */
+ if (!is_user && conn->is_new)
+ WT_RET_MSG(session, EINVAL,
+ "%s exists before database creation", filename);
+
+ /* Open the configuration file. */
+ WT_RET(__wt_open(session, filename, 0, 0, 0, &fh));
+ WT_ERR(__wt_filesize(session, fh, &size));
+ if (size == 0)
+ goto err;
+
+ /*
+ * Sanity test: a 100KB configuration file would be insane. (There's
+ * no practical reason to limit the file size, but I can either limit
+ * the file size to something rational, or add code to test if the
+ * wt_off_t size is larger than a uint32_t, which is more complicated
+ * and a waste of time.)
+ */
+ if (size > 100 * 1024)
+ WT_ERR_MSG(
+ session, EFBIG, "Configuration file too big: %s", filename);
+ len = (size_t)size;
+
+ /*
+ * Copy the configuration file into memory, with a little slop, I'm not
+ * interested in debugging off-by-ones.
+ *
+ * The beginning of a file is the same as if we run into an unquoted
+ * newline character, simplify the parsing loop by pretending that's
+ * what we're doing.
+ */
+ WT_ERR(__wt_buf_init(session, cbuf, len + 10));
+ WT_ERR(__wt_read(
+ session, fh, (wt_off_t)0, len, ((uint8_t *)cbuf->mem) + 1));
+ ((uint8_t *)cbuf->mem)[0] = '\n';
+ cbuf->size = len + 1;
+
+ /*
+ * Collapse the file's lines into a single string: newline characters
+ * are replaced with commas unless the newline is quoted or backslash
+ * escaped. Comment lines (an unescaped newline where the next non-
+ * white-space character is a hash), are discarded.
+ */
+ for (quoted = 0, p = t = cbuf->mem; len > 0;) {
+ /*
+ * Backslash pairs pass through untouched, unless immediately
+ * preceding a newline, in which case both the backslash and
+ * the newline are discarded. Backslash characters escape
+ * quoted characters, too, that is, a backslash followed by a
+ * quote doesn't start or end a quoted string.
+ */
+ if (*p == '\\' && len > 1) {
+ if (p[1] != '\n') {
+ *t++ = p[0];
+ *t++ = p[1];
+ }
+ p += 2;
+ len -= 2;
+ continue;
+ }
+
+ /*
+ * If we're in a quoted string, or starting a quoted string,
+ * take all characters, including white-space and newlines.
+ */
+ if (quoted || *p == '"') {
+ if (*p == '"')
+ quoted = !quoted;
+ *t++ = *p++;
+ --len;
+ continue;
+ }
+
+ /* Everything else gets taken, except for newline characters. */
+ if (*p != '\n') {
+ *t++ = *p++;
+ --len;
+ continue;
+ }
+
+ /*
+ * Replace any newline characters with commas (and strings of
+ * commas are safe).
+ *
+ * After any newline, skip to a non-white-space character; if
+ * the next character is a hash mark, skip to the next newline.
+ */
+ for (;;) {
+ for (*t++ = ','; --len > 0 && isspace(*++p);)
+ ;
+ if (len == 0)
+ break;
+ if (*p != '#')
+ break;
+ while (--len > 0 && *++p != '\n')
+ ;
+ if (len == 0)
+ break;
+ }
+ }
+ *t = '\0';
+ cbuf->size = WT_PTRDIFF(t, cbuf->data);
+
+ /* Check any version. */
+ WT_ERR(__conn_config_check_version(session, cbuf->data));
+
+ /* Upgrade the configuration string. */
+ WT_ERR(__wt_config_upgrade(session, cbuf));
+
+ /* Check the configuration information. */
+ WT_ERR(__wt_config_check(session, is_user ?
+ WT_CONFIG_REF(session, wiredtiger_open_usercfg) :
+ WT_CONFIG_REF(session, wiredtiger_open_basecfg), cbuf->data, 0));
+
+ /* Append it to the stack. */
+ __conn_config_append(cfg, cbuf->data);
+
+err: if (fh != NULL)
+ WT_TRET(__wt_close(session, fh));
+ return (ret);
+}
+
+/*
+ * __conn_config_env --
+ * Read configuration from an environment variable, if set.
+ */
+static int
+__conn_config_env(WT_SESSION_IMPL *session, const char *cfg[], WT_ITEM *cbuf)
+{
+ WT_CONFIG_ITEM cval;
+ const char *env_config;
+ size_t len;
+
+ if ((env_config = getenv("WIREDTIGER_CONFIG")) == NULL)
+ return (0);
+ len = strlen(env_config);
+ if (len == 0)
+ return (0);
+ WT_RET(__wt_buf_set(session, cbuf, env_config, len + 1));
+
+ /*
+ * Security stuff:
+ *
+ * If the "use_environment_priv" configuration string is set, use the
+ * environment variable if the process has appropriate privileges.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "use_environment_priv", &cval));
+ if (cval.val == 0 && __wt_has_priv())
+ WT_RET_MSG(session, WT_ERROR, "%s",
+ "WIREDTIGER_CONFIG environment variable set but process "
+ "lacks privileges to use that environment variable");
+
+ /* Check any version. */
+ WT_RET(__conn_config_check_version(session, env_config));
+
+ /* Upgrade the configuration string. */
+ WT_RET(__wt_config_upgrade(session, cbuf));
+
+ /* Check the configuration information. */
+ WT_RET(__wt_config_check(session,
+ WT_CONFIG_REF(session, wiredtiger_open), env_config, 0));
+
+ /* Append it to the stack. */
+ __conn_config_append(cfg, env_config);
+
+ return (0);
+}
+
+/*
+ * __conn_home --
+ * Set the database home directory.
+ */
+static int
+__conn_home(WT_SESSION_IMPL *session, const char *home, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+
+ /* If the application specifies a home directory, use it. */
+ if (home != NULL)
+ goto copy;
+
+ /* If there's no WIREDTIGER_HOME environment variable, use ".". */
+ if ((home = getenv("WIREDTIGER_HOME")) == NULL || strlen(home) == 0) {
+ home = ".";
+ goto copy;
+ }
+
+ /*
+ * Security stuff:
+ *
+ * Unless the "use_environment_priv" configuration string is set,
+ * fail if the process is running with special privileges.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "use_environment_priv", &cval));
+ if (cval.val == 0 && __wt_has_priv())
+ WT_RET_MSG(session, WT_ERROR, "%s",
+ "WIREDTIGER_HOME environment variable set but process "
+ "lacks privileges to use that environment variable");
+
+copy: return (__wt_strdup(session, home, &S2C(session)->home));
+}
+
+/*
+ * __conn_single --
+ * Confirm that no other thread of control is using this database.
+ */
+static int
+__conn_single(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn, *t;
+ WT_DECL_RET;
+ WT_FH *fh;
+ size_t len;
+ wt_off_t size;
+ char buf[256];
+
+ conn = S2C(session);
+ fh = NULL;
+
+ __wt_spin_lock(session, &__wt_process.spinlock);
+
+ /*
+ * We first check for other threads of control holding a lock on this
+ * database, because the byte-level locking functions are based on the
+ * POSIX 1003.1 fcntl APIs, which require all locks associated with a
+ * file for a given process are removed when any file descriptor for
+ * the file is closed by that process. In other words, we can't open a
+ * file handle on the lock file until we are certain that closing that
+ * handle won't discard the owning thread's lock. Applications hopefully
+ * won't open a database in multiple threads, but we don't want to have
+ * it fail the first time, but succeed the second.
+ */
+ TAILQ_FOREACH(t, &__wt_process.connqh, q)
+ if (t->home != NULL &&
+ t != conn && strcmp(t->home, conn->home) == 0) {
+ ret = EBUSY;
+ break;
+ }
+ if (ret != 0)
+ WT_ERR_MSG(session, EBUSY,
+ "WiredTiger database is already being managed by another "
+ "thread in this process");
+
+ /*
+ * !!!
+ * Be careful changing this code.
+ *
+ * We locked the WiredTiger file before release 2.3.2; a separate lock
+ * file was added after 2.3.1 because hot backup has to copy the
+ * WiredTiger file and system utilities on Windows can't copy locked
+ * files.
+ *
+ * For this reason, we don't use the lock file's existence to decide if
+ * we're creating the database or not, use the WiredTiger file instead,
+ * it has existed in every version of WiredTiger.
+ *
+ * Additionally, avoid an upgrade race: a 2.3.1 release process might
+ * have the WiredTiger file locked, and we're going to create the lock
+ * file and lock it instead. For this reason, first acquire a lock on
+ * the lock file and then a lock on the WiredTiger file, then release
+ * the latter so hot backups can proceed. (If someone were to run a
+ * current release and subsequently a historic release, we could still
+ * fail because the historic release will ignore our lock file and will
+ * then successfully lock the WiredTiger file, but I can't think of any
+ * way to fix that.)
+ *
+ * Open the WiredTiger lock file, creating it if it doesn't exist. (I'm
+ * not removing the lock file if we create it and subsequently fail, it
+ * isn't simple to detect that case, and there's no risk other than a
+ * useless file being left in the directory.)
+ */
+ WT_ERR(__wt_open(session, WT_SINGLETHREAD, 1, 0, 0, &conn->lock_fh));
+
+ /*
+ * Lock a byte of the file: if we don't get the lock, some other process
+ * is holding it, we're done. The file may be zero-length, and that's
+ * OK, the underlying call supports locking past the end-of-file.
+ */
+ if (__wt_bytelock(conn->lock_fh, (wt_off_t)0, 1) != 0)
+ WT_ERR_MSG(session, EBUSY,
+ "WiredTiger database is already being managed by another "
+ "process");
+
+ /*
+ * If the size of the lock file is 0, we created it (or we won a locking
+ * race with the thread that created it, it doesn't matter).
+ *
+ * Write something into the file, zero-length files make me nervous.
+ */
+ WT_ERR(__wt_filesize(session, conn->lock_fh, &size));
+ if (size == 0) {
+#define WT_SINGLETHREAD_STRING "WiredTiger lock file\n"
+ WT_ERR(__wt_write(session, conn->lock_fh, (wt_off_t)0,
+ strlen(WT_SINGLETHREAD_STRING), WT_SINGLETHREAD_STRING));
+ }
+
+ /* We own the lock file, optionally create the WiredTiger file. */
+ WT_ERR(__wt_config_gets(session, cfg, "create", &cval));
+ WT_ERR(__wt_open(session,
+ WT_WIREDTIGER, cval.val == 0 ? 0 : 1, 0, 0, &fh));
+
+ /*
+ * Lock the WiredTiger file (for backward compatibility reasons as
+ * described above). Immediately release the lock, it's just a test.
+ */
+ if (__wt_bytelock(fh, (wt_off_t)0, 1) != 0) {
+ WT_ERR_MSG(session, EBUSY,
+ "WiredTiger database is already being managed by another "
+ "process");
+ }
+ WT_ERR(__wt_bytelock(fh, (wt_off_t)0, 0));
+
+ /*
+ * If the size of the file is zero, we created it, fill it in. If the
+ * size of the file is non-zero, fail if configured for exclusivity.
+ */
+ WT_ERR(__wt_filesize(session, fh, &size));
+ if (size == 0) {
+ len = (size_t)snprintf(buf, sizeof(buf),
+ "%s\n%s\n", WT_WIREDTIGER, WIREDTIGER_VERSION_STRING);
+ WT_ERR(__wt_write(session, fh, (wt_off_t)0, len, buf));
+
+ conn->is_new = 1;
+ } else {
+ WT_ERR(__wt_config_gets(session, cfg, "exclusive", &cval));
+ if (cval.val != 0)
+ WT_ERR_MSG(session, EEXIST,
+ "WiredTiger database already exists and exclusive "
+ "option configured");
+
+ conn->is_new = 0;
+ }
+
+err: /*
+ * We ignore the connection's lock file handle on error, it will be
+ * closed when the connection structure is destroyed.
+ */
+ if (fh != NULL)
+ WT_TRET(__wt_close(session, fh));
+
+ __wt_spin_unlock(session, &__wt_process.spinlock);
+ return (ret);
+}
+
+/*
+ * __conn_statistics_config --
+ * Set statistics configuration.
+ */
+static int
+__conn_statistics_config(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval, sval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ uint32_t flags;
+ int set;
+
+ conn = S2C(session);
+
+ WT_RET(__wt_config_gets(session, cfg, "statistics", &cval));
+
+ flags = 0;
+ set = 0;
+ if ((ret = __wt_config_subgets(
+ session, &cval, "none", &sval)) == 0 && sval.val != 0) {
+ LF_SET(WT_CONN_STAT_NONE);
+ ++set;
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ if ((ret = __wt_config_subgets(
+ session, &cval, "fast", &sval)) == 0 && sval.val != 0) {
+ LF_SET(WT_CONN_STAT_FAST);
+ ++set;
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ if ((ret = __wt_config_subgets(
+ session, &cval, "all", &sval)) == 0 && sval.val != 0) {
+ LF_SET(WT_CONN_STAT_ALL | WT_CONN_STAT_FAST);
+ ++set;
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ if ((ret = __wt_config_subgets(
+ session, &cval, "clear", &sval)) == 0 && sval.val != 0)
+ LF_SET(WT_CONN_STAT_CLEAR);
+ WT_RET_NOTFOUND_OK(ret);
+
+ if (set > 1)
+ WT_RET_MSG(session, EINVAL,
+ "only one statistics configuration value may be specified");
+
+ /* Configuring statistics clears any existing values. */
+ conn->stat_flags = flags;
+
+ return (0);
+}
+
+/* Simple structure for name and flag configuration searches. */
+typedef struct {
+ const char *name;
+ uint32_t flag;
+} WT_NAME_FLAG;
+
+/*
+ * __wt_verbose_config --
+ * Set verbose configuration.
+ */
+int
+__wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ static const WT_NAME_FLAG verbtypes[] = {
+ { "api", WT_VERB_API },
+ { "block", WT_VERB_BLOCK },
+ { "checkpoint", WT_VERB_CHECKPOINT },
+ { "compact", WT_VERB_COMPACT },
+ { "evict", WT_VERB_EVICT },
+ { "evictserver", WT_VERB_EVICTSERVER },
+ { "fileops", WT_VERB_FILEOPS },
+ { "log", WT_VERB_LOG },
+ { "lsm", WT_VERB_LSM },
+ { "metadata", WT_VERB_METADATA },
+ { "mutex", WT_VERB_MUTEX },
+ { "overflow", WT_VERB_OVERFLOW },
+ { "read", WT_VERB_READ },
+ { "reconcile", WT_VERB_RECONCILE },
+ { "recovery", WT_VERB_RECOVERY },
+ { "salvage", WT_VERB_SALVAGE },
+ { "shared_cache", WT_VERB_SHARED_CACHE },
+ { "split", WT_VERB_SPLIT },
+ { "temporary", WT_VERB_TEMPORARY },
+ { "transaction", WT_VERB_TRANSACTION },
+ { "verify", WT_VERB_VERIFY },
+ { "version", WT_VERB_VERSION },
+ { "write", WT_VERB_WRITE },
+ { NULL, 0 }
+ };
+ WT_CONFIG_ITEM cval, sval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ const WT_NAME_FLAG *ft;
+ uint32_t flags;
+
+ conn = S2C(session);
+
+ WT_RET(__wt_config_gets(session, cfg, "verbose", &cval));
+
+ flags = 0;
+ for (ft = verbtypes; ft->name != NULL; ft++) {
+ if ((ret = __wt_config_subgets(
+ session, &cval, ft->name, &sval)) == 0 && sval.val != 0) {
+#ifdef HAVE_VERBOSE
+ LF_SET(ft->flag);
+#else
+ WT_RET_MSG(session, EINVAL,
+ "Verbose option specified when WiredTiger built "
+ "without verbose support. Add --enable-verbose to "
+ "configure command and rebuild to include support "
+ "for verbose messages");
+#endif
+ }
+ WT_RET_NOTFOUND_OK(ret);
+ }
+
+ conn->verbose = flags;
+ return (0);
+}
+
+/*
+ * __conn_write_config --
+ * Save the configuration used to create a database.
+ */
+static int
+__conn_write_config(
+ WT_SESSION_IMPL *session, const char *filename, const char *cfg[])
+{
+ FILE *fp;
+ WT_CONFIG parser;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_RET;
+ char *path;
+
+ /*
+ * We were passed an array of configuration strings where slot 0 is all
+ * all possible values and the second and subsequent slots are changes
+ * specified by the application during open (using the wiredtiger_open
+ * configuration string, an environment variable, or user-configuration
+ * file). The base configuration file contains all changes to default
+ * settings made at create, and we include the user-configuration file
+ * in that list, even though we don't expect it to change. Of course,
+ * an application could leave that file as it is right now and not
+ * remove a configuration we need, but applications can also guarantee
+ * all database users specify consistent environment variables and
+ * wiredtiger_open configuration arguments, and if we protect against
+ * those problems, might as well include the application's configuration
+ * file as well.
+ *
+ * If there is no configuration, don't bother creating an empty file.
+ */
+ if (cfg[1] == NULL)
+ return (0);
+
+ WT_RET(__wt_filename(session, filename, &path));
+ if ((fp = fopen(path, "w")) == NULL)
+ ret = __wt_errno();
+ __wt_free(session, path);
+ if (fp == NULL)
+ return (ret);
+
+ fprintf(fp, "%s\n\n",
+ "# Do not modify this file.\n"
+ "#\n"
+ "# WiredTiger created this file when the database was created,\n"
+ "# to store persistent database settings. Instead of changing\n"
+ "# these settings, set a WIREDTIGER_CONFIG environment variable\n"
+ "# or create a WiredTiger.config file to override them.");
+
+ fprintf(fp, "version=(major=%d,minor=%d)\n\n",
+ WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR);
+
+ /*
+ * We want the list of defaults that have been changed, that is, if the
+ * application didn't somehow configure a setting, we don't write out a
+ * default value, so future releases may silently migrate to new default
+ * values.
+ */
+ while (*++cfg != NULL) {
+ WT_ERR(__wt_config_init( session,
+ &parser, WT_CONFIG_BASE(session, wiredtiger_open_basecfg)));
+ while ((ret = __wt_config_next(&parser, &k, &v)) == 0) {
+ if ((ret =
+ __wt_config_getone(session, *cfg, &k, &v)) == 0) {
+ /* Fix quoting for non-trivial settings. */
+ if (v.type == WT_CONFIG_ITEM_STRING) {
+ --v.str;
+ v.len += 2;
+ }
+ fprintf(fp, "%.*s=%.*s\n",
+ (int)k.len, k.str, (int)v.len, v.str);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+
+err: WT_TRET(fclose(fp));
+
+ /* Don't leave a damaged file in place. */
+ if (ret != 0)
+ (void)__wt_remove(session, filename);
+
+ return (ret);
+}
+
+/*
+ * wiredtiger_open --
+ * Main library entry point: open a new connection to a WiredTiger
+ * database.
+ */
+int
+wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
+ const char *config, WT_CONNECTION **wt_connp)
+{
+ static const WT_CONNECTION stdc = {
+ __conn_async_flush,
+ __conn_async_new_op,
+ __conn_close,
+ __conn_reconfigure,
+ __conn_get_home,
+ __conn_configure_method,
+ __conn_is_new,
+ __conn_open_session,
+ __conn_load_extension,
+ __conn_add_data_source,
+ __conn_add_collator,
+ __conn_add_compressor,
+ __conn_add_extractor,
+ __conn_get_extension_api
+ };
+ static const WT_NAME_FLAG file_types[] = {
+ { "checkpoint", WT_FILE_TYPE_CHECKPOINT },
+ { "data", WT_FILE_TYPE_DATA },
+ { "log", WT_FILE_TYPE_LOG },
+ { NULL, 0 }
+ };
+
+ WT_CONFIG_ITEM cval, sval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_ITEM i1, i2, i3;
+ const WT_NAME_FLAG *ft;
+ WT_SESSION_IMPL *session;
+
+ /* Leave space for optional additional configuration. */
+ const char *cfg[] = { NULL, NULL, NULL, NULL, NULL, NULL };
+
+ *wt_connp = NULL;
+
+ conn = NULL;
+ session = NULL;
+
+ /*
+ * We could use scratch buffers, but I'd rather the default session
+ * not tie down chunks of memory past the open call.
+ */
+ WT_CLEAR(i1);
+ WT_CLEAR(i2);
+ WT_CLEAR(i3);
+
+ WT_RET(__wt_library_init());
+
+ WT_RET(__wt_calloc_def(NULL, 1, &conn));
+ conn->iface = stdc;
+
+ /*
+ * Immediately link the structure into the connection structure list:
+ * the only thing ever looked at on that list is the database name,
+ * and a NULL value is fine.
+ */
+ __wt_spin_lock(NULL, &__wt_process.spinlock);
+ TAILQ_INSERT_TAIL(&__wt_process.connqh, conn, q);
+ __wt_spin_unlock(NULL, &__wt_process.spinlock);
+
+ session = conn->default_session = &conn->dummy_session;
+ session->iface.connection = &conn->iface;
+ session->name = "wiredtiger_open";
+ __wt_random_init(session->rnd);
+ __wt_event_handler_set(session, event_handler);
+
+ /* Remaining basic initialization of the connection structure. */
+ WT_ERR(__wt_connection_init(conn));
+
+ /* Check/set the application-specified configuration string. */
+ WT_ERR(__wt_config_check(session,
+ WT_CONFIG_REF(session, wiredtiger_open), config, 0));
+ cfg[0] = WT_CONFIG_BASE(session, wiredtiger_open);
+ cfg[1] = config;
+
+ /* Configure error messages so we get them right early. */
+ WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval));
+ if (cval.len != 0)
+ WT_ERR(__wt_strndup(
+ session, cval.str, cval.len, &conn->error_prefix));
+
+ /* Get the database home. */
+ WT_ERR(__conn_home(session, home, cfg));
+
+ /* Make sure no other thread of control already owns this database. */
+ WT_ERR(__conn_single(session, cfg));
+
+ /*
+ * Build the configuration stack, in the following order (where later
+ * entries override earlier entries):
+ *
+ * 1. all possible wiredtiger_open configurations
+ * 2. base configuration file, created with the database (optional)
+ * 3. the config passed in by the application.
+ * 4. user configuration file (optional)
+ * 5. environment variable settings (optional)
+ *
+ * Clear the entries we added to the stack, we're going to build it in
+ * order.
+ */
+ cfg[0] = WT_CONFIG_BASE(session, wiredtiger_open_all);
+ cfg[1] = NULL;
+ WT_ERR(__conn_config_file(session, WT_BASECONFIG, 0, cfg, &i1));
+ __conn_config_append(cfg, config);
+ WT_ERR(__conn_config_file(session, WT_USERCONFIG, 1, cfg, &i2));
+ WT_ERR(__conn_config_env(session, cfg, &i3));
+
+ /*
+ * Configuration ...
+ *
+ * We can't open sessions yet, so any configurations that cause
+ * sessions to be opened must be handled inside __wt_connection_open.
+ *
+ * The error message configuration might have changed (if set in a
+ * configuration file, and not in the application's configuration
+ * string), get it again. Do it first, make error messages correct.
+ */
+ WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval));
+ if (cval.len != 0) {
+ __wt_free(session, conn->error_prefix);
+ WT_ERR(__wt_strndup(
+ session, cval.str, cval.len, &conn->error_prefix));
+ }
+
+ WT_ERR(__wt_config_gets(session, cfg, "hazard_max", &cval));
+ conn->hazard_max = (uint32_t)cval.val;
+
+ WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval));
+ conn->session_size = (uint32_t)cval.val + WT_NUM_INTERNAL_SESSIONS;
+
+ WT_ERR(__wt_config_gets(session, cfg, "checkpoint_sync", &cval));
+ if (cval.val)
+ F_SET(conn, WT_CONN_CKPT_SYNC);
+
+ WT_ERR(__wt_config_gets(session, cfg, "buffer_alignment", &cval));
+ if (cval.val == -1)
+ conn->buffer_alignment = WT_BUFFER_ALIGNMENT_DEFAULT;
+ else
+ conn->buffer_alignment = (size_t)cval.val;
+#ifndef HAVE_POSIX_MEMALIGN
+ if (conn->buffer_alignment != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "buffer_alignment requires posix_memalign");
+#endif
+
+ WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval));
+ for (ft = file_types; ft->name != NULL; ft++) {
+ ret = __wt_config_subgets(session, &cval, ft->name, &sval);
+ if (ret == 0) {
+ if (sval.val)
+ FLD_SET(conn->direct_io, ft->flag);
+ } else if (ret != WT_NOTFOUND)
+ goto err;
+ }
+
+ WT_ERR(__wt_config_gets(session, cfg, "file_extend", &cval));
+ for (ft = file_types; ft->name != NULL; ft++) {
+ ret = __wt_config_subgets(session, &cval, ft->name, &sval);
+ if (ret == 0) {
+ switch (ft->flag) {
+ case WT_FILE_TYPE_DATA:
+ conn->data_extend_len = sval.val;
+ break;
+ case WT_FILE_TYPE_LOG:
+ conn->log_extend_len = sval.val;
+ break;
+ }
+ } else if (ret != WT_NOTFOUND)
+ goto err;
+ }
+
+ WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval));
+ conn->mmap = cval.val == 0 ? 0 : 1;
+
+ WT_ERR(__conn_statistics_config(session, cfg));
+ WT_ERR(__wt_lsm_manager_config(session, cfg));
+ WT_ERR(__wt_verbose_config(session, cfg));
+
+ /* Now that we know if verbose is configured, output the version. */
+ WT_ERR(__wt_verbose(
+ session, WT_VERB_VERSION, "%s", WIREDTIGER_VERSION_STRING));
+
+ /*
+ * Open the connection, then reset the local session as the real one
+ * was allocated in __wt_connection_open.
+ */
+ WT_ERR(__wt_connection_open(conn, cfg));
+ session = conn->default_session;
+
+ /*
+ * Check on the turtle and metadata files, creating them if necessary
+ * (which avoids application threads racing to create the metadata file
+ * later). Once the metadata file exists, get a reference to it in
+ * the connection's session.
+ */
+ WT_ERR(__wt_turtle_init(session));
+ WT_ERR(__wt_metadata_open(session));
+
+ /*
+ * Load the extensions after initialization completes; extensions expect
+ * everything else to be in place, and the extensions call back into the
+ * library.
+ */
+ WT_ERR(__conn_load_extensions(session, cfg));
+
+ /*
+ * We've completed configuration, write the base configuration file if
+ * we're creating the database.
+ */
+ if (conn->is_new) {
+ WT_ERR(__wt_config_gets(session, cfg, "config_base", &cval));
+ if (cval.val)
+ WT_ERR(
+ __conn_write_config(session, WT_BASECONFIG, cfg));
+ }
+
+ /*
+ * Start the worker threads last.
+ */
+ WT_ERR(__wt_connection_workers(session, cfg));
+
+ /* Merge the final configuration for later reconfiguration. */
+ WT_ERR(__wt_config_merge(session, cfg, &conn->cfg));
+
+ WT_STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0);
+ *wt_connp = &conn->iface;
+
+err: /* Discard the configuration strings. */
+ __wt_buf_free(session, &i1);
+ __wt_buf_free(session, &i2);
+ __wt_buf_free(session, &i3);
+
+ if (ret != 0 && conn != NULL)
+ WT_TRET(__wt_connection_close(conn));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c
new file mode 100644
index 00000000000..079bd05ff1e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_cache.c
@@ -0,0 +1,174 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cache_config --
+ * Configure the underlying cache.
+ */
+int
+__wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CACHE *cache;
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+ cache = conn->cache;
+
+ /*
+ * If not using a shared cache configure the cache size, otherwise
+ * check for a reserved size.
+ */
+ if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) {
+ WT_RET(__wt_config_gets(session, cfg, "cache_size", &cval));
+ conn->cache_size = (uint64_t)cval.val;
+ } else {
+ WT_RET(__wt_config_gets(
+ session, cfg, "shared_cache.reserve", &cval));
+ if (cval.val == 0)
+ WT_RET(__wt_config_gets(
+ session, cfg, "shared_cache.chunk", &cval));
+ cache->cp_reserved = (uint64_t)cval.val;
+ }
+
+ WT_RET(__wt_config_gets(session, cfg, "eviction_target", &cval));
+ cache->eviction_target = (u_int)cval.val;
+
+ WT_RET(__wt_config_gets(session, cfg, "eviction_trigger", &cval));
+ cache->eviction_trigger = (u_int)cval.val;
+
+ WT_RET(__wt_config_gets(session, cfg, "eviction_dirty_target", &cval));
+ cache->eviction_dirty_target = (u_int)cval.val;
+
+ /*
+ * The eviction thread configuration options include the main eviction
+ * thread and workers. Our implementation splits them out. Adjust for
+ * the difference when parsing the configuration.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "eviction.threads_max", &cval));
+ WT_ASSERT(session, cval.val > 0);
+ conn->evict_workers_max = (u_int)cval.val - 1;
+
+ WT_RET(__wt_config_gets(session, cfg, "eviction.threads_min", &cval));
+ WT_ASSERT(session, cval.val > 0);
+ conn->evict_workers_min = (u_int)cval.val - 1;
+
+ if (conn->evict_workers_min > conn->evict_workers_max)
+ WT_RET_MSG(session, EINVAL,
+ "eviction=(threads_min) cannot be greater than "
+ "eviction=(threads_max)");
+
+ return (0);
+}
+
+/*
+ * __wt_cache_create --
+ * Create the underlying cache.
+ */
+int
+__wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ WT_ASSERT(session, conn->cache == NULL ||
+ (F_ISSET(conn, WT_CONN_CACHE_POOL) && conn->cache != NULL));
+
+ WT_RET(__wt_calloc_def(session, 1, &conn->cache));
+
+ cache = conn->cache;
+
+ /* Use a common routine for run-time configuration options. */
+ WT_RET(__wt_cache_config(session, cfg));
+
+ /* Add the configured cache to the cache pool. */
+ if (F_ISSET(conn, WT_CONN_CACHE_POOL))
+ WT_RET(__wt_conn_cache_pool_open(session));
+
+ /*
+ * The target size must be lower than the trigger size or we will never
+ * get any work done.
+ */
+ if (cache->eviction_target >= cache->eviction_trigger)
+ WT_ERR_MSG(session, EINVAL,
+ "eviction target must be lower than the eviction trigger");
+
+ WT_ERR(__wt_cond_alloc(session,
+ "cache eviction server", 0, &cache->evict_cond));
+ WT_ERR(__wt_cond_alloc(session,
+ "eviction waiters", 0, &cache->evict_waiter_cond));
+ WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction"));
+ WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk"));
+
+ /* Allocate the LRU eviction queue. */
+ cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
+ WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict));
+
+ /*
+ * We get/set some values in the cache statistics (rather than have
+ * two copies), configure them.
+ */
+ __wt_cache_stats_update(session);
+ return (0);
+
+err: WT_RET(__wt_cache_destroy(session));
+ return (ret);
+}
+
+/*
+ * __wt_cache_stats_update --
+ * Update the cache statistics for return to the application.
+ */
+void
+__wt_cache_stats_update(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_CONNECTION_STATS *stats;
+
+ conn = S2C(session);
+ cache = conn->cache;
+ stats = &conn->stats;
+
+ WT_STAT_SET(stats, cache_bytes_max, conn->cache_size);
+ WT_STAT_SET(stats, cache_bytes_inuse, __wt_cache_bytes_inuse(cache));
+ WT_STAT_SET(stats, cache_pages_inuse, __wt_cache_pages_inuse(cache));
+ WT_STAT_SET(stats, cache_bytes_dirty, cache->bytes_dirty);
+ WT_STAT_SET(stats, cache_pages_dirty, cache->pages_dirty);
+}
+
+/*
+ * __wt_cache_destroy --
+ * Discard the underlying cache.
+ */
+int
+__wt_cache_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+ cache = conn->cache;
+
+ if (cache == NULL)
+ return (0);
+
+ WT_TRET(__wt_cond_destroy(session, &cache->evict_cond));
+ WT_TRET(__wt_cond_destroy(session, &cache->evict_waiter_cond));
+ __wt_spin_destroy(session, &cache->evict_lock);
+ __wt_spin_destroy(session, &cache->evict_walk_lock);
+
+ __wt_free(session, cache->evict);
+ __wt_free(session, conn->cache);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
new file mode 100644
index 00000000000..ba80ac15267
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
@@ -0,0 +1,639 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Tuning constants.
+ */
+/* Threshold when a connection is allocated more cache */
+#define WT_CACHE_POOL_BUMP_THRESHOLD 6
+/* Threshold when a connection is allocated less cache */
+#define WT_CACHE_POOL_REDUCE_THRESHOLD 2
+/* Balancing passes after a bump before a connection is a candidate. */
+#define WT_CACHE_POOL_BUMP_SKIPS 10
+/* Balancing passes after a reduction before a connection is a candidate. */
+#define WT_CACHE_POOL_REDUCE_SKIPS 5
+
+static int __cache_pool_adjust(WT_SESSION_IMPL *, uint64_t, uint64_t, int *);
+static int __cache_pool_assess(WT_SESSION_IMPL *, uint64_t *);
+static int __cache_pool_balance(WT_SESSION_IMPL *);
+
+/*
+ * __wt_cache_pool_config --
+ * Parse and setup the cache pool options.
+ */
+int
+__wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
+{
+ WT_CACHE_POOL *cp;
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn, *entry;
+ WT_DECL_RET;
+ char *pool_name;
+ int created, reconfiguring;
+ uint64_t chunk, reserve, size, used_cache;
+
+ conn = S2C(session);
+ created = reconfiguring = 0;
+ pool_name = NULL;
+ cp = NULL;
+ size = 0;
+
+ if (F_ISSET(conn, WT_CONN_CACHE_POOL))
+ reconfiguring = 1;
+ else {
+ WT_RET(
+ __wt_config_gets(session, cfg, "shared_cache.name", &cval));
+ if (cval.len == 0) {
+ /*
+ * Tell the user if they configured some shared cache
+ * settings, but didn't enable it by naming it.
+ */
+ if (__wt_config_gets(session,
+ &cfg[1], "shared_cache", &cval) != WT_NOTFOUND)
+ WT_RET_MSG(session, EINVAL,
+ "Shared cache configuration requires a "
+ "pool name");
+ return (0);
+ }
+ if (__wt_config_gets(session,
+ &cfg[1], "cache_size", &cval) != WT_NOTFOUND)
+ WT_RET_MSG(session, EINVAL,
+ "Only one of cache_size and shared_cache can be "
+ "in the configuration");
+
+ /*
+ * NOTE: The allocations made when configuring and opening a
+ * cache pool don't really belong to the connection that
+ * allocates them. If a memory allocator becomes connection
+ * specific in the future we will need a way to allocate memory
+ * outside of the connection here.
+ */
+ WT_RET(__wt_strndup(session, cval.str, cval.len, &pool_name));
+ }
+
+ __wt_spin_lock(session, &__wt_process.spinlock);
+ if (__wt_process.cache_pool == NULL) {
+ WT_ASSERT(session, !reconfiguring);
+ /* Create a cache pool. */
+ WT_ERR(__wt_calloc_def(session, 1, &cp));
+ created = 1;
+ cp->name = pool_name;
+ pool_name = NULL; /* Belongs to the cache pool now. */
+ TAILQ_INIT(&cp->cache_pool_qh);
+ WT_ERR(__wt_spin_init(
+ session, &cp->cache_pool_lock, "cache shared pool"));
+ WT_ERR(__wt_cond_alloc(session,
+ "cache pool server", 0, &cp->cache_pool_cond));
+
+ __wt_process.cache_pool = cp;
+ WT_ERR(__wt_verbose(session,
+ WT_VERB_SHARED_CACHE, "Created cache pool %s", cp->name));
+ } else if (!reconfiguring && !WT_STRING_MATCH(
+ __wt_process.cache_pool->name, pool_name, strlen(pool_name)))
+ /* Only a single cache pool is supported. */
+ WT_ERR_MSG(session, WT_ERROR,
+ "Attempting to join a cache pool that does not exist: %s",
+ pool_name);
+
+ cp = __wt_process.cache_pool;
+
+ /*
+ * The cache pool requires a reference count to avoid a race between
+ * configuration/open and destroy.
+ */
+ if (!reconfiguring)
+ ++cp->refs;
+
+ /*
+ * Cache pool configurations are optional when not creating. If
+ * values aren't being changed, retrieve the current value so that
+ * validation of settings works.
+ */
+ if (!created) {
+ if (__wt_config_gets(session, &cfg[1],
+ "shared_cache.size", &cval) == 0 && cval.val != 0)
+ size = (uint64_t)cval.val;
+ else
+ size = cp->size;
+ if (__wt_config_gets(session, &cfg[1],
+ "shared_cache.chunk", &cval) == 0 && cval.val != 0)
+ chunk = (uint64_t)cval.val;
+ else
+ chunk = cp->chunk;
+ } else {
+ /*
+ * The only time shared cache configuration uses default
+ * values is when we are creating the pool.
+ */
+ WT_ERR(__wt_config_gets(
+ session, cfg, "shared_cache.size", &cval));
+ WT_ASSERT(session, cval.val != 0);
+ size = (uint64_t)cval.val;
+ WT_ERR(__wt_config_gets(
+ session, cfg, "shared_cache.chunk", &cval));
+ WT_ASSERT(session, cval.val != 0);
+ chunk = (uint64_t)cval.val;
+ }
+
+ /*
+ * Retrieve the reserve size here for validation of configuration.
+ * Don't save it yet since the connections cache is not created if
+ * we are opening. Cache configuration is responsible for saving the
+ * setting.
+ * The different conditions when reserved size are set are:
+ * - It's part of the users configuration - use that value.
+ * - We are reconfiguring - keep the previous value.
+ * - We are joining a cache pool for the first time (including
+ * creating the pool) - use the chunk size; that's the default.
+ */
+ if (__wt_config_gets(session, &cfg[1],
+ "shared_cache.reserve", &cval) == 0 && cval.val != 0)
+ reserve = (uint64_t)cval.val;
+ else if (reconfiguring)
+ reserve = conn->cache->cp_reserved;
+ else
+ reserve = chunk;
+
+ /*
+ * Validate that size and reserve values don't cause the cache
+ * pool to be over subscribed.
+ */
+ used_cache = 0;
+ if (!created) {
+ TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq)
+ used_cache += entry->cache->cp_reserved;
+ }
+ if (used_cache + reserve > size)
+ WT_ERR_MSG(session, EINVAL,
+ "Shared cache unable to accommodate this configuration. "
+ "Shared cache size: %" PRIu64 ", reserved: %" PRIu64,
+ size, used_cache + reserve);
+
+ /* The configuration is verified - it's safe to update the pool. */
+ cp->size = size;
+ cp->chunk = chunk;
+
+ /* Wake up the cache pool server so any changes are noticed. */
+ if (reconfiguring)
+ WT_ERR(__wt_cond_signal(
+ session, __wt_process.cache_pool->cache_pool_cond));
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "Configured cache pool %s. Size: %" PRIu64
+ ", chunk size: %" PRIu64, cp->name, cp->size, cp->chunk));
+
+ F_SET(conn, WT_CONN_CACHE_POOL);
+err: __wt_spin_unlock(session, &__wt_process.spinlock);
+ if (!reconfiguring)
+ __wt_free(session, pool_name);
+ if (ret != 0 && created) {
+ __wt_free(session, cp->name);
+ WT_TRET(__wt_cond_destroy(session, &cp->cache_pool_cond));
+ __wt_free(session, cp);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_conn_cache_pool_open --
+ * Add a connection to the cache pool.
+ */
+int
+__wt_conn_cache_pool_open(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CACHE_POOL *cp;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+ cache = conn->cache;
+ cp = __wt_process.cache_pool;
+
+ /*
+ * Create a session that can be used by the cache pool thread, do
+ * it in the main thread to avoid shutdown races
+ */
+ if ((ret = __wt_open_internal_session(
+ conn, "cache-pool", 0, 0, &cache->cp_session)) != 0)
+ WT_RET_MSG(NULL, ret,
+ "Failed to create session for cache pool");
+
+ /*
+ * Add this connection into the cache pool connection queue. Figure
+ * out if a manager thread is needed while holding the lock. Don't
+ * start the thread until we have released the lock.
+ */
+ __wt_spin_lock(session, &cp->cache_pool_lock);
+ TAILQ_INSERT_TAIL(&cp->cache_pool_qh, conn, cpq);
+ __wt_spin_unlock(session, &cp->cache_pool_lock);
+
+ WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "Added %s to cache pool %s", conn->home, cp->name));
+
+ /*
+ * Each connection participating in the cache pool starts a manager
+ * thread. Only one manager is active at a time, but having a thread
+ * in each connection saves having a complex election process when
+ * the active connection shuts down.
+ */
+ F_SET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE);
+ F_SET(cache, WT_CACHE_POOL_RUN);
+ WT_RET(__wt_thread_create(session, &cache->cp_tid,
+ __wt_cache_pool_server, cache->cp_session));
+
+ /* Wake up the cache pool server to get our initial chunk. */
+ WT_RET(__wt_cond_signal(session, cp->cache_pool_cond));
+
+ return (0);
+}
+
+/*
+ * __wt_conn_cache_pool_destroy --
+ * Remove our resources from the shared cache pool. Remove the cache pool
+ * if we were the last connection.
+ */
+int
+__wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CACHE_POOL *cp;
+ WT_CONNECTION_IMPL *conn, *entry;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ int cp_locked, found;
+
+ conn = S2C(session);
+ cache = conn->cache;
+ cp_locked = found = 0;
+ cp = __wt_process.cache_pool;
+
+ if (!F_ISSET(conn, WT_CONN_CACHE_POOL))
+ return (0);
+
+ __wt_spin_lock(session, &cp->cache_pool_lock);
+ cp_locked = 1;
+ TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq)
+ if (entry == conn) {
+ found = 1;
+ break;
+ }
+
+ /*
+ * If there was an error during open, we may not have made it onto the
+ * queue. We did increment the reference count, so proceed regardless.
+ */
+ if (found) {
+ WT_TRET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "Removing %s from cache pool", entry->home));
+ TAILQ_REMOVE(&cp->cache_pool_qh, entry, cpq);
+
+ /* Give the connection's resources back to the pool. */
+ WT_ASSERT(session, cp->currently_used >= conn->cache_size);
+ cp->currently_used -= conn->cache_size;
+
+ /*
+ * Stop our manager thread - release the cache pool lock while
+ * joining the thread to allow it to complete any balance
+ * operation.
+ */
+ __wt_spin_unlock(session, &cp->cache_pool_lock);
+ cp_locked = 0;
+
+ F_CLR(cache, WT_CACHE_POOL_RUN);
+ WT_TRET(__wt_cond_signal(session, cp->cache_pool_cond));
+ WT_TRET(__wt_thread_join(session, cache->cp_tid));
+
+ wt_session = &cache->cp_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+
+ /*
+ * Grab the lock again now to stop other threads joining the
+ * pool while we are figuring out whether we were the last
+ * participant.
+ */
+ __wt_spin_lock(session, &cp->cache_pool_lock);
+ cp_locked = 1;
+ }
+
+ /*
+ * If there are no references, we are cleaning up after a failed
+ * wiredtiger_open, there is nothing further to do.
+ */
+ if (cp->refs < 1) {
+ if (cp_locked)
+ __wt_spin_unlock(session, &cp->cache_pool_lock);
+ return (0);
+ }
+
+ if (--cp->refs == 0) {
+ WT_ASSERT(session, TAILQ_EMPTY(&cp->cache_pool_qh));
+ F_CLR_ATOMIC(cp, WT_CACHE_POOL_ACTIVE);
+ }
+
+ if (!F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE)) {
+ WT_TRET(__wt_verbose(
+ session, WT_VERB_SHARED_CACHE, "Destroying cache pool"));
+ __wt_spin_lock(session, &__wt_process.spinlock);
+ /*
+ * We have been holding the pool lock - no connections could
+ * have been added.
+ */
+ WT_ASSERT(session,
+ cp == __wt_process.cache_pool &&
+ TAILQ_EMPTY(&cp->cache_pool_qh));
+ __wt_process.cache_pool = NULL;
+ __wt_spin_unlock(session, &__wt_process.spinlock);
+ __wt_spin_unlock(session, &cp->cache_pool_lock);
+ cp_locked = 0;
+
+ /* Now free the pool. */
+ __wt_free(session, cp->name);
+
+ __wt_spin_destroy(session, &cp->cache_pool_lock);
+ WT_TRET(__wt_cond_destroy(session, &cp->cache_pool_cond));
+ __wt_free(session, cp);
+ }
+
+ if (cp_locked) {
+ __wt_spin_unlock(session, &cp->cache_pool_lock);
+
+ /* Notify other participants if we were managing */
+ if (F_ISSET(cache, WT_CACHE_POOL_MANAGER)) {
+ F_CLR_ATOMIC(cp, WT_CACHE_POOL_MANAGED);
+ WT_TRET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "Shutting down shared cache manager connection"));
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * __cache_pool_balance --
+ * Do a pass over the cache pool members and ensure the pool is being
+ * effectively used.
+ */
+static int
+__cache_pool_balance(WT_SESSION_IMPL *session)
+{
+ WT_CACHE_POOL *cp;
+ WT_DECL_RET;
+ int adjusted;
+ uint64_t bump_threshold, highest;
+
+ cp = __wt_process.cache_pool;
+ adjusted = 0;
+ highest = 0;
+
+ __wt_spin_lock(NULL, &cp->cache_pool_lock);
+
+ /* If the queue is empty there is nothing to do. */
+ if (TAILQ_FIRST(&cp->cache_pool_qh) == NULL)
+ goto err;
+
+ WT_ERR(__cache_pool_assess(session, &highest));
+ bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD;
+ /*
+ * Actively attempt to:
+ * - Reduce the amount allocated, if we are over the budget
+ * - Increase the amount used if there is capacity and any pressure.
+ */
+ for (bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD;
+ F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+ F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN);) {
+ WT_ERR(__cache_pool_adjust(
+ session, highest, bump_threshold, &adjusted));
+ /*
+ * Stop if the amount of cache being used is stable, and we
+ * aren't over capacity.
+ */
+ if (cp->currently_used <= cp->size && !adjusted)
+ break;
+ if (bump_threshold > 0)
+ --bump_threshold;
+ }
+
+err: __wt_spin_unlock(NULL, &cp->cache_pool_lock);
+ return (ret);
+}
+
+/*
+ * __cache_pool_assess --
+ * Assess the usage of the cache pool.
+ */
+static int
+__cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest)
+{
+ WT_CACHE_POOL *cp;
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *entry;
+ uint64_t entries, highest, new;
+
+ cp = __wt_process.cache_pool;
+ entries = highest = 0;
+
+ /* Generate read pressure information. */
+ TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
+ if (entry->cache_size == 0 ||
+ entry->cache == NULL)
+ continue;
+ cache = entry->cache;
+ ++entries;
+ new = cache->bytes_evict;
+ /* Handle wrapping of eviction requests. */
+ if (new >= cache->cp_saved_evict)
+ cache->cp_current_evict = new - cache->cp_saved_evict;
+ else
+ cache->cp_current_evict = new;
+ cache->cp_saved_evict = new;
+ if (cache->cp_current_evict > highest)
+ highest = cache->cp_current_evict;
+ }
+ WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "Highest eviction count: %" PRIu64 ", entries: %" PRIu64,
+ highest, entries));
+ /* Normalize eviction information across connections. */
+ highest = highest / (entries + 1);
+ ++highest; /* Avoid divide by zero. */
+
+ *phighest = highest;
+ return (0);
+}
+
+/*
+ * __cache_pool_adjust --
+ * Adjust the allocation of cache to each connection. If force is set
+ * ignore cache load information, and reduce the allocation for every
+ * connection allocated more than their reserved size.
+ */
+static int
+__cache_pool_adjust(WT_SESSION_IMPL *session,
+ uint64_t highest, uint64_t bump_threshold, int *adjustedp)
+{
+ WT_CACHE_POOL *cp;
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *entry;
+ uint64_t adjusted, reserved, read_pressure;
+ int force, grew;
+
+ *adjustedp = 0;
+ cp = __wt_process.cache_pool;
+ force = (cp->currently_used > cp->size);
+ grew = 0;
+ if (WT_VERBOSE_ISSET(session, WT_VERB_SHARED_CACHE)) {
+ WT_RET(__wt_verbose(session,
+ WT_VERB_SHARED_CACHE, "Cache pool distribution: "));
+ WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "\t" "cache_size, read_pressure, skips: "));
+ }
+
+ TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
+ cache = entry->cache;
+ reserved = cache->cp_reserved;
+ adjusted = 0;
+
+ read_pressure = cache->cp_current_evict / highest;
+ WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "\t%" PRIu64 ", %" PRIu64 ", %" PRIu32,
+ entry->cache_size, read_pressure, cache->cp_skip_count));
+
+ /* Allow to stabilize after changes. */
+ if (cache->cp_skip_count > 0 && --cache->cp_skip_count > 0)
+ continue;
+ /*
+ * If the entry is currently allocated less than the reserved
+ * size, increase it's allocation. This should only happen if:
+ * - It's the first time we've seen this member
+ * - The reserved size has been adjusted
+ */
+ if (entry->cache_size < reserved) {
+ grew = 1;
+ adjusted = reserved - entry->cache_size;
+ /*
+ * Conditions for reducing the amount of resources for an
+ * entry:
+ * - If we are forcing and this entry has more than the
+ * minimum amount of space in use.
+ * - If the read pressure in this entry is below the
+ * threshold, other entries need more cache, the entry has
+ * more than the minimum space and there is no available
+ * space in the pool.
+ */
+ } else if ((force && entry->cache_size > reserved) ||
+ (read_pressure < WT_CACHE_POOL_REDUCE_THRESHOLD &&
+ highest > 1 && entry->cache_size > reserved &&
+ cp->currently_used >= cp->size)) {
+ grew = 0;
+ /*
+ * Shrink by a chunk size if that doesn't drop us
+ * below the reserved size.
+ */
+ if (entry->cache_size > cp->chunk + reserved)
+ adjusted = cp->chunk;
+ else
+ adjusted = entry->cache_size - reserved;
+ /*
+ * Conditions for increasing the amount of resources for an
+ * entry:
+ * - There was some activity across the pool
+ * - This entry is using less than the entire cache pool
+ * - The connection is using enough cache to require eviction
+ * - There is space available in the pool
+ * - Additional cache would benefit the connection
+ */
+ } else if (highest > 1 &&
+ entry->cache_size < cp->size &&
+ cache->bytes_inmem >=
+ (entry->cache_size * cache->eviction_target) / 100 &&
+ cp->currently_used < cp->size &&
+ read_pressure > bump_threshold) {
+ grew = 1;
+ adjusted = WT_MIN(cp->chunk,
+ cp->size - cp->currently_used);
+ }
+ if (adjusted > 0) {
+ *adjustedp = 1;
+ if (grew > 0) {
+ cache->cp_skip_count = WT_CACHE_POOL_BUMP_SKIPS;
+ entry->cache_size += adjusted;
+ cp->currently_used += adjusted;
+ } else {
+ cache->cp_skip_count =
+ WT_CACHE_POOL_REDUCE_SKIPS;
+ WT_ASSERT(session,
+ entry->cache_size >= adjusted &&
+ cp->currently_used >= adjusted);
+ entry->cache_size -= adjusted;
+ cp->currently_used -= adjusted;
+ }
+ WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "Allocated %s%" PRId64 " to %s",
+ grew ? "" : "-", adjusted, entry->home));
+ /*
+ * TODO: Add a loop waiting for connection to give up
+ * cache.
+ */
+ }
+ }
+ return (0);
+}
+
+/*
+ * __wt_cache_pool_server --
+ * Thread to manage cache pool among connections.
+ */
+void *
+__wt_cache_pool_server(void *arg)
+{
+ WT_CACHE *cache;
+ WT_CACHE_POOL *cp;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)arg;
+
+ cp = __wt_process.cache_pool;
+ cache = S2C(session)->cache;
+
+ while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+ F_ISSET(cache, WT_CACHE_POOL_RUN)) {
+ if (cp->currently_used <= cp->size)
+ WT_ERR(__wt_cond_wait(session,
+ cp->cache_pool_cond, 1000000));
+
+ /*
+ * Re-check pool run flag - since we want to avoid getting the
+ * lock on shutdown.
+ */
+ if (!F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+ F_ISSET(cache, WT_CACHE_POOL_RUN))
+ break;
+
+ /* Try to become the managing thread */
+ F_CAS_ATOMIC(cp, WT_CACHE_POOL_MANAGED, ret);
+ if (ret == 0) {
+ F_SET(cache, WT_CACHE_POOL_MANAGER);
+ WT_ERR(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "Cache pool switched manager thread"));
+ }
+
+ /*
+ * Continue even if there was an error. Details of errors are
+ * reported in the balance function.
+ */
+ if (F_ISSET(cache, WT_CACHE_POOL_MANAGER))
+ (void)__cache_pool_balance(session);
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "cache pool manager server error");
+ }
+ return (NULL);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
new file mode 100644
index 00000000000..ab97d4ead46
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
@@ -0,0 +1,228 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __ckpt_server_start(WT_CONNECTION_IMPL *);
+
+/*
+ * __ckpt_server_config --
+ * Parse and setup the checkpoint server options.
+ */
+static int
+__ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, int *startp)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ char *p;
+
+ conn = S2C(session);
+
+ /*
+ * The checkpoint configuration requires a wait time and/or a log
+ * size -- if one is not set, we're not running at all.
+ * Checkpoints based on log size also require logging be enabled.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval));
+ conn->ckpt_usecs = (long)cval.val * 1000000;
+ WT_RET(__wt_config_gets(session, cfg, "checkpoint.log_size", &cval));
+ conn->ckpt_logsize = (wt_off_t)cval.val;
+ __wt_log_written_reset(session);
+ if ((conn->ckpt_usecs == 0 && conn->ckpt_logsize == 0) ||
+ (conn->ckpt_logsize && !conn->logging && conn->ckpt_usecs == 0)) {
+ *startp = 0;
+ return (0);
+ }
+ *startp = 1;
+
+ /*
+ * The application can specify a checkpoint name, which we ignore if
+ * it's our default.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "checkpoint.name", &cval));
+ if (cval.len != 0 &&
+ !WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) {
+ WT_RET(__wt_checkpoint_name_ok(session, cval.str, cval.len));
+
+ WT_RET(__wt_scr_alloc(session, cval.len + 20, &tmp));
+ WT_ERR(__wt_buf_fmt(
+ session, tmp, "name=%.*s", (int)cval.len, cval.str));
+ WT_ERR(__wt_strdup(session, tmp->data, &p));
+
+ __wt_free(session, conn->ckpt_config);
+ conn->ckpt_config = p;
+ }
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __ckpt_server --
+ * The checkpoint server thread.
+ */
+static void *
+__ckpt_server(void *arg)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ WT_SESSION_IMPL *session;
+
+ session = arg;
+ conn = S2C(session);
+ wt_session = (WT_SESSION *)session;
+
+ while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+ F_ISSET(conn, WT_CONN_SERVER_CHECKPOINT)) {
+ /* Checkpoint the database. */
+ WT_ERR(wt_session->checkpoint(wt_session, conn->ckpt_config));
+
+ /* Reset. */
+ if (conn->ckpt_logsize) {
+ __wt_log_written_reset(session);
+ conn->ckpt_signalled = 0;
+ }
+ /*
+ * Wait...
+ * NOTE: If the user only configured logsize, then usecs
+ * will be 0 and this wait won't return until signalled.
+ */
+ WT_ERR(
+ __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs));
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "checkpoint server error");
+ }
+ return (NULL);
+}
+
+/*
+ * __ckpt_server_start --
+ * Start the checkpoint server thread.
+ */
+static int
+__ckpt_server_start(WT_CONNECTION_IMPL *conn)
+{
+ WT_SESSION_IMPL *session;
+
+ /* Nothing to do if the server is already running. */
+ if (conn->ckpt_session != NULL)
+ return (0);
+
+ F_SET(conn, WT_CONN_SERVER_CHECKPOINT);
+ /* The checkpoint server gets its own session. */
+ WT_RET(__wt_open_internal_session(
+ conn, "checkpoint-server", 1, 1, &conn->ckpt_session));
+ session = conn->ckpt_session;
+
+ /*
+ * Checkpoint does enough I/O it may be called upon to perform slow
+ * operations for the block manager.
+ */
+ F_SET(session, WT_SESSION_CAN_WAIT);
+
+ WT_RET(
+ __wt_cond_alloc(session, "checkpoint server", 0, &conn->ckpt_cond));
+
+ /*
+ * Start the thread.
+ */
+ WT_RET(__wt_thread_create(
+ session, &conn->ckpt_tid, __ckpt_server, session));
+ conn->ckpt_tid_set = 1;
+
+ return (0);
+}
+
+/*
+ * __wt_checkpoint_server_create --
+ * Configure and start the checkpoint server.
+ */
+int
+__wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ int start;
+
+ conn = S2C(session);
+ start = 0;
+
+ /* If there is already a server running, shut it down. */
+ if (conn->ckpt_session != NULL)
+ WT_RET(__wt_checkpoint_server_destroy(session));
+
+ WT_RET(__ckpt_server_config(session, cfg, &start));
+ if (start)
+ WT_RET(__ckpt_server_start(conn));
+
+ return (0);
+}
+
+/*
+ * __wt_checkpoint_server_destroy --
+ * Destroy the checkpoint server thread.
+ */
+int
+__wt_checkpoint_server_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ conn = S2C(session);
+
+ F_CLR(conn, WT_CONN_SERVER_CHECKPOINT);
+ if (conn->ckpt_tid_set) {
+ WT_TRET(__wt_cond_signal(session, conn->ckpt_cond));
+ WT_TRET(__wt_thread_join(session, conn->ckpt_tid));
+ conn->ckpt_tid_set = 0;
+ }
+ WT_TRET(__wt_cond_destroy(session, &conn->ckpt_cond));
+
+ __wt_free(session, conn->ckpt_config);
+
+ /* Close the server thread's session. */
+ if (conn->ckpt_session != NULL) {
+ wt_session = &conn->ckpt_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ }
+
+ /*
+ * Ensure checkpoint settings are cleared - so that reconfigure doesn't
+ * get confused.
+ */
+ conn->ckpt_session = NULL;
+ conn->ckpt_tid_set = 0;
+ conn->ckpt_cond = NULL;
+ conn->ckpt_config = NULL;
+ conn->ckpt_usecs = 0;
+
+ return (ret);
+}
+
+/*
+ * __wt_checkpoint_signal --
+ * Signal the checkpoint thread if sufficient log has been written.
+ * Return 1 if this signals the checkpoint thread, 0 otherwise.
+ */
+int
+__wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+ WT_ASSERT(session, WT_CKPT_LOGSIZE(conn));
+ if (logsize >= conn->ckpt_logsize && !conn->ckpt_signalled) {
+ WT_RET(__wt_cond_signal(session, conn->ckpt_cond));
+ conn->ckpt_signalled = 1;
+ }
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
new file mode 100644
index 00000000000..f4f540e33c7
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -0,0 +1,694 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __conn_dhandle_open_lock --
+ * Spin on the current data handle until either (a) it is open, read
+ * locked; or (b) it is closed, write locked. If exclusive access is
+ * requested and cannot be granted immediately, fail with EBUSY.
+ */
+static int
+__conn_dhandle_open_lock(
+ WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, uint32_t flags)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+
+ btree = dhandle->handle;
+
+ /*
+ * Check that the handle is open. We've already incremented
+ * the reference count, so once the handle is open it won't be
+ * closed by another thread.
+ *
+ * If we can see the WT_DHANDLE_OPEN flag set while holding a
+ * lock on the handle, then it's really open and we can start
+ * using it. Alternatively, if we can get an exclusive lock
+ * and WT_DHANDLE_OPEN is still not set, we need to do the open.
+ */
+ for (;;) {
+ if (!LF_ISSET(WT_DHANDLE_EXCLUSIVE) &&
+ F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS))
+ return (EBUSY);
+
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+ !LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
+ WT_RET(__wt_readlock(session, dhandle->rwlock));
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
+ return (0);
+ WT_RET(__wt_readunlock(session, dhandle->rwlock));
+ }
+
+ /*
+ * It isn't open or we want it exclusive: try to get an
+ * exclusive lock. There is some subtlety here: if we race
+ * with another thread that successfully opens the file, we
+ * don't want to block waiting to get exclusive access.
+ */
+ if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) {
+ /*
+ * If it was opened while we waited, drop the write
+ * lock and get a read lock instead.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+ !LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
+ WT_RET(
+ __wt_writeunlock(session, dhandle->rwlock));
+ continue;
+ }
+
+ /* We have an exclusive lock, we're done. */
+ F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
+ return (0);
+ } else if (ret != EBUSY || LF_ISSET(WT_DHANDLE_EXCLUSIVE))
+ return (EBUSY);
+
+ /* Give other threads a chance to make progress. */
+ __wt_yield();
+ }
+}
+
+/*
+ * __conn_dhandle_get --
+ * Find an open btree file handle, otherwise create a new one, lock it
+ * exclusively, and return it linked into the connection's list.
+ */
+static int
+__conn_dhandle_get(WT_SESSION_IMPL *session,
+ const char *name, const char *ckpt, uint32_t flags)
+{
+ WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ uint64_t hash;
+
+ conn = S2C(session);
+
+ /* We must be holding the schema lock at a higher level. */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) &&
+ !LF_ISSET(WT_DHANDLE_HAVE_REF));
+
+ /* Increment the reference count if we already have the btree open. */
+ hash = __wt_hash_city64(name, strlen(name));
+ SLIST_FOREACH(dhandle, &conn->dhlh, l)
+ if ((hash == dhandle->name_hash &&
+ strcmp(name, dhandle->name) == 0) &&
+ ((ckpt == NULL && dhandle->checkpoint == NULL) ||
+ (ckpt != NULL && dhandle->checkpoint != NULL &&
+ strcmp(ckpt, dhandle->checkpoint) == 0))) {
+ WT_RET(__conn_dhandle_open_lock(
+ session, dhandle, flags));
+ (void)WT_ATOMIC_ADD4(dhandle->session_ref, 1);
+ session->dhandle = dhandle;
+ return (0);
+ }
+
+ /*
+ * Allocate the data source handle and underlying btree handle, then
+ * initialize the data source handle. Exclusively lock the data
+ * source handle before inserting it in the list.
+ */
+ WT_RET(__wt_calloc_def(session, 1, &dhandle));
+
+ WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle"));
+ dhandle->session_ref = 1;
+
+ dhandle->name_hash = hash;
+ WT_ERR(__wt_strdup(session, name, &dhandle->name));
+ if (ckpt != NULL)
+ WT_ERR(__wt_strdup(session, ckpt, &dhandle->checkpoint));
+
+ WT_ERR(__wt_calloc_def(session, 1, &btree));
+ dhandle->handle = btree;
+ btree->dhandle = dhandle;
+
+ WT_ERR(__wt_spin_init(
+ session, &dhandle->close_lock, "data handle close"));
+
+ F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
+ WT_ERR(__wt_writelock(session, dhandle->rwlock));
+
+ /*
+ * Prepend the handle to the connection list, assuming we're likely to
+ * need new files again soon, until they are cached by all sessions.
+ *
+ * !!!
+ * We hold only the schema lock here, not the dhandle lock. Eviction
+ * walks this list only holding the dhandle lock. This works because
+ * we're inserting at the beginning of the list, and we're only
+ * publishing this one entry per lock acquisition. Eviction either
+ * sees our newly added entry or the former head of the list, and it
+ * doesn't matter which (if eviction only sees a single element in the
+ * list because the insert races, it will return without finding enough
+ * candidates for eviction, and will then retry).
+ */
+ SLIST_INSERT_HEAD(&conn->dhlh, dhandle, l);
+
+ session->dhandle = dhandle;
+ return (0);
+
+err: WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock));
+ __wt_free(session, dhandle->name);
+ __wt_free(session, dhandle->checkpoint);
+ __wt_free(session, dhandle->handle); /* btree free */
+ __wt_spin_destroy(session, &dhandle->close_lock);
+ __wt_overwrite_and_free(session, dhandle);
+
+ return (ret);
+}
+
+/*
+ * __wt_conn_btree_sync_and_close --
+ * Sync and close the underlying btree handle.
+ */
+int
+__wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int force)
+{
+ WT_BTREE *btree;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ int no_schema_lock;
+
+ dhandle = session->dhandle;
+ btree = S2BT(session);
+
+ if (!F_ISSET(dhandle, WT_DHANDLE_OPEN))
+ return (0);
+
+ /*
+ * If we don't already have the schema lock, make it an error to try
+ * to acquire it. The problem is that we are holding an exclusive
+ * lock on the handle, and if we attempt to acquire the schema lock
+ * we might deadlock with a thread that has the schema lock and wants
+ * a handle lock (specifically, checkpoint).
+ */
+ no_schema_lock = 0;
+ if (!F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) {
+ no_schema_lock = 1;
+ F_SET(session, WT_SESSION_NO_SCHEMA_LOCK);
+ }
+
+ /*
+ * We may not be holding the schema lock, and threads may be walking
+ * the list of open handles (for example, checkpoint). Acquire the
+ * handle's close lock.
+ */
+ __wt_spin_lock(session, &dhandle->close_lock);
+
+ /*
+ * The close can fail if an update cannot be written, return the EBUSY
+ * error to our caller for eventual retry.
+ */
+ if (!F_ISSET(btree,
+ WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
+ WT_ERR(__wt_checkpoint_close(session, force));
+
+ if (dhandle->checkpoint == NULL)
+ --S2C(session)->open_btree_count;
+
+ WT_TRET(__wt_btree_close(session));
+ F_CLR(dhandle, WT_DHANDLE_OPEN);
+ F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
+
+err: __wt_spin_unlock(session, &dhandle->close_lock);
+
+ if (no_schema_lock)
+ F_CLR(session, WT_SESSION_NO_SCHEMA_LOCK);
+
+ return (ret);
+}
+
+/*
+ * __conn_btree_config_clear --
+ * Clear the underlying object's configuration information.
+ */
+static void
+__conn_btree_config_clear(WT_SESSION_IMPL *session)
+{
+ WT_DATA_HANDLE *dhandle;
+ const char **a;
+
+ dhandle = session->dhandle;
+
+ if (dhandle->cfg == NULL)
+ return;
+ for (a = dhandle->cfg; *a != NULL; ++a)
+ __wt_free(session, *a);
+ __wt_free(session, dhandle->cfg);
+}
+
+/*
+ * __conn_btree_config_set --
+ * Set up a btree handle's configuration information.
+ */
+static int
+__conn_btree_config_set(WT_SESSION_IMPL *session)
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ const char *metaconf;
+
+ dhandle = session->dhandle;
+
+ /*
+ * Read the object's entry from the metadata file, we're done if we
+ * don't find one.
+ */
+ if ((ret =
+ __wt_metadata_search(session, dhandle->name, &metaconf)) != 0) {
+ if (ret == WT_NOTFOUND)
+ ret = ENOENT;
+ WT_RET(ret);
+ }
+
+ /*
+ * The defaults are included because underlying objects have persistent
+ * configuration information stored in the metadata file. If defaults
+ * are included in the configuration, we can add new configuration
+ * strings without upgrading the metadata file or writing special code
+ * in case a configuration string isn't initialized, as long as the new
+ * configuration string has an appropriate default value.
+ *
+ * The error handling is a little odd, but be careful: we're holding a
+ * chunk of allocated memory in metaconf. If we fail before we copy a
+ * reference to it into the object's configuration array, we must free
+ * it, after the copy, we don't want to free it.
+ */
+ WT_ERR(__wt_calloc_def(session, 3, &dhandle->cfg));
+ WT_ERR(__wt_strdup(
+ session, WT_CONFIG_BASE(session, file_meta), &dhandle->cfg[0]));
+ dhandle->cfg[1] = metaconf;
+ return (0);
+
+err: __wt_free(session, metaconf);
+ return (ret);
+}
+
+/*
+ * __conn_btree_open --
+ * Open the current btree handle.
+ */
+static int
+__conn_btree_open(
+ WT_SESSION_IMPL *session, const char *op_cfg[], uint32_t flags)
+{
+ WT_BTREE *btree;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ dhandle = session->dhandle;
+ btree = S2BT(session);
+
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) &&
+ F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) &&
+ !LF_ISSET(WT_DHANDLE_LOCK_ONLY));
+
+ /*
+ * If the handle is already open, it has to be closed so it can be
+ * reopened with a new configuration. We don't need to check again:
+ * this function isn't called if the handle is already open in the
+ * required mode.
+ *
+ * This call can return EBUSY if there's an update in the object that's
+ * not yet globally visible. That's not a problem because it can only
+ * happen when we're switching from a normal handle to a "special" one,
+ * so we're returning EBUSY to an attempt to verify or do other special
+ * operations. The reverse won't happen because when the handle from a
+ * verify or other special operation is closed, there won't be updates
+ * in the tree that can block the close.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
+ WT_RET(__wt_conn_btree_sync_and_close(session, 0));
+
+ /* Discard any previous configuration, set up the new configuration. */
+ __conn_btree_config_clear(session);
+ WT_RET(__conn_btree_config_set(session));
+
+ /* Set any special flags on the handle. */
+ F_SET(btree, LF_ISSET(WT_BTREE_SPECIAL_FLAGS));
+
+ do {
+ WT_ERR(__wt_btree_open(session, op_cfg));
+ F_SET(dhandle, WT_DHANDLE_OPEN);
+ /*
+ * Checkpoint handles are read only, so eviction calculations
+ * based on the number of btrees are better to ignore them.
+ */
+ if (dhandle->checkpoint == NULL)
+ ++S2C(session)->open_btree_count;
+
+ /* Drop back to a readlock if that is all that was needed. */
+ if (!LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
+ F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+ WT_ERR(__wt_writeunlock(session, dhandle->rwlock));
+ WT_ERR(
+ __conn_dhandle_open_lock(session, dhandle, flags));
+ }
+ } while (!F_ISSET(dhandle, WT_DHANDLE_OPEN));
+
+ if (0) {
+err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
+ /*
+ * If the open failed, close the handle. If there was no
+ * reference to the handle in this session, we incremented the
+ * session reference count, so decrement it here. Otherwise,
+ * just close the handle without decrementing.
+ */
+ if (!LF_ISSET(WT_DHANDLE_HAVE_REF))
+ __wt_conn_btree_close(session);
+ else if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
+ WT_TRET(__wt_conn_btree_sync_and_close(session, 0));
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_conn_btree_get --
+ * Get an open btree file handle, otherwise open a new one.
+ */
+int
+__wt_conn_btree_get(WT_SESSION_IMPL *session,
+ const char *name, const char *ckpt, const char *op_cfg[], uint32_t flags)
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ if (LF_ISSET(WT_DHANDLE_HAVE_REF))
+ WT_RET(
+ __conn_dhandle_open_lock(session, session->dhandle, flags));
+ else
+ WT_RET(__conn_dhandle_get(session, name, ckpt, flags));
+ dhandle = session->dhandle;
+
+ if (!LF_ISSET(WT_DHANDLE_LOCK_ONLY) &&
+ (!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
+ LF_ISSET(WT_BTREE_SPECIAL_FLAGS)))
+ if ((ret = __conn_btree_open(session, op_cfg, flags)) != 0) {
+ F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+ WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+ }
+
+ WT_ASSERT(session, ret != 0 ||
+ LF_ISSET(WT_DHANDLE_EXCLUSIVE) ==
+ F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE));
+
+ return (ret);
+}
+
+/*
+ * __wt_conn_btree_apply --
+ * Apply a function to all open btree handles apart from the metadata
+ * file.
+ */
+int
+__wt_conn_btree_apply(WT_SESSION_IMPL *session,
+ int apply_checkpoints,
+ int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+ SLIST_FOREACH(dhandle, &conn->dhlh, l)
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+ WT_PREFIX_MATCH(dhandle->name, "file:") &&
+ (apply_checkpoints || dhandle->checkpoint == NULL) &&
+ !WT_IS_METADATA(dhandle)) {
+ /*
+ * We need to pull the handle into the session handle
+ * cache and make sure it's referenced to stop other
+ * internal code dropping the handle (e.g in LSM when
+ * cleaning up obsolete chunks). Holding the metadata
+ * lock isn't enough.
+ */
+ ret = __wt_session_get_btree(session,
+ dhandle->name, dhandle->checkpoint, NULL, 0);
+ if (ret == 0) {
+ ret = func(session, cfg);
+ if (WT_META_TRACKING(session))
+ WT_TRET(__wt_meta_track_handle_lock(
+ session, 0));
+ else
+ WT_TRET(__wt_session_release_btree(
+ session));
+ } else if (ret == EBUSY)
+ ret = __wt_conn_btree_apply_single(
+ session, dhandle->name,
+ dhandle->checkpoint, func, cfg);
+ WT_RET(ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_conn_btree_apply_single --
+ * Apply a function to a single btree handle that couldn't be locked
+ * (attempting to get the handle returned EBUSY).
+ */
+int
+__wt_conn_btree_apply_single(WT_SESSION_IMPL *session,
+ const char *uri, const char *checkpoint,
+ int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle, *saved_dhandle;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+ saved_dhandle = session->dhandle;
+
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+ SLIST_FOREACH(dhandle, &conn->dhlh, l)
+ if (strcmp(dhandle->name, uri) == 0 &&
+ ((dhandle->checkpoint == NULL && checkpoint == NULL) ||
+ (dhandle->checkpoint != NULL && checkpoint != NULL &&
+ strcmp(dhandle->checkpoint, checkpoint) == 0))) {
+ /*
+ * We're holding the schema lock which locks out handle
+ * open (which might change the state of the underlying
+ * object). However, closing a handle doesn't require
+ * the schema lock, lock out closing the handle and then
+ * confirm the handle is still open.
+ */
+ __wt_spin_lock(session, &dhandle->close_lock);
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
+ session->dhandle = dhandle;
+ ret = func(session, cfg);
+ }
+ __wt_spin_unlock(session, &dhandle->close_lock);
+ WT_ERR(ret);
+ }
+
+err: session->dhandle = saved_dhandle;
+ return (ret);
+}
+
+/*
+ * __wt_conn_btree_close --
+ * Discard a reference to an open btree file handle.
+ */
+void
+__wt_conn_btree_close(WT_SESSION_IMPL *session)
+{
+ (void)WT_ATOMIC_SUB4(session->dhandle->session_ref, 1);
+}
+
+/*
+ * __wt_conn_dhandle_close_all --
+ * Close all data handles handles with matching name (including all
+ * checkpoint handles).
+ */
+int
+__wt_conn_dhandle_close_all(
+ WT_SESSION_IMPL *session, const char *name, int force)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+ WT_ASSERT(session, session->dhandle == NULL);
+
+ SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ if (strcmp(dhandle->name, name) != 0)
+ continue;
+
+ session->dhandle = dhandle;
+
+ /* Lock the handle exclusively. */
+ WT_ERR(__wt_session_get_btree(session,
+ dhandle->name, dhandle->checkpoint,
+ NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY));
+ if (WT_META_TRACKING(session))
+ WT_ERR(__wt_meta_track_handle_lock(session, 0));
+
+ /*
+ * We have an exclusive lock, which means there are no cursors
+ * open at this point. Close the handle, if necessary.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
+ if ((ret = __wt_meta_track_sub_on(session)) == 0)
+ ret = __wt_conn_btree_sync_and_close(
+ session, force);
+
+ /*
+ * If the close succeeded, drop any locks it acquired.
+ * If there was a failure, this function will fail and
+ * the whole transaction will be rolled back.
+ */
+ if (ret == 0)
+ ret = __wt_meta_track_sub_off(session);
+ }
+
+ if (!WT_META_TRACKING(session))
+ WT_TRET(__wt_session_release_btree(session));
+
+ WT_ERR(ret);
+ }
+
+err: session->dhandle = NULL;
+ return (ret);
+}
+
+/*
+ * __wt_conn_dhandle_discard_single --
+ * Close/discard a single data handle.
+ */
+int
+__wt_conn_dhandle_discard_single(
+ WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, int final)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *save_dhandle;
+ WT_DECL_RET;
+ WT_DECL_SPINLOCK_ID(id); /* Must appear last */
+
+ conn = S2C(session);
+
+ save_dhandle = session->dhandle;
+ session->dhandle = dhandle;
+
+ /*
+ * We're called from the periodic sweep function and the final close;
+ * the former wants to continue if the handle is suddenly found to be
+ * busy, the latter wants to shut things down.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
+ if (!final)
+ WT_ERR(EBUSY);
+ WT_ERR(__wt_conn_btree_sync_and_close(session, 0));
+ }
+
+ /*
+ * Get the schema lock (required to remove entries from the data handle
+ * list), get the dhandle lock to block the eviction server from
+ * walking the list.
+ */
+ F_SET(session, WT_SESSION_SCHEMA_LOCKED);
+ __wt_spin_lock(session, &conn->schema_lock);
+
+ /*
+ * If the eviction server is running, don't block waiting for it while
+ * holding the schema lock. The sweep server will try again.
+ */
+ if (final)
+ __wt_spin_lock(session, &conn->dhandle_lock);
+ else if ((ret =
+ __wt_spin_trylock(session, &conn->dhandle_lock, &id)) != 0)
+ goto unlock;
+
+ /*
+ * Check if the handle was reacquired by a session while we waited;
+ * this should only happen when called from the periodic sweep code, of
+ * course.
+ */
+ if (!final && dhandle->session_ref != 0)
+ ret = EBUSY;
+ else
+ SLIST_REMOVE(&conn->dhlh, dhandle, __wt_data_handle, l);
+
+ __wt_spin_unlock(session, &conn->dhandle_lock);
+
+unlock: __wt_spin_unlock(session, &conn->schema_lock);
+ F_CLR(session, WT_SESSION_SCHEMA_LOCKED);
+
+ /*
+ * After successfully removing the handle, clean it up.
+ */
+ if (ret == 0) {
+ WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock));
+ __wt_free(session, dhandle->name);
+ __wt_free(session, dhandle->checkpoint);
+ __conn_btree_config_clear(session);
+ __wt_free(session, dhandle->handle);
+ __wt_spin_destroy(session, &dhandle->close_lock);
+ __wt_overwrite_and_free(session, dhandle);
+
+ WT_CLEAR_BTREE_IN_SESSION(session);
+ }
+
+err: session->dhandle = save_dhandle;
+ WT_ASSERT(session, !final || ret == 0);
+ return (ret);
+}
+
+/*
+ * __wt_conn_dhandle_discard --
+ * Close/discard all data handles.
+ */
+int
+__wt_conn_dhandle_discard(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ /*
+ * Close open data handles: first, everything but the metadata file
+ * (as closing a normal file may open and write the metadata file),
+ * then the metadata file. This function isn't called often, and I
+ * don't want to "know" anything about the metadata file's position on
+ * the list, so we do it the hard way.
+ */
+restart:
+ SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ if (WT_IS_METADATA(dhandle))
+ continue;
+
+ WT_TRET(__wt_conn_dhandle_discard_single(session, dhandle, 1));
+ goto restart;
+ }
+
+ /*
+ * Closing the files may have resulted in entries on our default
+ * session's list of open data handles, specifically, we added the
+ * metadata file if any of the files were dirty. Clean up that list
+ * before we shut down the metadata entry, for good.
+ */
+ __wt_session_close_cache(session);
+ F_SET(session, WT_SESSION_NO_DATA_HANDLES);
+
+ /* Close the metadata file handle. */
+ while ((dhandle = SLIST_FIRST(&conn->dhlh)) != NULL)
+ WT_TRET(__wt_conn_dhandle_discard_single(session, dhandle, 1));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c
new file mode 100644
index 00000000000..e4f0a6ddd73
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_handle.c
@@ -0,0 +1,142 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_connection_init --
+ * Structure initialization for a just-created WT_CONNECTION_IMPL handle.
+ */
+int
+__wt_connection_init(WT_CONNECTION_IMPL *conn)
+{
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ session = conn->default_session;
+
+ SLIST_INIT(&conn->dhlh); /* Data handle list */
+ TAILQ_INIT(&conn->dlhqh); /* Library list */
+ TAILQ_INIT(&conn->dsrcqh); /* Data source list */
+ TAILQ_INIT(&conn->fhqh); /* File list */
+ TAILQ_INIT(&conn->collqh); /* Collator list */
+ TAILQ_INIT(&conn->compqh); /* Compressor list */
+
+ TAILQ_INIT(&conn->lsmqh); /* WT_LSM_TREE list */
+
+ /* Setup the LSM work queues. */
+ TAILQ_INIT(&conn->lsm_manager.switchqh);
+ TAILQ_INIT(&conn->lsm_manager.appqh);
+ TAILQ_INIT(&conn->lsm_manager.managerqh);
+
+ /* Configuration. */
+ WT_RET(__wt_conn_config_init(session));
+
+ /* Statistics. */
+ __wt_stat_init_connection_stats(&conn->stats);
+
+ /* Locks. */
+ WT_RET(__wt_spin_init(session, &conn->api_lock, "api"));
+ WT_RET(__wt_spin_init(session, &conn->checkpoint_lock, "checkpoint"));
+ WT_RET(__wt_spin_init(session, &conn->dhandle_lock, "data handle"));
+ WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list"));
+ WT_RET(__wt_spin_init(session, &conn->hot_backup_lock, "hot backup"));
+ WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure"));
+ WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema"));
+ WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS(conn), &conn->page_lock));
+ for (i = 0; i < WT_PAGE_LOCKS(conn); ++i)
+ WT_RET(
+ __wt_spin_init(session, &conn->page_lock[i], "btree page"));
+
+ /* Setup the spin locks for the LSM manager queues. */
+ WT_RET(__wt_spin_init(session,
+ &conn->lsm_manager.app_lock, "LSM application queue lock"));
+ WT_RET(__wt_spin_init(session,
+ &conn->lsm_manager.manager_lock, "LSM manager queue lock"));
+ WT_RET(__wt_spin_init(
+ session, &conn->lsm_manager.switch_lock, "LSM switch queue lock"));
+ WT_RET(__wt_cond_alloc(
+ session, "LSM worker cond", 0, &conn->lsm_manager.work_cond));
+
+ /*
+ * Generation numbers.
+ *
+ * Start split generations at one. Threads publish this generation
+ * number before examining tree structures, and zero when they leave.
+ * We need to distinguish between threads that are in a tree before the
+ * first split has happened, and threads that are not in a tree.
+ */
+ conn->split_gen = 1;
+
+ /*
+ * Block manager.
+ * XXX
+ * If there's ever a second block manager, we'll want to make this
+ * more opaque, but for now this is simpler.
+ */
+ WT_RET(__wt_spin_init(session, &conn->block_lock, "block manager"));
+ TAILQ_INIT(&conn->blockqh); /* Block manager list */
+
+ return (0);
+}
+
+/*
+ * __wt_connection_destroy --
+ * Destroy the connection's underlying WT_CONNECTION_IMPL structure.
+ */
+int
+__wt_connection_destroy(WT_CONNECTION_IMPL *conn)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ /* Check there's something to destroy. */
+ if (conn == NULL)
+ return (0);
+
+ session = conn->default_session;
+
+ /*
+ * Close remaining open files (before discarding the mutex, the
+ * underlying file-close code uses the mutex to guard lists of
+ * open files.
+ */
+ if (conn->lock_fh != NULL)
+ WT_TRET(__wt_close(session, conn->lock_fh));
+
+ /* Remove from the list of connections. */
+ __wt_spin_lock(session, &__wt_process.spinlock);
+ TAILQ_REMOVE(&__wt_process.connqh, conn, q);
+ __wt_spin_unlock(session, &__wt_process.spinlock);
+
+ /* Configuration */
+ __wt_conn_config_discard(session); /* configuration */
+
+ __wt_conn_foc_discard(session); /* free-on-close */
+
+ __wt_spin_destroy(session, &conn->api_lock);
+ __wt_spin_destroy(session, &conn->block_lock);
+ __wt_spin_destroy(session, &conn->checkpoint_lock);
+ __wt_spin_destroy(session, &conn->dhandle_lock);
+ __wt_spin_destroy(session, &conn->fh_lock);
+ __wt_spin_destroy(session, &conn->hot_backup_lock);
+ __wt_spin_destroy(session, &conn->reconfig_lock);
+ __wt_spin_destroy(session, &conn->schema_lock);
+ for (i = 0; i < WT_PAGE_LOCKS(conn); ++i)
+ __wt_spin_destroy(session, &conn->page_lock[i]);
+ __wt_free(session, conn->page_lock);
+
+ /* Free allocated memory. */
+ __wt_free(session, conn->cfg);
+ __wt_free(session, conn->home);
+ __wt_free(session, conn->error_prefix);
+ __wt_free(session, conn->sessions);
+
+ __wt_free(NULL, conn);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
new file mode 100644
index 00000000000..e516fdc68d2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -0,0 +1,284 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __logmgr_sync_cfg --
+ * Interpret the transaction_sync config.
+ */
+static int
+__logmgr_sync_cfg(WT_SESSION_IMPL *session, const char **cfg)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ WT_RET(
+ __wt_config_gets(session, cfg, "transaction_sync.enabled", &cval));
+ if (cval.val)
+ FLD_SET(conn->txn_logsync, WT_LOG_FLUSH);
+ else
+ FLD_CLR(conn->txn_logsync, WT_LOG_FLUSH);
+
+ WT_RET(
+ __wt_config_gets(session, cfg, "transaction_sync.method", &cval));
+ FLD_CLR(conn->txn_logsync, WT_LOG_DSYNC | WT_LOG_FSYNC);
+ if (WT_STRING_MATCH("dsync", cval.str, cval.len))
+ FLD_SET(conn->txn_logsync, WT_LOG_DSYNC);
+ else if (WT_STRING_MATCH("fsync", cval.str, cval.len))
+ FLD_SET(conn->txn_logsync, WT_LOG_FSYNC);
+ return (0);
+}
+
+/*
+ * __logmgr_config --
+ * Parse and setup the logging server options.
+ */
+static int
+__logmgr_config(WT_SESSION_IMPL *session, const char **cfg, int *runp)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /*
+ * The logging configuration is off by default.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
+ *runp = cval.val != 0;
+ if (*runp == 0)
+ return (0);
+
+ WT_RET(__wt_config_gets(session, cfg, "log.archive", &cval));
+ conn->archive = cval.val != 0;
+
+ WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval));
+ conn->log_file_max = (wt_off_t)cval.val;
+ WT_STAT_FAST_CONN_SET(session, log_max_filesize, conn->log_file_max);
+
+ WT_RET(__wt_config_gets(session, cfg, "log.path", &cval));
+ WT_RET(__wt_strndup(session, cval.str, cval.len, &conn->log_path));
+
+ WT_RET(__logmgr_sync_cfg(session, cfg));
+ return (0);
+}
+
+/*
+ * __log_archive_server --
+ * The log archiving server thread.
+ */
+static void *
+__log_archive_server(void *arg)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LSN lsn;
+ WT_SESSION_IMPL *session;
+ uint32_t lognum;
+ u_int i, logcount;
+ char **logfiles;
+
+ session = arg;
+ conn = S2C(session);
+ log = conn->log;
+ logcount = 0;
+ logfiles = NULL;
+
+ while (F_ISSET(conn, WT_CONN_SERVER_RUN)) {
+ /*
+ * If archiving is reconfigured and turned off, wait until it
+ * gets turned back on and check again. Don't wait forever: if
+ * a notification gets lost during close, we want to find out
+ * eventually.
+ */
+ if (conn->archive == 0 ||
+ __wt_try_writelock(session, log->log_archive_lock) != 0) {
+ if (conn->archive != 0) {
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "log_archive: Blocked due to open log "
+ "cursor holding archive lock"));
+ }
+ WT_ERR(
+ __wt_cond_wait(session, conn->arch_cond, 1000000));
+ continue;
+ }
+
+ lsn = log->ckpt_lsn;
+ lsn.offset = 0;
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "log_archive: ckpt LSN %" PRIu32 ",%" PRIu64,
+ lsn.file, lsn.offset));
+ /*
+ * Main archive code. Get the list of all log files and
+ * remove any earlier than the checkpoint LSN.
+ */
+ WT_ERR(__wt_dirlist(session, conn->log_path,
+ WT_LOG_FILENAME, WT_DIRLIST_INCLUDE, &logfiles, &logcount));
+
+ /*
+ * We can only archive files if a hot backup is not in progress.
+ */
+ __wt_spin_lock(session, &conn->hot_backup_lock);
+ for (i = 0; i < logcount; i++) {
+ if (conn->hot_backup == 0) {
+ WT_ERR(__wt_log_extract_lognum(
+ session, logfiles[i], &lognum));
+ if (lognum < lsn.file)
+ WT_ERR(
+ __wt_log_remove(session, lognum));
+ }
+ }
+ __wt_spin_unlock(session, &conn->hot_backup_lock);
+ __wt_log_files_free(session, logfiles, logcount);
+ logfiles = NULL;
+ logcount = 0;
+
+ /*
+ * Indicate what is our new earliest LSN. It is the start
+ * of the log file containing the last checkpoint.
+ */
+ log->first_lsn = lsn;
+ log->first_lsn.offset = 0;
+ WT_ERR(__wt_writeunlock(session, log->log_archive_lock));
+
+ /* Wait until the next event. */
+ WT_ERR(__wt_cond_wait(session, conn->arch_cond, 1000000));
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "log archive server error");
+ }
+ if (logfiles != NULL)
+ __wt_log_files_free(session, logfiles, logcount);
+ return (NULL);
+}
+
+/*
+ * __wt_logmgr_create --
+ * Start the log subsystem and archive server thread.
+ */
+int
+__wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ int run;
+
+ conn = S2C(session);
+
+ /* Handle configuration. */
+ WT_RET(__logmgr_config(session, cfg, &run));
+
+ /* If logging is not configured, we're done. */
+ if (!run)
+ return (0);
+
+ conn->logging = 1;
+ /*
+ * Logging is on, allocate the WT_LOG structure and open the log file.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_LOG), &conn->log));
+ log = conn->log;
+ WT_RET(__wt_spin_init(session, &log->log_lock, "log"));
+ WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot"));
+ WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync"));
+ WT_RET(__wt_rwlock_alloc(session,
+ &log->log_archive_lock, "log archive lock"));
+ if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG))
+ log->allocsize =
+ WT_MAX((uint32_t)conn->buffer_alignment, LOG_ALIGN);
+ else
+ log->allocsize = LOG_ALIGN;
+ INIT_LSN(&log->alloc_lsn);
+ INIT_LSN(&log->ckpt_lsn);
+ INIT_LSN(&log->first_lsn);
+ INIT_LSN(&log->sync_lsn);
+ INIT_LSN(&log->trunc_lsn);
+ INIT_LSN(&log->write_lsn);
+ log->fileid = 0;
+ WT_RET(__wt_cond_alloc(session, "log sync", 0, &log->log_sync_cond));
+ WT_RET(__wt_log_open(session));
+ WT_RET(__wt_log_slot_init(session));
+
+ /* If archiving is not configured, we're done. */
+ if (!conn->archive)
+ return (0);
+
+ /*
+ * If an archive thread exists, the user may have reconfigured the
+ * archive thread. Signal the thread. Otherwise the user wants
+ * archiving and we need to start up the thread.
+ */
+ if (conn->arch_session != NULL) {
+ WT_ASSERT(session, conn->arch_cond != NULL);
+ WT_ASSERT(session, conn->arch_tid_set != 0);
+ WT_RET(__wt_cond_signal(session, conn->arch_cond));
+ } else {
+ /* The log archive server gets its own session. */
+ WT_RET(__wt_open_internal_session(
+ conn, "archive-server", 0, 0, &conn->arch_session));
+ WT_RET(__wt_cond_alloc(conn->arch_session,
+ "log archiving server", 0, &conn->arch_cond));
+
+ /*
+ * Start the thread.
+ */
+ WT_RET(__wt_thread_create(conn->arch_session,
+ &conn->arch_tid, __log_archive_server, conn->arch_session));
+ conn->arch_tid_set = 1;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_logmgr_destroy --
+ * Destroy the log archiving server thread and logging subsystem.
+ */
+int
+__wt_logmgr_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ conn = S2C(session);
+
+ if (!conn->logging)
+ return (0);
+ if (conn->arch_tid_set) {
+ WT_TRET(__wt_cond_signal(session, conn->arch_cond));
+ WT_TRET(__wt_thread_join(session, conn->arch_tid));
+ conn->arch_tid_set = 0;
+ }
+ WT_TRET(__wt_cond_destroy(session, &conn->arch_cond));
+
+ WT_TRET(__wt_log_close(session));
+
+ __wt_free(session, conn->log_path);
+
+ /* Close the server thread's session. */
+ if (conn->arch_session != NULL) {
+ wt_session = &conn->arch_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ conn->arch_session = NULL;
+ }
+
+ WT_TRET(__wt_log_slot_destroy(session));
+ WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond));
+ WT_TRET(__wt_rwlock_destroy(session, &conn->log->log_archive_lock));
+ __wt_spin_destroy(session, &conn->log->log_lock);
+ __wt_spin_destroy(session, &conn->log->log_slot_lock);
+ __wt_spin_destroy(session, &conn->log->log_sync_lock);
+ __wt_free(session, conn->log);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
new file mode 100644
index 00000000000..41fc9809521
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -0,0 +1,244 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_connection_open --
+ * Open a connection.
+ */
+int
+__wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
+{
+ WT_SESSION_IMPL *session;
+
+ /* Default session. */
+ session = conn->default_session;
+ WT_ASSERT(session, session->iface.connection == &conn->iface);
+
+ /*
+ * Tell internal server threads to run: this must be set before opening
+ * any sessions.
+ */
+ F_SET(conn, WT_CONN_SERVER_RUN);
+
+ /* WT_SESSION_IMPL array. */
+ WT_RET(__wt_calloc(session,
+ conn->session_size, sizeof(WT_SESSION_IMPL), &conn->sessions));
+
+ /*
+ * Open the default session. We open this before starting service
+ * threads because those may allocate and use session resources that
+ * need to get cleaned up on close.
+ */
+ WT_RET(__wt_open_internal_session(conn, "connection", 1, 0, &session));
+
+ /*
+ * The connection's default session is originally a static structure,
+ * swap that out for a more fully-functional session. It's necessary
+ * to have this step: the session allocation code uses the connection's
+ * session, and if we pass a reference to the default session as the
+ * place to store the allocated session, things get confused and error
+ * handling can be corrupted. So, we allocate into a stack variable
+ * and then assign it on success.
+ */
+ conn->default_session = session;
+
+ /*
+ * Publish: there must be a barrier to ensure the connection structure
+ * fields are set before other threads read from the pointer.
+ */
+ WT_WRITE_BARRIER();
+
+ /* Connect to a cache pool. */
+ WT_RET(__wt_cache_pool_config(session, cfg));
+
+ /* Create the cache. */
+ WT_RET(__wt_cache_create(session, cfg));
+
+ /* Initialize transaction support. */
+ WT_RET(__wt_txn_global_init(session, cfg));
+
+ return (0);
+}
+
+/*
+ * __wt_connection_close --
+ * Close a connection handle.
+ */
+int
+__wt_connection_close(WT_CONNECTION_IMPL *conn)
+{
+ WT_CONNECTION *wt_conn;
+ WT_DECL_RET;
+ WT_DLH *dlh;
+ WT_FH *fh;
+ WT_SESSION_IMPL *s, *session;
+ WT_TXN_GLOBAL *txn_global;
+ u_int i;
+
+ wt_conn = &conn->iface;
+ txn_global = &conn->txn_global;
+ session = conn->default_session;
+
+ /*
+ * We're shutting down. Make sure everything gets freed.
+ *
+ * It's possible that the eviction server is in the middle of a long
+ * operation, with a transaction ID pinned. In that case, we will loop
+ * here until the transaction ID is released, when the oldest
+ * transaction ID will catch up with the current ID.
+ */
+ for (;;) {
+ __wt_txn_update_oldest(session);
+ if (txn_global->oldest_id == txn_global->current)
+ break;
+ __wt_yield();
+ }
+
+ /* Clear any pending async ops. */
+ WT_TRET(__wt_async_flush(session));
+
+ /*
+ * Shut down server threads other than the eviction server, which is
+ * needed later to close btree handles. Some of these threads access
+ * btree handles, so take care in ordering shutdown to make sure they
+ * exit before files are closed.
+ */
+ F_CLR(conn, WT_CONN_SERVER_RUN);
+ WT_TRET(__wt_async_destroy(session));
+ WT_TRET(__wt_lsm_manager_destroy(session));
+ WT_TRET(__wt_checkpoint_server_destroy(session));
+ WT_TRET(__wt_statlog_destroy(session, 1));
+ WT_TRET(__wt_sweep_destroy(session));
+
+ /* Close open data handles. */
+ WT_TRET(__wt_conn_dhandle_discard(session));
+
+ /*
+ * Now that all data handles are closed, tell logging that a checkpoint
+ * has completed then shut down the log manager (only after closing
+ * data handles).
+ */
+ if (conn->logging) {
+ WT_TRET(__wt_txn_checkpoint_log(
+ session, 1, WT_TXN_LOG_CKPT_STOP, NULL));
+ WT_TRET(__wt_logmgr_destroy(session));
+ }
+
+ /* Free memory for collators, compressors, data sources. */
+ WT_TRET(__wt_conn_remove_collator(session));
+ WT_TRET(__wt_conn_remove_compressor(session));
+ WT_TRET(__wt_conn_remove_data_source(session));
+
+ /*
+ * Complain if files weren't closed, ignoring the lock file, we'll
+ * close it in a minute.
+ */
+ TAILQ_FOREACH(fh, &conn->fhqh, q) {
+ if (fh == conn->lock_fh)
+ continue;
+
+ __wt_errx(session,
+ "Connection has open file handles: %s", fh->name);
+ WT_TRET(__wt_close(session, fh));
+ fh = TAILQ_FIRST(&conn->fhqh);
+ }
+
+ /* Shut down the eviction server thread. */
+ WT_TRET(__wt_evict_destroy(session));
+
+ /* Disconnect from shared cache - must be before cache destroy. */
+ WT_TRET(__wt_conn_cache_pool_destroy(session));
+
+ /* Discard the cache. */
+ WT_TRET(__wt_cache_destroy(session));
+
+ /* Discard transaction state. */
+ __wt_txn_global_destroy(session);
+
+ /* Close extensions, first calling any unload entry point. */
+ while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) {
+ TAILQ_REMOVE(&conn->dlhqh, dlh, q);
+
+ if (dlh->terminate != NULL)
+ WT_TRET(dlh->terminate(wt_conn));
+ WT_TRET(__wt_dlclose(session, dlh));
+ }
+
+ /*
+ * Close the internal (default) session, and switch back to the dummy
+ * session in case of any error messages from the remaining operations
+ * while destroying the connection handle.
+ */
+ if (session != &conn->dummy_session) {
+ WT_TRET(session->iface.close(&session->iface, NULL));
+ session = conn->default_session = &conn->dummy_session;
+ }
+
+ /*
+ * The session's split stash isn't discarded during normal session close
+ * because it may persist past the life of the session. Discard it now.
+ */
+ if ((s = conn->sessions) != NULL)
+ for (i = 0; i < conn->session_size; ++s, ++i)
+ __wt_split_stash_discard_all(session, s);
+
+ /*
+ * The session's hazard pointer memory isn't discarded during normal
+ * session close because access to it isn't serialized. Discard it
+ * now.
+ */
+ if ((s = conn->sessions) != NULL)
+ for (i = 0; i < conn->session_size; ++s, ++i)
+ if (s != session)
+ __wt_free(session, s->hazard);
+
+ /* Destroy the handle. */
+ WT_TRET(__wt_connection_destroy(conn));
+
+ return (ret);
+}
+
+/*
+ * __wt_connection_workers --
+ * Start the worker threads.
+ */
+int
+__wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ /*
+ * Start the eviction thread.
+ */
+ WT_RET(__wt_evict_create(session));
+
+ /*
+ * Start the handle sweep thread.
+ */
+ WT_RET(__wt_sweep_create(session));
+
+ /*
+ * Start the optional statistics thread. Start statistics first so that
+ * other optional threads can know if statistics are enabled or not.
+ */
+ WT_RET(__wt_statlog_create(session, cfg));
+
+ /* Start the optional async threads. */
+ WT_RET(__wt_async_create(session, cfg));
+
+ /*
+ * Start the optional logging/archive thread.
+ * NOTE: The log manager must be started before checkpoints so that the
+ * checkpoint server knows if logging is enabled.
+ */
+ WT_RET(__wt_logmgr_create(session, cfg));
+
+ /* Start the optional checkpoint thread. */
+ WT_RET(__wt_checkpoint_server_create(session, cfg));
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c
new file mode 100644
index 00000000000..f7229504898
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_stat.c
@@ -0,0 +1,540 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#ifdef __GNUC__
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1)
+/*
+ * !!!
+ * GCC with -Wformat-nonliteral complains about calls to strftime in this file.
+ * There's nothing wrong, this makes the warning go away.
+ */
+#pragma GCC diagnostic ignored "-Wformat-nonliteral"
+#endif
+#endif
+
+/*
+ * __stat_sources_free --
+ * Free the array of statistics sources.
+ */
+static void
+__stat_sources_free(WT_SESSION_IMPL *session, char ***sources)
+{
+ char **p;
+
+ if ((p = (*sources)) != NULL) {
+ for (; *p != NULL; ++p)
+ __wt_free(session, *p);
+ __wt_free(session, *sources);
+ }
+}
+
+/*
+ * __wt_conn_stat_init --
+ * Initialize the per-connection statistics.
+ */
+void
+__wt_conn_stat_init(WT_SESSION_IMPL *session)
+{
+ __wt_async_stats_update(session);
+ __wt_cache_stats_update(session);
+ __wt_txn_stats_update(session);
+}
+
+/*
+ * __statlog_config --
+ * Parse and setup the statistics server options.
+ */
+static int
+__statlog_config(WT_SESSION_IMPL *session, const char **cfg, int *runp)
+{
+ WT_CONFIG objectconf;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ int cnt;
+ char **sources;
+
+ conn = S2C(session);
+ sources = NULL;
+
+ WT_RET(__wt_config_gets(session, cfg, "statistics_log.wait", &cval));
+ /* Only start the server if wait time is non-zero */
+ *runp = (cval.val == 0) ? 0 : 1;
+ conn->stat_usecs = (long)cval.val * 1000000;
+
+ WT_RET(__wt_config_gets(
+ session, cfg, "statistics_log.on_close", &cval));
+ if (cval.val != 0)
+ FLD_SET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE);
+
+ /*
+ * Statistics logging configuration requires either a wait time or an
+ * on-close setting.
+ */
+ if (*runp == 0 && !FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE))
+ return (0);
+
+ WT_RET(__wt_config_gets(session, cfg, "statistics_log.sources", &cval));
+ WT_RET(__wt_config_subinit(session, &objectconf, &cval));
+ for (cnt = 0; (ret = __wt_config_next(&objectconf, &k, &v)) == 0; ++cnt)
+ ;
+ WT_RET_NOTFOUND_OK(ret);
+ if (cnt != 0) {
+ WT_RET(__wt_calloc_def(session, cnt + 1, &sources));
+ WT_RET(__wt_config_subinit(session, &objectconf, &cval));
+ for (cnt = 0;
+ (ret = __wt_config_next(&objectconf, &k, &v)) == 0; ++cnt) {
+ /*
+ * XXX
+ * Only allow "file:" and "lsm:" for now: "file:" works
+ * because it's been converted to data handles, "lsm:"
+ * works because we can easily walk the list of open LSM
+ * objects, even though it hasn't been converted.
+ */
+ if (!WT_PREFIX_MATCH(k.str, "file:") &&
+ !WT_PREFIX_MATCH(k.str, "lsm:"))
+ WT_ERR_MSG(session, EINVAL,
+ "statistics_log sources configuration only "
+ "supports objects of type \"file\" or "
+ "\"lsm\"");
+ WT_ERR(
+ __wt_strndup(session, k.str, k.len, &sources[cnt]));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ conn->stat_sources = sources;
+ sources = NULL;
+ }
+
+ WT_ERR(__wt_config_gets(session, cfg, "statistics_log.path", &cval));
+ WT_ERR(__wt_nfilename(session, cval.str, cval.len, &conn->stat_path));
+
+ WT_ERR(__wt_config_gets(
+ session, cfg, "statistics_log.timestamp", &cval));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &conn->stat_format));
+
+err: __stat_sources_free(session, &sources);
+ return (ret);
+}
+
+/*
+ * __statlog_dump --
+ * Dump out handle/connection statistics.
+ */
+static int
+__statlog_dump(WT_SESSION_IMPL *session, const char *name, int conn_stats)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_STATS *stats;
+ u_int i;
+ uint64_t max;
+ const char *uri;
+ const char *cfg[] = {
+ WT_CONFIG_BASE(session, session_open_cursor), NULL };
+
+ conn = S2C(session);
+
+ /* Build URI and configuration string. */
+ if (conn_stats)
+ uri = "statistics:";
+ else {
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_buf_fmt(session, tmp, "statistics:%s", name));
+ uri = tmp->data;
+ }
+
+ /*
+ * Open the statistics cursor and dump the statistics.
+ *
+ * If we don't find an underlying object, silently ignore it, the object
+ * may exist only intermittently.
+ */
+ switch (ret = __wt_curstat_open(session, uri, cfg, &cursor)) {
+ case 0:
+ max = conn_stats ?
+ sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS) :
+ sizeof(WT_DSRC_STATS) / sizeof(WT_STATS);
+ for (i = 0,
+ stats = WT_CURSOR_STATS(cursor); i < max; ++i, ++stats)
+ WT_ERR_TEST((fprintf(conn->stat_fp,
+ "%s %" PRIu64 " %s %s\n",
+ conn->stat_stamp,
+ stats->v, name, stats->desc) < 0), __wt_errno());
+ WT_ERR(cursor->close(cursor));
+ break;
+ case EBUSY:
+ case ENOENT:
+ case WT_NOTFOUND:
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __statlog_apply --
+ * Review a single open handle and dump statistics on demand.
+ */
+static int
+__statlog_apply(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_DATA_HANDLE *dhandle;
+ char **p;
+
+ WT_UNUSED(cfg);
+
+ dhandle = session->dhandle;
+
+ /* Check for a match on the set of sources. */
+ for (p = S2C(session)->stat_sources; *p != NULL; ++p)
+ if (WT_PREFIX_MATCH(dhandle->name, *p))
+ return (__statlog_dump(session, dhandle->name, 0));
+ return (0);
+}
+
+/*
+ * __statlog_lsm_apply --
+ * Review the list open LSM trees, and dump statistics on demand.
+ *
+ * XXX
+ * This code should be removed when LSM objects are converted to data handles.
+ */
+static int
+__statlog_lsm_apply(WT_SESSION_IMPL *session)
+{
+#define WT_LSM_TREE_LIST_SLOTS 100
+ WT_LSM_TREE *lsm_tree, *list[WT_LSM_TREE_LIST_SLOTS];
+ WT_DECL_RET;
+ int cnt, locked;
+ char **p;
+
+ cnt = locked = 0;
+
+ /*
+ * Walk the list of LSM trees, checking for a match on the set of
+ * sources.
+ *
+ * XXX
+ * We can't hold the schema lock for the traversal because the LSM
+ * statistics code acquires the tree lock, and the LSM cursor code
+ * acquires the tree lock and then acquires the schema lock, it's a
+ * classic deadlock. This is temporary code so I'm not going to do
+ * anything fancy.
+ * It is OK to not keep holding the schema lock after populating
+ * the list of matching LSM trees, since the __wt_lsm_tree_get call
+ * will bump a reference count, so the tree won't go away.
+ */
+ __wt_spin_lock(session, &S2C(session)->schema_lock);
+ locked = 1;
+ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) {
+ if (cnt == WT_LSM_TREE_LIST_SLOTS)
+ break;
+ for (p = S2C(session)->stat_sources; *p != NULL; ++p)
+ if (WT_PREFIX_MATCH(lsm_tree->name, *p)) {
+ WT_ERR(__wt_lsm_tree_get(
+ session, lsm_tree->name, 0, &list[cnt++]));
+ break;
+ }
+ }
+ __wt_spin_unlock(session, &S2C(session)->schema_lock);
+ locked = 0;
+
+ while (cnt > 0) {
+ --cnt;
+ WT_TRET(__statlog_dump(session, list[cnt]->name, 0));
+ __wt_lsm_tree_release(session, list[cnt]);
+ }
+
+err: if (locked)
+ __wt_spin_unlock(session, &S2C(session)->schema_lock);
+ /* Release any LSM trees on error. */
+ while (cnt > 0) {
+ --cnt;
+ __wt_lsm_tree_release(session, list[cnt]);
+ }
+ return (ret);
+}
+
+/*
+ * __statlog_log_one --
+ * Output a set of statistics into the current log file.
+ */
+static int
+__statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
+{
+ FILE *log_file;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ struct timespec ts;
+ struct tm *tm, _tm;
+
+ conn = S2C(session);
+
+ /* Get the current local time of day. */
+ WT_RET(__wt_epoch(session, &ts));
+ tm = localtime_r(&ts.tv_sec, &_tm);
+
+ /* Create the logging path name for this time of day. */
+ if (strftime(tmp->mem, tmp->memsize, conn->stat_path, tm) == 0)
+ WT_RET_MSG(session, ENOMEM, "strftime path conversion");
+
+ /* If the path has changed, cycle the log file. */
+ if ((log_file = conn->stat_fp) == NULL ||
+ path == NULL || strcmp(tmp->mem, path->mem) != 0) {
+ conn->stat_fp = NULL;
+ if (log_file != NULL)
+ WT_RET(fclose(log_file) == 0 ? 0 : __wt_errno());
+
+ if (path != NULL)
+ (void)strcpy(path->mem, tmp->mem);
+ WT_RET_TEST((log_file =
+ fopen(tmp->mem, "a")) == NULL, __wt_errno());
+ }
+ conn->stat_fp = log_file;
+
+ /* Create the entry prefix for this time of day. */
+ if (strftime(tmp->mem, tmp->memsize, conn->stat_format, tm) == 0)
+ WT_RET_MSG(session, ENOMEM, "strftime timestamp conversion");
+ conn->stat_stamp = tmp->mem;
+
+ /* Dump the connection statistics. */
+ WT_RET(__statlog_dump(session, conn->home, 1));
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+ /* Dump the spinlock statistics. */
+ WT_RET(__wt_statlog_dump_spinlock(conn, conn->home));
+#endif
+
+ /*
+ * Lock the schema and walk the list of open handles, dumping
+ * any that match the list of object sources.
+ */
+ if (conn->stat_sources != NULL) {
+ WT_WITH_SCHEMA_LOCK(session, ret =
+ __wt_conn_btree_apply(session, 0, __statlog_apply, NULL));
+ WT_RET(ret);
+ }
+
+ /*
+ * Walk the list of open LSM trees, dumping any that match the
+ * the list of object sources.
+ *
+ * XXX
+ * This code should be removed when LSM objects are converted to
+ * data handles.
+ */
+ if (conn->stat_sources != NULL)
+ WT_RET(__statlog_lsm_apply(session));
+
+ /* Flush. */
+ WT_RET(fflush(conn->stat_fp) == 0 ? 0 : __wt_errno());
+
+ return (0);
+}
+
+/*
+ * __wt_statlog_log_one --
+ * Log a set of statistics into the configured statistics log. Requires
+ * that the server is not currently running.
+ */
+int
+__wt_statlog_log_one(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_DECL_ITEM(tmp);
+
+ conn = S2C(session);
+
+ if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE))
+ return (0);
+
+ if (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+ F_ISSET(conn, WT_CONN_SERVER_STATISTICS))
+ WT_RET_MSG(session, EINVAL,
+ "Attempt to log statistics while a server is running");
+
+ WT_RET(__wt_scr_alloc(session, strlen(conn->stat_path) + 128, &tmp));
+ WT_ERR(__statlog_log_one(session, NULL, tmp));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __statlog_server --
+ * The statistics server thread.
+ */
+static void *
+__statlog_server(void *arg)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_ITEM path, tmp;
+ WT_SESSION_IMPL *session;
+
+ session = arg;
+ conn = S2C(session);
+
+ WT_CLEAR(path);
+ WT_CLEAR(tmp);
+
+ /*
+ * We need a temporary place to build a path and an entry prefix.
+ * The length of the path plus 128 should be more than enough.
+ *
+ * We also need a place to store the current path, because that's
+ * how we know when to close/re-open the file.
+ */
+ WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128));
+ WT_ERR(__wt_buf_init(session, &tmp, strlen(conn->stat_path) + 128));
+
+ while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+ F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) {
+ if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_NONE))
+ WT_ERR(__statlog_log_one(session, &path, &tmp));
+
+ /* Wait until the next event. */
+ WT_ERR(
+ __wt_cond_wait(session, conn->stat_cond, conn->stat_usecs));
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "statistics log server error");
+ }
+ __wt_buf_free(session, &path);
+ __wt_buf_free(session, &tmp);
+ return (NULL);
+}
+
+/*
+ * __statlog_start --
+ * Start the statistics server thread.
+ */
+static int
+__statlog_start(WT_CONNECTION_IMPL *conn)
+{
+ WT_SESSION_IMPL *session;
+
+ /* Nothing to do if the server is already running. */
+ if (conn->stat_session != NULL)
+ return (0);
+
+ F_SET(conn, WT_CONN_SERVER_STATISTICS);
+ /* The statistics log server gets its own session. */
+ WT_RET(__wt_open_internal_session(
+ conn, "statlog-server", 1, 1, &conn->stat_session));
+ session = conn->stat_session;
+
+ WT_RET(__wt_cond_alloc(
+ session, "statistics log server", 0, &conn->stat_cond));
+
+ /*
+ * Start the thread.
+ *
+ * Statistics logging creates a thread per database, rather than using
+ * a single thread to do logging for all of the databases. If we ever
+ * see lots of databases at a time, doing statistics logging, and we
+ * want to reduce the number of threads, there's no reason we have to
+ * have more than one thread, I just didn't feel like writing the code
+ * to figure out the scheduling.
+ */
+ WT_RET(__wt_thread_create(
+ session, &conn->stat_tid, __statlog_server, session));
+ conn->stat_tid_set = 1;
+
+ return (0);
+}
+
+/*
+ * __wt_statlog_create --
+ * Start the statistics server thread.
+ */
+int
+__wt_statlog_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ int start;
+
+ conn = S2C(session);
+ start = 0;
+
+ /*
+ * Stop any server that is already running. This means that each time
+ * reconfigure is called we'll bounce the server even if there are no
+ * configuration changes - but that makes our lives easier.
+ */
+ if (conn->stat_session != NULL)
+ WT_RET(__wt_statlog_destroy(session, 0));
+
+ WT_RET(__statlog_config(session, cfg, &start));
+ if (start)
+ WT_RET(__statlog_start(conn));
+
+ return (0);
+}
+
+/*
+ * __wt_statlog_destroy --
+ * Destroy the statistics server thread.
+ */
+int
+__wt_statlog_destroy(WT_SESSION_IMPL *session, int is_close)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ conn = S2C(session);
+
+ F_CLR(conn, WT_CONN_SERVER_STATISTICS);
+ if (conn->stat_tid_set) {
+ WT_TRET(__wt_cond_signal(session, conn->stat_cond));
+ WT_TRET(__wt_thread_join(session, conn->stat_tid));
+ conn->stat_tid_set = 0;
+ }
+
+ /* Log a set of statistics on shutdown if configured. */
+ if (is_close)
+ WT_TRET(__wt_statlog_log_one(session));
+
+ WT_TRET(__wt_cond_destroy(session, &conn->stat_cond));
+
+ __stat_sources_free(session, &conn->stat_sources);
+ __wt_free(session, conn->stat_path);
+ __wt_free(session, conn->stat_format);
+
+ /* Close the server thread's session. */
+ if (conn->stat_session != NULL) {
+ wt_session = &conn->stat_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ }
+
+ /* Clear connection settings so reconfigure is reliable. */
+ conn->stat_session = NULL;
+ conn->stat_tid_set = 0;
+ conn->stat_format = NULL;
+ if (conn->stat_fp != NULL) {
+ WT_TRET(fclose(conn->stat_fp) == 0 ? 0 : __wt_errno());
+ conn->stat_fp = NULL;
+ }
+ conn->stat_path = NULL;
+ conn->stat_sources = NULL;
+ conn->stat_stamp = NULL;
+ conn->stat_usecs = 0;
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
new file mode 100644
index 00000000000..3bccc5814be
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -0,0 +1,187 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __sweep --
+ * Close unused dhandles on the connection dhandle list.
+ */
+static int
+__sweep(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle, *dhandle_next;
+ WT_DECL_RET;
+ time_t now;
+
+ conn = S2C(session);
+
+ /*
+ * Session's cache handles unless the session itself is closed, at which
+ * time the handle reference counts are immediately decremented. Don't
+ * discard handles that have been open recently.
+ */
+ WT_RET(__wt_seconds(session, &now));
+
+ dhandle = SLIST_FIRST(&conn->dhlh);
+ for (; dhandle != NULL; dhandle = dhandle_next) {
+ dhandle_next = SLIST_NEXT(dhandle, l);
+ if (dhandle->session_ref != 0 ||
+ now - dhandle->timeofdeath <= WT_DHANDLE_SWEEP_WAIT)
+ continue;
+
+ /*
+ * We have a candidate for closing; if it's open, flush dirty
+ * leaf pages, then acquire an exclusive lock on the handle
+ * and close it. We might be blocking opens for a long time
+ * (over disk I/O), but the handle was quiescent for awhile.
+ *
+ * The close can fail if an update cannot be written (updates in
+ * a no-longer-referenced file might not yet be globally visible
+ * if sessions have disjoint sets of files open). If the handle
+ * is busy, skip it, we'll retry the close the next time, after
+ * the transaction state has progressed.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
+ WT_WITH_DHANDLE(session, dhandle,
+ ret = __wt_cache_op(
+ session, NULL, WT_SYNC_WRITE_LEAVES));
+ WT_RET(ret);
+
+ /*
+ * We don't set WT_DHANDLE_EXCLUSIVE deliberately, we
+ * want opens to block on us rather than returning an
+ * EBUSY error to the application.
+ */
+ ret = __wt_try_writelock(session, dhandle->rwlock);
+ if (ret == EBUSY) {
+ ret = 0;
+ continue;
+ }
+ WT_RET(ret);
+
+ WT_WITH_DHANDLE(session, dhandle,
+ ret = __wt_conn_btree_sync_and_close(session, 0));
+ if (ret == EBUSY)
+ ret = 0;
+
+ WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+ WT_RET(ret);
+ }
+
+ /*
+ * Attempt to discard the handle (the called function checks the
+ * handle-open flag after acquiring appropriate locks, which is
+ * why we don't do any special handling of EBUSY returns above,
+ * that path never cleared the handle-open flag.
+ */
+ ret = __wt_conn_dhandle_discard_single(session, dhandle, 0);
+ if (ret == EBUSY)
+ ret = 0;
+ WT_RET(ret);
+ }
+ return (0);
+}
+
+/*
+ * __sweep_server --
+ * The handle sweep server thread.
+ */
+static void *
+__sweep_server(void *arg)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = arg;
+ conn = S2C(session);
+
+ /*
+ * Sweep for dead handles.
+ */
+ while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+ F_ISSET(conn, WT_CONN_SERVER_SWEEP)) {
+
+ /* Wait until the next event. */
+ WT_ERR(
+ __wt_cond_wait(session, conn->sweep_cond, 30 * WT_MILLION));
+
+ /* Sweep the handles. */
+ WT_ERR(__sweep(session));
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "handle sweep server error");
+ }
+ return (NULL);
+}
+
+/*
+ * __wt_sweep_create --
+ * Start the handle sweep thread.
+ */
+int
+__wt_sweep_create(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /* Set first, the thread might run before we finish up. */
+ F_SET(conn, WT_CONN_SERVER_SWEEP);
+
+ WT_RET(__wt_open_internal_session(
+ conn, "sweep-server", 1, 1, &conn->sweep_session));
+ session = conn->sweep_session;
+
+ /*
+ * Handle sweep does enough I/O it may be called upon to perform slow
+ * operations for the block manager.
+ */
+ F_SET(session, WT_SESSION_CAN_WAIT);
+
+ WT_RET(__wt_cond_alloc(
+ session, "handle sweep server", 0, &conn->sweep_cond));
+
+ WT_RET(__wt_thread_create(
+ session, &conn->sweep_tid, __sweep_server, session));
+ conn->sweep_tid_set = 1;
+
+ return (0);
+}
+
+/*
+ * __wt_sweep_destroy --
+ * Destroy the handle-sweep thread.
+ */
+int
+__wt_sweep_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ conn = S2C(session);
+
+ F_CLR(conn, WT_CONN_SERVER_SWEEP);
+ if (conn->sweep_tid_set) {
+ WT_TRET(__wt_cond_signal(session, conn->sweep_cond));
+ WT_TRET(__wt_thread_join(session, conn->sweep_tid));
+ conn->sweep_tid_set = 0;
+ }
+ WT_TRET(__wt_cond_destroy(session, &conn->sweep_cond));
+
+ if (conn->sweep_session != NULL) {
+ wt_session = &conn->sweep_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+
+ conn->sweep_session = NULL;
+ }
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c
new file mode 100644
index 00000000000..85a85521213
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c
@@ -0,0 +1,540 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __backup_all(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *);
+static int __backup_cleanup_handles(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *);
+static int __backup_file_create(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *);
+static int __backup_file_remove(WT_SESSION_IMPL *);
+static int __backup_list_all_append(WT_SESSION_IMPL *, const char *[]);
+static int __backup_list_append(
+ WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *);
+static int __backup_start(
+ WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *[]);
+static int __backup_stop(WT_SESSION_IMPL *);
+static int __backup_uri(
+ WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *[], int *);
+
+/*
+ * __curbackup_next --
+ * WT_CURSOR->next method for the backup cursor type.
+ */
+static int
+__curbackup_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BACKUP *cb;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cb = (WT_CURSOR_BACKUP *)cursor;
+ CURSOR_API_CALL(cursor, session, next, NULL);
+
+ if (cb->list == NULL || cb->list[cb->next].name == NULL) {
+ F_CLR(cursor, WT_CURSTD_KEY_SET);
+ WT_ERR(WT_NOTFOUND);
+ }
+
+ cb->iface.key.data = cb->list[cb->next].name;
+ cb->iface.key.size = strlen(cb->list[cb->next].name) + 1;
+ ++cb->next;
+
+ F_SET(cursor, WT_CURSTD_KEY_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curbackup_reset --
+ * WT_CURSOR->reset method for the backup cursor type.
+ */
+static int
+__curbackup_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BACKUP *cb;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cb = (WT_CURSOR_BACKUP *)cursor;
+ CURSOR_API_CALL(cursor, session, reset, NULL);
+
+ cb->next = 0;
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curbackup_close --
+ * WT_CURSOR->close method for the backup cursor type.
+ */
+static int
+__curbackup_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BACKUP *cb;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int tret;
+
+ cb = (WT_CURSOR_BACKUP *)cursor;
+ CURSOR_API_CALL(cursor, session, close, NULL);
+
+ WT_TRET(__backup_cleanup_handles(session, cb));
+ WT_TRET(__wt_cursor_close(cursor));
+ session->bkp_cursor = NULL;
+
+ WT_WITH_SCHEMA_LOCK(session,
+ tret = __backup_stop(session)); /* Stop the backup. */
+ WT_TRET(tret);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curbackup_open --
+ * WT_SESSION->open_cursor method for the backup cursor type.
+ */
+int
+__wt_curbackup_open(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_notsup, /* get-value */
+ __wt_cursor_notsup, /* set-key */
+ __wt_cursor_notsup, /* set-value */
+ __wt_cursor_notsup, /* compare */
+ __curbackup_next, /* next */
+ __wt_cursor_notsup, /* prev */
+ __curbackup_reset, /* reset */
+ __wt_cursor_notsup, /* search */
+ __wt_cursor_notsup, /* search-near */
+ __wt_cursor_notsup, /* insert */
+ __wt_cursor_notsup, /* update */
+ __wt_cursor_notsup, /* remove */
+ __curbackup_close); /* close */
+ WT_CURSOR *cursor;
+ WT_CURSOR_BACKUP *cb;
+ WT_DECL_RET;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_BACKUP, iface) == 0);
+
+ cb = NULL;
+
+ WT_RET(__wt_calloc_def(session, 1, &cb));
+ cursor = &cb->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ session->bkp_cursor = cb;
+
+ cursor->key_format = "S"; /* Return the file names as the key. */
+ cursor->value_format = ""; /* No value. */
+
+ /*
+ * Start the backup and fill in the cursor's list. Acquire the schema
+ * lock, we need a consistent view when creating a copy.
+ */
+ WT_WITH_SCHEMA_LOCK(session, ret = __backup_start(session, cb, cfg));
+ WT_ERR(ret);
+
+ /* __wt_cursor_init is last so we don't have to clean up on error. */
+ WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
+
+ if (0) {
+err: __wt_free(session, cb);
+ }
+
+ return (ret);
+}
+
+/*
+ * __backup_start --
+ * Start a backup.
+ */
+static int
+__backup_start(
+ WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ u_int i, logcount;
+ int exist, target_list;
+ char **logfiles;
+
+ conn = S2C(session);
+
+ cb->next = 0;
+ cb->list = NULL;
+ logfiles = NULL;
+ logcount = 0;
+
+ /*
+ * Single thread hot backups: we're holding the schema lock, so we
+ * know we'll serialize with other attempts to start a hot backup.
+ */
+ if (conn->hot_backup)
+ WT_RET_MSG(
+ session, EINVAL, "there is already a backup cursor open");
+
+ /*
+ * The hot backup copy is done outside of WiredTiger, which means file
+ * blocks can't be freed and re-allocated until the backup completes.
+ * The checkpoint code checks the backup flag, and if a backup cursor
+ * is open checkpoints aren't discarded. We release the lock as soon
+ * as we've set the flag, we don't want to block checkpoints, we just
+ * want to make sure no checkpoints are deleted. The checkpoint code
+ * holds the lock until it's finished the checkpoint, otherwise we
+ * could start a hot backup that would race with an already-started
+ * checkpoint.
+ */
+ __wt_spin_lock(session, &conn->hot_backup_lock);
+ conn->hot_backup = 1;
+ __wt_spin_unlock(session, &conn->hot_backup_lock);
+
+ /* Create the hot backup file. */
+ WT_ERR(__backup_file_create(session, cb));
+
+ /* Add log files if logging is enabled. */
+
+ /*
+ * If a list of targets was specified, work our way through them.
+ * Else, generate a list of all database objects.
+ *
+ * Include log files if doing a full backup, and copy them before
+ * copying data files to avoid rolling the metadata forward across
+ * a checkpoint that completes during the backup.
+ */
+ target_list = 0;
+ WT_ERR(__backup_uri(session, cb, cfg, &target_list));
+ if (!target_list) {
+ if (conn->log) {
+ WT_ERR(__wt_log_get_active_files(
+ session, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++)
+ WT_ERR(__backup_list_append(
+ session, cb, logfiles[i]));
+ }
+
+ WT_ERR(__backup_all(session, cb));
+ }
+
+ /* Add the hot backup and standard WiredTiger files to the list. */
+ WT_ERR(__backup_list_append(session, cb, WT_METADATA_BACKUP));
+ WT_ERR(__wt_exist(session, WT_BASECONFIG, &exist));
+ if (exist)
+ WT_ERR(__backup_list_append(session, cb, WT_BASECONFIG));
+ WT_ERR(__wt_exist(session, WT_USERCONFIG, &exist));
+ if (exist)
+ WT_ERR(__backup_list_append(session, cb, WT_USERCONFIG));
+ WT_ERR(__backup_list_append(session, cb, WT_WIREDTIGER));
+
+err: /* Close the hot backup file. */
+ if (cb->bfp != NULL) {
+ WT_TRET(fclose(cb->bfp) == 0 ? 0 : __wt_errno());
+ cb->bfp = NULL;
+ }
+ if (logfiles != NULL)
+ __wt_log_files_free(session, logfiles, logcount);
+
+ if (ret != 0) {
+ WT_TRET(__backup_cleanup_handles(session, cb));
+ WT_TRET(__backup_stop(session));
+ }
+
+ return (ret);
+}
+
+/*
+ * __backup_cleanup_handles --
+ * Release and free all btree handles held by the backup. This is kept
+ * separate from __backup_stop because it can be called without the
+ * schema lock held.
+ */
+static int
+__backup_cleanup_handles(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
+{
+ WT_CURSOR_BACKUP_ENTRY *p;
+ WT_DECL_RET;
+
+ if (cb->list == NULL)
+ return (0);
+
+ /* Release the handles, free the file names, free the list itself. */
+ for (p = cb->list; p->name != NULL; ++p) {
+ if (p->handle != NULL)
+ WT_WITH_DHANDLE(session, p->handle,
+ WT_TRET(__wt_session_release_btree(session)));
+ __wt_free(session, p->name);
+ }
+
+ __wt_free(session, cb->list);
+ return (ret);
+}
+
+/*
+ * __backup_stop --
+ * Stop a backup.
+ */
+static int
+__backup_stop(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ /* Remove any backup metadata file. */
+ ret = __backup_file_remove(session);
+
+ /* Checkpoint deletion can proceed, as can the next hot backup. */
+ __wt_spin_lock(session, &conn->hot_backup_lock);
+ conn->hot_backup = 0;
+ __wt_spin_unlock(session, &conn->hot_backup_lock);
+
+ return (ret);
+}
+
+/*
+ * __backup_all --
+ * Backup all objects in the database.
+ */
+static int
+__backup_all(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ const char *key, *value;
+
+ cursor = NULL;
+
+ /*
+ * Open a cursor on the metadata file and copy all of the entries to
+ * the hot backup file.
+ */
+ WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ while ((ret = cursor->next(cursor)) == 0) {
+ WT_ERR(cursor->get_key(cursor, &key));
+ WT_ERR(cursor->get_value(cursor, &value));
+ WT_ERR_TEST((fprintf(
+ cb->bfp, "%s\n%s\n", key, value) < 0), __wt_errno());
+
+ /*
+ * While reading the metadata file, check there are no "sources"
+ * or "types" which can't support hot backup. This checks for
+ * a data source that's non-standard, which can't be backed up,
+ * but is also sanity checking: if there's an entry backed by
+ * anything other than a file or lsm entry, we're confused.
+ */
+ if ((ret = __wt_config_getones(
+ session, value, "type", &cval)) == 0 &&
+ !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file") &&
+ !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm"))
+ WT_ERR_MSG(session, ENOTSUP,
+ "hot backup is not supported for objects of "
+ "type %.*s", (int)cval.len, cval.str);
+ WT_ERR_NOTFOUND_OK(ret);
+ if ((ret =__wt_config_getones(
+ session, value, "source", &cval)) == 0 &&
+ !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file:") &&
+ !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm:"))
+ WT_ERR_MSG(session, ENOTSUP,
+ "hot backup is not supported for objects of "
+ "source %.*s", (int)cval.len, cval.str);
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ /* Build a list of the file objects that need to be copied. */
+ WT_ERR(__wt_meta_btree_apply(session, __backup_list_all_append, NULL));
+
+err: if (cursor != NULL)
+ WT_TRET(cursor->close(cursor));
+ return (ret);
+}
+
+/*
+ * __backup_uri --
+ * Backup a list of objects.
+ */
+static int
+__backup_uri(WT_SESSION_IMPL *session,
+ WT_CURSOR_BACKUP *cb, const char *cfg[], int *foundp)
+{
+ WT_CONFIG targetconf;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ int target_list;
+ const char *uri;
+
+ *foundp = target_list = 0;
+
+ /*
+ * If we find a non-empty target configuration string, we have a job,
+ * otherwise it's not our problem.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "target", &cval));
+ WT_RET(__wt_config_subinit(session, &targetconf, &cval));
+ for (cb->list_next = 0;
+ (ret = __wt_config_next(&targetconf, &k, &v)) == 0;) {
+ if (!target_list) {
+ target_list = *foundp = 1;
+
+ WT_ERR(__wt_scr_alloc(session, 512, &tmp));
+ }
+
+ WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
+ uri = tmp->data;
+ if (v.len != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "%s: invalid backup target: URIs may need quoting",
+ uri);
+
+ WT_ERR(__wt_schema_worker(
+ session, uri, NULL, __wt_backup_list_uri_append, cfg, 0));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __backup_file_create --
+ * Create the meta-data backup file.
+ */
+static int
+__backup_file_create(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
+{
+ WT_DECL_RET;
+ char *path;
+
+ /* Open the hot backup file. */
+ WT_RET(__wt_filename(session, WT_METADATA_BACKUP, &path));
+ WT_ERR_TEST((cb->bfp = fopen(path, "w")) == NULL, __wt_errno());
+
+err: __wt_free(session, path);
+ return (ret);
+}
+
+/*
+ * __backup_file_remove --
+ * Remove the meta-data backup file.
+ */
+static int
+__backup_file_remove(WT_SESSION_IMPL *session)
+{
+ return (__wt_remove(session, WT_METADATA_BACKUP));
+}
+
+/*
+ * __wt_backup_list_uri_append --
+ * Append a new file name to the list, allocate space as necessary.
+ * Called via the schema_worker function.
+ */
+int
+__wt_backup_list_uri_append(
+ WT_SESSION_IMPL *session, const char *name, int *skip)
+{
+ WT_CURSOR_BACKUP *cb;
+ const char *value;
+
+ cb = session->bkp_cursor;
+ WT_UNUSED(skip);
+
+ /* Add the metadata entry to the backup file. */
+ WT_RET(__wt_metadata_search(session, name, &value));
+ WT_RET_TEST(
+ (fprintf(cb->bfp, "%s\n%s\n", name, value) < 0), __wt_errno());
+ __wt_free(session, value);
+
+ /* Add file type objects to the list of files to be copied. */
+ if (WT_PREFIX_MATCH(name, "file:"))
+ WT_RET(__backup_list_append(session, cb, name));
+
+ return (0);
+}
+
+/*
+ * __backup_list_all_append --
+ * Append a new file name to the list, allocate space as necessary.
+ * Called via the __wt_meta_btree_apply function.
+ */
+static int
+__backup_list_all_append(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CURSOR_BACKUP *cb;
+
+ WT_UNUSED(cfg);
+
+ cb = session->bkp_cursor;
+
+ /* Ignore files in the process of being bulk-loaded. */
+ if (F_ISSET(S2BT(session), WT_BTREE_BULK))
+ return (0);
+
+ /* Add the file to the list of files to be copied. */
+ return (__backup_list_append(session, cb, session->dhandle->name));
+}
+
+/*
+ * __backup_list_append --
+ * Append a new file name to the list, allocate space as necessary.
+ */
+static int
+__backup_list_append(
+ WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *uri)
+{
+ WT_CURSOR_BACKUP_ENTRY *p;
+ WT_DATA_HANDLE *old_dhandle;
+ WT_DECL_RET;
+ const char *name;
+ int need_handle;
+
+ /* Leave a NULL at the end to mark the end of the list. */
+ WT_RET(__wt_realloc_def(session, &cb->list_allocated,
+ cb->list_next + 2, &cb->list));
+ p = &cb->list[cb->list_next];
+ p[0].name = p[1].name = NULL;
+ p[0].handle = p[1].handle = NULL;
+
+ need_handle = 0;
+ name = uri;
+ if (WT_PREFIX_MATCH(uri, "file:")) {
+ need_handle = 1;
+ name += strlen("file:");
+ }
+
+ /*
+ * !!!
+ * Assumes metadata file entries map one-to-one to physical files.
+ * To support a block manager where that's not the case, we'd need
+ * to call into the block manager and get a list of physical files
+ * that map to this logical "file". I'm not going to worry about
+ * that for now, that block manager might not even support physical
+ * copying of files by applications.
+ */
+ WT_RET(__wt_strdup(session, name, &p->name));
+
+ /*
+ * If it's a file in the database, get a handle for the underlying
+ * object (this handle blocks schema level operations, for example
+ * WT_SESSION.drop or an LSM file discard after level merging).
+ */
+ if (need_handle) {
+ old_dhandle = session->dhandle;
+ if ((ret =
+ __wt_session_get_btree(session, uri, NULL, NULL, 0)) == 0)
+ p->handle = session->dhandle;
+ session->dhandle = old_dhandle;
+ WT_RET(ret);
+ }
+
+ ++cb->list_next;
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_bulk.c b/src/third_party/wiredtiger/src/cursor/cur_bulk.c
new file mode 100644
index 00000000000..96a45a7e629
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_bulk.c
@@ -0,0 +1,287 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curbulk_insert_fix --
+ * Fixed-length column-store bulk cursor insert.
+ */
+static int
+__curbulk_insert_fix(WT_CURSOR *cursor)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BULK *cbulk;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbulk = (WT_CURSOR_BULK *)cursor;
+ btree = cbulk->cbt.btree;
+
+ /*
+ * Bulk cursor inserts are updates, but don't need auto-commit
+ * transactions because they are single-threaded and not visible
+ * until the bulk cursor is closed.
+ */
+ CURSOR_API_CALL(cursor, session, insert, btree);
+
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ WT_ERR(__wt_bulk_insert_fix(session, cbulk));
+
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curbulk_insert_var --
+ * Variable-length column-store bulk cursor insert.
+ */
+static int
+__curbulk_insert_var(WT_CURSOR *cursor)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BULK *cbulk;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int duplicate;
+
+ cbulk = (WT_CURSOR_BULK *)cursor;
+ btree = cbulk->cbt.btree;
+
+ /*
+ * Bulk cursor inserts are updates, but don't need auto-commit
+ * transactions because they are single-threaded and not visible
+ * until the bulk cursor is closed.
+ */
+ CURSOR_API_CALL(cursor, session, insert, btree);
+
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ /*
+ * If this isn't the first value inserted, compare it against the last
+ * value and increment the RLE count.
+ *
+ * Instead of a "first time" variable, I'm using the RLE count, because
+ * it is only zero before the first row is inserted.
+ */
+ duplicate = 0;
+ if (cbulk->rle != 0) {
+ if (cbulk->last.size == cursor->value.size &&
+ memcmp(cbulk->last.data, cursor->value.data,
+ cursor->value.size) == 0) {
+ ++cbulk->rle;
+ duplicate = 1;
+ } else
+ WT_ERR(__wt_bulk_insert_var(session, cbulk));
+ }
+
+ /*
+ * Save a copy of the value for the next comparison and reset the RLE
+ * counter.
+ */
+ if (!duplicate) {
+ WT_ERR(__wt_buf_set(session,
+ &cbulk->last, cursor->value.data, cursor->value.size));
+ cbulk->rle = 1;
+ }
+
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __bulk_row_keycmp_err --
+ * Error routine when keys inserted out-of-order.
+ */
+static int
+__bulk_row_keycmp_err(WT_CURSOR_BULK *cbulk)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(a);
+ WT_DECL_ITEM(b);
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session;
+ cursor = &cbulk->cbt.iface;
+
+ WT_ERR(__wt_scr_alloc(session, 512, &a));
+ WT_ERR(__wt_scr_alloc(session, 512, &b));
+
+ WT_ERR(__wt_buf_set_printable(
+ session, a, cursor->key.data, cursor->key.size));
+ WT_ERR(__wt_buf_set_printable(
+ session, b, cbulk->last.data, cbulk->last.size));
+
+ WT_ERR_MSG(session, EINVAL,
+ "bulk-load presented with out-of-order keys: %.*s compares smaller "
+ "than previously inserted key %.*s",
+ (int)a->size, (const char *)a->data,
+ (int)b->size, (const char *)b->data);
+
+err: __wt_scr_free(&a);
+ __wt_scr_free(&b);
+ return (ret);
+}
+
+/*
+ * __curbulk_insert_row --
+ * Row-store bulk cursor insert, with key-sort checks.
+ */
+static int
+__curbulk_insert_row(WT_CURSOR *cursor)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BULK *cbulk;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int cmp;
+
+ cbulk = (WT_CURSOR_BULK *)cursor;
+ btree = cbulk->cbt.btree;
+
+ /*
+ * Bulk cursor inserts are updates, but don't need auto-commit
+ * transactions because they are single-threaded and not visible
+ * until the bulk cursor is closed.
+ */
+ CURSOR_API_CALL(cursor, session, insert, btree);
+
+ WT_CURSOR_CHECKKEY(cursor);
+ WT_CURSOR_CHECKVALUE(cursor);
+
+ /*
+ * If this isn't the first key inserted, compare it against the last key
+ * to ensure the application doesn't accidentally corrupt the table.
+ *
+ * Instead of a "first time" variable, I'm using the RLE count, because
+ * it is only zero before the first row is inserted.
+ */
+ if (cbulk->rle != 0) {
+ WT_ERR(__wt_compare(session,
+ btree->collator, &cursor->key, &cbulk->last, &cmp));
+ if (cmp <= 0)
+ WT_ERR(__bulk_row_keycmp_err(cbulk));
+ }
+
+ /*
+ * Save a copy of the key for the next comparison and set the RLE
+ * counter.
+ */
+ WT_ERR(__wt_buf_set(session,
+ &cbulk->last, cursor->key.data, cursor->key.size));
+ cbulk->rle = 1;
+
+ WT_ERR(__wt_bulk_insert_row(session, cbulk));
+
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curbulk_insert_row_skip_check --
+ * Row-store bulk cursor insert, without key-sort checks.
+ */
+static int
+__curbulk_insert_row_skip_check(WT_CURSOR *cursor)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BULK *cbulk;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbulk = (WT_CURSOR_BULK *)cursor;
+ btree = cbulk->cbt.btree;
+
+ /*
+ * Bulk cursor inserts are updates, but don't need auto-commit
+ * transactions because they are single-threaded and not visible
+ * until the bulk cursor is closed.
+ */
+ CURSOR_API_CALL(cursor, session, insert, btree);
+
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ WT_ERR(__wt_bulk_insert_row(session, cbulk));
+
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curbulk_close --
+ * WT_CURSOR->close for the bulk cursor type.
+ */
+static int
+__curbulk_close(WT_CURSOR *cursor)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BULK *cbulk;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbulk = (WT_CURSOR_BULK *)cursor;
+ btree = cbulk->cbt.btree;
+
+ CURSOR_API_CALL(cursor, session, close, btree);
+
+ WT_TRET(__wt_bulk_wrapup(session, cbulk));
+ __wt_buf_free(session, &cbulk->last);
+
+ WT_TRET(__wt_session_release_btree(session));
+
+ /* The URI is owned by the btree handle. */
+ cursor->internal_uri = NULL;
+
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curbulk_init --
+ * Initialize a bulk cursor.
+ */
+int
+__wt_curbulk_init(WT_SESSION_IMPL *session,
+ WT_CURSOR_BULK *cbulk, int bitmap, int skip_sort_check)
+{
+ WT_CURSOR *c;
+ WT_CURSOR_BTREE *cbt;
+
+ c = &cbulk->cbt.iface;
+ cbt = &cbulk->cbt;
+
+ /* Bulk cursors only support insert and close (reset is a no-op). */
+ __wt_cursor_set_notsup(c);
+ switch (cbt->btree->type) {
+ case BTREE_COL_FIX:
+ c->insert = __curbulk_insert_fix;
+ break;
+ case BTREE_COL_VAR:
+ c->insert = __curbulk_insert_var;
+ break;
+ case BTREE_ROW:
+ c->insert = skip_sort_check ?
+ __curbulk_insert_row_skip_check : __curbulk_insert_row;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ c->close = __curbulk_close;
+
+ cbulk->bitmap = bitmap;
+ if (bitmap)
+ F_SET(c, WT_CURSTD_RAW);
+
+ return (__wt_bulk_init(session, cbulk));
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_config.c b/src/third_party/wiredtiger/src/cursor/cur_config.c
new file mode 100644
index 00000000000..868b144efc1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_config.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curconfig_close --
+ * WT_CURSOR->close method for the config cursor type.
+ */
+static int
+__curconfig_close(WT_CURSOR *cursor)
+{
+ return (__wt_cursor_close(cursor));
+}
+
+/*
+ * __wt_curconfig_open --
+ * WT_SESSION->open_cursor method for config cursors.
+ */
+int
+__wt_curconfig_open(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __wt_cursor_notsup, /* compare */
+ __wt_cursor_notsup, /* next */
+ __wt_cursor_notsup, /* prev */
+ __wt_cursor_noop, /* reset */
+ __wt_cursor_notsup, /* search */
+ __wt_cursor_notsup, /* search-near */
+ __wt_cursor_notsup, /* insert */
+ __wt_cursor_notsup, /* update */
+ __wt_cursor_notsup, /* remove */
+ __curconfig_close);
+ WT_CURSOR_CONFIG *cconfig;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_CONFIG, iface) == 0);
+
+ WT_UNUSED(uri);
+
+ WT_RET(__wt_calloc_def(session, 1, &cconfig));
+
+ cursor = &cconfig->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ cursor->key_format = cursor->value_format = "S";
+
+ /* __wt_cursor_init is last so we don't have to clean up on error. */
+ WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
+
+ if (0) {
+err: __wt_free(session, cconfig);
+ }
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c
new file mode 100644
index 00000000000..33e89764617
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c
@@ -0,0 +1,524 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curds_txn_enter --
+ * Do transactional initialization when starting an operation.
+ */
+static int
+__curds_txn_enter(WT_SESSION_IMPL *session)
+{
+ session->ncursors++; /* XXX */
+ __wt_txn_cursor_op(session);
+
+ return (0);
+}
+
+/*
+ * __curds_txn_leave --
+ * Do transactional cleanup when ending an operation.
+ */
+static void
+__curds_txn_leave(WT_SESSION_IMPL *session)
+{
+ if (--session->ncursors == 0) /* XXX */
+ __wt_txn_read_last(session);
+}
+
+/*
+ * __curds_key_set --
+ * Set the key for the data-source.
+ */
+static int
+__curds_key_set(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ WT_CURSOR_NEEDKEY(cursor);
+
+ source->recno = cursor->recno;
+ source->key.data = cursor->key.data;
+ source->key.size = cursor->key.size;
+
+err: return (ret);
+}
+
+/*
+ * __curds_value_set --
+ * Set the value for the data-source.
+ */
+static int
+__curds_value_set(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ source->value.data = cursor->value.data;
+ source->value.size = cursor->value.size;
+
+err: return (ret);
+}
+
+/*
+ * __curds_cursor_resolve --
+ * Resolve cursor operation.
+ */
+static int
+__curds_cursor_resolve(WT_CURSOR *cursor, int ret)
+{
+ WT_CURSOR *source;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ /*
+ * Update the cursor's key, value and flags. (We use the _INT flags in
+ * the same way as file objects: there's some chance the underlying data
+ * source is passing us a reference to data only pinned per operation,
+ * might as well be safe.)
+ *
+ * There's also a requirement the underlying data-source never returns
+ * with the cursor/source key referencing application memory: it'd be
+ * great to do a copy as necessary here so the data-source doesn't have
+ * to worry about copying the key, but we don't have enough information
+ * to know if a cursor is pointing at application or data-source memory.
+ */
+ if (ret == 0) {
+ cursor->key.data = source->key.data;
+ cursor->key.size = source->key.size;
+ cursor->value.data = source->value.data;
+ cursor->value.size = source->value.size;
+ cursor->recno = source->recno;
+
+ F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+ } else {
+ if (ret == WT_NOTFOUND)
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ else
+ F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+ /*
+ * Cursor operation failure implies a lost cursor position and
+ * a subsequent next/prev starting at the beginning/end of the
+ * table. We simplify underlying data source implementations
+ * by resetting the cursor explicitly here.
+ */
+ WT_TRET(source->reset(source));
+ }
+
+ return (ret);
+}
+
+/*
+ * __curds_compare --
+ * WT_CURSOR.compare method for the data-source cursor type.
+ */
+static int
+__curds_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+ WT_COLLATOR *collator;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ CURSOR_API_CALL(a, session, compare, NULL);
+
+ /*
+ * Confirm both cursors refer to the same source and have keys, then
+ * compare them.
+ */
+ if (strcmp(a->internal_uri, b->internal_uri) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "Cursors must reference the same object");
+
+ WT_CURSOR_NEEDKEY(a);
+ WT_CURSOR_NEEDKEY(b);
+
+ if (WT_CURSOR_RECNO(a)) {
+ if (a->recno < b->recno)
+ *cmpp = -1;
+ else if (a->recno == b->recno)
+ *cmpp = 0;
+ else
+ *cmpp = 1;
+ } else {
+ /*
+ * The assumption is data-sources don't provide WiredTiger with
+ * WT_CURSOR.compare methods, instead, we'll copy the key/value
+ * out of the underlying data-source cursor and any comparison
+ * to be done can be done at this level.
+ */
+ collator = ((WT_CURSOR_DATA_SOURCE *)a)->collator;
+ WT_ERR(__wt_compare(
+ session, collator, &a->key, &b->key, cmpp));
+ }
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curds_next --
+ * WT_CURSOR.next method for the data-source cursor type.
+ */
+static int
+__curds_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_API_CALL(cursor, session, next, NULL);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_next);
+ WT_STAT_FAST_DATA_INCR(session, cursor_next);
+
+ WT_ERR(__curds_txn_enter(session));
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ ret = __curds_cursor_resolve(cursor, source->next(source));
+
+err: __curds_txn_leave(session);
+
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curds_prev --
+ * WT_CURSOR.prev method for the data-source cursor type.
+ */
+static int
+__curds_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_API_CALL(cursor, session, prev, NULL);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_prev);
+ WT_STAT_FAST_DATA_INCR(session, cursor_prev);
+
+ WT_ERR(__curds_txn_enter(session));
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ ret = __curds_cursor_resolve(cursor, source->prev(source));
+
+err: __curds_txn_leave(session);
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curds_reset --
+ * WT_CURSOR.reset method for the data-source cursor type.
+ */
+static int
+__curds_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_API_CALL(cursor, session, reset, NULL);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_reset);
+ WT_STAT_FAST_DATA_INCR(session, cursor_reset);
+
+ WT_ERR(source->reset(source));
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curds_search --
+ * WT_CURSOR.search method for the data-source cursor type.
+ */
+static int
+__curds_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_API_CALL(cursor, session, search, NULL);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_search);
+ WT_STAT_FAST_DATA_INCR(session, cursor_search);
+
+ WT_ERR(__curds_txn_enter(session));
+
+ WT_ERR(__curds_key_set(cursor));
+ ret = __curds_cursor_resolve(cursor, source->search(source));
+
+err: __curds_txn_leave(session);
+
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curds_search_near --
+ * WT_CURSOR.search_near method for the data-source cursor type.
+ */
+static int
+__curds_search_near(WT_CURSOR *cursor, int *exact)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_API_CALL(cursor, session, search_near, NULL);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_search_near);
+ WT_STAT_FAST_DATA_INCR(session, cursor_search_near);
+
+ WT_ERR(__curds_txn_enter(session));
+
+ WT_ERR(__curds_key_set(cursor));
+ ret =
+ __curds_cursor_resolve(cursor, source->search_near(source, exact));
+
+err: __curds_txn_leave(session);
+
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curds_insert --
+ * WT_CURSOR.insert method for the data-source cursor type.
+ */
+static int
+__curds_insert(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL);
+
+ WT_ERR(__curds_txn_enter(session));
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_insert);
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert);
+ WT_STAT_FAST_DATA_INCRV(session,
+ cursor_insert_bytes, cursor->key.size + cursor->value.size);
+
+ if (!F_ISSET(cursor, WT_CURSTD_APPEND))
+ WT_ERR(__curds_key_set(cursor));
+ WT_ERR(__curds_value_set(cursor));
+ ret = __curds_cursor_resolve(cursor, source->insert(source));
+
+err: __curds_txn_leave(session);
+
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curds_update --
+ * WT_CURSOR.update method for the data-source cursor type.
+ */
+static int
+__curds_update(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, update, NULL);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_update);
+ WT_STAT_FAST_DATA_INCR(session, cursor_update);
+ WT_STAT_FAST_DATA_INCRV(
+ session, cursor_update_bytes, cursor->value.size);
+
+ WT_ERR(__curds_txn_enter(session));
+
+ WT_ERR(__curds_key_set(cursor));
+ WT_ERR(__curds_value_set(cursor));
+ ret = __curds_cursor_resolve(cursor, source->update(source));
+
+err: __curds_txn_leave(session);
+
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curds_remove --
+ * WT_CURSOR.remove method for the data-source cursor type.
+ */
+static int
+__curds_remove(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_remove);
+ WT_STAT_FAST_DATA_INCR(session, cursor_remove);
+ WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size);
+
+ WT_ERR(__curds_txn_enter(session));
+
+ WT_ERR(__curds_key_set(cursor));
+ ret = __curds_cursor_resolve(cursor, source->remove(source));
+
+err: __curds_txn_leave(session);
+
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curds_close --
+ * WT_CURSOR.close method for the data-source cursor type.
+ */
+static int
+__curds_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_DATA_SOURCE *cds;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cds = (WT_CURSOR_DATA_SOURCE *)cursor;
+
+ CURSOR_API_CALL(cursor, session, close, NULL);
+
+ if (cds->source != NULL)
+ ret = cds->source->close(cds->source);
+
+ if (cds->collator_owned) {
+ if (cds->collator->terminate != NULL)
+ WT_TRET(cds->collator->terminate(
+ cds->collator, &session->iface));
+ cds->collator_owned = 0;
+ }
+ cds->collator = NULL;
+
+ /*
+ * The key/value formats are in allocated memory, which isn't standard
+ * behavior.
+ */
+ __wt_free(session, cursor->key_format);
+ __wt_free(session, cursor->value_format);
+
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curds_open --
+ * Initialize a data-source cursor.
+ */
+int
+__wt_curds_open(
+ WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner,
+ const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __curds_compare, /* compare */
+ __curds_next, /* next */
+ __curds_prev, /* prev */
+ __curds_reset, /* reset */
+ __curds_search, /* search */
+ __curds_search_near, /* search-near */
+ __curds_insert, /* insert */
+ __curds_update, /* update */
+ __curds_remove, /* remove */
+ __curds_close); /* close */
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR *cursor, *source;
+ WT_CURSOR_DATA_SOURCE *data_source;
+ WT_DECL_RET;
+ const char *metaconf;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_DATA_SOURCE, iface) == 0);
+
+ data_source = NULL;
+ metaconf = NULL;
+
+ WT_RET(__wt_calloc_def(session, 1, &data_source));
+ cursor = &data_source->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ F_SET(cursor, WT_CURSTD_DATA_SOURCE);
+
+ /*
+ * XXX
+ * The underlying data-source may require the object's key and value
+ * formats. This isn't a particularly elegant way of getting that
+ * information to the data-source, this feels like a layering problem
+ * to me.
+ */
+ WT_ERR(__wt_metadata_search(session, uri, &metaconf));
+ WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format));
+ WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval));
+ WT_ERR(
+ __wt_strndup(session, cval.str, cval.len, &cursor->value_format));
+
+ WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));
+
+ /* Data-source cursors have a collator reference. */
+ WT_ERR(__wt_collator_config(session, cfg,
+ &data_source->collator, &data_source->collator_owned));
+
+ WT_ERR(dsrc->open_cursor(dsrc,
+ &session->iface, uri, (WT_CONFIG_ARG *)cfg, &data_source->source));
+ source = data_source->source;
+ source->session = (WT_SESSION *)session;
+ memset(&source->q, 0, sizeof(source->q));
+ source->recno = 0;
+ memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf));
+ memset(&source->key, 0, sizeof(source->key));
+ memset(&source->value, 0, sizeof(source->value));
+ source->saved_err = 0;
+ source->flags = 0;
+
+ if (0) {
+err: if (F_ISSET(cursor, WT_CURSTD_OPEN))
+ WT_TRET(cursor->close(cursor));
+ else
+ __wt_free(session, data_source);
+ *cursorp = NULL;
+ }
+
+ __wt_free(session, metaconf);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_dump.c b/src/third_party/wiredtiger/src/cursor/cur_dump.c
new file mode 100644
index 00000000000..003b7e1f961
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_dump.c
@@ -0,0 +1,400 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __raw_to_dump --
+ * We have a buffer where the data item contains a raw value,
+ * convert it to a printable string.
+ */
+static int
+__raw_to_dump(
+ WT_SESSION_IMPL *session, WT_ITEM *from, WT_ITEM *to, int hexonly)
+{
+ if (hexonly)
+ WT_RET(__wt_raw_to_hex(session, from->data, from->size, to));
+ else
+ WT_RET(
+ __wt_raw_to_esc_hex(session, from->data, from->size, to));
+
+ return (0);
+}
+
+/*
+ * __dump_to_raw --
+ * We have a buffer containing a dump string,
+ * convert it to a raw value.
+ */
+static int
+__dump_to_raw(
+ WT_SESSION_IMPL *session, const char *src_arg, WT_ITEM *item, int hexonly)
+{
+ if (hexonly)
+ WT_RET(__wt_hex_to_raw(session, src_arg, item));
+ else
+ WT_RET(__wt_esc_hex_to_raw(session, src_arg, item));
+
+ return (0);
+}
+
+/*
+ * __curdump_get_key --
+ * WT_CURSOR->get_key for dump cursors.
+ */
+static int
+__curdump_get_key(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR *child;
+ WT_CURSOR_DUMP *cdump;
+ WT_CURSOR_JSON *json;
+ WT_DECL_RET;
+ WT_ITEM item, *itemp;
+ WT_SESSION_IMPL *session;
+ size_t size;
+ uint64_t recno;
+ const char *fmt;
+ const void *buffer;
+ va_list ap;
+
+ cdump = (WT_CURSOR_DUMP *)cursor;
+ child = cdump->child;
+
+ va_start(ap, cursor);
+ CURSOR_API_CALL(cursor, session, get_key, NULL);
+
+ if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) {
+ json = (WT_CURSOR_JSON *)cursor->json_private;
+ WT_ASSERT(session, json != NULL);
+ if (WT_CURSOR_RECNO(cursor)) {
+ WT_ERR(child->get_key(child, &recno));
+ buffer = &recno;
+ size = sizeof(recno);
+ fmt = "R";
+ } else {
+ WT_ERR(__wt_cursor_get_raw_key(child, &item));
+ buffer = item.data;
+ size = item.size;
+ if (F_ISSET(cursor, WT_CURSTD_RAW))
+ fmt = "u";
+ else
+ fmt = cursor->key_format;
+ }
+ ret = __wt_json_alloc_unpack(session, buffer, size, fmt,
+ json, 1, ap);
+ } else {
+ if (WT_CURSOR_RECNO(cursor) &&
+ !F_ISSET(cursor, WT_CURSTD_RAW)) {
+ WT_ERR(child->get_key(child, &recno));
+
+ WT_ERR(__wt_buf_fmt(session, &cursor->key, "%"
+ PRIu64, recno));
+ } else {
+ WT_ERR(child->get_key(child, &item));
+
+ WT_ERR(__raw_to_dump(session, &item, &cursor->key,
+ F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0));
+ }
+
+ if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+ itemp = va_arg(ap, WT_ITEM *);
+ itemp->data = cursor->key.data;
+ itemp->size = cursor->key.size;
+ } else
+ *va_arg(ap, const char **) = cursor->key.data;
+ }
+
+err: va_end(ap);
+ API_END_RET(session, ret);
+}
+
+/*
+ * str2recno --
+ * Convert a string to a record number.
+ */
+static int
+str2recno(WT_SESSION_IMPL *session, const char *p, uint64_t *recnop)
+{
+ uint64_t recno;
+ char *endptr;
+
+ /*
+ * strtouq takes lots of things like hex values, signs and so on and so
+ * forth -- none of them are OK with us. Check the string starts with
+ * digit, that turns off the special processing.
+ */
+ if (!isdigit(p[0]))
+ goto format;
+
+ errno = 0;
+ recno = __wt_strtouq(p, &endptr, 0);
+ if (recno == ULLONG_MAX && errno == ERANGE)
+ WT_RET_MSG(session, ERANGE, "%s: invalid record number", p);
+ if (endptr[0] != '\0')
+format: WT_RET_MSG(session, EINVAL, "%s: invalid record number", p);
+
+ *recnop = recno;
+ return (0);
+}
+
+/*
+ * __curdump_set_key --
+ * WT_CURSOR->set_key for dump cursors.
+ */
+static void
+__curdump_set_key(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_DUMP *cdump;
+ WT_CURSOR *child;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ uint64_t recno;
+ va_list ap;
+ const char *p;
+
+ cdump = (WT_CURSOR_DUMP *)cursor;
+ child = cdump->child;
+ CURSOR_API_CALL(cursor, session, set_key, NULL);
+
+ va_start(ap, cursor);
+ if (F_ISSET(cursor, WT_CURSTD_RAW))
+ p = va_arg(ap, WT_ITEM *)->data;
+ else
+ p = va_arg(ap, const char *);
+ va_end(ap);
+
+ if (WT_CURSOR_RECNO(cursor) && !F_ISSET(cursor, WT_CURSTD_RAW)) {
+ WT_ERR(str2recno(session, p, &recno));
+
+ child->set_key(child, recno);
+ } else {
+ if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
+ WT_ERR(__wt_json_to_item(session, p, cursor->key_format,
+ (WT_CURSOR_JSON *)cursor->json_private, 1,
+ &cursor->key));
+ else
+ WT_ERR(__dump_to_raw(session, p, &cursor->key,
+ F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0));
+
+ child->set_key(child, &cursor->key);
+ }
+
+ if (0) {
+err: cursor->saved_err = ret;
+ F_CLR(cursor, WT_CURSTD_KEY_SET);
+ }
+ API_END(session, ret);
+}
+
+/*
+ * __curdump_get_value --
+ * WT_CURSOR->get_value for dump cursors.
+ */
+static int
+__curdump_get_value(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_DUMP *cdump;
+ WT_CURSOR_JSON *json;
+ WT_CURSOR *child;
+ WT_DECL_RET;
+ WT_ITEM item, *itemp;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+ const char *fmt;
+
+ cdump = (WT_CURSOR_DUMP *)cursor;
+ child = cdump->child;
+
+ va_start(ap, cursor);
+ CURSOR_API_CALL(cursor, session, get_value, NULL);
+
+ if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) {
+ json = (WT_CURSOR_JSON *)cursor->json_private;
+ WT_ASSERT(session, json != NULL);
+ WT_ERR(__wt_cursor_get_raw_value(child, &item));
+ fmt = F_ISSET(cursor, WT_CURSTD_RAW) ?
+ "u" : cursor->value_format;
+ ret = __wt_json_alloc_unpack(session, item.data,
+ item.size, fmt, json, 0, ap);
+ } else {
+ WT_ERR(child->get_value(child, &item));
+
+ WT_ERR(__raw_to_dump(session, &item, &cursor->value,
+ F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0));
+
+ if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+ itemp = va_arg(ap, WT_ITEM *);
+ itemp->data = cursor->value.data;
+ itemp->size = cursor->value.size;
+ } else
+ *va_arg(ap, const char **) = cursor->value.data;
+ }
+
+err: va_end(ap);
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curdump_set_value --
+ * WT_CURSOR->set_value for dump cursors.
+ */
+static void
+__curdump_set_value(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_DUMP *cdump;
+ WT_CURSOR *child;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+ const char *p;
+
+ cdump = (WT_CURSOR_DUMP *)cursor;
+ child = cdump->child;
+ CURSOR_API_CALL(cursor, session, set_value, NULL);
+
+ va_start(ap, cursor);
+ if (F_ISSET(cursor, WT_CURSTD_RAW))
+ p = va_arg(ap, WT_ITEM *)->data;
+ else
+ p = va_arg(ap, const char *);
+ va_end(ap);
+
+ if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
+ WT_ERR(__wt_json_to_item(session, p, cursor->value_format,
+ (WT_CURSOR_JSON *)cursor->json_private, 0, &cursor->value));
+ else
+ WT_ERR(__dump_to_raw(session, p, &cursor->value,
+ F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0));
+
+ child->set_value(child, &cursor->value);
+
+ if (0) {
+err: cursor->saved_err = ret;
+ F_CLR(cursor, WT_CURSTD_VALUE_SET);
+ }
+ API_END(session, ret);
+}
+
+/* Pass through a call to the underlying cursor. */
+#define WT_CURDUMP_PASS(op) \
+static int \
+__curdump_##op(WT_CURSOR *cursor) \
+{ \
+ WT_CURSOR *child; \
+ \
+ child = ((WT_CURSOR_DUMP *)cursor)->child; \
+ return (child->op(child)); \
+}
+
+WT_CURDUMP_PASS(next)
+WT_CURDUMP_PASS(prev)
+WT_CURDUMP_PASS(reset)
+WT_CURDUMP_PASS(search)
+
+/*
+ * __curdump_search_near --
+ * WT_CURSOR::search_near for dump cursors.
+ */
+static int
+__curdump_search_near(WT_CURSOR *cursor, int *exact)
+{
+ WT_CURSOR_DUMP *cdump;
+
+ cdump = (WT_CURSOR_DUMP *)cursor;
+ return (cdump->child->search_near(cdump->child, exact));
+}
+
+WT_CURDUMP_PASS(insert)
+WT_CURDUMP_PASS(update)
+WT_CURDUMP_PASS(remove)
+
+/*
+ * __curdump_close --
+ * WT_CURSOR::close for dump cursors.
+ */
+static int
+__curdump_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_DUMP *cdump;
+ WT_CURSOR *child;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cdump = (WT_CURSOR_DUMP *)cursor;
+ child = cdump->child;
+
+ CURSOR_API_CALL(cursor, session, get_key, NULL);
+ if (child != NULL)
+ WT_TRET(child->close(child));
+ /* We shared the child's URI. */
+ cursor->internal_uri = NULL;
+ __wt_json_close(session, cursor);
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curdump_create --
+ * initialize a dump cursor.
+ */
+int
+__wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __curdump_get_key, /* get-key */
+ __curdump_get_value, /* get-value */
+ __curdump_set_key, /* set-key */
+ __curdump_set_value, /* set-value */
+ __wt_cursor_notsup, /* compare */
+ __curdump_next, /* next */
+ __curdump_prev, /* prev */
+ __curdump_reset, /* reset */
+ __curdump_search, /* search */
+ __curdump_search_near, /* search-near */
+ __curdump_insert, /* insert */
+ __curdump_update, /* update */
+ __curdump_remove, /* remove */
+ __curdump_close); /* close */
+ WT_CURSOR *cursor;
+ WT_CURSOR_DUMP *cdump;
+ WT_CURSOR_JSON *json;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ const char *cfg[2];
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_DUMP, iface) == 0);
+
+ session = (WT_SESSION_IMPL *)child->session;
+
+ WT_RET(__wt_calloc_def(session, 1, &cdump));
+ cursor = &cdump->iface;
+ *cursor = iface;
+ cursor->session = child->session;
+ cursor->internal_uri = child->internal_uri;
+ cursor->key_format = child->key_format;
+ cursor->value_format = child->value_format;
+ cdump->child = child;
+
+ /* Copy the dump flags from the child cursor. */
+ F_SET(cursor, F_ISSET(child,
+ WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_JSON | WT_CURSTD_DUMP_PRINT));
+ if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) {
+ WT_ERR(__wt_calloc_def(session, 1, &json));
+ cursor->json_private = child->json_private = json;
+ }
+
+ /* __wt_cursor_init is last so we don't have to clean up on error. */
+ cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+ cfg[1] = NULL;
+ WT_ERR(__wt_cursor_init(cursor, NULL, owner, cfg, cursorp));
+
+ if (0) {
+err: __wt_free(session, cursor);
+ }
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c
new file mode 100644
index 00000000000..e5aaa19d0cc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_file.c
@@ -0,0 +1,471 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * WT_BTREE_CURSOR_SAVE_AND_RESTORE
+ * Save the cursor's key/value data/size fields, call an underlying btree
+ * function, and then consistently handle failure and success.
+ */
+#define WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, f, ret) do { \
+ WT_ITEM __key_copy = (cursor)->key; \
+ uint64_t __recno = (cursor)->recno; \
+ WT_ITEM __value_copy = (cursor)->value; \
+ if (((ret) = (f)) == 0) { \
+ F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); \
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); \
+ } else { \
+ if (F_ISSET(cursor, WT_CURSTD_KEY_EXT)) { \
+ (cursor)->recno = __recno; \
+ WT_ITEM_SET((cursor)->key, __key_copy); \
+ } \
+ if (F_ISSET(cursor, WT_CURSTD_VALUE_EXT)) \
+ WT_ITEM_SET((cursor)->value, __value_copy); \
+ F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); \
+ } \
+} while (0)
+
+/*
+ * __curfile_compare --
+ * WT_CURSOR->compare method for the btree cursor type.
+ */
+static int
+__curfile_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)a;
+ CURSOR_API_CALL(a, session, compare, cbt->btree);
+
+ /*
+ * Confirm both cursors refer to the same source and have keys, then
+ * call the underlying object to compare them.
+ */
+ if (strcmp(a->internal_uri, b->internal_uri) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "Cursors must reference the same object");
+
+ WT_CURSOR_CHECKKEY(a);
+ WT_CURSOR_CHECKKEY(b);
+
+ ret = __wt_btcur_compare(
+ (WT_CURSOR_BTREE *)a, (WT_CURSOR_BTREE *)b, cmpp);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_next --
+ * WT_CURSOR->next method for the btree cursor type.
+ */
+static int
+__curfile_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_API_CALL(cursor, session, next, cbt->btree);
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if ((ret = __wt_btcur_next(cbt, 0)) == 0)
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_next_random --
+ * WT_CURSOR->next method for the btree cursor type when configured with
+ * next_random.
+ */
+static int
+__curfile_next_random(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_API_CALL(cursor, session, next, cbt->btree);
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if ((ret = __wt_btcur_next_random(cbt)) == 0)
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_prev --
+ * WT_CURSOR->prev method for the btree cursor type.
+ */
+static int
+__curfile_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_API_CALL(cursor, session, prev, cbt->btree);
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if ((ret = __wt_btcur_prev(cbt, 0)) == 0)
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_reset --
+ * WT_CURSOR->reset method for the btree cursor type.
+ */
+static int
+__curfile_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_API_CALL(cursor, session, reset, cbt->btree);
+
+ ret = __wt_btcur_reset(cbt);
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_search --
+ * WT_CURSOR->search method for the btree cursor type.
+ */
+static int
+__curfile_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_API_CALL(cursor, session, search, cbt->btree);
+
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NOVALUE(cursor);
+
+ WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_search(cbt), ret);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_search_near --
+ * WT_CURSOR->search_near method for the btree cursor type.
+ */
+static int
+__curfile_search_near(WT_CURSOR *cursor, int *exact)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_API_CALL(cursor, session, search_near, cbt->btree);
+
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NOVALUE(cursor);
+
+ WT_BTREE_CURSOR_SAVE_AND_RESTORE(
+ cursor, __wt_btcur_search_near(cbt, exact), ret);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_insert --
+ * WT_CURSOR->insert method for the btree cursor type.
+ */
+static int
+__curfile_insert(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_UPDATE_API_CALL(cursor, session, insert, cbt->btree);
+ if (!F_ISSET(cursor, WT_CURSTD_APPEND))
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_insert(cbt), ret);
+
+ /*
+ * Insert is the one cursor operation that doesn't end with the cursor
+ * pointing to an on-page item. The standard macro handles errors
+ * correctly, but we need to leave the application cursor unchanged in
+ * the case of success, except for column-store appends, where we are
+ * returning a key.
+ */
+ if (ret == 0) {
+ if (!F_ISSET(cursor, WT_CURSTD_APPEND)) {
+ F_SET(cursor, WT_CURSTD_KEY_EXT);
+ F_CLR(cursor, WT_CURSTD_KEY_INT);
+ }
+ F_SET(cursor, WT_CURSTD_VALUE_EXT);
+ F_CLR(cursor, WT_CURSTD_VALUE_INT);
+ }
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curfile_update --
+ * WT_CURSOR->update method for the btree cursor type.
+ */
+static int
+__curfile_update(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_UPDATE_API_CALL(cursor, session, update, cbt->btree);
+
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_update(cbt), ret);
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __wt_curfile_update_check --
+ * WT_CURSOR->update_check method for the btree cursor type.
+ */
+int
+__wt_curfile_update_check(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_UPDATE_API_CALL(cursor, session, update, cbt->btree);
+
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NOVALUE(cursor);
+
+ WT_BTREE_CURSOR_SAVE_AND_RESTORE(
+ cursor, __wt_btcur_update_check(cbt), ret);
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curfile_remove --
+ * WT_CURSOR->remove method for the btree cursor type.
+ */
+static int
+__curfile_remove(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_UPDATE_API_CALL(cursor, session, remove, cbt->btree);
+
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NOVALUE(cursor);
+
+ WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_remove(cbt), ret);
+
+ /*
+ * After a successful remove, copy the key: the value is not available.
+ */
+ if (ret == 0) {
+ if (F_ISSET(cursor, WT_CURSTD_KEY_INT) &&
+ !WT_DATA_IN_ITEM(&(cursor)->key)) {
+ WT_ERR(__wt_buf_set(session, &cursor->key,
+ cursor->key.data, cursor->key.size));
+ F_CLR(cursor, WT_CURSTD_KEY_INT);
+ F_SET(cursor, WT_CURSTD_KEY_EXT);
+ }
+ F_CLR(cursor, WT_CURSTD_VALUE_SET);
+ }
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curfile_close --
+ * WT_CURSOR->close method for the btree cursor type.
+ */
+static int
+__curfile_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_API_CALL(cursor, session, close, cbt->btree);
+ WT_TRET(__wt_btcur_close(cbt));
+ if (cbt->btree != NULL)
+ WT_TRET(__wt_session_release_btree(session));
+ /* The URI is owned by the btree handle. */
+ cursor->internal_uri = NULL;
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curfile_create --
+ * Open a cursor for a given btree handle.
+ */
+int
+__wt_curfile_create(WT_SESSION_IMPL *session,
+ WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap,
+ WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __curfile_compare, /* compare */
+ __curfile_next, /* next */
+ __curfile_prev, /* prev */
+ __curfile_reset, /* reset */
+ __curfile_search, /* search */
+ __curfile_search_near, /* search-near */
+ __curfile_insert, /* insert */
+ __curfile_update, /* update */
+ __curfile_remove, /* remove */
+ __curfile_close); /* close */
+ WT_BTREE *btree;
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR *cursor;
+ WT_CURSOR_BTREE *cbt;
+ WT_CURSOR_BULK *cbulk;
+ WT_DECL_RET;
+ size_t csize;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0);
+
+ cbt = NULL;
+
+ btree = S2BT(session);
+ WT_ASSERT(session, btree != NULL);
+
+ csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE);
+ WT_RET(__wt_calloc(session, 1, csize, &cbt));
+
+ cursor = &cbt->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ cursor->internal_uri = btree->dhandle->name;
+ cursor->key_format = btree->key_format;
+ cursor->value_format = btree->value_format;
+
+ cbt->btree = btree;
+ if (bulk) {
+ F_SET(cursor, WT_CURSTD_BULK);
+
+ cbulk = (WT_CURSOR_BULK *)cbt;
+
+ /* Optionally skip the validation of each bulk-loaded key. */
+ WT_ERR(__wt_config_gets_def(
+ session, cfg, "skip_sort_check", 0, &cval));
+ WT_ERR(__wt_curbulk_init(
+ session, cbulk, bitmap, cval.val == 0 ? 0 : 1));
+ }
+
+ /*
+ * random_retrieval
+ * Random retrieval cursors only support next, reset and close.
+ */
+ WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
+ if (cval.val != 0) {
+ __wt_cursor_set_notsup(cursor);
+ cursor->next = __curfile_next_random;
+ cursor->reset = __curfile_reset;
+ }
+
+ /* __wt_cursor_init is last so we don't have to clean up on error. */
+ WT_ERR(__wt_cursor_init(
+ cursor, cursor->internal_uri, owner, cfg, cursorp));
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_create);
+ WT_STAT_FAST_DATA_INCR(session, cursor_create);
+
+ if (0) {
+err: __wt_free(session, cbt);
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_curfile_open --
+ * WT_SESSION->open_cursor method for the btree cursor type.
+ */
+int
+__wt_curfile_open(WT_SESSION_IMPL *session, const char *uri,
+ WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ int bitmap, bulk;
+ uint32_t flags;
+
+ flags = 0;
+
+ WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval));
+ if (cval.type == WT_CONFIG_ITEM_BOOL ||
+ (cval.type == WT_CONFIG_ITEM_NUM &&
+ (cval.val == 0 || cval.val == 1))) {
+ bitmap = 0;
+ bulk = (cval.val != 0);
+ } else if (WT_STRING_MATCH("bitmap", cval.str, cval.len))
+ bitmap = bulk = 1;
+ else
+ WT_RET_MSG(session, EINVAL,
+ "Value for 'bulk' must be a boolean or 'bitmap'");
+
+ /* Bulk handles require exclusive access. */
+ if (bulk)
+ LF_SET(WT_BTREE_BULK | WT_DHANDLE_EXCLUSIVE);
+
+ /* Get the handle and lock it while the cursor is using it. */
+ if (WT_PREFIX_MATCH(uri, "file:"))
+ WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, flags));
+ else
+ WT_RET(__wt_bad_object_type(session, uri));
+
+ WT_ERR(__wt_curfile_create(session, owner, cfg, bulk, bitmap, cursorp));
+
+ return (0);
+
+err: /* If the cursor could not be opened, release the handle. */
+ WT_TRET(__wt_session_release_btree(session));
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c
new file mode 100644
index 00000000000..936337047b8
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_index.c
@@ -0,0 +1,447 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curindex_get_value --
+ * WT_CURSOR->get_value implementation for index cursors.
+ */
+static int
+__curindex_get_value(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_INDEX *cindex;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ CURSOR_API_CALL(cursor, session, get_value, NULL);
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ va_start(ap, cursor);
+ if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+ ret = __wt_schema_project_merge(session,
+ cindex->cg_cursors, cindex->value_plan,
+ cursor->value_format, &cursor->value);
+ if (ret == 0) {
+ item = va_arg(ap, WT_ITEM *);
+ item->data = cursor->value.data;
+ item->size = cursor->value.size;
+ }
+ } else
+ ret = __wt_schema_project_out(session,
+ cindex->cg_cursors, cindex->value_plan, ap);
+ va_end(ap);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_set_value --
+ * WT_CURSOR->set_value implementation for index cursors.
+ */
+static void
+__curindex_set_value(WT_CURSOR *cursor, ...)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ CURSOR_API_CALL(cursor, session, set_value, NULL);
+ ret = ENOTSUP;
+err: cursor->saved_err = ret;
+ F_CLR(cursor, WT_CURSTD_VALUE_SET);
+ API_END(session, ret);
+}
+
+/*
+ * __curindex_move --
+ * When an index cursor changes position, set the primary key in the
+ * associated column groups and update their positions to match.
+ */
+static int
+__curindex_move(WT_CURSOR_INDEX *cindex)
+{
+ WT_CURSOR **cp, *first;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ session = (WT_SESSION_IMPL *)cindex->iface.session;
+ first = NULL;
+
+ /* Point the public cursor to the key in the child. */
+ __wt_cursor_set_raw_key(&cindex->iface, &cindex->child->key);
+ F_CLR(&cindex->iface, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+ for (i = 0, cp = cindex->cg_cursors;
+ i < WT_COLGROUPS(cindex->table);
+ i++, cp++) {
+ if (*cp == NULL)
+ continue;
+ if (first == NULL) {
+ /*
+ * Set the primary key -- note that we need the primary
+ * key columns, so we have to use the full key format,
+ * not just the public columns.
+ */
+ WT_RET(__wt_schema_project_slice(session,
+ cp, cindex->index->key_plan,
+ 1, cindex->index->key_format,
+ &cindex->iface.key));
+ first = *cp;
+ } else {
+ (*cp)->key.data = first->key.data;
+ (*cp)->key.size = first->key.size;
+ (*cp)->recno = first->recno;
+ }
+ F_SET(*cp, WT_CURSTD_KEY_EXT);
+ WT_RET((*cp)->search(*cp));
+ }
+
+ F_SET(&cindex->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+ return (0);
+}
+
+/*
+ * __curindex_next --
+ * WT_CURSOR->next method for index cursors.
+ */
+static int
+__curindex_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_INDEX *cindex;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ CURSOR_API_CALL(cursor, session, next, NULL);
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+ if ((ret = cindex->child->next(cindex->child)) == 0)
+ ret = __curindex_move(cindex);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_prev --
+ * WT_CURSOR->prev method for index cursors.
+ */
+static int
+__curindex_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR_INDEX *cindex;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ CURSOR_API_CALL(cursor, session, prev, NULL);
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+ if ((ret = cindex->child->prev(cindex->child)) == 0)
+ ret = __curindex_move(cindex);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_reset --
+ * WT_CURSOR->reset method for index cursors.
+ */
+static int
+__curindex_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR **cp;
+ WT_CURSOR_INDEX *cindex;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ CURSOR_API_CALL(cursor, session, reset, NULL);
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+ WT_TRET(cindex->child->reset(cindex->child));
+ for (i = 0, cp = cindex->cg_cursors;
+ i < WT_COLGROUPS(cindex->table);
+ i++, cp++) {
+ if (*cp == NULL)
+ continue;
+ WT_TRET((*cp)->reset(*cp));
+ }
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_search --
+ * WT_CURSOR->search method for index cursors.
+ */
+static int
+__curindex_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR *child;
+ WT_CURSOR_INDEX *cindex;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int exact;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ child = cindex->child;
+ CURSOR_API_CALL(cursor, session, search, NULL);
+
+ /*
+ * We expect partial matches, but we want the smallest item that
+ * matches the prefix. Fail if there is no matching item.
+ */
+ __wt_cursor_set_raw_key(child, &cursor->key);
+ WT_ERR(child->search_near(child, &exact));
+
+ /*
+ * We expect partial matches, and want the smallest record with a key
+ * greater than or equal to the search key. The only way for the key
+ * to be equal is if there is an index on the primary key, because
+ * otherwise the primary key columns will be appended to the index key,
+ * but we don't disallow that (odd) case.
+ */
+ if (exact < 0)
+ WT_ERR(child->next(child));
+
+ if (child->key.size < cursor->key.size ||
+ memcmp(child->key.data, cursor->key.data, cursor->key.size) != 0) {
+ ret = WT_NOTFOUND;
+ goto err;
+ }
+
+ WT_ERR(__curindex_move(cindex));
+
+ if (0) {
+err: F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+ }
+
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_search_near --
+ * WT_CURSOR->search_near method for index cursors.
+ */
+static int
+__curindex_search_near(WT_CURSOR *cursor, int *exact)
+{
+ WT_CURSOR_INDEX *cindex;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ CURSOR_API_CALL(cursor, session, search_near, NULL);
+ __wt_cursor_set_raw_key(cindex->child, &cursor->key);
+ if ((ret = cindex->child->search_near(cindex->child, exact)) == 0)
+ ret = __curindex_move(cindex);
+ else
+ F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_close --
+ * WT_CURSOR->close method for index cursors.
+ */
+static int
+__curindex_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_INDEX *cindex;
+ WT_CURSOR **cp;
+ WT_DECL_RET;
+ WT_INDEX *idx;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ idx = cindex->index;
+
+ CURSOR_API_CALL(cursor, session, close, NULL);
+
+ if ((cp = cindex->cg_cursors) != NULL)
+ for (i = 0, cp = cindex->cg_cursors;
+ i < WT_COLGROUPS(cindex->table); i++, cp++)
+ if (*cp != NULL) {
+ WT_TRET((*cp)->close(*cp));
+ *cp = NULL;
+ }
+
+ __wt_free(session, cindex->cg_cursors);
+ if (cindex->key_plan != idx->key_plan)
+ __wt_free(session, cindex->key_plan);
+ if (cursor->value_format != cindex->table->value_format)
+ __wt_free(session, cursor->value_format);
+ if (cindex->value_plan != idx->value_plan)
+ __wt_free(session, cindex->value_plan);
+
+ if (cindex->child != NULL)
+ WT_TRET(cindex->child->close(cindex->child));
+
+ __wt_schema_release_table(session, cindex->table);
+ /* The URI is owned by the index. */
+ cursor->internal_uri = NULL;
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_open_colgroups --
+ * Open cursors on the column groups required for an index cursor.
+ */
+static int
+__curindex_open_colgroups(
+ WT_SESSION_IMPL *session, WT_CURSOR_INDEX *cindex, const char *cfg_arg[])
+{
+ WT_TABLE *table;
+ WT_CURSOR **cp;
+ u_long arg;
+ /* Child cursors are opened with dump disabled. */
+ const char *cfg[] = { cfg_arg[0], cfg_arg[1], "dump=\"\"", NULL };
+ char *proj;
+
+ table = cindex->table;
+ WT_RET(__wt_calloc_def(session, WT_COLGROUPS(table), &cp));
+ cindex->cg_cursors = cp;
+
+ /* Work out which column groups we need. */
+ for (proj = (char *)cindex->value_plan; *proj != '\0'; proj++) {
+ arg = strtoul(proj, &proj, 10);
+ if ((*proj != WT_PROJ_KEY && *proj != WT_PROJ_VALUE) ||
+ cp[arg] != NULL)
+ continue;
+ WT_RET(__wt_open_cursor(session,
+ table->cgroups[arg]->source,
+ &cindex->iface, cfg, &cp[arg]));
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_curindex_open --
+ * WT_SESSION->open_cursor method for index cursors.
+ */
+int
+__wt_curindex_open(WT_SESSION_IMPL *session,
+ const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __curindex_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __curindex_set_value, /* set-value */
+ __wt_cursor_notsup, /* compare */
+ __curindex_next, /* next */
+ __curindex_prev, /* prev */
+ __curindex_reset, /* reset */
+ __curindex_search, /* search */
+ __curindex_search_near, /* search-near */
+ __wt_cursor_notsup, /* insert */
+ __wt_cursor_notsup, /* update */
+ __wt_cursor_notsup, /* remove */
+ __curindex_close); /* close */
+ WT_CURSOR_INDEX *cindex;
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_INDEX *idx;
+ WT_TABLE *table;
+ const char *columns, *idxname, *tablename;
+ size_t namesize;
+
+ tablename = uri;
+ if (!WT_PREFIX_SKIP(tablename, "index:") ||
+ (idxname = strchr(tablename, ':')) == NULL)
+ WT_RET_MSG(session, EINVAL, "Invalid cursor URI: '%s'", uri);
+ namesize = (size_t)(idxname - tablename);
+ ++idxname;
+
+ if ((ret = __wt_schema_get_table(session,
+ tablename, namesize, 0, &table)) != 0) {
+ if (ret == WT_NOTFOUND)
+ WT_RET_MSG(session, EINVAL,
+ "Cannot open cursor '%s' on unknown table", uri);
+ return (ret);
+ }
+
+ columns = strchr(idxname, '(');
+ if (columns == NULL)
+ namesize = strlen(idxname);
+ else
+ namesize = (size_t)(columns - idxname);
+
+ WT_RET(__wt_schema_open_index(session, table, idxname, namesize, &idx));
+ WT_RET(__wt_calloc_def(session, 1, &cindex));
+
+ cursor = &cindex->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+
+ cindex->table = table;
+ cindex->index = idx;
+ cindex->key_plan = idx->key_plan;
+ cindex->value_plan = idx->value_plan;
+
+ cursor->internal_uri = idx->name;
+ cursor->key_format = idx->idxkey_format;
+ cursor->value_format = table->value_format;
+
+ /*
+ * XXX
+ * A very odd corner case is an index with a recno key.
+ * The only way to get here is by creating an index on a column store
+ * using only the primary's recno as the index key. Disallow that for
+ * now.
+ */
+ if (WT_CURSOR_RECNO(cursor))
+ WT_ERR_MSG(session, WT_ERROR,
+ "Column store indexes based on a record number primary "
+ "key are not supported.");
+
+ /* Handle projections. */
+ if (columns != NULL) {
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_struct_reformat(session, table,
+ columns, strlen(columns), NULL, 0, tmp));
+ WT_ERR(__wt_strndup(
+ session, tmp->data, tmp->size, &cursor->value_format));
+
+ WT_ERR(__wt_buf_init(session, tmp, 0));
+ WT_ERR(__wt_struct_plan(session, table,
+ columns, strlen(columns), 0, tmp));
+ WT_ERR(__wt_strndup(
+ session, tmp->data, tmp->size, &cindex->value_plan));
+ }
+
+ WT_ERR(__wt_cursor_init(
+ cursor, cursor->internal_uri, owner, cfg, cursorp));
+
+ WT_ERR(__wt_open_cursor(
+ session, idx->source, cursor, cfg, &cindex->child));
+
+ /* Open the column groups needed for this index cursor. */
+ WT_ERR(__curindex_open_colgroups(session, cindex, cfg));
+
+ if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
+ WT_ERR(__wt_json_column_init(cursor, table->key_format,
+ &idx->colconf, &table->colconf));
+
+ if (0) {
+err: WT_TRET(__curindex_close(cursor));
+ *cursorp = NULL;
+ }
+
+ __wt_scr_free(&tmp);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_json.c b/src/third_party/wiredtiger/src/cursor/cur_json.c
new file mode 100644
index 00000000000..f4459819259
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_json.c
@@ -0,0 +1,931 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static size_t __json_unpack_put(WT_SESSION_IMPL *, void *, u_char *, size_t,
+ WT_CONFIG_ITEM *);
+static inline int __json_struct_size(WT_SESSION_IMPL *, const void *, size_t,
+ const char *, WT_CONFIG_ITEM *, int, size_t *);
+static inline int __json_struct_unpackv(WT_SESSION_IMPL *, const void *, size_t,
+ const char *, WT_CONFIG_ITEM *, u_char *, size_t, int, va_list);
+static int json_string_arg(WT_SESSION_IMPL *, const char **, WT_ITEM *);
+static int json_int_arg(WT_SESSION_IMPL *, const char **, int64_t *);
+static int json_uint_arg(WT_SESSION_IMPL *, const char **, uint64_t *);
+static int __json_pack_struct(WT_SESSION_IMPL *, void *, size_t, const char *,
+ const char *);
+static int __json_pack_size(WT_SESSION_IMPL *, const char *, WT_CONFIG_ITEM *,
+ int, const char *, size_t *);
+
+#define WT_PACK_JSON_GET(session, pv, jstr) do { \
+ switch (pv.type) { \
+ case 'x': \
+ break; \
+ case 's': \
+ case 'S': \
+ WT_RET(json_string_arg(session, &jstr, &pv.u.item)); \
+ pv.type = pv.type == 's' ? 'j' : 'J'; \
+ break; \
+ case 'b': \
+ case 'h': \
+ case 'i': \
+ case 'l': \
+ case 'q': \
+ WT_RET(json_int_arg(session, &jstr, &pv.u.i)); \
+ break; \
+ case 'B': \
+ case 'H': \
+ case 'I': \
+ case 'L': \
+ case 'Q': \
+ case 'r': \
+ case 'R': \
+ case 't': \
+ WT_RET(json_uint_arg(session, &jstr, &pv.u.u)); \
+ break; \
+ /* User format strings have already been validated. */ \
+ WT_ILLEGAL_VALUE(session); \
+ } \
+} while (0)
+
+/*
+ * __json_unpack_put --
+ * Calculate the size of a packed byte string as formatted for JSON.
+ */
+static size_t
+__json_unpack_put(WT_SESSION_IMPL *session, void *voidpv,
+ u_char *buf, size_t bufsz, WT_CONFIG_ITEM *name)
+{
+ WT_PACK_VALUE *pv;
+ const char *p, *end;
+ size_t s, n;
+
+ pv = (WT_PACK_VALUE *)voidpv;
+ s = (size_t)snprintf((char *)buf, bufsz, "\"%.*s\" : ",
+ (int)name->len, name->str);
+ if (s <= bufsz) {
+ bufsz -= s;
+ buf += s;
+ }
+ else
+ bufsz = 0;
+
+ switch (pv->type) {
+ case 'x':
+ return (0);
+ case 's':
+ case 'S':
+ /* Account for '"' quote in front and back. */
+ s += 2;
+ p = (const char *)pv->u.s;
+ if (bufsz > 0) {
+ *buf++ = '"';
+ bufsz--;
+ }
+ if (pv->type == 's' || pv->havesize) {
+ end = p + pv->size;
+ for (; p < end; p++) {
+ n = __wt_json_unpack_char(*p, buf, bufsz, 0);
+ if (n > bufsz)
+ bufsz = 0;
+ else {
+ bufsz -= n;
+ buf += n;
+ }
+ s += n;
+ }
+ } else
+ for (; *p; p++) {
+ n = __wt_json_unpack_char(*p, buf, bufsz, 0);
+ if (n > bufsz)
+ bufsz = 0;
+ else {
+ bufsz -= n;
+ buf += n;
+ }
+ s += n;
+ }
+ if (bufsz > 0)
+ *buf++ = '"';
+ return (s);
+ case 'U':
+ case 'u':
+ s += 2;
+ p = (const char *)pv->u.item.data;
+ end = p + pv->u.item.size;
+ if (bufsz > 0) {
+ *buf++ = '"';
+ bufsz--;
+ }
+ for (; p < end; p++) {
+ n = __wt_json_unpack_char(*p, buf, bufsz, 1);
+ if (n > bufsz)
+ bufsz = 0;
+ else {
+ bufsz -= n;
+ buf += n;
+ }
+ s += n;
+ }
+ if (bufsz > 0)
+ *buf++ = '"';
+ return (s);
+ case 'b':
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'q':
+ return (s +
+ (size_t)snprintf((char *)buf, bufsz, "%" PRId64, pv->u.i));
+ case 'B':
+ case 't':
+ case 'H':
+ case 'I':
+ case 'L':
+ case 'Q':
+ case 'r':
+ case 'R':
+ return (s +
+ (size_t)snprintf((char *)buf, bufsz, "%" PRId64, pv->u.u));
+ }
+ __wt_err(session, EINVAL, "unknown pack-value type: %c", (int)pv->type);
+ return ((size_t)-1);
+}
+
+/*
+ * __json_struct_size --
+ * Calculate the size of a packed byte string as formatted for JSON.
+ */
+static inline int
+__json_struct_size(WT_SESSION_IMPL *session, const void *buffer,
+ size_t size, const char *fmt, WT_CONFIG_ITEM *names, int iskey,
+ size_t *presult)
+{
+ WT_CONFIG_ITEM name;
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ WT_PACK_NAME packname;
+ const uint8_t *p, *end;
+ size_t result;
+ int needcr;
+
+ p = buffer;
+ end = p + size;
+ result = 0;
+ needcr = 0;
+
+ WT_RET(__pack_name_init(session, names, iskey, &packname));
+ WT_RET(__pack_init(session, &pack, fmt));
+ while ((ret = __pack_next(&pack, &pv)) == 0) {
+ if (needcr)
+ result += 2;
+ needcr = 1;
+ WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p)));
+ WT_RET(__pack_name_next(&packname, &name));
+ result += __json_unpack_put(session, &pv, NULL, 0, &name);
+ }
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+
+ /* Be paranoid - __pack_write should never overflow. */
+ WT_ASSERT(session, p <= end);
+
+ *presult = result;
+ return (ret);
+}
+
+/*
+ * __json_struct_unpackv --
+ * Unpack a byte string to JSON (va_list version).
+ */
+static inline int
+__json_struct_unpackv(WT_SESSION_IMPL *session,
+ const void *buffer, size_t size, const char *fmt, WT_CONFIG_ITEM *names,
+ u_char *jbuf, size_t jbufsize, int iskey, va_list ap)
+{
+ WT_CONFIG_ITEM name;
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ WT_PACK_NAME packname;
+ int needcr;
+ size_t jsize;
+ const uint8_t *p, *end;
+
+ p = buffer;
+ end = p + size;
+ needcr = 0;
+
+ /* Unpacking a cursor marked as json implies a single arg. */
+ *va_arg(ap, const char **) = (char *)jbuf;
+
+ WT_RET(__pack_name_init(session, names, iskey, &packname));
+ WT_RET(__pack_init(session, &pack, fmt));
+ while ((ret = __pack_next(&pack, &pv)) == 0) {
+ if (needcr) {
+ WT_ASSERT(session, jbufsize >= 3);
+ strncat((char *)jbuf, ",\n", jbufsize);
+ jbuf += 2;
+ jbufsize -= 2;
+ }
+ needcr = 1;
+ WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p)));
+ WT_RET(__pack_name_next(&packname, &name));
+ jsize = __json_unpack_put(session,
+ (u_char *)&pv, jbuf, jbufsize, &name);
+ WT_ASSERT(session, jsize <= jbufsize);
+ jbuf += jsize;
+ jbufsize -= jsize;
+ }
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+
+ /* Be paranoid - __unpack_read should never overflow. */
+ WT_ASSERT(session, p <= end);
+
+ WT_ASSERT(session, jbufsize == 1);
+
+ return (ret);
+}
+
+/*
+ * __wt_json_alloc_unpack --
+ * Allocate space for, and unpack an entry into JSON format.
+ */
+int
+__wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer,
+ size_t size, const char *fmt, WT_CURSOR_JSON *json,
+ int iskey, va_list ap)
+{
+ WT_CONFIG_ITEM *names;
+ WT_DECL_RET;
+ size_t needed;
+ char **json_bufp;
+
+ if (iskey) {
+ names = &json->key_names;
+ json_bufp = &json->key_buf;
+ } else {
+ names = &json->value_names;
+ json_bufp = &json->value_buf;
+ }
+ needed = 0;
+ WT_RET(__json_struct_size(session, buffer, size, fmt, names,
+ iskey, &needed));
+ WT_RET(__wt_realloc(session, NULL, needed + 1, json_bufp));
+ WT_RET(__json_struct_unpackv(session, buffer, size, fmt,
+ names, (u_char *)*json_bufp, needed + 1, iskey, ap));
+
+ return (ret);
+}
+
+/*
+ * __wt_json_close --
+ * Release any json related resources.
+ */
+void
+__wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
+{
+ WT_CURSOR_JSON *json;
+
+ if ((json = (WT_CURSOR_JSON *)cursor->json_private) != NULL) {
+ __wt_free(session, json->key_buf);
+ __wt_free(session, json->value_buf);
+ __wt_free(session, json);
+ }
+ return;
+}
+
+/*
+ * __wt_json_unpack_char --
+ * Unpack a single character into JSON escaped format.
+ * Can be called with null buf for sizing.
+ */
+size_t
+__wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, int force_unicode)
+{
+ char abbrev;
+ u_char h;
+
+ if (!force_unicode) {
+ if (isprint(ch) && ch != '\\' && ch != '"') {
+ if (bufsz >= 1)
+ *buf = (u_char)ch;
+ return (1);
+ } else {
+ abbrev = '\0';
+ switch (ch) {
+ case '\\':
+ case '"':
+ abbrev = ch;
+ break;
+ case '\f':
+ abbrev = 'f';
+ break;
+ case '\n':
+ abbrev = 'n';
+ break;
+ case '\r':
+ abbrev = 'r';
+ break;
+ case '\t':
+ abbrev = 't';
+ break;
+ }
+ if (abbrev != '\0') {
+ if (bufsz >= 2) {
+ *buf++ = '\\';
+ *buf = (u_char)abbrev;
+ }
+ return (2);
+ }
+ }
+ }
+ if (bufsz >= 6) {
+ *buf++ = '\\';
+ *buf++ = 'u';
+ *buf++ = '0';
+ *buf++ = '0';
+ h = (((u_char)ch) >> 4) & 0xF;
+ if (h >= 10)
+ *buf++ = 'A' + (h - 10);
+ else
+ *buf++ = '0' + h;
+ h = ((u_char)ch) & 0xF;
+ if (h >= 10)
+ *buf++ = 'A' + (h - 10);
+ else
+ *buf++ = '0' + h;
+ }
+ return (6);
+}
+
+/*
+ * __wt_json_column_init --
+ * set json_key_names, json_value_names to comma separated lists
+ * of column names.
+ */
+int
+__wt_json_column_init(WT_CURSOR *cursor, const char *keyformat,
+ const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf)
+{
+ WT_CURSOR_JSON *json;
+ const char *p, *end, *beginkey;
+ uint32_t keycnt, nkeys;
+
+ json = (WT_CURSOR_JSON *)cursor->json_private;
+ beginkey = colconf->str;
+ end = beginkey + colconf->len;
+
+ if (idxconf != NULL) {
+ json->key_names.str = idxconf->str;
+ json->key_names.len = idxconf->len;
+ } else if (colconf->len > 0 && *beginkey == '(') {
+ beginkey++;
+ if (end[-1] == ')')
+ end--;
+ }
+
+ for (nkeys = 0; *keyformat; keyformat++)
+ if (!isdigit(*keyformat))
+ nkeys++;
+
+ p = beginkey;
+ keycnt = 0;
+ while (p < end && keycnt < nkeys) {
+ if (*p == ',')
+ keycnt++;
+ p++;
+ }
+ json->value_names.str = p;
+ json->value_names.len = WT_PTRDIFF(end, p);
+ if (idxconf == NULL) {
+ if (p > beginkey)
+ p--;
+ json->key_names.str = beginkey;
+ json->key_names.len = WT_PTRDIFF(p, beginkey);
+ }
+ return (0);
+}
+
+#define MATCH_KEYWORD(session, in, result, keyword, matchval) do { \
+ size_t _kwlen = strlen(keyword); \
+ if (strncmp(in, keyword, _kwlen) == 0 && !isalnum(in[_kwlen])) { \
+ in += _kwlen; \
+ result = matchval; \
+ } else { \
+ const char *_bad = in; \
+ while (isalnum(*in)) \
+ in++; \
+ __wt_errx(session, "unknown keyword \"%.*s\" in JSON", \
+ (int)(in - _bad), _bad); \
+ } \
+} while (0)
+
+/*
+ * __wt_json_token --
+ * Return the type, start position and length of the next JSON
+ * token in the input. String tokens include the quotes. JSON
+ * can be entirely parsed using calls to this tokenizer, each
+ * call using a src pointer that is the previously returned
+ * tokstart + toklen.
+ *
+ * The token type returned is one of:
+ * 0 : EOF
+ * 's' : string
+ * 'i' : intnum
+ * 'f' : floatnum
+ * ':' : colon
+ * ',' : comma
+ * '{' : lbrace
+ * '}' : rbrace
+ * '[' : lbracket
+ * ']' : rbracket
+ * 'N' : null
+ * 'T' : true
+ * 'F' : false
+ */
+int
+__wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype,
+ const char **tokstart, size_t *toklen)
+{
+ WT_SESSION_IMPL *session;
+ char ch;
+ const char *bad;
+ int backslash, isalph, isfloat, result;
+
+ result = -1;
+ session = (WT_SESSION_IMPL *)wt_session;
+ while (isspace(*src))
+ src++;
+ *tokstart = src;
+
+ if (*src == '\0') {
+ *toktype = 0;
+ *toklen = 0;
+ return (0);
+ }
+
+ /* JSON is specified in RFC 4627. */
+ switch (*src) {
+ case '"':
+ backslash = 0;
+ src++;
+ while ((ch = *src) != '\0') {
+ if (!backslash) {
+ if (ch == '"') {
+ src++;
+ result = 's';
+ break;
+ }
+ if (ch == '\\')
+ backslash = 1;
+ } else {
+ /* We validate Unicode on this pass. */
+ if (ch == 'u') {
+ u_char ignored;
+ const u_char *uc;
+
+ uc = (const u_char *)src;
+ if (__wt_hex2byte(&uc[1], &ignored) ||
+ __wt_hex2byte(&uc[3], &ignored)) {
+ __wt_errx(session,
+ "invalid Unicode within JSON string");
+ return (-1);
+ }
+ src += 5;
+ }
+ backslash = 0;
+ }
+ src++;
+ }
+ if (result != 's')
+ __wt_errx(session, "unterminated string in JSON");
+ break;
+ case '-':
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ isfloat = 0;
+ if (*src == '-')
+ src++;
+ while ((ch = *src) != '\0' && isdigit(ch))
+ src++;
+ if (*src == '.') {
+ isfloat = 1;
+ src++;
+ while ((ch = *src) != '\0' &&
+ isdigit(ch))
+ src++;
+ }
+ if (*src == 'e' || *src == 'E') {
+ isfloat = 1;
+ src++;
+ if (*src == '+' || *src == '-')
+ src++;
+ while ((ch = *src) != '\0' &&
+ isdigit(ch))
+ src++;
+ }
+ result = isfloat ? 'f' : 'i';
+ break;
+ case ':':
+ case ',':
+ case '{':
+ case '}':
+ case '[':
+ case ']':
+ result = *src++;
+ break;
+ case 'n':
+ MATCH_KEYWORD(session, src, result, "null", 'N');
+ break;
+ case 't':
+ MATCH_KEYWORD(session, src, result, "true", 'T');
+ break;
+ case 'f':
+ MATCH_KEYWORD(session, src, result, "false", 'F');
+ break;
+ default:
+ /* An illegal token, move past it anyway */
+ bad = src;
+ isalph = isalnum(*src);
+ src++;
+ if (isalph)
+ while (*src != '\0' && isalnum(*src))
+ src++;
+ __wt_errx(session, "unknown token \"%.*s\" in JSON",
+ (int)(src - bad), bad);
+ break;
+ }
+ *toklen = (size_t)(src - *tokstart);
+ *toktype = result;
+ return (result < 0 ? EINVAL : 0);
+}
+
+/*
+ * __wt_json_tokname
+ * Return a descriptive name from the token type returned by
+ * __wt_json_token
+ */
+const char *
+__wt_json_tokname(int toktype)
+{
+ switch (toktype) {
+ case 0: return ("<EOF>");
+ case 's': return ("<string>");
+ case 'i': return ("<integer>");
+ case 'f': return ("<float>");
+ case ':': return ("':'");
+ case ',': return ("','");
+ case '{': return ("'{'");
+ case '}': return ("'}'");
+ case '[': return ("'['");
+ case ']': return ("']'");
+ case 'N': return ("'null'");
+ case 'T': return ("'true'");
+ case 'F': return ("'false'");
+ default: return ("<UNKNOWN>");
+ }
+}
+
+/*
+ * json_string_arg --
+ * Returns a first cut of the needed string in item.
+ * The result has not been stripped of escapes.
+ */
+static int
+json_string_arg(WT_SESSION_IMPL *session, const char **jstr, WT_ITEM *item)
+{
+ const char *tokstart;
+ int tok;
+ WT_DECL_RET;
+
+ WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart,
+ &item->size));
+ if (tok == 's') {
+ *jstr = tokstart + item->size;
+ /* The tokenizer includes the '"' chars */
+ item->data = tokstart + 1;
+ item->size -= 2;
+ ret = 0;
+ } else {
+ __wt_errx(session, "expected JSON <string>, got %s",
+ __wt_json_tokname(tok));
+ ret = EINVAL;
+ }
+ return (ret);
+}
+
+/*
+ * json_int_arg --
+ * Returns a signed integral value from the current position
+ * in the JSON string.
+ */
+static int
+json_int_arg(WT_SESSION_IMPL *session, const char **jstr, int64_t *ip)
+{
+ char *end;
+ const char *tokstart;
+ int tok;
+ size_t toksize;
+
+ WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart,
+ &toksize));
+ if (tok == 'i') {
+ /* JSON only allows decimal */
+ *ip = strtoll(tokstart, &end, 10);
+ if (end != tokstart + toksize)
+ WT_RET_MSG(session, EINVAL,
+ "JSON <int> extraneous input");
+ *jstr = tokstart + toksize;
+ } else {
+ __wt_errx(session, "expected JSON <int>, got %s",
+ __wt_json_tokname(tok));
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * json_uint_arg --
+ * Returns an unsigned integral value from the current position
+ * in the JSON string.
+ */
+static int
+json_uint_arg(WT_SESSION_IMPL *session, const char **jstr, uint64_t *up)
+{
+ char *end;
+ const char *tokstart;
+ int tok;
+ size_t toksize;
+
+ WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart,
+ &toksize));
+ if (tok == 'i' && *tokstart != '-') {
+ /* JSON only allows decimal */
+ *up = strtoull(tokstart, &end, 10);
+ if (end != tokstart + toksize)
+ WT_RET_MSG(session, EINVAL,
+ "JSON <int> extraneous input");
+ *jstr = tokstart + toksize;
+ } else {
+ __wt_errx(session, "expected unsigned JSON <int>, got %s",
+ __wt_json_tokname(tok));
+ return (EINVAL);
+ }
+ return (0);
+}
+
+#define JSON_EXPECT_TOKEN_GET(session, jstr, tokval, start, sz) do { \
+ int __tok; \
+ WT_RET(__wt_json_token((WT_SESSION *)session, jstr, &__tok, &start, &sz));\
+ if (__tok != tokval) { \
+ __wt_errx(session, "expected JSON %s, got %s", \
+ __wt_json_tokname(tokval), __wt_json_tokname(__tok)); \
+ return (EINVAL); \
+ } \
+ jstr = start + sz; \
+} while (0)
+
+#define JSON_EXPECT_TOKEN(session, jstr, tokval) do { \
+ const char *__start; \
+ size_t __sz; \
+ JSON_EXPECT_TOKEN_GET(session, jstr, tokval, __start, __sz); \
+} while (0)
+
+/*
+ * __json_pack_struct --
+ * Pack a byte string from a JSON string.
+ */
+static int
+__json_pack_struct(WT_SESSION_IMPL *session, void *buffer, size_t size,
+ const char *fmt, const char *jstr)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ const char *tokstart;
+ int multi;
+ size_t toksize;
+ uint8_t *p, *end;
+
+ p = buffer;
+ end = p + size;
+ multi = 0;
+
+ if (fmt[0] != '\0' && fmt[1] == '\0') {
+ JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize);
+ /* the key name was verified in __json_pack_size */
+ JSON_EXPECT_TOKEN(session, jstr, ':');
+ pv.type = fmt[0];
+ WT_PACK_JSON_GET(session, pv, jstr);
+ return (__pack_write(session, &pv, &p, size));
+ }
+
+ WT_RET(__pack_init(session, &pack, fmt));
+ while ((ret = __pack_next(&pack, &pv)) == 0) {
+ if (multi)
+ JSON_EXPECT_TOKEN(session, jstr, ',');
+ JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize);
+ /* the key name was verified in __json_pack_size */
+ JSON_EXPECT_TOKEN(session, jstr, ':');
+ WT_PACK_JSON_GET(session, pv, jstr);
+ WT_RET(__pack_write(session, &pv, &p, (size_t)(end - p)));
+ multi = 1;
+ }
+
+ /* Be paranoid - __pack_write should never overflow. */
+ WT_ASSERT(session, p <= end);
+
+ if (ret != WT_NOTFOUND)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __json_pack_size --
+ * Calculate the size of a packed byte string from a JSON string.
+ * We verify that the names and value types provided in JSON match
+ * the column names and type from the schema format, returning error
+ * if not.
+ */
+static int
+__json_pack_size(
+ WT_SESSION_IMPL *session, const char *fmt, WT_CONFIG_ITEM *names,
+ int iskey, const char *jstr, size_t *sizep)
+{
+ WT_CONFIG_ITEM name;
+ WT_DECL_PACK_VALUE(pv);
+ WT_PACK pack;
+ WT_PACK_NAME packname;
+ const char *tokstart;
+ int multi;
+ size_t toksize, total;
+
+ WT_RET(__pack_name_init(session, names, iskey, &packname));
+ multi = 0;
+ WT_RET(__pack_init(session, &pack, fmt));
+ for (total = 0; __pack_next(&pack, &pv) == 0;) {
+ if (multi)
+ JSON_EXPECT_TOKEN(session, jstr, ',');
+ JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize);
+ WT_RET(__pack_name_next(&packname, &name));
+ if (toksize - 2 != name.len ||
+ strncmp(tokstart + 1, name.str, toksize - 2) != 0) {
+ __wt_errx(session, "JSON expected %s name: \"%.*s\"",
+ iskey ? "key" : "value", (int)name.len, name.str);
+ return (EINVAL);
+ }
+ JSON_EXPECT_TOKEN(session, jstr, ':');
+ WT_PACK_JSON_GET(session, pv, jstr);
+ total += __pack_size(session, &pv);
+ multi = 1;
+ }
+ /* check end of string */
+ JSON_EXPECT_TOKEN(session, jstr, 0);
+
+ *sizep = total;
+ return (0);
+}
+
+/*
+ * __wt_json_to_item --
+ * Convert a JSON input string for either key/value to a raw WT_ITEM.
+ * Checks that the input matches the expected format.
+ */
+int
+__wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr,
+ const char *format, WT_CURSOR_JSON *json, int iskey, WT_ITEM *item)
+{
+ size_t sz;
+ sz = 0; /* Initialize because GCC 4.1 is paranoid */
+
+ WT_RET(__json_pack_size(session, format,
+ iskey ? &json->key_names : &json->value_names, iskey, jstr, &sz));
+ WT_RET(__wt_buf_initsize(session, item, sz));
+ WT_RET(__json_pack_struct(session, item->mem, sz, format, jstr));
+ return (0);
+}
+
+/*
+ * __wt_json_strlen --
+ * Return the number of bytes represented by a string in JSON format,
+ * or -1 if the format is incorrect.
+ */
+ssize_t
+__wt_json_strlen(const char *src, size_t srclen)
+{
+ const char *srcend;
+ size_t dstlen;
+ u_char hi, lo;
+
+ dstlen = 0;
+ srcend = src + srclen;
+ while (src < srcend) {
+ /* JSON can include any UTF-8 expressed in 4 hex chars. */
+ if (*src == '\\') {
+ if (*++src == 'u') {
+ if (__wt_hex2byte((const u_char *)++src, &hi))
+ return (-1);
+ src += 2;
+ if (__wt_hex2byte((const u_char *)src, &lo))
+ return (-1);
+ src += 2;
+ /* RFC 3629 */
+ if (hi >= 0x8) {
+ /* 3 bytes total */
+ dstlen += 2;
+ }
+ else if (hi != 0 || lo >= 0x80) {
+ /* 2 bytes total */
+ dstlen++;
+ }
+ /* else 1 byte total */
+ }
+ }
+ dstlen++;
+ src++;
+ }
+ if (src != srcend)
+ return (-1); /* invalid input, e.g. final char is '\\' */
+ return ((ssize_t)dstlen);
+}
+
+/*
+ * __wt_json_strncpy --
+ * Copy bytes of string in JSON format to a destination,
+ * up to dstlen bytes. If dstlen is greater than the needed size,
+ * the result if zero padded.
+ */
+int
+__wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen)
+{
+ char *dst;
+ const char *dstend, *srcend;
+ u_char hi, lo;
+
+ dst = *pdst;
+ dstend = dst + dstlen;
+ srcend = src + srclen;
+ while (src < srcend && dst < dstend) {
+ /* JSON can include any UTF-8 expressed in 4 hex chars. */
+ if (*src == '\\') {
+ if (*++src == 'u') {
+ if (__wt_hex2byte((const u_char *)++src, &hi))
+ return (EINVAL);
+ src += 2;
+ if (__wt_hex2byte((const u_char *)src, &lo))
+ return (EINVAL);
+ src += 2;
+ /* RFC 3629 */
+ if (hi >= 0x8) {
+ /* 3 bytes total */
+ /* byte 0: 1110HHHH */
+ /* byte 1: 10HHHHLL */
+ /* byte 2: 10LLLLLL */
+ *dst++ = (char)(0xe0 |
+ ((hi >> 4) & 0x0f));
+ *dst++ = (char)(0x80 |
+ ((hi << 2) & 0x3c) |
+ ((lo >> 6) & 0x03));
+ *dst++ = (char)(0x80 | (lo & 0x3f));
+ } else if (hi != 0 || lo >= 0x80) {
+ /* 2 bytes total */
+ /* byte 0: 110HHHLL */
+ /* byte 1: 10LLLLLL */
+ *dst++ = (char)(0xc0 |
+ (hi << 2) |
+ ((lo >> 6) & 0x03));
+ *dst++ = (char)(0x80 | (lo & 0x3f));
+ } else
+ /* else 1 byte total */
+ /* byte 0: 0LLLLLLL */
+ *dst++ = (char)lo;
+ }
+ else
+ *dst++ = *src;
+ } else
+ *dst++ = *src;
+ src++;
+ }
+ if (src != srcend)
+ return (ENOMEM);
+ *pdst = dst;
+ while (dst < dstend)
+ *dst++ = '\0';
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_log.c b/src/third_party/wiredtiger/src/cursor/cur_log.c
new file mode 100644
index 00000000000..803d68e890c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_log.c
@@ -0,0 +1,380 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curlog_logrec --
+ * Callback function from log_scan to get a log record.
+ */
+static int
+__curlog_logrec(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie)
+{
+ WT_CURSOR_LOG *cl;
+
+ cl = cookie;
+
+ /* Set up the LSNs and take a copy of the log record for the cursor. */
+ *cl->cur_lsn = *lsnp;
+ *cl->next_lsn = *lsnp;
+ cl->next_lsn->offset += (wt_off_t)logrec->size;
+ WT_RET(__wt_buf_set(session, cl->logrec, logrec->data, logrec->size));
+
+ /*
+ * Read the log header. Set up the step pointers to walk the
+ * operations inside the record. Get the record type.
+ */
+ cl->stepp = LOG_SKIP_HEADER(cl->logrec->data);
+ cl->stepp_end = (uint8_t *)cl->logrec->data + logrec->size;
+ WT_RET(__wt_logrec_read(session, &cl->stepp, cl->stepp_end,
+ &cl->rectype));
+
+ /* A step count of 0 means the entire record. */
+ cl->step_count = 0;
+
+ /*
+ * Unpack the txnid so that we can return each
+ * individual operation for this txnid.
+ */
+ if (cl->rectype == WT_LOGREC_COMMIT)
+ WT_RET(__wt_vunpack_uint(&cl->stepp,
+ WT_PTRDIFF(cl->stepp_end, cl->stepp), &cl->txnid));
+ else {
+ /*
+ * Step over anything else.
+ * Setting stepp to NULL causes the next()
+ * method to read a new record on the next call.
+ */
+ cl->stepp = NULL;
+ cl->txnid = 0;
+ }
+ return (0);
+}
+
+/*
+ * __curlog_compare --
+ * WT_CURSOR.compare method for the log cursor type.
+ */
+static int
+__curlog_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+ WT_CURSOR_LOG *acl, *bcl;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ CURSOR_API_CALL(a, session, compare, NULL);
+
+ acl = (WT_CURSOR_LOG *)a;
+ bcl = (WT_CURSOR_LOG *)b;
+ WT_ASSERT(session, cmpp != NULL);
+ *cmpp = LOG_CMP(acl->cur_lsn, bcl->cur_lsn);
+ /*
+ * If both are on the same LSN, compare step counter.
+ */
+ if (*cmpp == 0)
+ *cmpp = (acl->step_count != bcl->step_count ?
+ (acl->step_count < bcl->step_count ? -1 : 1) : 0);
+err: API_END_RET(session, ret);
+
+}
+
+/*
+ * __curlog_op_read --
+ * Read out any key/value from an individual operation record
+ * in the log. We're only interested in put and remove operations
+ * since truncate is not a cursor operation. All successful
+ * returns from this function will have set up the cursor copy of
+ * key and value to give the user.
+ */
+static int
+__curlog_op_read(WT_SESSION_IMPL *session,
+ WT_CURSOR_LOG *cl, uint32_t optype, uint32_t opsize, uint32_t *fileid)
+{
+ WT_ITEM key, value;
+ uint64_t recno;
+ const uint8_t *end, *pp;
+
+ pp = cl->stepp;
+ end = pp + opsize;
+ switch (optype) {
+ case WT_LOGOP_COL_PUT:
+ WT_RET(__wt_logop_col_put_unpack(session, &pp, end,
+ fileid, &recno, &value));
+ WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno)));
+ WT_RET(__wt_buf_set(session,
+ cl->opvalue, value.data, value.size));
+ break;
+ case WT_LOGOP_COL_REMOVE:
+ WT_RET(__wt_logop_col_remove_unpack(session, &pp, end,
+ fileid, &recno));
+ WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno)));
+ WT_RET(__wt_buf_set(session, cl->opvalue, NULL, 0));
+ break;
+ case WT_LOGOP_ROW_PUT:
+ WT_RET(__wt_logop_row_put_unpack(session, &pp, end,
+ fileid, &key, &value));
+ WT_RET(__wt_buf_set(session, cl->opkey, key.data, key.size));
+ WT_RET(__wt_buf_set(session,
+ cl->opvalue, value.data, value.size));
+ break;
+ case WT_LOGOP_ROW_REMOVE:
+ WT_RET(__wt_logop_row_remove_unpack(session, &pp, end,
+ fileid, &key));
+ WT_RET(__wt_buf_set(session, cl->opkey, key.data, key.size));
+ WT_RET(__wt_buf_set(session, cl->opvalue, NULL, 0));
+ break;
+ default:
+ /*
+ * Any other operations return the record in the value
+ * and an empty key.
+ */
+ *fileid = 0;
+ WT_RET(__wt_buf_set(session, cl->opkey, NULL, 0));
+ WT_RET(__wt_buf_set(session, cl->opvalue, cl->stepp, opsize));
+ }
+ return (0);
+}
+
+/*
+ * __curlog_kv --
+ * Set the key and value of the log cursor to return to the user.
+ */
+static int
+__curlog_kv(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
+{
+ WT_CURSOR_LOG *cl;
+ uint32_t fileid, key_count, opsize, optype;
+
+ cl = (WT_CURSOR_LOG *)cursor;
+ /*
+ * If it is a commit and we have stepped over the header, peek to get
+ * the size and optype and read out any key/value from this operation.
+ */
+ if ((key_count = cl->step_count++) > 0) {
+ WT_RET(__wt_logop_read(session,
+ &cl->stepp, cl->stepp_end, &optype, &opsize));
+ WT_RET(__curlog_op_read(session, cl, optype, opsize, &fileid));
+ /* Position on the beginning of the next record part. */
+ cl->stepp += opsize;
+ } else {
+ optype = WT_LOGOP_INVALID;
+ fileid = 0;
+ cl->opkey->data = NULL;
+ cl->opkey->size = 0;
+ /*
+ * Non-commit records we want to return the record without the
+ * header and the adjusted size. Add one to skip over the type
+ * which is normally consumed by __wt_logrec_read.
+ */
+ cl->opvalue->data = LOG_SKIP_HEADER(cl->logrec->data) + 1;
+ cl->opvalue->size = LOG_REC_SIZE(cl->logrec->size) - 1;
+ }
+ /*
+ * The log cursor sets the LSN and step count as the cursor key and
+ * and log record related data in the value. The data in the value
+ * contains any operation key/value that was in the log record.
+ */
+ __wt_cursor_set_key(cursor, cl->cur_lsn->file, cl->cur_lsn->offset,
+ key_count);
+ __wt_cursor_set_value(cursor, cl->txnid, cl->rectype, optype,
+ fileid, cl->opkey, cl->opvalue);
+ return (0);
+}
+
+/*
+ * __curlog_next --
+ * WT_CURSOR.next method for the step log cursor type.
+ */
+static int
+__curlog_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LOG *cl;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cl = (WT_CURSOR_LOG *)cursor;
+
+ CURSOR_API_CALL(cursor, session, next, NULL);
+
+ /*
+ * If we don't have a record, or went to the end of the record we
+ * have, or we are in the zero-fill portion of the record, get a
+ * new one.
+ */
+ if (cl->stepp == NULL || cl->stepp >= cl->stepp_end || !*cl->stepp) {
+ cl->txnid = 0;
+ WT_ERR(__wt_log_scan(session, cl->next_lsn, WT_LOGSCAN_ONE,
+ __curlog_logrec, cl));
+ }
+ WT_ASSERT(session, cl->logrec->data != NULL);
+ WT_ERR(__curlog_kv(session, cursor));
+ WT_STAT_FAST_CONN_INCR(session, cursor_next);
+ WT_STAT_FAST_DATA_INCR(session, cursor_next);
+
+err: API_END_RET(session, ret);
+
+}
+
+/*
+ * __curlog_search --
+ * WT_CURSOR.search method for the log cursor type.
+ */
+static int
+__curlog_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LOG *cl;
+ WT_DECL_RET;
+ WT_LSN key;
+ WT_SESSION_IMPL *session;
+ uint32_t counter;
+
+ cl = (WT_CURSOR_LOG *)cursor;
+
+ CURSOR_API_CALL(cursor, session, search, NULL);
+
+ /*
+ * !!! We are ignoring the counter and only searching based on the LSN.
+ */
+ WT_ERR(__wt_cursor_get_key((WT_CURSOR *)cl,
+ &key.file, &key.offset, &counter));
+ WT_ERR(__wt_log_scan(session, &key, WT_LOGSCAN_ONE,
+ __curlog_logrec, cl));
+ WT_ERR(__curlog_kv(session, cursor));
+ WT_STAT_FAST_CONN_INCR(session, cursor_search);
+ WT_STAT_FAST_DATA_INCR(session, cursor_search);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curlog_reset --
+ * WT_CURSOR.reset method for the log cursor type.
+ */
+static int
+__curlog_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LOG *cl;
+
+ cl = (WT_CURSOR_LOG *)cursor;
+ cl->stepp = cl->stepp_end = NULL;
+ cl->step_count = 0;
+ INIT_LSN(cl->cur_lsn);
+ INIT_LSN(cl->next_lsn);
+ return (0);
+}
+
+/*
+ * __curlog_close --
+ * WT_CURSOR.close method for the log cursor type.
+ */
+static int
+__curlog_close(WT_CURSOR *cursor)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR_LOG *cl;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_SESSION_IMPL *session;
+
+ CURSOR_API_CALL(cursor, session, close, NULL);
+ cl = (WT_CURSOR_LOG *)cursor;
+ conn = S2C(session);
+ WT_ASSERT(session, conn->logging);
+ log = conn->log;
+ WT_TRET(__wt_readunlock(session, log->log_archive_lock));
+ WT_TRET(__curlog_reset(cursor));
+ __wt_free(session, cl->cur_lsn);
+ __wt_free(session, cl->next_lsn);
+ __wt_scr_free(&cl->logrec);
+ __wt_scr_free(&cl->opkey);
+ __wt_scr_free(&cl->opvalue);
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curlog_open --
+ * Initialize a log cursor.
+ */
+int
+__wt_curlog_open(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __curlog_compare, /* compare */
+ __curlog_next, /* next */
+ __wt_cursor_notsup, /* prev */
+ __curlog_reset, /* reset */
+ __curlog_search, /* search */
+ __wt_cursor_notsup, /* search-near */
+ __wt_cursor_notsup, /* insert */
+ __wt_cursor_notsup, /* update */
+ __wt_cursor_notsup, /* remove */
+ __curlog_close); /* close */
+ WT_CURSOR *cursor;
+ WT_CURSOR_LOG *cl;
+ WT_DECL_RET;
+ WT_LOG *log;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_LOG, iface) == 0);
+ conn = S2C(session);
+ if (!conn->logging)
+ WT_RET_MSG(session, EINVAL,
+ "Cannot open a log cursor without logging enabled");
+
+ log = conn->log;
+ cl = NULL;
+ WT_RET(__wt_calloc_def(session, 1, &cl));
+ cursor = &cl->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ WT_ERR(__wt_calloc_def(session, 1, &cl->cur_lsn));
+ WT_ERR(__wt_calloc_def(session, 1, &cl->next_lsn));
+ WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec));
+ WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey));
+ WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue));
+ cursor->key_format = LOGC_KEY_FORMAT;
+ cursor->value_format = LOGC_VALUE_FORMAT;
+
+ INIT_LSN(cl->cur_lsn);
+ INIT_LSN(cl->next_lsn);
+
+ WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
+
+ /* Log cursors are read only. */
+ WT_ERR(__wt_cursor_config_readonly(cursor, cfg, 1));
+ /* Log cursors block archiving. */
+ WT_ERR(__wt_readlock(session, log->log_archive_lock));
+
+ if (0) {
+err: if (F_ISSET(cursor, WT_CURSTD_OPEN))
+ WT_TRET(cursor->close(cursor));
+ else {
+ __wt_free(session, cl->cur_lsn);
+ __wt_free(session, cl->next_lsn);
+ __wt_scr_free(&cl->logrec);
+ __wt_scr_free(&cl->opkey);
+ __wt_scr_free(&cl->opvalue);
+ /*
+ * NOTE: We cannot get on the error path with the
+ * readlock held. No need to unlock it unless that
+ * changes above.
+ */
+ __wt_free(session, cl);
+ }
+ *cursorp = NULL;
+ }
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_metadata.c b/src/third_party/wiredtiger/src/cursor/cur_metadata.c
new file mode 100644
index 00000000000..30fe3b28625
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_metadata.c
@@ -0,0 +1,444 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Custom NEED macros for metadata cursors - that copy the values into the
+ * backing metadata table cursor.
+ */
+#define WT_MD_CURSOR_NEEDKEY(cursor) do { \
+ WT_CURSOR_NEEDKEY(cursor); \
+ WT_ERR(__wt_buf_set(session, \
+ &((WT_CURSOR_METADATA *)(cursor))->file_cursor->key, \
+ cursor->key.data, cursor->key.size)); \
+ F_SET(((WT_CURSOR_METADATA *)(cursor))->file_cursor, \
+ WT_CURSTD_KEY_EXT); \
+} while (0)
+
+#define WT_MD_CURSOR_NEEDVALUE(cursor) do { \
+ WT_CURSOR_NEEDVALUE(cursor); \
+ WT_ERR(__wt_buf_set(session, \
+ &((WT_CURSOR_METADATA *)(cursor))->file_cursor->value, \
+ cursor->value.data, cursor->value.size)); \
+ F_SET(((WT_CURSOR_METADATA *)(cursor))->file_cursor, \
+ WT_CURSTD_VALUE_EXT); \
+} while (0)
+
+#define WT_MD_SET_KEY_VALUE(c, mc, fc) do { \
+ (c)->key.data = (fc)->key.data; \
+ (c)->key.size = (fc)->key.size; \
+ (c)->value.data = (fc)->value.data; \
+ (c)->value.size = (fc)->value.size; \
+ F_SET((c), WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); \
+ F_CLR((mc), WT_MDC_ONMETADATA); \
+ F_SET((mc), WT_MDC_POSITIONED); \
+} while (0)
+
+/*
+ * Check if a key matches the metadata. The public value is "metadata:",
+ * but also check for the internal version of the URI.
+ */
+#define WT_KEY_IS_METADATA(key) \
+ (WT_STRING_MATCH(WT_METADATA_URI, (key)->data, (key)->size - 1) ||\
+ WT_STRING_MATCH(WT_METAFILE_URI, (key)->data, (key)->size - 1))
+
+/*
+ * __curmetadata_metadata_search --
+ * Retrieve the metadata for the metadata table
+ */
+static int
+__curmetadata_metadata_search(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
+{
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ const char *value;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+
+ /* The metadata search interface allocates a new string in value. */
+ WT_RET(__wt_metadata_search(session, WT_METAFILE_URI, &value));
+
+ /*
+ * Copy the value to the underlying btree cursor's tmp item which will
+ * be freed when the cursor is closed.
+ */
+ ret = __wt_buf_setstr(session, &cursor->value, value);
+ __wt_free(session, value);
+ WT_RET(ret);
+
+ WT_RET(__wt_buf_setstr(session, &cursor->key, WT_METADATA_URI));
+
+ F_SET(mdc, WT_MDC_ONMETADATA | WT_MDC_POSITIONED);
+ F_SET(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+ return (0);
+}
+
+/*
+ * __curmetadata_compare --
+ * WT_CURSOR->compare method for the metadata cursor type.
+ */
+static int
+__curmetadata_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+ WT_CURSOR *a_file_cursor, *b_file_cursor;
+ WT_CURSOR_METADATA *a_mdc, *b_mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ a_mdc = ((WT_CURSOR_METADATA *)a);
+ b_mdc = ((WT_CURSOR_METADATA *)b);
+ a_file_cursor = a_mdc->file_cursor;
+ b_file_cursor = b_mdc->file_cursor;
+
+ CURSOR_API_CALL(a, session,
+ compare, ((WT_CURSOR_BTREE *)a_file_cursor)->btree);
+
+ if (b->compare != __curmetadata_compare)
+ WT_ERR_MSG(session, EINVAL,
+ "Can only compare cursors of the same type");
+
+ WT_MD_CURSOR_NEEDKEY(a);
+ WT_MD_CURSOR_NEEDKEY(b);
+
+ if (F_ISSET(a_mdc, WT_MDC_ONMETADATA)) {
+ if (F_ISSET(b_mdc, WT_MDC_ONMETADATA))
+ *cmpp = 0;
+ else
+ *cmpp = 1;
+ } else if (F_ISSET(b_mdc, WT_MDC_ONMETADATA))
+ *cmpp = -1;
+ else
+ ret = a_file_cursor->compare(
+ a_file_cursor, b_file_cursor, cmpp);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_next --
+ * WT_CURSOR->next method for the metadata cursor type.
+ */
+static int
+__curmetadata_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ next, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ if (!F_ISSET(mdc, WT_MDC_POSITIONED))
+ WT_ERR(__curmetadata_metadata_search(session, cursor));
+ else {
+ WT_ERR(file_cursor->next(mdc->file_cursor));
+ WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor);
+ }
+
+err: if (ret != 0) {
+ F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+ F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+ }
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_prev --
+ * WT_CURSOR->prev method for the metadata cursor type.
+ */
+static int
+__curmetadata_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ prev, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ if (F_ISSET(mdc, WT_MDC_ONMETADATA)) {
+ ret = WT_NOTFOUND;
+ goto err;
+ }
+
+ ret = file_cursor->prev(file_cursor);
+ if (ret == 0) {
+ WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor);
+ } else if (ret == WT_NOTFOUND)
+ WT_ERR(__curmetadata_metadata_search(session, cursor));
+
+err: if (ret != 0) {
+ F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+ F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+ }
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_reset --
+ * WT_CURSOR->reset method for the metadata cursor type.
+ */
+static int
+__curmetadata_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ reset, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ if (F_ISSET(mdc, WT_MDC_POSITIONED) && !F_ISSET(mdc, WT_MDC_ONMETADATA))
+ ret = file_cursor->reset(file_cursor);
+ F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_search --
+ * WT_CURSOR->search method for the metadata cursor type.
+ */
+static int
+__curmetadata_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ search, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ WT_MD_CURSOR_NEEDKEY(cursor);
+
+ if (WT_KEY_IS_METADATA(&cursor->key))
+ WT_ERR(__curmetadata_metadata_search(session, cursor));
+ else {
+ WT_ERR(file_cursor->search(file_cursor));
+ WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor);
+ }
+
+err: if (ret != 0) {
+ F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+ F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+ }
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_search_near --
+ * WT_CURSOR->search_near method for the metadata cursor type.
+ */
+static int
+__curmetadata_search_near(WT_CURSOR *cursor, int *exact)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ search_near, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ WT_MD_CURSOR_NEEDKEY(cursor);
+
+ if (WT_KEY_IS_METADATA(&cursor->key)) {
+ WT_ERR(__curmetadata_metadata_search(session, cursor));
+ *exact = 1;
+ } else {
+ WT_ERR(file_cursor->search_near(file_cursor, exact));
+ WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor);
+ }
+
+err: if (ret != 0) {
+ F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+ F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+ }
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_insert --
+ * WT_CURSOR->insert method for the metadata cursor type.
+ */
+static int
+__curmetadata_insert(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ insert, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ WT_MD_CURSOR_NEEDKEY(cursor);
+ WT_MD_CURSOR_NEEDVALUE(cursor);
+
+ /*
+ * Since the key/value formats are 's' the WT_ITEMs must contain a
+ * NULL terminated string.
+ */
+ ret =
+ __wt_metadata_insert(session, cursor->key.data, cursor->value.data);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_update --
+ * WT_CURSOR->update method for the metadata cursor type.
+ */
+static int
+__curmetadata_update(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ update, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ WT_MD_CURSOR_NEEDKEY(cursor);
+ WT_MD_CURSOR_NEEDVALUE(cursor);
+
+ /*
+ * Since the key/value formats are 's' the WT_ITEMs must contain a
+ * NULL terminated string.
+ */
+ ret =
+ __wt_metadata_update(session, cursor->key.data, cursor->value.data);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_remove --
+ * WT_CURSOR->remove method for the metadata cursor type.
+ */
+static int
+__curmetadata_remove(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ remove, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ WT_MD_CURSOR_NEEDKEY(cursor);
+
+ /*
+ * Since the key format is 's' the WT_ITEM must contain a NULL
+ * terminated string.
+ */
+ ret = __wt_metadata_remove(session, cursor->key.data);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_close --
+ * WT_CURSOR->close method for the metadata cursor type.
+ */
+static int
+__curmetadata_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ close, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ ret = file_cursor->close(file_cursor);
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curmetadata_open --
+ * WT_SESSION->open_cursor method for metadata cursors.
+ *
+ * Metadata cursors are a similar to a file cursor on the special metadata
+ * table, except that the metadata for the metadata table (which is stored
+ * in the turtle file) can also be queried.
+ *
+ * Metadata cursors are read-only by default.
+ */
+int
+__wt_curmetadata_open(WT_SESSION_IMPL *session,
+ const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __curmetadata_compare, /* compare */
+ __curmetadata_next, /* next */
+ __curmetadata_prev, /* prev */
+ __curmetadata_reset, /* reset */
+ __curmetadata_search, /* search */
+ __curmetadata_search_near, /* search-near */
+ __curmetadata_insert, /* insert */
+ __curmetadata_update, /* update */
+ __curmetadata_remove, /* remove */
+ __curmetadata_close); /* close */
+ WT_CURSOR *cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_CURSOR_METADATA), &mdc));
+
+ cursor = &mdc->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ cursor->key_format = "S";
+ cursor->value_format = "S";
+
+ /* Open the file cursor for operations on the regular metadata */
+ WT_ERR(__wt_metadata_cursor(session, cfg[1], &mdc->file_cursor));
+
+ WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));
+
+ /* Metadata cursors default to read only. */
+ WT_ERR(__wt_cursor_config_readonly(cursor, cfg, 1));
+
+ if (0) {
+err: __wt_free(session, mdc);
+ }
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c
new file mode 100644
index 00000000000..c06efced369
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c
@@ -0,0 +1,574 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __curstat_next(WT_CURSOR *cursor);
+static int __curstat_prev(WT_CURSOR *cursor);
+
+/*
+ * The statistics identifier is an offset from a base to ensure the integer ID
+ * values don't overlap (the idea is if they overlap it's easy for application
+ * writers to confuse them).
+ */
+#define WT_STAT_KEY_MAX(cst) (((cst)->stats_base + (cst)->stats_count) - 1)
+#define WT_STAT_KEY_MIN(cst) ((cst)->stats_base)
+#define WT_STAT_KEY_OFFSET(cst) ((cst)->key - (cst)->stats_base)
+
+/*
+ * __curstat_print_value --
+ * Convert statistics cursor value to printable format.
+ */
+static int
+__curstat_print_value(WT_SESSION_IMPL *session, uint64_t v, WT_ITEM *buf)
+{
+ if (v >= WT_BILLION)
+ WT_RET(__wt_buf_fmt(session, buf,
+ "%" PRIu64 "B (%" PRIu64 ")", v / WT_BILLION, v));
+ else if (v >= WT_MILLION)
+ WT_RET(__wt_buf_fmt(session, buf,
+ "%" PRIu64 "M (%" PRIu64 ")", v / WT_MILLION, v));
+ else
+ WT_RET(__wt_buf_fmt(session, buf, "%" PRIu64, v));
+
+ return (0);
+}
+
+/*
+ * __curstat_get_key --
+ * WT_CURSOR->get_key for statistics cursors.
+ */
+static int
+__curstat_get_key(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_SESSION_IMPL *session;
+ size_t size;
+ va_list ap;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ va_start(ap, cursor);
+ CURSOR_API_CALL(cursor, session, get_key, NULL);
+
+ WT_CURSOR_NEEDKEY(cursor);
+
+ if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+ WT_ERR(__wt_struct_size(
+ session, &size, cursor->key_format, cst->key));
+ WT_ERR(__wt_buf_initsize(session, &cursor->key, size));
+ WT_ERR(__wt_struct_pack(session, cursor->key.mem, size,
+ cursor->key_format, cst->key));
+
+ item = va_arg(ap, WT_ITEM *);
+ item->data = cursor->key.data;
+ item->size = cursor->key.size;
+ } else
+ *va_arg(ap, int *) = cst->key;
+
+err: va_end(ap);
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_get_value --
+ * WT_CURSOR->get_value for statistics cursors.
+ */
+static int
+__curstat_get_value(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+ size_t size;
+ uint64_t *v;
+ const char **p;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ va_start(ap, cursor);
+ CURSOR_API_CALL(cursor, session, get_value, NULL);
+
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+ WT_ERR(__wt_struct_size(session, &size, cursor->value_format,
+ cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc,
+ cst->pv.data, cst->v));
+ WT_ERR(__wt_buf_initsize(session, &cursor->value, size));
+ WT_ERR(__wt_struct_pack(session, cursor->value.mem, size,
+ cursor->value_format,
+ cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc,
+ cst->pv.data, cst->v));
+
+ item = va_arg(ap, WT_ITEM *);
+ item->data = cursor->value.data;
+ item->size = cursor->value.size;
+ } else {
+ /*
+ * Don't drop core if the statistics value isn't requested; NULL
+ * pointer support isn't documented, but it's a cheap test.
+ */
+ if ((p = va_arg(ap, const char **)) != NULL)
+ *p = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc;
+ if ((p = va_arg(ap, const char **)) != NULL)
+ *p = cst->pv.data;
+ if ((v = va_arg(ap, uint64_t *)) != NULL)
+ *v = cst->v;
+ }
+
+err: va_end(ap);
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_set_key --
+ * WT_CURSOR->set_key for statistics cursors.
+ */
+static void
+__curstat_set_key(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ CURSOR_API_CALL(cursor, session, set_key, NULL);
+ F_CLR(cursor, WT_CURSTD_KEY_SET);
+
+ va_start(ap, cursor);
+ if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+ item = va_arg(ap, WT_ITEM *);
+ ret = __wt_struct_unpack(session, item->data, item->size,
+ cursor->key_format, &cst->key);
+ } else
+ cst->key = va_arg(ap, int);
+ va_end(ap);
+
+ if ((cursor->saved_err = ret) == 0)
+ F_SET(cursor, WT_CURSTD_KEY_EXT);
+
+err: API_END(session, ret);
+}
+
+/*
+ * __curstat_set_value --
+ * WT_CURSOR->set_value for statistics cursors.
+ */
+static void
+__curstat_set_value(WT_CURSOR *cursor, ...)
+{
+ WT_UNUSED(cursor);
+ return;
+}
+
+/*
+ * __curstat_next --
+ * WT_CURSOR->next method for the statistics cursor type.
+ */
+static int
+__curstat_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ CURSOR_API_CALL(cursor, session, next, NULL);
+
+ /* Move to the next item. */
+ if (cst->notpositioned) {
+ cst->notpositioned = 0;
+ cst->key = WT_STAT_KEY_MIN(cst);
+ } else if (cst->key < WT_STAT_KEY_MAX(cst))
+ ++cst->key;
+ else {
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ WT_ERR(WT_NOTFOUND);
+ }
+ cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+ WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_prev --
+ * WT_CURSOR->prev method for the statistics cursor type.
+ */
+static int
+__curstat_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ CURSOR_API_CALL(cursor, session, prev, NULL);
+
+ /* Move to the previous item. */
+ if (cst->notpositioned) {
+ cst->notpositioned = 0;
+ cst->key = WT_STAT_KEY_MAX(cst);
+ } else if (cst->key > WT_STAT_KEY_MIN(cst))
+ --cst->key;
+ else {
+ F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+ WT_ERR(WT_NOTFOUND);
+ }
+
+ cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+ WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_reset --
+ * WT_CURSOR->reset method for the statistics cursor type.
+ */
+static int
+__curstat_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ CURSOR_API_CALL(cursor, session, reset, NULL);
+
+ cst->notpositioned = 1;
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_search --
+ * WT_CURSOR->search method for the statistics cursor type.
+ */
+static int
+__curstat_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ CURSOR_API_CALL(cursor, session, search, NULL);
+
+ WT_CURSOR_NEEDKEY(cursor);
+ F_CLR(cursor, WT_CURSTD_VALUE_SET | WT_CURSTD_VALUE_SET);
+
+ if (cst->key < WT_STAT_KEY_MIN(cst) || cst->key > WT_STAT_KEY_MAX(cst))
+ WT_ERR(WT_NOTFOUND);
+
+ cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+ WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_close --
+ * WT_CURSOR->close method for the statistics cursor type.
+ */
+static int
+__curstat_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ CURSOR_API_CALL(cursor, session, close, NULL);
+
+ __wt_buf_free(session, &cst->pv);
+
+ WT_ERR(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_conn_init --
+ * Initialize the statistics for a connection.
+ */
+static void
+__curstat_conn_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /*
+ * Fill in the connection statistics, and copy them to the cursor.
+ * Optionally clear the connection statistics.
+ */
+ __wt_conn_stat_init(session);
+ cst->u.conn_stats = conn->stats;
+ if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+ __wt_stat_refresh_connection_stats(&conn->stats);
+
+ cst->stats_first = cst->stats = (WT_STATS *)&cst->u.conn_stats;
+ cst->stats_base = WT_CONNECTION_STATS_BASE;
+ cst->stats_count = sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS);
+}
+
+/*
+ * When returning the statistics for a file URI, we review open handles, and
+ * aggregate checkpoint handle statistics with the file URI statistics. To
+ * make that work, we have to pass information to the function reviewing the
+ * handles, this structure is what we pass.
+ */
+struct __checkpoint_args {
+ const char *name; /* Data source handle name */
+ WT_DSRC_STATS *stats; /* Stat structure being filled */
+ int clear; /* WT_STATISTICS_CLEAR */
+};
+
+/*
+ * __curstat_checkpoint --
+ * Aggregate statistics from checkpoint handles.
+ */
+static int
+__curstat_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ struct __checkpoint_args *args;
+ WT_DATA_HANDLE *dhandle;
+
+ dhandle = session->dhandle;
+ args = (struct __checkpoint_args *)cfg[0];
+
+ /* Aggregate the flagged file's checkpoint handles. */
+ if (dhandle->checkpoint != NULL &&
+ strcmp(dhandle->name, args->name) == 0) {
+ __wt_stat_aggregate_dsrc_stats(&dhandle->stats, args->stats);
+ if (args->clear)
+ __wt_stat_refresh_dsrc_stats(&dhandle->stats);
+ }
+
+ return (0);
+}
+
+/*
+ * __curstat_file_init --
+ * Initialize the statistics for a file.
+ */
+static int
+__curstat_file_init(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+ struct __checkpoint_args args;
+ WT_DATA_HANDLE *dhandle, *saved_dhandle;
+ WT_DECL_RET;
+ const char *cfg_arg[] = { NULL, NULL };
+
+ WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, 0));
+ dhandle = session->dhandle;
+
+ /*
+ * Fill in the data source statistics, and copy them to the cursor.
+ * Optionally clear the data source statistics.
+ */
+ if ((ret = __wt_btree_stat_init(session, cst)) == 0) {
+ cst->u.dsrc_stats = dhandle->stats;
+ if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+ __wt_stat_refresh_dsrc_stats(&dhandle->stats);
+ __wt_curstat_dsrc_final(cst);
+ }
+
+ /* Release the handle, we're done with it. */
+ WT_TRET(__wt_session_release_btree(session));
+ WT_RET(ret);
+
+ /*
+ * If no checkpoint was specified, review the open handles and aggregate
+ * the statistics from any checkpoint handles matching this file.
+ */
+ if (dhandle->checkpoint == NULL) {
+ args.name = dhandle->name;
+ args.stats = &cst->u.dsrc_stats;
+ args.clear = F_ISSET(cst, WT_CONN_STAT_CLEAR);
+ cfg_arg[0] = (char *)&args;
+
+ /*
+ * We're likely holding the schema lock inside the statistics
+ * logging thread, not to mention calling __wt_conn_btree_apply
+ * from there as well. Save/restore the handle.
+ */
+ saved_dhandle = dhandle;
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_conn_btree_apply(
+ session, 1, __curstat_checkpoint, cfg_arg));
+ session->dhandle = saved_dhandle;
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_curstat_dsrc_final --
+ * Finalize a data-source statistics cursor.
+ */
+void
+__wt_curstat_dsrc_final(WT_CURSOR_STAT *cst)
+{
+
+ cst->stats_first = cst->stats = (WT_STATS *)&cst->u.dsrc_stats;
+ cst->stats_base = WT_DSRC_STATS_BASE;
+ cst->stats_count = sizeof(WT_DSRC_STATS) / sizeof(WT_STATS);
+}
+
+/*
+ * __wt_curstat_init --
+ * Initialize a statistics cursor.
+ */
+int
+__wt_curstat_init(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+ const char *dsrc_uri;
+
+ cst->notpositioned = 1;
+
+ if (strcmp(uri, "statistics:") == 0) {
+ __curstat_conn_init(session, cst);
+ return (0);
+ }
+
+ dsrc_uri = uri + strlen("statistics:");
+
+ if (WT_PREFIX_MATCH(dsrc_uri, "colgroup:"))
+ return (
+ __wt_curstat_colgroup_init(session, dsrc_uri, cfg, cst));
+
+ if (WT_PREFIX_MATCH(dsrc_uri, "file:"))
+ return (__curstat_file_init(session, dsrc_uri, cfg, cst));
+
+ if (WT_PREFIX_MATCH(dsrc_uri, "index:"))
+ return (__wt_curstat_index_init(session, dsrc_uri, cfg, cst));
+
+ if (WT_PREFIX_MATCH(dsrc_uri, "lsm:"))
+ return (__wt_curstat_lsm_init(session, dsrc_uri, cst));
+
+ if (WT_PREFIX_MATCH(dsrc_uri, "table:"))
+ return (__wt_curstat_table_init(session, dsrc_uri, cfg, cst));
+
+ return (__wt_bad_object_type(session, uri));
+}
+
+/*
+ * __wt_curstat_open --
+ * WT_SESSION->open_cursor method for the statistics cursor type.
+ */
+int
+__wt_curstat_open(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR_STATIC_INIT(iface,
+ __curstat_get_key, /* get-key */
+ __curstat_get_value, /* get-value */
+ __curstat_set_key, /* set-key */
+ __curstat_set_value, /* set-value */
+ __wt_cursor_notsup, /* compare */
+ __curstat_next, /* next */
+ __curstat_prev, /* prev */
+ __curstat_reset, /* reset */
+ __curstat_search, /* search */
+ __wt_cursor_notsup, /* search-near */
+ __wt_cursor_notsup, /* insert */
+ __wt_cursor_notsup, /* update */
+ __wt_cursor_notsup, /* remove */
+ __curstat_close); /* close */
+ WT_CONFIG_ITEM cval, sval;
+ WT_CURSOR *cursor;
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_STAT, iface) == 0);
+
+ conn = S2C(session);
+
+ WT_ERR(__wt_calloc_def(session, 1, &cst));
+ cursor = &cst->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+
+ /*
+ * Statistics cursor configuration: must match (and defaults to), the
+ * database configuration.
+ */
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_NONE))
+ goto config_err;
+ if ((ret = __wt_config_gets(session, cfg, "statistics", &cval)) == 0) {
+ if ((ret = __wt_config_subgets(
+ session, &cval, "all", &sval)) == 0 && sval.val != 0) {
+ if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ALL))
+ goto config_err;
+ F_SET(cst, WT_CONN_STAT_ALL | WT_CONN_STAT_FAST);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ if ((ret = __wt_config_subgets(
+ session, &cval, "fast", &sval)) == 0 && sval.val != 0) {
+ if (F_ISSET(cst, WT_CONN_STAT_ALL))
+ WT_ERR_MSG(session, EINVAL,
+ "only one statistics configuration value "
+ "may be specified");
+ F_SET(cst, WT_CONN_STAT_FAST);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ if ((ret = __wt_config_subgets(
+ session, &cval, "clear", &sval)) == 0 && sval.val != 0)
+ F_SET(cst, WT_CONN_STAT_CLEAR);
+ WT_ERR_NOTFOUND_OK(ret);
+
+ /* If no configuration, use the connection's configuration. */
+ if (cst->flags == 0) {
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ALL))
+ F_SET(cst, WT_CONN_STAT_ALL);
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_FAST))
+ F_SET(cst, WT_CONN_STAT_FAST);
+ }
+
+ /* If the connection configures clear, so do we. */
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+ F_SET(cst, WT_CONN_STAT_CLEAR);
+ }
+
+ /*
+ * We return the statistics field's offset as the key, and a string
+ * description, a string value, and a uint64_t value as the value
+ * columns.
+ */
+ cursor->key_format = "i";
+ cursor->value_format = "SSq";
+ WT_ERR(__wt_curstat_init(session, uri, cfg, cst));
+
+ /* __wt_cursor_init is last so we don't have to clean up on error. */
+ WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
+
+ if (0) {
+config_err: WT_ERR_MSG(session, EINVAL,
+ "cursor's statistics configuration doesn't match the "
+ "database statistics configuration");
+ }
+
+ if (0) {
+err: __wt_free(session, cst);
+ }
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c
new file mode 100644
index 00000000000..21d676d943a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_std.c
@@ -0,0 +1,625 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cursor_notsup --
+ * Unsupported cursor actions.
+ */
+int
+__wt_cursor_notsup(WT_CURSOR *cursor)
+{
+ WT_UNUSED(cursor);
+
+ return (ENOTSUP);
+}
+
+/*
+ * __wt_cursor_noop --
+ * Cursor noop.
+ */
+int
+__wt_cursor_noop(WT_CURSOR *cursor)
+{
+ WT_UNUSED(cursor);
+
+ return (0);
+}
+
+/*
+ * __wt_cursor_set_notsup --
+ * Reset the cursor methods to not-supported.
+ */
+void
+__wt_cursor_set_notsup(WT_CURSOR *cursor)
+{
+ /*
+ * Set all of the cursor methods (except for close and reset), to fail.
+ * Close is unchanged so the cursor can be discarded, reset defaults to
+ * a no-op because session transactional operations reset all of the
+ * cursors in a session, and random cursors shouldn't block transactions
+ * or checkpoints.
+ */
+ cursor->compare =
+ (int (*)(WT_CURSOR *, WT_CURSOR *, int *))__wt_cursor_notsup;
+ cursor->next = __wt_cursor_notsup;
+ cursor->prev = __wt_cursor_notsup;
+ cursor->reset = __wt_cursor_noop;
+ cursor->search = __wt_cursor_notsup;
+ cursor->search_near = (int (*)(WT_CURSOR *, int *))__wt_cursor_notsup;
+ cursor->insert = __wt_cursor_notsup;
+ cursor->update = __wt_cursor_notsup;
+ cursor->remove = __wt_cursor_notsup;
+}
+
+/*
+ * __wt_cursor_config_readonly --
+ * Parse read only configuration and setup cursor appropriately.
+ */
+int
+__wt_cursor_config_readonly(WT_CURSOR *cursor, const char *cfg[], int def)
+{
+ WT_CONFIG_ITEM cval;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_RET(__wt_config_gets_def(session, cfg, "readonly", def, &cval));
+ if (cval.val != 0) {
+ /* Reset all cursor methods that could modify data. */
+ cursor->insert = __wt_cursor_notsup;
+ cursor->update = __wt_cursor_notsup;
+ cursor->remove = __wt_cursor_notsup;
+ }
+ return (0);
+}
+
+/*
+ * __wt_cursor_kv_not_set --
+ * Standard error message for key/values not set.
+ */
+int
+__wt_cursor_kv_not_set(WT_CURSOR *cursor, int key)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_RET_MSG(session,
+ cursor->saved_err == 0 ? EINVAL : cursor->saved_err,
+ "requires %s be set", key ? "key" : "value");
+}
+
+/*
+ * __wt_cursor_get_key --
+ * WT_CURSOR->get_key default implementation.
+ */
+int
+__wt_cursor_get_key(WT_CURSOR *cursor, ...)
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, cursor);
+ ret = __wt_cursor_get_keyv(cursor, cursor->flags, ap);
+ va_end(ap);
+ return (ret);
+}
+
+/*
+ * __wt_cursor_set_key --
+ * WT_CURSOR->set_key default implementation.
+ */
+void
+__wt_cursor_set_key(WT_CURSOR *cursor, ...)
+{
+ va_list ap;
+
+ va_start(ap, cursor);
+ __wt_cursor_set_keyv(cursor, cursor->flags, ap);
+ va_end(ap);
+}
+
+/*
+ * __wt_cursor_get_raw_key --
+ * Temporarily force raw mode in a cursor to get a canonical copy of
+ * the key.
+ */
+int
+__wt_cursor_get_raw_key(WT_CURSOR *cursor, WT_ITEM *key)
+{
+ WT_DECL_RET;
+ int raw_set;
+
+ raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0;
+ if (!raw_set)
+ F_SET(cursor, WT_CURSTD_RAW);
+ ret = cursor->get_key(cursor, key);
+ if (!raw_set)
+ F_CLR(cursor, WT_CURSTD_RAW);
+ return (ret);
+}
+
+/*
+ * __wt_cursor_set_raw_key --
+ * Temporarily force raw mode in a cursor to set a canonical copy of
+ * the key.
+ */
+void
+__wt_cursor_set_raw_key(WT_CURSOR *cursor, WT_ITEM *key)
+{
+ int raw_set;
+
+ raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0;
+ if (!raw_set)
+ F_SET(cursor, WT_CURSTD_RAW);
+ cursor->set_key(cursor, key);
+ if (!raw_set)
+ F_CLR(cursor, WT_CURSTD_RAW);
+}
+
+/*
+ * __wt_cursor_get_raw_value --
+ * Temporarily force raw mode in a cursor to get a canonical copy of
+ * the value.
+ */
+int
+__wt_cursor_get_raw_value(WT_CURSOR *cursor, WT_ITEM *value)
+{
+ WT_DECL_RET;
+ int raw_set;
+
+ raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0;
+ if (!raw_set)
+ F_SET(cursor, WT_CURSTD_RAW);
+ ret = cursor->get_value(cursor, value);
+ if (!raw_set)
+ F_CLR(cursor, WT_CURSTD_RAW);
+ return (ret);
+}
+
+/*
+ * __wt_cursor_set_raw_value --
+ * Temporarily force raw mode in a cursor to set a canonical copy of
+ * the value.
+ */
+void
+__wt_cursor_set_raw_value(WT_CURSOR *cursor, WT_ITEM *value)
+{
+ int raw_set;
+
+ raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0;
+ if (!raw_set)
+ F_SET(cursor, WT_CURSTD_RAW);
+ cursor->set_value(cursor, value);
+ if (!raw_set)
+ F_CLR(cursor, WT_CURSTD_RAW);
+}
+
+/*
+ * __wt_cursor_get_keyv --
+ * WT_CURSOR->get_key worker function.
+ */
+int
+__wt_cursor_get_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap)
+{
+ WT_DECL_RET;
+ WT_ITEM *key;
+ WT_SESSION_IMPL *session;
+ size_t size;
+ const char *fmt;
+
+ CURSOR_API_CALL(cursor, session, get_key, NULL);
+ if (!F_ISSET(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT))
+ WT_ERR(__wt_cursor_kv_not_set(cursor, 1));
+
+ if (WT_CURSOR_RECNO(cursor)) {
+ if (LF_ISSET(WT_CURSTD_RAW)) {
+ key = va_arg(ap, WT_ITEM *);
+ key->data = cursor->raw_recno_buf;
+ WT_ERR(__wt_struct_size(
+ session, &size, "q", cursor->recno));
+ key->size = size;
+ ret = __wt_struct_pack(session, cursor->raw_recno_buf,
+ sizeof(cursor->raw_recno_buf), "q", cursor->recno);
+ } else
+ *va_arg(ap, uint64_t *) = cursor->recno;
+ } else {
+ /* Fast path some common cases. */
+ fmt = cursor->key_format;
+ if (LF_ISSET(WT_CURSOR_RAW_OK) || WT_STREQ(fmt, "u")) {
+ key = va_arg(ap, WT_ITEM *);
+ key->data = cursor->key.data;
+ key->size = cursor->key.size;
+ } else if (WT_STREQ(fmt, "S"))
+ *va_arg(ap, const char **) = cursor->key.data;
+ else
+ ret = __wt_struct_unpackv(session,
+ cursor->key.data, cursor->key.size, fmt, ap);
+ }
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_cursor_set_keyv --
+ * WT_CURSOR->set_key default implementation.
+ */
+void
+__wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_ITEM *buf, *item;
+ size_t sz;
+ va_list ap_copy;
+ const char *fmt, *str;
+
+ CURSOR_API_CALL(cursor, session, set_key, NULL);
+ F_CLR(cursor, WT_CURSTD_KEY_SET);
+
+ if (WT_CURSOR_RECNO(cursor)) {
+ if (LF_ISSET(WT_CURSTD_RAW)) {
+ item = va_arg(ap, WT_ITEM *);
+ WT_ERR(__wt_struct_unpack(session,
+ item->data, item->size, "q", &cursor->recno));
+ } else
+ cursor->recno = va_arg(ap, uint64_t);
+ if (cursor->recno == 0)
+ WT_ERR_MSG(session, EINVAL,
+ "Record numbers must be greater than zero");
+ cursor->key.data = &cursor->recno;
+ sz = sizeof(cursor->recno);
+ } else {
+ /* Fast path some common cases and special case WT_ITEMs. */
+ fmt = cursor->key_format;
+ if (LF_ISSET(WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON) ||
+ WT_STREQ(fmt, "u")) {
+ item = va_arg(ap, WT_ITEM *);
+ sz = item->size;
+ cursor->key.data = item->data;
+ } else if (WT_STREQ(fmt, "S")) {
+ str = va_arg(ap, const char *);
+ sz = strlen(str) + 1;
+ cursor->key.data = (void *)str;
+ } else {
+ buf = &cursor->key;
+
+ va_copy(ap_copy, ap);
+ ret = __wt_struct_sizev(
+ session, &sz, cursor->key_format, ap_copy);
+ va_end(ap_copy);
+ WT_ERR(ret);
+
+ WT_ERR(__wt_buf_initsize(session, buf, sz));
+ WT_ERR(__wt_struct_packv(
+ session, buf->mem, sz, cursor->key_format, ap));
+ }
+ }
+ if (sz == 0)
+ WT_ERR_MSG(session, EINVAL, "Empty keys not permitted");
+ else if ((uint32_t)sz != sz)
+ WT_ERR_MSG(session, EINVAL,
+ "Key size (%" PRIu64 ") out of range", (uint64_t)sz);
+ cursor->saved_err = 0;
+ cursor->key.size = sz;
+ F_SET(cursor, WT_CURSTD_KEY_EXT);
+ if (0) {
+err: cursor->saved_err = ret;
+ }
+
+ API_END(session, ret);
+}
+
+/*
+ * __wt_cursor_get_value --
+ * WT_CURSOR->get_value default implementation.
+ */
+int
+__wt_cursor_get_value(WT_CURSOR *cursor, ...)
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, cursor);
+ ret = __wt_cursor_get_valuev(cursor, ap);
+ va_end(ap);
+ return (ret);
+}
+
+/*
+ * __wt_cursor_get_valuev --
+ * WT_CURSOR->get_value worker implementation.
+ */
+int
+__wt_cursor_get_valuev(WT_CURSOR *cursor, va_list ap)
+{
+ WT_DECL_RET;
+ WT_ITEM *value;
+ WT_SESSION_IMPL *session;
+ const char *fmt;
+
+ CURSOR_API_CALL(cursor, session, get_value, NULL);
+
+ if (!F_ISSET(cursor, WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT))
+ WT_ERR(__wt_cursor_kv_not_set(cursor, 0));
+
+ /* Fast path some common cases. */
+ fmt = cursor->value_format;
+ if (F_ISSET(cursor, WT_CURSOR_RAW_OK) || WT_STREQ(fmt, "u")) {
+ value = va_arg(ap, WT_ITEM *);
+ value->data = cursor->value.data;
+ value->size = cursor->value.size;
+ } else if (WT_STREQ(fmt, "S"))
+ *va_arg(ap, const char **) = cursor->value.data;
+ else if (WT_STREQ(fmt, "t") ||
+ (isdigit(fmt[0]) && WT_STREQ(fmt + 1, "t")))
+ *va_arg(ap, uint8_t *) = *(uint8_t *)cursor->value.data;
+ else
+ ret = __wt_struct_unpackv(session,
+ cursor->value.data, cursor->value.size, fmt, ap);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_cursor_set_value --
+ * WT_CURSOR->set_value default implementation.
+ */
+void
+__wt_cursor_set_value(WT_CURSOR *cursor, ...)
+{
+ va_list ap;
+
+ va_start(ap, cursor);
+ __wt_cursor_set_valuev(cursor, ap);
+ va_end(ap);
+}
+
+/*
+ * __wt_cursor_set_valuev --
+ * WT_CURSOR->set_value worker implementation.
+ */
+void
+__wt_cursor_set_valuev(WT_CURSOR *cursor, va_list ap)
+{
+ WT_DECL_RET;
+ WT_ITEM *buf, *item;
+ WT_SESSION_IMPL *session;
+ const char *fmt, *str;
+ va_list ap_copy;
+ size_t sz;
+
+ CURSOR_API_CALL(cursor, session, set_value, NULL);
+ F_CLR(cursor, WT_CURSTD_VALUE_SET);
+
+ /* Fast path some common cases. */
+ fmt = cursor->value_format;
+ if (F_ISSET(cursor, WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON) ||
+ WT_STREQ(fmt, "u")) {
+ item = va_arg(ap, WT_ITEM *);
+ sz = item->size;
+ cursor->value.data = item->data;
+ } else if (WT_STREQ(fmt, "S")) {
+ str = va_arg(ap, const char *);
+ sz = strlen(str) + 1;
+ cursor->value.data = str;
+ } else if (WT_STREQ(fmt, "t") ||
+ (isdigit(fmt[0]) && WT_STREQ(fmt + 1, "t"))) {
+ sz = 1;
+ buf = &cursor->value;
+ WT_ERR(__wt_buf_initsize(session, buf, sz));
+ *(uint8_t *)buf->mem = (uint8_t)va_arg(ap, int);
+ } else {
+ va_copy(ap_copy, ap);
+ ret = __wt_struct_sizev(session,
+ &sz, cursor->value_format, ap_copy);
+ va_end(ap_copy);
+ WT_ERR(ret);
+ buf = &cursor->value;
+ WT_ERR(__wt_buf_initsize(session, buf, sz));
+ WT_ERR(__wt_struct_packv(session, buf->mem, sz,
+ cursor->value_format, ap));
+ }
+ F_SET(cursor, WT_CURSTD_VALUE_EXT);
+ cursor->value.size = sz;
+
+ if (0) {
+err: cursor->saved_err = ret;
+ }
+ API_END(session, ret);
+}
+
+/*
+ * __wt_cursor_close --
+ * WT_CURSOR->close default implementation.
+ */
+int
+__wt_cursor_close(WT_CURSOR *cursor)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+ __wt_buf_free(session, &cursor->key);
+ __wt_buf_free(session, &cursor->value);
+
+ if (F_ISSET(cursor, WT_CURSTD_OPEN)) {
+ TAILQ_REMOVE(&session->cursors, cursor, q);
+
+ WT_STAT_FAST_DATA_DECR(session, session_cursor_open);
+ WT_STAT_FAST_CONN_ATOMIC_DECR(session, session_cursor_open);
+ }
+
+ __wt_free(session, cursor->internal_uri);
+ __wt_free(session, cursor->uri);
+ __wt_overwrite_and_free(session, cursor);
+ return (ret);
+}
+
+/*
+ * __cursor_runtime_config --
+ * Set runtime-configurable settings.
+ */
+static int
+__cursor_runtime_config(WT_CURSOR *cursor, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ /*
+ * !!!
+ * There's no way yet to reconfigure cursor flags at runtime; if, in
+ * the future there is a way to do that, similar support needs to be
+ * added for data-source cursors, or, this call needs to return an
+ * error in the case of a data-source cursor.
+ */
+ WT_RET(__wt_config_gets_def(session, cfg, "overwrite", 1, &cval));
+ if (cval.val)
+ F_SET(cursor, WT_CURSTD_OVERWRITE);
+ else
+ F_CLR(cursor, WT_CURSTD_OVERWRITE);
+
+ return (0);
+}
+
+/*
+ * __wt_cursor_dup_position --
+ * Set a cursor to another cursor's position.
+ */
+int
+__wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor)
+{
+ WT_ITEM key;
+
+ /*
+ * Get a copy of the cursor's raw key, and set it in the new cursor,
+ * then search for that key to position the cursor.
+ *
+ * We don't clear the WT_ITEM structure: all that happens when getting
+ * and setting the key is the data/size fields are reset to reference
+ * the original cursor's key.
+ *
+ * That said, we're playing games with the cursor flags: setting the key
+ * sets the key/value application-set flags in the new cursor, which may
+ * or may not be correct, but there's nothing simple that fixes it. We
+ * depend on the subsequent cursor search to clean things up, as search
+ * is required to copy and/or reference private memory after success.
+ */
+ WT_RET(__wt_cursor_get_raw_key(to_dup, &key));
+ __wt_cursor_set_raw_key(cursor, &key);
+
+ /*
+ * We now have a reference to the raw key, but we don't know anything
+ * about the memory in which it's stored, it could be btree/file page
+ * memory in the cache, application memory or the original cursor's
+ * key/value WT_ITEMs. Memory allocated in support of another cursor
+ * could be discarded when that cursor is closed, so it's a problem.
+ * However, doing a search to position the cursor will fix the problem:
+ * cursors cannot reference application memory after cursor operations
+ * and that requirement will save the day.
+ */
+ WT_RET(cursor->search(cursor));
+
+ return (0);
+}
+
+/*
+ * __wt_cursor_init --
+ * Default cursor initialization.
+ */
+int
+__wt_cursor_init(WT_CURSOR *cursor,
+ const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR *cdump;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ if (cursor->internal_uri == NULL)
+ WT_RET(__wt_strdup(session, uri, &cursor->internal_uri));
+
+ /* Set runtime-configurable settings. */
+ WT_RET(__cursor_runtime_config(cursor, cfg));
+
+ /*
+ * append
+ * The append flag is only relevant to column stores.
+ */
+ if (WT_CURSOR_RECNO(cursor)) {
+ WT_RET(__wt_config_gets_def(session, cfg, "append", 0, &cval));
+ if (cval.val != 0)
+ F_SET(cursor, WT_CURSTD_APPEND);
+ }
+
+ /*
+ * checkpoint
+ * Checkpoint cursors are read-only.
+ */
+ WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
+ if (cval.len != 0) {
+ cursor->insert = __wt_cursor_notsup;
+ cursor->update = __wt_cursor_notsup;
+ cursor->remove = __wt_cursor_notsup;
+ }
+
+ /*
+ * dump
+ * If an index cursor is opened with dump, then this
+ * function is called on the index files, with the dump
+ * config string, and with the index cursor as an owner.
+ * We don't want to create a dump cursor in that case, because
+ * we'll create the dump cursor on the index cursor itself.
+ */
+ WT_RET(__wt_config_gets_def(session, cfg, "dump", 0, &cval));
+ if (cval.len != 0 && owner == NULL) {
+ F_SET(cursor,
+ WT_STRING_MATCH("json", cval.str, cval.len) ?
+ WT_CURSTD_DUMP_JSON :
+ (WT_STRING_MATCH("print", cval.str, cval.len) ?
+ WT_CURSTD_DUMP_PRINT : WT_CURSTD_DUMP_HEX));
+ /*
+ * Dump cursors should not have owners: only the
+ * top-level cursor should be wrapped in a dump cursor.
+ */
+ WT_RET(__wt_curdump_create(cursor, owner, &cdump));
+ owner = cdump;
+ } else
+ cdump = NULL;
+
+ /* raw */
+ WT_RET(__wt_config_gets_def(session, cfg, "raw", 0, &cval));
+ if (cval.val != 0)
+ F_SET(cursor, WT_CURSTD_RAW);
+
+ /* readonly */
+ WT_RET(__wt_cursor_config_readonly(cursor, cfg, 0));
+
+ /*
+ * Cursors that are internal to some other cursor (such as file cursors
+ * inside a table cursor) should be closed after the containing cursor.
+ * Arrange for that to happen by putting internal cursors after their
+ * owners on the queue.
+ */
+ if (owner != NULL) {
+ WT_ASSERT(session, F_ISSET(owner, WT_CURSTD_OPEN));
+ TAILQ_INSERT_AFTER(&session->cursors, owner, cursor, q);
+ } else
+ TAILQ_INSERT_HEAD(&session->cursors, cursor, q);
+
+ F_SET(cursor, WT_CURSTD_OPEN);
+ WT_STAT_FAST_DATA_INCR(session, session_cursor_open);
+ WT_STAT_FAST_CONN_ATOMIC_INCR(session, session_cursor_open);
+
+ *cursorp = (cdump != NULL) ? cdump : cursor;
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c
new file mode 100644
index 00000000000..ea267f96f9c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_table.c
@@ -0,0 +1,808 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __curtable_open_indices(WT_CURSOR_TABLE *ctable);
+static int __curtable_update(WT_CURSOR *cursor);
+
+#define APPLY_CG(ctable, f) do { \
+ WT_CURSOR **__cp; \
+ u_int __i; \
+ for (__i = 0, __cp = ctable->cg_cursors; \
+ __i < WT_COLGROUPS(ctable->table); \
+ __i++, __cp++) \
+ WT_TRET((*__cp)->f(*__cp)); \
+} while (0)
+
+#define APPLY_IDX(ctable, f) do { \
+ WT_INDEX *idx; \
+ WT_CURSOR **__cp; \
+ u_int __i; \
+ __cp = (ctable)->idx_cursors; \
+ for (__i = 0; __i < ctable->table->nindices; __i++, __cp++) { \
+ idx = ctable->table->indices[__i]; \
+ WT_ERR(__wt_schema_project_merge(session, \
+ ctable->cg_cursors, \
+ idx->key_plan, idx->key_format, &(*__cp)->key)); \
+ F_SET(*__cp, WT_CURSTD_KEY_EXT | \
+ WT_CURSTD_VALUE_EXT); \
+ WT_ERR((*__cp)->f(*__cp)); \
+ WT_ERR((*__cp)->reset(*__cp)); \
+ } \
+} while (0)
+
+/*
+ * __wt_curtable_get_key --
+ * WT_CURSOR->get_key implementation for tables.
+ */
+int
+__wt_curtable_get_key(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR *primary;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ va_list ap;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ primary = *ctable->cg_cursors;
+
+ va_start(ap, cursor);
+ ret = __wt_cursor_get_keyv(primary, cursor->flags, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_curtable_get_value --
+ * WT_CURSOR->get_value implementation for tables.
+ */
+int
+__wt_curtable_get_value(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR *primary;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ primary = *ctable->cg_cursors;
+ CURSOR_API_CALL(cursor, session, get_value, NULL);
+ WT_CURSOR_NEEDVALUE(primary);
+
+ va_start(ap, cursor);
+ if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) {
+ ret = __wt_schema_project_merge(session,
+ ctable->cg_cursors, ctable->plan,
+ cursor->value_format, &cursor->value);
+ if (ret == 0) {
+ item = va_arg(ap, WT_ITEM *);
+ item->data = cursor->value.data;
+ item->size = cursor->value.size;
+ }
+ } else
+ ret = __wt_schema_project_out(session,
+ ctable->cg_cursors, ctable->plan, ap);
+ va_end(ap);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curtable_set_key --
+ * WT_CURSOR->set_key implementation for tables.
+ */
+void
+__wt_curtable_set_key(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR **cp, *primary;
+ WT_CURSOR_TABLE *ctable;
+ va_list ap;
+ u_int i;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ cp = ctable->cg_cursors;
+ primary = *cp++;
+
+ va_start(ap, cursor);
+ __wt_cursor_set_keyv(primary, cursor->flags, ap);
+ va_end(ap);
+
+ if (!F_ISSET(primary, WT_CURSTD_KEY_SET))
+ return;
+
+ /* Copy the primary key to the other cursors. */
+ for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) {
+ (*cp)->recno = primary->recno;
+ (*cp)->key.data = primary->key.data;
+ (*cp)->key.size = primary->key.size;
+ F_SET(*cp, WT_CURSTD_KEY_EXT);
+ }
+}
+
+/*
+ * __wt_curtable_set_value --
+ * WT_CURSOR->set_value implementation for tables.
+ */
+void
+__wt_curtable_set_value(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR **cp;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+ u_int i;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, set_value, NULL);
+
+ va_start(ap, cursor);
+ if (F_ISSET(cursor, WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON)) {
+ item = va_arg(ap, WT_ITEM *);
+ cursor->value.data = item->data;
+ cursor->value.size = item->size;
+ ret = __wt_schema_project_slice(session,
+ ctable->cg_cursors, ctable->plan, 0,
+ cursor->value_format, &cursor->value);
+ } else
+ ret = __wt_schema_project_in(session,
+ ctable->cg_cursors, ctable->plan, ap);
+ va_end(ap);
+
+ for (i = 0, cp = ctable->cg_cursors;
+ i < WT_COLGROUPS(ctable->table); i++, cp++)
+ if (ret == 0)
+ F_SET(*cp, WT_CURSTD_VALUE_EXT);
+ else {
+ (*cp)->saved_err = ret;
+ F_CLR(*cp, WT_CURSTD_VALUE_SET);
+ }
+
+err: API_END(session, ret);
+}
+
+/*
+ * __curtable_compare --
+ * WT_CURSOR->compare implementation for tables.
+ */
+static int
+__curtable_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ CURSOR_API_CALL(a, session, compare, NULL);
+
+ /*
+ * Confirm both cursors refer to the same source and have keys, then
+ * call the underlying object's comparison routine.
+ */
+ if (strcmp(a->internal_uri, b->internal_uri) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "comparison method cursors must reference the same object");
+ WT_CURSOR_CHECKKEY(WT_CURSOR_PRIMARY(a));
+ WT_CURSOR_CHECKKEY(WT_CURSOR_PRIMARY(b));
+
+ ret = WT_CURSOR_PRIMARY(a)->compare(
+ WT_CURSOR_PRIMARY(a), WT_CURSOR_PRIMARY(b), cmpp);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_next --
+ * WT_CURSOR->next method for the table cursor type.
+ */
+static int
+__curtable_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, next, NULL);
+ APPLY_CG(ctable, next);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_next_random --
+ * WT_CURSOR->next method for the table cursor type when configured with
+ * next_random.
+ */
+static int
+__curtable_next_random(WT_CURSOR *cursor)
+{
+ WT_CURSOR *primary, **cp;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, next, NULL);
+ cp = ctable->cg_cursors;
+
+ /* Split out the first next, it retrieves the random record. */
+ primary = *cp++;
+ WT_ERR(primary->next(primary));
+
+ /* Fill in the rest of the columns. */
+ for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) {
+ (*cp)->key.data = primary->key.data;
+ (*cp)->key.size = primary->key.size;
+ (*cp)->recno = primary->recno;
+ F_SET(*cp, WT_CURSTD_KEY_EXT);
+ WT_ERR((*cp)->search(*cp));
+ }
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_prev --
+ * WT_CURSOR->prev method for the table cursor type.
+ */
+static int
+__curtable_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, prev, NULL);
+ APPLY_CG(ctable, prev);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_reset --
+ * WT_CURSOR->reset method for the table cursor type.
+ */
+static int
+__curtable_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, reset, NULL);
+ APPLY_CG(ctable, reset);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_search --
+ * WT_CURSOR->search method for the table cursor type.
+ */
+static int
+__curtable_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, search, NULL);
+ APPLY_CG(ctable, search);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_search_near --
+ * WT_CURSOR->search_near method for the table cursor type.
+ */
+static int
+__curtable_search_near(WT_CURSOR *cursor, int *exact)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_CURSOR *primary, **cp;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, search_near, NULL);
+ cp = ctable->cg_cursors;
+ primary = *cp;
+ WT_ERR(primary->search_near(primary, exact));
+
+ for (i = 1, ++cp; i < WT_COLGROUPS(ctable->table); i++) {
+ (*cp)->key.data = primary->key.data;
+ (*cp)->key.size = primary->key.size;
+ (*cp)->recno = primary->recno;
+ F_SET(*cp, WT_CURSTD_KEY_EXT);
+ WT_ERR((*cp)->search(*cp));
+ }
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_insert --
+ * WT_CURSOR->insert method for the table cursor type.
+ */
+static int
+__curtable_insert(WT_CURSOR *cursor)
+{
+ WT_CURSOR *primary, **cp;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ uint32_t flag_orig;
+ u_int i;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL);
+ WT_ERR(__curtable_open_indices(ctable));
+
+ /*
+ * Split out the first insert, it may be allocating a recno.
+ *
+ * If the table has indices, we also need to know whether this record
+ * is replacing an existing record so that the existing index entries
+ * can be removed. We discover if this is an overwrite by configuring
+ * the primary cursor for no-overwrite, and checking if the insert
+ * detects a duplicate key.
+ */
+ cp = ctable->cg_cursors;
+ primary = *cp++;
+
+ flag_orig = F_ISSET(primary, WT_CURSTD_OVERWRITE);
+ if (ctable->table->nindices > 0)
+ F_CLR(primary, WT_CURSTD_OVERWRITE);
+ ret = primary->insert(primary);
+ F_SET(primary, flag_orig);
+
+ if (ret == WT_DUPLICATE_KEY && F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
+ /*
+ * !!!
+ * The insert failure clears these flags, but does not touch the
+ * items. We could make a copy each time for overwrite cursors,
+ * but for now we just reset the flags.
+ */
+ F_SET(primary, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+ ret = __curtable_update(cursor);
+ goto err;
+ }
+ WT_ERR(ret);
+
+ for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) {
+ (*cp)->recno = primary->recno;
+ WT_ERR((*cp)->insert(*cp));
+ }
+
+ APPLY_IDX(ctable, insert);
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curtable_update --
+ * WT_CURSOR->update method for the table cursor type.
+ */
+static int
+__curtable_update(WT_CURSOR *cursor)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_UPDATE_API_CALL(cursor, session, update, NULL);
+ WT_ERR(__curtable_open_indices(ctable));
+
+ /*
+ * If the table has indices, first delete any old index keys, then
+ * update the primary, then insert the new index keys. This is
+ * complicated by the fact that we need the old value to generate the
+ * old index keys, so we make a temporary copy of the new value.
+ */
+ if (ctable->table->nindices > 0) {
+ WT_ERR(__wt_schema_project_merge(session,
+ ctable->cg_cursors, ctable->plan,
+ cursor->value_format, &cursor->value));
+ APPLY_CG(ctable, search);
+ /*
+ * Remove only if the key exists.
+ */
+ if (ret == 0) {
+ APPLY_IDX(ctable, remove);
+ WT_ERR(__wt_schema_project_slice(session,
+ ctable->cg_cursors, ctable->plan, 0,
+ cursor->value_format, &cursor->value));
+ } else if (ret == WT_NOTFOUND)
+ ret = 0;
+ else
+ WT_ERR(ret);
+ }
+ APPLY_CG(ctable, update);
+ WT_ERR(ret);
+ if (ctable->idx_cursors != NULL)
+ APPLY_IDX(ctable, insert);
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curtable_remove --
+ * WT_CURSOR->remove method for the table cursor type.
+ */
+static int
+__curtable_remove(WT_CURSOR *cursor)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL);
+ WT_ERR(__curtable_open_indices(ctable));
+
+ /* Find the old record so it can be removed from indices */
+ if (ctable->table->nindices > 0) {
+ APPLY_CG(ctable, search);
+ WT_ERR(ret);
+ APPLY_IDX(ctable, remove);
+ }
+
+ APPLY_CG(ctable, remove);
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __wt_table_range_truncate --
+ * Truncate of a cursor range, table implementation.
+ */
+int
+__wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop)
+{
+ WT_CURSOR *wt_start, *wt_stop;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_ITEM raw;
+ WT_SESSION_IMPL *session;
+ u_int i;
+ int cmp;
+
+ ctable = (start != NULL) ? start : stop;
+ session = (WT_SESSION_IMPL *)ctable->iface.session;
+ wt_start = &start->iface;
+ wt_stop = &stop->iface;
+
+ /* Open any indices. */
+ WT_RET(__curtable_open_indices(ctable));
+ WT_RET(__wt_scr_alloc(session, 128, &key));
+
+ /*
+ * Step through the cursor range, removing the index entries.
+ *
+ * If there are indices, copy the key we're using to step through the
+ * cursor range (so we can reset the cursor to its original position),
+ * then remove all of the index records in the truncated range. Copy
+ * the raw key because the memory is only valid until the cursor moves.
+ */
+ if (ctable->table->nindices > 0) {
+ if (start == NULL) {
+ WT_ERR(__wt_cursor_get_raw_key(wt_stop, &raw));
+ WT_ERR(__wt_buf_set(session, key, raw.data, raw.size));
+
+ do {
+ APPLY_CG(stop, search);
+ WT_ERR(ret);
+ APPLY_IDX(stop, remove);
+ } while ((ret = wt_stop->prev(wt_stop)) == 0);
+ WT_ERR_NOTFOUND_OK(ret);
+
+ __wt_cursor_set_raw_key(wt_stop, key);
+ APPLY_CG(stop, search);
+ } else {
+ WT_ERR(__wt_cursor_get_raw_key(wt_start, &raw));
+ WT_ERR(__wt_buf_set(session, key, raw.data, raw.size));
+
+ cmp = -1;
+ do {
+ APPLY_CG(start, search);
+ WT_ERR(ret);
+ APPLY_IDX(start, remove);
+ if (stop != NULL)
+ WT_ERR(wt_start->compare(
+ wt_start, wt_stop,
+ &cmp));
+ } while (cmp < 0 &&
+ (ret = wt_start->next(wt_start)) == 0);
+ WT_ERR_NOTFOUND_OK(ret);
+
+ __wt_cursor_set_raw_key(wt_start, key);
+ APPLY_CG(start, search);
+ }
+ }
+
+ /* Truncate the column groups. */
+ for (i = 0; i < WT_COLGROUPS(ctable->table); i++)
+ WT_ERR(__wt_range_truncate(
+ (start == NULL) ? NULL : start->cg_cursors[i],
+ (stop == NULL) ? NULL : stop->cg_cursors[i]));
+
+err: __wt_scr_free(&key);
+ return (ret);
+}
+
+/*
+ * __curtable_close --
+ * WT_CURSOR->close method for the table cursor type.
+ */
+static int
+__curtable_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_CURSOR **cp;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, close, NULL);
+
+ for (i = 0, cp = ctable->cg_cursors;
+ i < WT_COLGROUPS(ctable->table); i++, cp++)
+ if (*cp != NULL) {
+ WT_TRET((*cp)->close(*cp));
+ *cp = NULL;
+ }
+
+ if (ctable->idx_cursors != NULL)
+ for (i = 0, cp = ctable->idx_cursors;
+ i < ctable->table->nindices; i++, cp++)
+ if (*cp != NULL) {
+ WT_TRET((*cp)->close(*cp));
+ *cp = NULL;
+ }
+
+ if (ctable->plan != ctable->table->plan)
+ __wt_free(session, ctable->plan);
+ for (i = 0; ctable->cfg[i] != NULL; ++i)
+ __wt_free(session, ctable->cfg[i]);
+ __wt_free(session, ctable->cfg);
+ if (cursor->value_format != ctable->table->value_format)
+ __wt_free(session, cursor->value_format);
+ __wt_free(session, ctable->cg_cursors);
+ __wt_free(session, ctable->idx_cursors);
+ __wt_schema_release_table(session, ctable->table);
+ /* The URI is owned by the table. */
+ cursor->internal_uri = NULL;
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_open_colgroups --
+ * Open cursors on column groups for a table cursor.
+ */
+static int
+__curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[])
+{
+ WT_SESSION_IMPL *session;
+ WT_TABLE *table;
+ WT_CURSOR **cp;
+ /*
+ * Underlying column groups are always opened without dump, and only
+ * the primary is opened with next_random.
+ */
+ const char *cfg[] = {
+ cfg_arg[0], cfg_arg[1], "dump=\"\"", NULL, NULL
+ };
+ u_int i;
+
+ session = (WT_SESSION_IMPL *)ctable->iface.session;
+ table = ctable->table;
+
+ if (!table->cg_complete)
+ WT_RET_MSG(session, EINVAL,
+ "Can't use '%s' until all column groups are created",
+ table->name);
+
+ WT_RET(__wt_calloc_def(session,
+ WT_COLGROUPS(table), &ctable->cg_cursors));
+
+ for (i = 0, cp = ctable->cg_cursors;
+ i < WT_COLGROUPS(table);
+ i++, cp++) {
+ WT_RET(__wt_open_cursor(session, table->cgroups[i]->source,
+ &ctable->iface, cfg, cp));
+ cfg[3] = "next_random=false";
+ }
+ return (0);
+}
+
+/*
+ * __curtable_open_indices --
+ * Open cursors on indices for a table cursor.
+ */
+static int
+__curtable_open_indices(WT_CURSOR_TABLE *ctable)
+{
+ WT_CURSOR **cp, *primary;
+ WT_SESSION_IMPL *session;
+ WT_TABLE *table;
+ u_int i;
+
+ session = (WT_SESSION_IMPL *)ctable->iface.session;
+ table = ctable->table;
+
+ WT_RET(__wt_schema_open_indices(session, table));
+ if (table->nindices == 0 || ctable->idx_cursors != NULL)
+ return (0);
+
+ /* Check for bulk cursors. */
+ primary = *ctable->cg_cursors;
+ if (F_ISSET(primary, WT_CURSTD_BULK))
+ WT_RET_MSG(session, ENOTSUP,
+ "Bulk load is not supported for tables with indices");
+
+ WT_RET(__wt_calloc_def(session, table->nindices, &ctable->idx_cursors));
+ for (i = 0, cp = ctable->idx_cursors; i < table->nindices; i++, cp++)
+ WT_RET(__wt_open_cursor(session, table->indices[i]->source,
+ &ctable->iface, ctable->cfg, cp));
+ return (0);
+}
+
+/*
+ * __wt_curtable_open --
+ * WT_SESSION->open_cursor method for table cursors.
+ */
+int
+__wt_curtable_open(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_curtable_get_key, /* get-key */
+ __wt_curtable_get_value, /* get-value */
+ __wt_curtable_set_key, /* set-key */
+ __wt_curtable_set_value, /* set-value */
+ __curtable_compare, /* compare */
+ __curtable_next, /* next */
+ __curtable_prev, /* prev */
+ __curtable_reset, /* reset */
+ __curtable_search, /* search */
+ __curtable_search_near, /* search-near */
+ __curtable_insert, /* insert */
+ __curtable_update, /* update */
+ __curtable_remove, /* remove */
+ __curtable_close); /* close */
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR *cursor;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_TABLE *table;
+ size_t size;
+ int cfg_cnt;
+ const char *tablename, *columns;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_TABLE, iface) == 0);
+
+ ctable = NULL;
+
+ tablename = uri;
+ if (!WT_PREFIX_SKIP(tablename, "table:"))
+ return (EINVAL);
+ columns = strchr(tablename, '(');
+ if (columns == NULL)
+ size = strlen(tablename);
+ else
+ size = WT_PTRDIFF(columns, tablename);
+ WT_RET(__wt_schema_get_table(session, tablename, size, 0, &table));
+
+ if (table->is_simple) {
+ /* Just return a cursor on the underlying data source. */
+ ret = __wt_open_cursor(session,
+ table->cgroups[0]->source, NULL, cfg, cursorp);
+
+ __wt_schema_release_table(session, table);
+ return (ret);
+ }
+
+ WT_RET(__wt_calloc_def(session, 1, &ctable));
+
+ cursor = &ctable->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ cursor->internal_uri = table->name;
+ cursor->key_format = table->key_format;
+ cursor->value_format = table->value_format;
+
+ ctable->table = table;
+ ctable->plan = table->plan;
+
+ /* Handle projections. */
+ if (columns != NULL) {
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_struct_reformat(session, table,
+ columns, strlen(columns), NULL, 1, tmp));
+ WT_ERR(__wt_strndup(
+ session, tmp->data, tmp->size, &cursor->value_format));
+
+ WT_ERR(__wt_buf_init(session, tmp, 0));
+ WT_ERR(__wt_struct_plan(session, table,
+ columns, strlen(columns), 0, tmp));
+ WT_ERR(__wt_strndup(
+ session, tmp->data, tmp->size, &ctable->plan));
+ }
+
+ /*
+ * random_retrieval
+ * Random retrieval cursors only support next, reset and close.
+ */
+ WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
+ if (cval.val != 0) {
+ __wt_cursor_set_notsup(cursor);
+ cursor->next = __curtable_next_random;
+ cursor->reset = __curtable_reset;
+ }
+
+ WT_ERR(__wt_cursor_init(
+ cursor, cursor->internal_uri, NULL, cfg, cursorp));
+
+ if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
+ WT_ERR(__wt_json_column_init(cursor, table->key_format,
+ NULL, &table->colconf));
+
+ /*
+ * Open the colgroup cursors immediately: we're going to need them for
+ * any operation. We defer opening index cursors until we need them
+ * for an update. Note that this must come after the call to
+ * __wt_cursor_init: the table cursor must already be on the list of
+ * session cursors or we can't work out where to put the colgroup
+ * cursor(s).
+ */
+ WT_ERR(__curtable_open_colgroups(ctable, cfg));
+
+ /*
+ * We'll need to squirrel away a copy of the cursor configuration
+ * for if/when we open indices.
+ *
+ * cfg[0] is the baseline configuration for the cursor open and we can
+ * acquire another copy from the configuration structures, so it would
+ * be reasonable not to copy it here: but I'd rather be safe than sorry.
+ *
+ * Underlying indices are always opened without dump.
+ */
+ for (cfg_cnt = 0; cfg[cfg_cnt] != NULL; ++cfg_cnt)
+ ;
+ WT_ERR(__wt_calloc_def(session, cfg_cnt + 2, &ctable->cfg));
+ for (cfg_cnt = 0; cfg[cfg_cnt] != NULL; ++cfg_cnt)
+ WT_ERR(
+ __wt_strdup(session, cfg[cfg_cnt], &ctable->cfg[cfg_cnt]));
+ WT_ERR(__wt_strdup(session, "dump=\"\"", &ctable->cfg[cfg_cnt]));
+
+ if (0) {
+err: WT_TRET(__curtable_close(cursor));
+ *cursorp = NULL;
+ }
+
+ __wt_scr_free(&tmp);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h
new file mode 100644
index 00000000000..e358d22b278
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/api.h
@@ -0,0 +1,128 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/* Standard entry points to the API: declares/initializes local variables. */
+#define API_SESSION_INIT(s, h, n, cur, dh) \
+ WT_DATA_HANDLE *__olddh = (s)->dhandle; \
+ const char *__oldname = (s)->name; \
+ (s)->cursor = (cur); \
+ (s)->dhandle = (dh); \
+ (s)->name = (s)->lastop = #h "." #n; \
+
+#define API_CALL_NOCONF(s, h, n, cur, dh) do { \
+ API_SESSION_INIT(s, h, n, cur, dh); \
+ WT_ERR(F_ISSET(S2C(s), WT_CONN_PANIC) ? __wt_panic(s) : 0); \
+ WT_ERR(__wt_verbose((s), WT_VERB_API, "CALL: " #h ":" #n))
+
+#define API_CALL(s, h, n, cur, dh, config, cfg) do { \
+ const char *cfg[] = \
+ { WT_CONFIG_BASE(s, h##_##n), config, NULL }; \
+ API_SESSION_INIT(s, h, n, cur, dh); \
+ WT_ERR(F_ISSET(S2C(s), WT_CONN_PANIC) ? __wt_panic(s) : 0); \
+ WT_ERR(((config) != NULL) ? \
+ __wt_config_check((s), \
+ WT_CONFIG_REF(session, h##_##n), (config), 0) : 0); \
+ WT_ERR(__wt_verbose((s), WT_VERB_API, "CALL: " #h ":" #n))
+
+#define API_END(s, ret) \
+ if ((s) != NULL) { \
+ (s)->dhandle = __olddh; \
+ (s)->name = __oldname; \
+ if (F_ISSET(&(s)->txn, TXN_RUNNING) && \
+ (ret) != 0 && \
+ (ret) != WT_NOTFOUND && \
+ (ret) != WT_DUPLICATE_KEY) \
+ F_SET(&(s)->txn, TXN_ERROR); \
+ } \
+} while (0)
+
+/* An API call wrapped in a transaction if necessary. */
+#define TXN_API_CALL(s, h, n, cur, bt, config, cfg) do { \
+ int __autotxn = 0; \
+ API_CALL(s, h, n, bt, cur, config, cfg); \
+ __autotxn = !F_ISSET(&(s)->txn, TXN_AUTOCOMMIT | TXN_RUNNING); \
+ if (__autotxn) \
+ F_SET(&(s)->txn, TXN_AUTOCOMMIT)
+
+/* An API call wrapped in a transaction if necessary. */
+#define TXN_API_CALL_NOCONF(s, h, n, cur, bt) do { \
+ int __autotxn = 0; \
+ API_CALL_NOCONF(s, h, n, cur, bt); \
+ __autotxn = !F_ISSET(&(s)->txn, TXN_AUTOCOMMIT | TXN_RUNNING); \
+ if (__autotxn) \
+ F_SET(&(s)->txn, TXN_AUTOCOMMIT)
+
+/* End a transactional API call, optional retry on deadlock. */
+#define TXN_API_END_RETRY(s, ret, retry) \
+ API_END(s, ret); \
+ if (__autotxn) { \
+ if (F_ISSET(&(s)->txn, TXN_AUTOCOMMIT)) \
+ F_CLR(&(s)->txn, TXN_AUTOCOMMIT); \
+ else if (ret == 0 && !F_ISSET(&(s)->txn, TXN_ERROR)) \
+ ret = __wt_txn_commit((s), NULL); \
+ else { \
+ WT_TRET(__wt_txn_rollback((s), NULL)); \
+ if ((ret == 0 || ret == WT_ROLLBACK) && \
+ (retry)) { \
+ ret = 0; \
+ continue; \
+ } \
+ WT_TRET(__wt_session_reset_cursors(s)); \
+ } \
+ } \
+ break; \
+} while (ret == 0)
+
+/* End a transactional API call, retry on deadlock. */
+#define TXN_API_END(s, ret) TXN_API_END_RETRY(s, ret, 1)
+
+/*
+ * In almost all cases, API_END is returning immediately, make it simple.
+ * If a session or connection method is about to return WT_NOTFOUND (some
+ * underlying object was not found), map it to ENOENT, only cursor methods
+ * return WT_NOTFOUND.
+ */
+#define API_END_RET(s, ret) \
+ API_END(s, ret); \
+ return (ret)
+#define API_END_RET_NOTFOUND_MAP(s, ret) \
+ API_END(s, ret); \
+ return ((ret) == WT_NOTFOUND ? ENOENT : (ret))
+
+#define CONNECTION_API_CALL(conn, s, n, config, cfg) \
+ s = (conn)->default_session; \
+ API_CALL(s, connection, n, NULL, NULL, config, cfg)
+
+#define CONNECTION_API_CALL_NOCONF(conn, s, n) \
+ s = (conn)->default_session; \
+ API_CALL_NOCONF(s, connection, n, NULL, NULL)
+
+#define SESSION_API_CALL(s, n, config, cfg) \
+ API_CALL(s, session, n, NULL, NULL, config, cfg)
+
+#define SESSION_API_CALL_NOCONF(s, n) \
+ API_CALL_NOCONF(s, session, n, NULL, NULL)
+
+#define SESSION_TXN_API_CALL(s, n, config, cfg) \
+ TXN_API_CALL(s, session, n, NULL, NULL, config, cfg)
+
+#define CURSOR_API_CALL(cur, s, n, bt) \
+ (s) = (WT_SESSION_IMPL *)(cur)->session; \
+ API_CALL_NOCONF(s, cursor, n, cur, \
+ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle)
+
+#define CURSOR_UPDATE_API_CALL(cur, s, n, bt) \
+ (s) = (WT_SESSION_IMPL *)(cur)->session; \
+ TXN_API_CALL_NOCONF(s, cursor, n, cur, \
+ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle)
+
+#define CURSOR_UPDATE_API_END(s, ret) \
+ TXN_API_END(s, ret)
+
+#define ASYNCOP_API_CALL(conn, s, n) \
+ s = (conn)->default_session; \
+ API_CALL_NOCONF(s, asyncop, n, NULL, NULL)
diff --git a/src/third_party/wiredtiger/src/include/async.h b/src/third_party/wiredtiger/src/include/async.h
new file mode 100644
index 00000000000..8565874c2f3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/async.h
@@ -0,0 +1,128 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+typedef enum {
+ WT_ASYNCOP_ENQUEUED, /* Placed on the work queue */
+ WT_ASYNCOP_FREE, /* Able to be allocated to user */
+ WT_ASYNCOP_READY, /* Allocated and ready for user to use */
+ WT_ASYNCOP_WORKING /* Operation in progress by worker */
+} WT_ASYNC_STATE;
+
+typedef enum {
+ WT_ASYNC_FLUSH_NONE=0, /* No flush in progress */
+ WT_ASYNC_FLUSH_COMPLETE, /* Notify flush caller it's done */
+ WT_ASYNC_FLUSH_IN_PROGRESS, /* Prevent other callers */
+ WT_ASYNC_FLUSHING /* Notify workers */
+} WT_ASYNC_FLUSH_STATE;
+
+#define MAX_ASYNC_SLEEP_USECS 100000 /* Maximum sleep waiting for work */
+#define MAX_ASYNC_YIELD 200 /* Maximum number of yields for work */
+
+#define O2C(op) ((WT_CONNECTION_IMPL *)(op)->iface.connection)
+#define O2S(op) \
+ (((WT_CONNECTION_IMPL *)(op)->iface.connection)->default_session)
+/*
+ * WT_ASYNC_FORMAT --
+ * The URI/config/format cache.
+ */
+struct __wt_async_format {
+ STAILQ_ENTRY(__wt_async_format) q;
+ const char *config;
+ uint64_t cfg_hash; /* Config hash */
+ const char *uri;
+ uint64_t uri_hash; /* URI hash */
+ const char *key_format;
+ const char *value_format;
+};
+
+/*
+ * WT_ASYNC_OP_IMPL --
+ * Implementation of the WT_ASYNC_OP.
+ */
+struct __wt_async_op_impl {
+ WT_ASYNC_OP iface;
+
+ WT_ASYNC_CALLBACK *cb;
+
+ uint32_t internal_id; /* Array position id. */
+ uint64_t unique_id; /* Unique identifier. */
+
+ WT_ASYNC_FORMAT *format; /* Format structure */
+ WT_ASYNC_STATE state; /* Op state */
+ WT_ASYNC_OPTYPE optype; /* Operation type */
+};
+
+/*
+ * Definition of the async subsystem.
+ */
+struct __wt_async {
+ /*
+ * Ops array protected by the ops_lock.
+ */
+ WT_SPINLOCK ops_lock; /* Locked: ops array */
+ WT_ASYNC_OP_IMPL *async_ops; /* Async ops */
+#define OPS_INVALID_INDEX 0xffffffff
+ uint32_t ops_index; /* Active slot index */
+ uint64_t op_id; /* Unique ID counter */
+ WT_ASYNC_OP_IMPL **async_queue; /* Async ops work queue */
+ uint32_t async_qsize; /* Async work queue size */
+ /*
+ * We need to have two head and tail values. All but one is
+ * maintained as an ever increasing value to ease wrap around.
+ *
+ * alloc_head: the next one to allocate for producers.
+ * head: the current head visible to consumers.
+ * head is always <= alloc_head.
+ * alloc_tail: the next slot for consumers to dequeue.
+ * alloc_tail is always <= head.
+ * tail_slot: the last slot consumed.
+ * A producer may need wait for tail_slot to advance.
+ */
+ uint64_t alloc_head; /* Next slot to enqueue */
+ uint64_t head; /* Head visible to worker */
+ uint64_t alloc_tail; /* Next slot to dequeue */
+ uint64_t tail_slot; /* Worker slot consumed */
+
+ STAILQ_HEAD(__wt_async_format_qh, __wt_async_format) formatqh;
+ int cur_queue; /* Currently enqueued */
+ int max_queue; /* Maximum enqueued */
+ WT_ASYNC_FLUSH_STATE flush_state; /* Queue flush state */
+ /* Notify any waiting threads when flushing is done. */
+ WT_CONDVAR *flush_cond;
+ WT_ASYNC_OP_IMPL flush_op; /* Special flush op */
+ uint32_t flush_count; /* Worker count */
+ uint64_t flush_gen; /* Flush generation number */
+
+#define WT_ASYNC_MAX_WORKERS 20
+ WT_SESSION_IMPL *worker_sessions[WT_ASYNC_MAX_WORKERS];
+ /* Async worker threads */
+ wt_thread_t worker_tids[WT_ASYNC_MAX_WORKERS];
+
+ uint32_t flags; /* Currently unused. */
+};
+
+/*
+ * WT_ASYNC_CURSOR --
+ * Async container for a cursor. Each async worker thread
+ * has a cache of async cursors to reuse for operations.
+ */
+struct __wt_async_cursor {
+ STAILQ_ENTRY(__wt_async_cursor) q; /* Worker cache */
+ uint64_t cfg_hash; /* Config hash */
+ uint64_t uri_hash; /* URI hash */
+ WT_CURSOR *c; /* WT cursor */
+};
+
+/*
+ * WT_ASYNC_WORKER_STATE --
+ * State for an async worker thread.
+ */
+struct __wt_async_worker_state {
+ uint32_t id;
+ STAILQ_HEAD(__wt_cursor_qh, __wt_async_cursor) cursorqh;
+ uint32_t num_cursors;
+};
diff --git a/src/third_party/wiredtiger/src/include/bitstring.i b/src/third_party/wiredtiger/src/include/bitstring.i
new file mode 100644
index 00000000000..95af6731bf9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/bitstring.i
@@ -0,0 +1,316 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*-
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Paul Vixie.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/sys/bitstring.h,v 1.5 2005/01/07 02:29:23 imp Exp $
+ */
+
+ /* byte of the bitstring bit is in */
+#define __bit_byte(bit) ((bit) >> 3)
+
+ /* mask for the bit within its byte */
+#define __bit_mask(bit) (1 << ((bit) & 0x7))
+
+ /* Bytes in a bitstring of nbits */
+#define __bitstr_size(nbits) (((nbits) + 7) >> 3)
+
+/*
+ * __bit_alloc --
+ * Allocate a bitstring.
+ */
+static inline int
+__bit_alloc(WT_SESSION_IMPL *session, uint64_t nbits, void *retp)
+{
+ return (__wt_calloc(
+ session, (size_t)__bitstr_size(nbits), sizeof(uint8_t), retp));
+}
+
+/*
+ * __bit_test --
+ * Test one bit in name.
+ */
+static inline int
+__bit_test(uint8_t *bitf, uint64_t bit)
+{
+ return (bitf[__bit_byte(bit)] & __bit_mask(bit) ? 1 : 0);
+}
+
+/*
+ * __bit_set --
+ * Set one bit in name.
+ */
+static inline void
+__bit_set(uint8_t *bitf, uint64_t bit)
+{
+ bitf[__bit_byte(bit)] |= __bit_mask(bit);
+}
+
+/*
+ * __bit_clear --
+ * Clear one bit in name.
+ */
+static inline void
+__bit_clear(uint8_t *bitf, uint64_t bit)
+{
+ bitf[__bit_byte(bit)] &= ~__bit_mask(bit);
+}
+
+/*
+ * __bit_nclr --
+ * Clear bits start-to-stop in name.
+ */
+static inline void
+__bit_nclr(uint8_t *bitf, uint64_t start, uint64_t stop)
+{
+ uint64_t startbyte, stopbyte;
+
+ startbyte = __bit_byte(start);
+ stopbyte = __bit_byte(stop);
+
+ if (startbyte == stopbyte)
+ bitf[startbyte] &=
+ ((0xff >> (8 - (start & 0x7))) |
+ (0xff << ((stop & 0x7) + 1)));
+ else {
+ bitf[startbyte] &= 0xff >> (8 - (start & 0x7));
+ while (++startbyte < stopbyte)
+ bitf[startbyte] = 0;
+ bitf[stopbyte] &= 0xff << ((stop & 0x7) + 1);
+ }
+}
+
+/*
+ * __bit_nset --
+ * Set bits start-to-stop in name.
+ */
+static inline void
+__bit_nset(uint8_t *bitf, uint64_t start, uint64_t stop)
+{
+ uint64_t startbyte, stopbyte;
+
+ startbyte = __bit_byte(start);
+ stopbyte = __bit_byte(stop);
+ if (startbyte == stopbyte)
+ bitf[startbyte] |=
+ ((0xff << (start & 0x7)) & (0xff >> (7 - (stop & 0x7))));
+ else {
+ bitf[startbyte] |= 0xff << (start & 0x7);
+ while (++startbyte < stopbyte)
+ bitf[startbyte] = 0xff;
+ bitf[stopbyte] |= 0xff >> (7 - (stop & 0x7));
+ }
+}
+
+/*
+ * __bit_ffc --
+ * Find first clear bit in name, return 0 on success, -1 on no bit clear.
+ */
+static inline int
+__bit_ffc(uint8_t *bitf, uint64_t nbits, uint64_t *retp)
+{
+ uint8_t lb;
+ uint64_t byte, stopbyte, value;
+
+ value = 0; /* -Wuninitialized */
+
+ if (nbits == 0)
+ return (-1);
+
+ for (byte = 0,
+ stopbyte = __bit_byte(nbits - 1); byte <= stopbyte; ++byte)
+ if (bitf[byte] != 0xff) {
+ value = byte << 3;
+ for (lb = bitf[byte]; lb & 0x01; ++value, lb >>= 1)
+ ;
+ break;
+ }
+
+ if (byte > stopbyte || value >= nbits)
+ return (-1);
+
+ *retp = value;
+ return (0);
+}
+
+/*
+ * __bit_ffs --
+ * Find first set bit in name, return 0 on success, -1 on no bit set.
+ */
+static inline int
+__bit_ffs(uint8_t *bitf, uint64_t nbits, uint64_t *retp)
+{
+ uint8_t lb;
+ uint64_t byte, stopbyte, value;
+
+ value = 0;
+ if (nbits == 0)
+ return (-1);
+
+ for (byte = 0,
+ stopbyte = __bit_byte(nbits - 1); byte <= stopbyte; ++byte)
+ if (bitf[byte] != 0) {
+ value = byte << 3;
+ for (lb = bitf[byte]; !(lb & 0x01); ++value, lb >>= 1)
+ ;
+ break;
+ }
+
+ if (byte > stopbyte || value >= nbits)
+ return (-1);
+
+ *retp = value;
+ return (0);
+}
+
+/*
+ * __bit_getv --
+ * Return a fixed-length column store bit-field value.
+ */
+static inline uint8_t
+__bit_getv(uint8_t *bitf, uint64_t entry, uint8_t width)
+{
+ uint8_t value;
+ uint64_t bit;
+
+#define __BIT_GET(len, mask) \
+ case len: \
+ if (__bit_test(bitf, bit)) \
+ value |= mask; \
+ ++bit \
+ /* FALLTHROUGH */
+
+ value = 0;
+ bit = entry * width;
+
+ /*
+ * Fast-path single bytes, do repeated tests for the rest: we could
+ * slice-and-dice instead, but the compiler is probably going to do
+ * a better job than I will.
+ */
+ switch (width) {
+ case 8:
+ return (bitf[__bit_byte(bit)]);
+ __BIT_GET(7, 0x40);
+ __BIT_GET(6, 0x20);
+ __BIT_GET(5, 0x10);
+ __BIT_GET(4, 0x08);
+ __BIT_GET(3, 0x04);
+ __BIT_GET(2, 0x02);
+ __BIT_GET(1, 0x01);
+ }
+ return (value);
+}
+
+/*
+ * __bit_getv_recno --
+ * Return a record number's bit-field value.
+ */
+static inline uint8_t
+__bit_getv_recno(WT_PAGE *page, uint64_t recno, uint8_t width)
+{
+ return (__bit_getv(
+ page->pg_fix_bitf, recno - page->pg_fix_recno, width));
+}
+
+/*
+ * __bit_setv --
+ * Set a fixed-length column store bit-field value.
+ */
+static inline void
+__bit_setv(uint8_t *bitf, uint64_t entry, uint8_t width, uint8_t value)
+{
+ uint64_t bit;
+
+#define __BIT_SET(len, mask) \
+ case len: \
+ if (value & (mask)) \
+ __bit_set(bitf, bit); \
+ else \
+ __bit_clear(bitf, bit); \
+ ++bit \
+ /* FALLTHROUGH */
+
+ bit = entry * width;
+
+ /*
+ * Fast-path single bytes, do repeated tests for the rest: we could
+ * slice-and-dice instead, but the compiler is probably going to do
+ * a better job than I will.
+ */
+ switch (width) {
+ case 8:
+ bitf[__bit_byte(bit)] = value;
+ return;
+ __BIT_SET(7, 0x40);
+ __BIT_SET(6, 0x20);
+ __BIT_SET(5, 0x10);
+ __BIT_SET(4, 0x08);
+ __BIT_SET(3, 0x04);
+ __BIT_SET(2, 0x02);
+ __BIT_SET(1, 0x01);
+ }
+}
+
+/*
+ * __bit_setv_recno --
+ * Set a record number's bit-field value.
+ */
+static inline void
+__bit_setv_recno(WT_PAGE *page, uint64_t recno, uint8_t width, uint8_t value)
+{
+ __bit_setv(page->pg_fix_bitf, recno - page->pg_fix_recno, width, value);
+}
diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h
new file mode 100644
index 00000000000..10fa51243ac
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/block.h
@@ -0,0 +1,337 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WiredTiger's block manager interface.
+ */
+
+/*
+ * The file's description is written into the first block of the file, which
+ * means we can use an offset of 0 as an invalid offset.
+ */
+#define WT_BLOCK_INVALID_OFFSET 0
+
+/*
+ * The block manager maintains three per-checkpoint extent lists:
+ * alloc: the extents allocated in this checkpoint
+ * avail: the extents available for allocation
+ * discard: the extents freed in this checkpoint
+ *
+ * An extent list is based on two skiplists: first, a by-offset list linking
+ * WT_EXT elements and sorted by file offset (low-to-high), second, a by-size
+ * list linking WT_SIZE elements and sorted by chunk size (low-to-high).
+ *
+ * Additionally, each WT_SIZE element on the by-size has a skiplist of its own,
+ * linking WT_EXT elements and sorted by file offset (low-to-high). This list
+ * has an entry for extents of a particular size.
+ *
+ * The trickiness is each individual WT_EXT element appears on two skiplists.
+ * In order to minimize allocation calls, we allocate a single array of WT_EXT
+ * pointers at the end of the WT_EXT structure, for both skiplists, and store
+ * the depth of the skiplist in the WT_EXT structure. The skiplist entries for
+ * the offset skiplist start at WT_EXT.next[0] and the entries for the size
+ * skiplist start at WT_EXT.next[WT_EXT.depth].
+ *
+ * One final complication: we only maintain the per-size skiplist for the avail
+ * list, the alloc and discard extent lists are not searched based on size.
+ */
+
+/*
+ * WT_EXTLIST --
+ * An extent list.
+ */
+struct __wt_extlist {
+ char *name; /* Name */
+
+ uint64_t bytes; /* Byte count */
+ uint32_t entries; /* Entry count */
+
+ wt_off_t offset; /* Written extent offset */
+ uint32_t cksum, size; /* Written extent cksum, size */
+
+ int track_size; /* Maintain per-size skiplist */
+
+ WT_EXT *last; /* Cached last element */
+
+ WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Size/offset skiplists */
+ WT_SIZE *sz[WT_SKIP_MAXDEPTH];
+};
+
+/*
+ * WT_EXT --
+ * Encapsulation of an extent, either allocated or freed within the
+ * checkpoint.
+ */
+struct __wt_ext {
+ wt_off_t off; /* Extent's file offset */
+ wt_off_t size; /* Extent's Size */
+
+ uint8_t depth; /* Skip list depth */
+
+ /*
+ * Variable-length array, sized by the number of skiplist elements.
+ * The first depth array entries are the address skiplist elements,
+ * the second depth array entries are the size skiplist.
+ */
+ WT_EXT *next[0]; /* Offset, size skiplists */
+};
+
+/*
+ * WT_SIZE --
+ * Encapsulation of a block size skiplist entry.
+ */
+struct __wt_size {
+ wt_off_t size; /* Size */
+
+ uint8_t depth; /* Skip list depth */
+
+ WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Per-size offset skiplist */
+
+ /*
+ * We don't use a variable-length array for the size skiplist, we want
+ * to be able to use any cached WT_SIZE structure as the head of a list,
+ * and we don't know the related WT_EXT structure's depth.
+ */
+ WT_SIZE *next[WT_SKIP_MAXDEPTH]; /* Size skiplist */
+};
+
+/*
+ * WT_EXT_FOREACH --
+ * Walk a block manager skiplist.
+ * WT_EXT_FOREACH_OFF --
+ * Walk a block manager skiplist where the WT_EXT.next entries are offset
+ * by the depth.
+ */
+#define WT_EXT_FOREACH(skip, head) \
+ for ((skip) = (head)[0]; \
+ (skip) != NULL; (skip) = (skip)->next[0])
+#define WT_EXT_FOREACH_OFF(skip, head) \
+ for ((skip) = (head)[0]; \
+ (skip) != NULL; (skip) = (skip)->next[(skip)->depth])
+
+/*
+ * Checkpoint cookie: carries a version number as I don't want to rev the schema
+ * file version should the default block manager checkpoint format change.
+ *
+ * Version #1 checkpoint cookie format:
+ * [1] [root addr] [alloc addr] [avail addr] [discard addr]
+ * [file size] [checkpoint size] [write generation]
+ */
+#define WT_BM_CHECKPOINT_VERSION 1 /* Checkpoint format version */
+#define WT_BLOCK_EXTLIST_MAGIC 71002 /* Identify a list */
+struct __wt_block_ckpt {
+ uint8_t version; /* Version */
+
+ wt_off_t root_offset; /* The root */
+ uint32_t root_cksum, root_size;
+
+ WT_EXTLIST alloc; /* Extents allocated */
+ WT_EXTLIST avail; /* Extents available */
+ WT_EXTLIST discard; /* Extents discarded */
+
+ wt_off_t file_size; /* Checkpoint file size */
+ uint64_t ckpt_size; /* Checkpoint byte count */
+
+ WT_EXTLIST ckpt_avail; /* Checkpoint free'd extents */
+
+ /*
+ * Checkpoint archive: the block manager may potentially free a lot of
+ * memory from the allocation and discard extent lists when checkpoint
+ * completes. Put it off until the checkpoint resolves, that lets the
+ * upper btree layer continue eviction sooner.
+ */
+ WT_EXTLIST ckpt_alloc; /* Checkpoint archive */
+ WT_EXTLIST ckpt_discard; /* Checkpoint archive */
+};
+
+/*
+ * WT_BM --
+ * Block manager handle, references a single checkpoint in a file.
+ */
+struct __wt_bm {
+ /* Methods */
+ int (*addr_string)
+ (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
+ int (*addr_valid)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ u_int (*block_header)(WT_BM *);
+ int (*checkpoint)
+ (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, WT_CKPT *, int);
+ int (*checkpoint_load)(WT_BM *, WT_SESSION_IMPL *,
+ const uint8_t *, size_t, uint8_t *, size_t *, int);
+ int (*checkpoint_resolve)(WT_BM *, WT_SESSION_IMPL *);
+ int (*checkpoint_unload)(WT_BM *, WT_SESSION_IMPL *);
+ int (*close)(WT_BM *, WT_SESSION_IMPL *);
+ int (*compact_end)(WT_BM *, WT_SESSION_IMPL *);
+ int (*compact_page_skip)
+ (WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t, int *);
+ int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, int *);
+ int (*compact_start)(WT_BM *, WT_SESSION_IMPL *);
+ int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ int (*read)
+ (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
+ int (*salvage_end)(WT_BM *, WT_SESSION_IMPL *);
+ int (*salvage_next)
+ (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t *, int *);
+ int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *);
+ int (*salvage_valid)
+ (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, int);
+ int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats);
+ int (*sync)(WT_BM *, WT_SESSION_IMPL *, int);
+ int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ int (*verify_end)(WT_BM *, WT_SESSION_IMPL *);
+ int (*verify_start)(WT_BM *, WT_SESSION_IMPL *, WT_CKPT *);
+ int (*write) (WT_BM *,
+ WT_SESSION_IMPL *, WT_ITEM *, uint8_t *, size_t *, int);
+ int (*write_size)(WT_BM *, WT_SESSION_IMPL *, size_t *);
+
+ WT_BLOCK *block; /* Underlying file */
+
+ void *map; /* Mapped region */
+ size_t maplen;
+ void *mappingcookie;
+
+ /*
+ * There's only a single block manager handle that can be written, all
+ * others are checkpoints.
+ */
+ int is_live; /* The live system */
+};
+
+/*
+ * WT_BLOCK --
+ * Block manager handle, references a single file.
+ */
+struct __wt_block {
+ const char *name; /* Name */
+
+ /* A list of block manager handles, sharing a file descriptor. */
+ uint32_t ref; /* References */
+ WT_FH *fh; /* Backing file handle */
+ TAILQ_ENTRY(__wt_block) q; /* Linked list of handles */
+
+ /* Configuration information, set when the file is opened. */
+ int allocfirst; /* Allocation is first-fit */
+ int allocfirst_save; /* Allocation is first-fit, saved */
+ uint32_t allocsize; /* Allocation size */
+ size_t os_cache; /* System buffer cache flush max */
+ size_t os_cache_max;
+ size_t os_cache_dirty; /* System buffer cache write max */
+ size_t os_cache_dirty_max;
+
+ u_int block_header; /* Header length */
+
+ /*
+ * There is only a single checkpoint in a file that can be written. The
+ * information could logically live in the WT_BM structure, but then we
+ * would be re-creating it every time we opened a new checkpoint and I'd
+ * rather not do that. So, it's stored here, only accessed by one WT_BM
+ * handle.
+ */
+ WT_SPINLOCK live_lock; /* Live checkpoint lock */
+ WT_BLOCK_CKPT live; /* Live checkpoint */
+ int ckpt_inprogress;/* Live checkpoint in progress */
+
+ /* Salvage support */
+ wt_off_t slvg_off; /* Salvage file offset */
+
+ /* Verification support */
+ int verify; /* If performing verification */
+ wt_off_t verify_size; /* Checkpoint's file size */
+ WT_EXTLIST verify_alloc; /* Verification allocation list */
+ uint64_t frags; /* Maximum frags in the file */
+ uint8_t *fragfile; /* Per-file frag tracking list */
+ uint8_t *fragckpt; /* Per-checkpoint frag tracking list */
+};
+
+/*
+ * WT_BLOCK_DESC --
+ * The file's description.
+ */
+struct __wt_block_desc {
+#define WT_BLOCK_MAGIC 120897
+ uint32_t magic; /* 00-03: Magic number */
+#define WT_BLOCK_MAJOR_VERSION 1
+ uint16_t majorv; /* 04-05: Major version */
+#define WT_BLOCK_MINOR_VERSION 0
+ uint16_t minorv; /* 06-07: Minor version */
+
+ uint32_t cksum; /* 08-11: Description block checksum */
+
+ uint32_t unused; /* 12-15: Padding */
+};
+/*
+ * WT_BLOCK_DESC_SIZE is the expected structure size -- we verify the build to
+ * ensure the compiler hasn't inserted padding (padding won't cause failure,
+ * we reserve the first allocation-size block of the file for this information,
+ * but it would be worth investigation, regardless).
+ */
+#define WT_BLOCK_DESC_SIZE 16
+
+/*
+ * WT_BLOCK_HEADER --
+ * Blocks have a common header, a WT_PAGE_HEADER structure followed by a
+ * block-manager specific structure: WT_BLOCK_HEADER is WiredTiger's default.
+ */
+struct __wt_block_header {
+ /*
+ * We write the page size in the on-disk page header because it makes
+ * salvage easier. (If we don't know the expected page length, we'd
+ * have to read increasingly larger chunks from the file until we find
+ * one that checksums, and that's going to be harsh given WiredTiger's
+ * potentially large page sizes.)
+ */
+ uint32_t disk_size; /* 00-03: on-disk page size */
+
+ /*
+ * Page checksums are stored in two places. First, the page checksum
+ * is written within the internal page that references it as part of
+ * the address cookie. This is done to improve the chances of detecting
+ * not only disk corruption but other bugs (for example, overwriting a
+ * page with another valid page image). Second, a page's checksum is
+ * stored in the disk header. This is for salvage, so salvage knows it
+ * has found a page that may be useful.
+ */
+ uint32_t cksum; /* 04-07: checksum */
+
+#define WT_BLOCK_DATA_CKSUM 0x01 /* Block data is part of the checksum */
+ uint8_t flags; /* 08: flags */
+
+ /*
+ * End the structure with 3 bytes of padding: it wastes space, but it
+ * leaves the structure 32-bit aligned and having a few bytes to play
+ * with in the future can't hurt.
+ */
+ uint8_t unused[3]; /* 09-11: unused padding */
+};
+/*
+ * WT_BLOCK_HEADER_SIZE is the number of bytes we allocate for the structure: if
+ * the compiler inserts padding it will break the world.
+ */
+#define WT_BLOCK_HEADER_SIZE 12
+
+/*
+ * WT_BLOCK_HEADER_BYTE
+ * WT_BLOCK_HEADER_BYTE_SIZE --
+ * The first usable data byte on the block (past the combined headers).
+ */
+#define WT_BLOCK_HEADER_BYTE_SIZE \
+ (WT_PAGE_HEADER_SIZE + WT_BLOCK_HEADER_SIZE)
+#define WT_BLOCK_HEADER_BYTE(dsk) \
+ ((void *)((uint8_t *)(dsk) + WT_BLOCK_HEADER_BYTE_SIZE))
+
+/*
+ * Don't compress the block's WT_PAGE_HEADER and WT_BLOCK_HEADER structures.
+ * We need the WT_PAGE_HEADER in-memory size, and the WT_BLOCK_HEADER checksum
+ * and on-disk size to be immediately available without decompression. We use
+ * the on-disk size and checksum during salvage to figure out where the blocks
+ * are, and the in-memory size tells us how large a buffer we need to decompress
+ * the block. We could skip less than 64B, but a 64B boundary may offer better
+ * alignment for the underlying compression engine, and skipping 64B won't make
+ * a difference in terms of compression efficiency.
+ */
+#define WT_BLOCK_COMPRESS_SKIP 64
diff --git a/src/third_party/wiredtiger/src/include/bloom.h b/src/third_party/wiredtiger/src/include/bloom.h
new file mode 100644
index 00000000000..4ae6d96b935
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/bloom.h
@@ -0,0 +1,28 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+/*
+ * REFERENCES:
+ * http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf
+ * http://code.google.com/p/cityhash-c/
+ */
+
+struct __wt_bloom {
+ const char *uri;
+ char *config;
+ uint8_t *bitstring; /* For in memory representation. */
+ WT_SESSION_IMPL *session;
+ WT_CURSOR *c;
+
+ uint32_t k; /* The number of hash functions used. */
+ uint32_t factor; /* The number of bits per item inserted. */
+ uint64_t m; /* The number of slots in the bit string. */
+ uint64_t n; /* The number of items to be inserted. */
+};
+
+struct __wt_bloom_hash {
+ uint64_t h1, h2; /* The two hashes used to calculate bits. */
+};
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
new file mode 100644
index 00000000000..0c4fe876e5e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -0,0 +1,1015 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WT_PAGE_HEADER --
+ * Blocks have a common header, a WT_PAGE_HEADER structure followed by a
+ * block-manager specific structure.
+ */
+struct __wt_page_header {
+ /*
+ * The record number of the first record of the page is stored on disk
+ * so we can figure out where the column-store leaf page fits into the
+ * key space during salvage.
+ */
+ uint64_t recno; /* 00-07: column-store starting recno */
+
+ /*
+ * We maintain page write-generations in the non-transactional case
+ * as that's how salvage can determine the most recent page between
+ * pages overlapping the same key range.
+ */
+ uint64_t write_gen; /* 08-15: write generation */
+
+ /*
+ * The page's in-memory size isn't rounded or aligned, it's the actual
+ * number of bytes the disk-image consumes when instantiated in memory.
+ */
+ uint32_t mem_size; /* 16-19: in-memory page size */
+
+ union {
+ uint32_t entries; /* 20-23: number of cells on page */
+ uint32_t datalen; /* 20-23: overflow data length */
+ } u;
+
+ uint8_t type; /* 24: page type */
+
+#define WT_PAGE_COMPRESSED 0x01 /* Page is compressed on disk */
+#define WT_PAGE_EMPTY_V_ALL 0x02 /* Page has all zero-length values */
+#define WT_PAGE_EMPTY_V_NONE 0x04 /* Page has no zero-length values */
+ uint8_t flags; /* 25: flags */
+
+ /*
+ * End the structure with 2 bytes of padding: it wastes space, but it
+ * leaves the structure 32-bit aligned and having a few bytes to play
+ * with in the future can't hurt.
+ */
+ uint8_t unused[2]; /* 26-27: unused padding */
+};
+/*
+ * WT_PAGE_HEADER_SIZE is the number of bytes we allocate for the structure: if
+ * the compiler inserts padding it will break the world.
+ */
+#define WT_PAGE_HEADER_SIZE 28
+
+/*
+ * The block-manager specific information immediately follows the WT_PAGE_HEADER
+ * structure.
+ */
+#define WT_BLOCK_HEADER_REF(dsk) \
+ ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_SIZE))
+
+/*
+ * WT_PAGE_HEADER_BYTE --
+ * WT_PAGE_HEADER_BYTE_SIZE --
+ * The first usable data byte on the block (past the combined headers).
+ */
+#define WT_PAGE_HEADER_BYTE_SIZE(btree) \
+ ((u_int)(WT_PAGE_HEADER_SIZE + (btree)->block_header))
+#define WT_PAGE_HEADER_BYTE(btree, dsk) \
+ ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_BYTE_SIZE(btree)))
+
+/*
+ * WT_ADDR --
+ * An in-memory structure to hold a block's location.
+ */
+struct __wt_addr {
+ uint8_t *addr; /* Block-manager's cookie */
+ uint8_t size; /* Block-manager's cookie length */
+
+#define WT_ADDR_INT 1 /* Internal page */
+#define WT_ADDR_LEAF 2 /* Leaf page */
+#define WT_ADDR_LEAF_NO 3 /* Leaf page, no overflow */
+ uint8_t type;
+
+ /*
+ * If an address is both as an address for the previous and the current
+ * multi-block reconciliations, that is, a block we're writing matches
+ * the block written the last time, it will appear in both the current
+ * boundary points as well as the page modification's list of previous
+ * blocks. The reuse flag is how we know that's happening so the block
+ * is treated correctly (not free'd on error, for example).
+ */
+ uint8_t reuse;
+};
+
+/*
+ * Overflow tracking for reuse: When a page is reconciled, we write new K/V
+ * overflow items. If pages are reconciled multiple times, we need to know
+ * if we've already written a particular overflow record (so we don't write
+ * it again), as well as if we've modified an overflow record previously
+ * written (in which case we want to write a new record and discard blocks
+ * used by the previously written record). Track overflow records written
+ * for the page, storing the values in a skiplist with the record's value as
+ * the "key".
+ */
+struct __wt_ovfl_reuse {
+ uint32_t value_offset; /* Overflow value offset */
+ uint32_t value_size; /* Overflow value size */
+ uint8_t addr_offset; /* Overflow addr offset */
+ uint8_t addr_size; /* Overflow addr size */
+
+ /*
+ * On each page reconciliation, we clear the entry's in-use flag, and
+ * reset it as the overflow record is re-used. After reconciliation
+ * completes, unused skiplist entries are discarded, along with their
+ * underlying blocks.
+ *
+ * On each page reconciliation, set the just-added flag for each new
+ * skiplist entry; if reconciliation fails for any reason, discard the
+ * newly added skiplist entries, along with their underlying blocks.
+ */
+#define WT_OVFL_REUSE_INUSE 0x01
+#define WT_OVFL_REUSE_JUST_ADDED 0x02
+ uint8_t flags;
+
+ /*
+ * The untyped address immediately follows the WT_OVFL_REUSE structure,
+ * the untyped value immediately follows the address.
+ */
+#define WT_OVFL_REUSE_ADDR(p) \
+ ((void *)((uint8_t *)(p) + (p)->addr_offset))
+#define WT_OVFL_REUSE_VALUE(p) \
+ ((void *)((uint8_t *)(p) + (p)->value_offset))
+
+ WT_OVFL_REUSE *next[0]; /* Forward-linked skip list */
+};
+
+/*
+ * Overflow tracking for cached values: When a page is reconciled, we write new
+ * K/V overflow items, and discard previous underlying blocks. If there's a
+ * transaction in the system that needs to read the previous value, we have to
+ * cache the old value until no running transaction needs it.
+ */
+struct __wt_ovfl_txnc {
+ uint64_t current; /* Maximum transaction ID at store */
+
+ uint32_t value_offset; /* Overflow value offset */
+ uint32_t value_size; /* Overflow value size */
+ uint8_t addr_offset; /* Overflow addr offset */
+ uint8_t addr_size; /* Overflow addr size */
+
+ /*
+ * The untyped address immediately follows the WT_OVFL_TXNC
+ * structure, the untyped value immediately follows the address.
+ */
+#define WT_OVFL_TXNC_ADDR(p) \
+ ((void *)((uint8_t *)(p) + (p)->addr_offset))
+#define WT_OVFL_TXNC_VALUE(p) \
+ ((void *)((uint8_t *)(p) + (p)->value_offset))
+
+ WT_OVFL_TXNC *next[0]; /* Forward-linked skip list */
+};
+
+/*
+ * WT_PAGE_MODIFY --
+ * When a page is modified, there's additional information to maintain.
+ */
+struct __wt_page_modify {
+ /*
+ * Track the highest transaction ID at which the page was written to
+ * disk. This can be used to avoid trying to write the page multiple
+ * times if a snapshot is keeping old versions pinned (e.g., in a
+ * checkpoint).
+ */
+ uint64_t disk_snap_min;
+
+ /* The largest transaction ID seen on the page by reconciliation. */
+ uint64_t rec_max_txn;
+
+ /* The first unwritten transaction ID (approximate). */
+ uint64_t first_dirty_txn;
+
+ /* The largest update transaction ID (approximate). */
+ uint64_t update_txn;
+
+ /* Dirty bytes added to the cache. */
+ uint64_t bytes_dirty;
+
+ /*
+ * When pages are reconciled, the result is one or more replacement
+ * blocks. A replacement block can be in one of two states: it was
+ * written to disk, and so we have a block address, or it contained
+ * unresolved modifications and we have a disk image for it with a
+ * list of those unresolved modifications. The former is the common
+ * case: we only build lists of unresolved modifications when we're
+ * evicting a page, and we only expect to see unresolved modifications
+ * on a page being evicted in the case of a hot page that's too large
+ * to keep in memory as it is. In other words, checkpoints will skip
+ * unresolved modifications, and will write the blocks rather than
+ * build lists of unresolved modifications.
+ *
+ * Ugly union/struct layout to conserve memory, we never have both
+ * a replace address and multiple replacement blocks.
+ */
+ union {
+ WT_ADDR replace; /* Single, written replacement block */
+#define mod_replace u1.replace
+
+ struct { /* Multiple replacement blocks */
+ struct __wt_multi {
+ /*
+ * Block's key: either a column-store record number or a
+ * row-store variable length byte string.
+ */
+ union {
+ uint64_t recno;
+ WT_IKEY *ikey;
+ } key;
+
+ /*
+ * Eviction, but block wasn't written: unresolved updates and
+ * associated disk image.
+ *
+ * Skipped updates are either a WT_INSERT, or a row-store leaf
+ * page entry.
+ */
+ struct __wt_upd_skipped {
+ WT_INSERT *ins;
+ WT_ROW *rip;
+ } *skip;
+ uint32_t skip_entries;
+ void *skip_dsk;
+
+ /*
+ * Block was written: address, size and checksum.
+ * On subsequent reconciliations of this page, we avoid writing
+ * the block if it's unchanged by comparing size and checksum;
+ * the reuse flag is set when the block is unchanged and we're
+ * reusing a previous address.
+ */
+ WT_ADDR addr;
+ uint32_t size;
+ uint32_t cksum;
+ } *multi;
+ uint32_t multi_entries; /* Multiple blocks element count */
+ } m;
+#define mod_multi u1.m.multi
+#define mod_multi_entries u1.m.multi_entries
+ } u1;
+
+ /*
+ * Internal pages need to be able to chain root-page splits and have a
+ * special transactional eviction requirement. Column-store leaf pages
+ * need update and append lists.
+ *
+ * Ugly union/struct layout to conserve memory, a page is either a leaf
+ * page or an internal page.
+ */
+ union {
+ struct {
+ /*
+ * When a root page splits, we create a new page and write it;
+ * the new page can also split and so on, and we continue this
+ * process until we write a single replacement root page. We
+ * use the root split field to track the list of created pages
+ * so they can be discarded when no longer needed.
+ */
+ WT_PAGE *root_split; /* Linked list of root split pages */
+
+ /*
+ * When we deepen the tree, newly created internal pages cannot
+ * be evicted until all threads have exited the original page
+ * index structure. We set a transaction value during the split
+ * that's checked during eviction.
+ */
+ uint64_t split_txn; /* Split eviction transaction value */
+ } intl;
+#define mod_root_split u2.intl.root_split
+#define mod_split_txn u2.intl.split_txn
+ struct {
+ /*
+ * Appended items to column-stores: there is only a single one
+ * of these per column-store tree.
+ */
+ WT_INSERT_HEAD **append;
+
+ /*
+ * Updated items in column-stores: variable-length RLE entries
+ * can expand to multiple entries which requires some kind of
+ * list we can expand on demand. Updated items in fixed-length
+ * files could be done based on an WT_UPDATE array as in
+ * row-stores, but there can be a very large number of bits on
+ * a single page, and the cost of the WT_UPDATE array would be
+ * huge.
+ */
+ WT_INSERT_HEAD **update;
+ } leaf;
+#define mod_append u2.leaf.append
+#define mod_update u2.leaf.update
+ } u2;
+
+ /*
+ * Overflow record tracking for reconciliation. We assume overflow
+ * records are relatively rare, so we don't allocate the structures
+ * to track them until we actually see them in the data.
+ */
+ struct __wt_ovfl_track {
+ /*
+ * Overflow key/value address/byte-string pairs we potentially
+ * reuse each time we reconcile the page.
+ */
+ WT_OVFL_REUSE *ovfl_reuse[WT_SKIP_MAXDEPTH];
+
+ /*
+ * Overflow value address/byte-string pairs cached until no
+ * running transaction will possibly read them.
+ */
+ WT_OVFL_TXNC *ovfl_txnc[WT_SKIP_MAXDEPTH];
+
+ /*
+ * Overflow key/value addresses to be discarded from the block
+ * manager after reconciliation completes successfully.
+ */
+ WT_CELL **discard;
+ size_t discard_entries;
+ size_t discard_allocated;
+ } *ovfl_track;
+
+ /*
+ * The write generation is incremented when a page is modified, a page
+ * is clean if the write generation is 0.
+ *
+ * !!!
+ * 4B values are probably larger than required, but I'm more confident
+ * 4B types will always be backed by atomic writes to memory.
+ */
+ uint32_t write_gen;
+
+#define WT_PAGE_LOCK(s, p) \
+ __wt_spin_lock((s), &S2C(s)->page_lock[(p)->modify->page_lock])
+#define WT_PAGE_UNLOCK(s, p) \
+ __wt_spin_unlock((s), &S2C(s)->page_lock[(p)->modify->page_lock])
+ uint8_t page_lock; /* Page's spinlock */
+
+#define WT_PM_REC_EMPTY 0x01 /* Reconciliation: no replacement */
+#define WT_PM_REC_MULTIBLOCK 0x02 /* Reconciliation: multiple blocks */
+#define WT_PM_REC_REPLACE 0x04 /* Reconciliation: single block */
+#define WT_PM_REC_MASK \
+ (WT_PM_REC_EMPTY | WT_PM_REC_MULTIBLOCK | WT_PM_REC_REPLACE)
+ uint8_t flags; /* Page flags */
+};
+
+/*
+ * WT_PAGE --
+ * The WT_PAGE structure describes the in-memory page information.
+ */
+struct __wt_page {
+ /* Per page-type information. */
+ union {
+ /*
+ * Internal pages (both column- and row-store).
+ *
+ * The page record number is only used by column-store, but it
+ * makes some things simpler and it doesn't cost us any memory,
+ * other structures in this union are still as large.
+ *
+ * In-memory internal pages have an array of pointers to child
+ * structures, maintained in collated order. When a page is
+ * read into memory, the initial list of children is stored in
+ * the "orig_index" field, and it and the collated order are
+ * the same. After a page splits, the collated order and the
+ * original order will differ.
+ *
+ * Multiple threads of control may be searching the in-memory
+ * internal page and a child page of the internal page may
+ * cause a split at any time. When a page splits, a new array
+ * is allocated and atomically swapped into place. Threads in
+ * the old array continue without interruption (the old array is
+ * still valid), but have to avoid racing. No barrier is needed
+ * because the array reference is updated atomically, but code
+ * reading the fields multiple times would be a very bad idea.
+ * Specifically, do not do this:
+ * WT_REF **refp = page->u.intl__index->index;
+ * uint32_t entries = page->u.intl__index->entries;
+ *
+ * The field is declared volatile (so the compiler knows not to
+ * read it multiple times), and we obscure the field name and
+ * use a copy macro in all references to the field (so the code
+ * doesn't read it multiple times).
+ */
+ struct {
+ uint64_t recno; /* Starting recno */
+ WT_REF *parent_ref; /* Parent reference */
+
+ struct __wt_page_index {
+ uint32_t entries;
+ WT_REF **index;
+ } * volatile __index; /* Collated children */
+ } intl;
+#undef pg_intl_recno
+#define pg_intl_recno u.intl.recno
+#define pg_intl_parent_ref u.intl.parent_ref
+
+ /*
+ * Macros to copy/set the index because the name is obscured to ensure
+ * the field isn't read multiple times.
+ */
+#define WT_INTL_INDEX_COPY(page) ((page)->u.intl.__index)
+#define WT_INTL_INDEX_SET(page, v) do { \
+ WT_WRITE_BARRIER(); \
+ ((page)->u.intl.__index) = (v); \
+} while (0)
+
+ /*
+ * Macro to walk the list of references in an internal page.
+ */
+#define WT_INTL_FOREACH_BEGIN(session, page, ref) do { \
+ WT_PAGE_INDEX *__pindex; \
+ WT_REF **__refp; \
+ WT_SESSION_IMPL *__session = (session); \
+ uint32_t __entries; \
+ WT_ENTER_PAGE_INDEX(session); \
+ for (__pindex = WT_INTL_INDEX_COPY(page), \
+ __refp = __pindex->index, \
+ __entries = __pindex->entries; __entries > 0; --__entries) {\
+ (ref) = *__refp++;
+#define WT_INTL_FOREACH_END \
+ } \
+ WT_LEAVE_PAGE_INDEX(__session); \
+ } while (0)
+
+ /* Row-store leaf page. */
+ struct {
+ WT_ROW *d; /* Key/value pairs */
+
+ /*
+ * The column-store leaf page modification structures
+ * live in the WT_PAGE_MODIFY structure to keep the
+ * WT_PAGE structure as small as possible for read-only
+ * pages. For consistency, we could move the row-store
+ * modification structures into WT_PAGE_MODIFY too, but
+ * that doesn't shrink WT_PAGE any further and it would
+ * require really ugly naming inside of WT_PAGE_MODIFY
+ * to avoid growing that structure.
+ */
+ WT_INSERT_HEAD **ins; /* Inserts */
+ WT_UPDATE **upd; /* Updates */
+
+ uint32_t entries; /* Entries */
+ } row;
+#undef pg_row_d
+#define pg_row_d u.row.d
+#undef pg_row_ins
+#define pg_row_ins u.row.ins
+#undef pg_row_upd
+#define pg_row_upd u.row.upd
+#define pg_row_entries u.row.entries
+#define pg_row_entries u.row.entries
+
+ /* Fixed-length column-store leaf page. */
+ struct {
+ uint64_t recno; /* Starting recno */
+
+ uint8_t *bitf; /* Values */
+ uint32_t entries; /* Entries */
+ } col_fix;
+#undef pg_fix_recno
+#define pg_fix_recno u.col_fix.recno
+#undef pg_fix_bitf
+#define pg_fix_bitf u.col_fix.bitf
+#undef pg_fix_entries
+#define pg_fix_entries u.col_fix.entries
+
+ /* Variable-length column-store leaf page. */
+ struct {
+ uint64_t recno; /* Starting recno */
+
+ WT_COL *d; /* Values */
+
+ /*
+ * Variable-length column-store files maintain a list of
+ * RLE entries on the page so it's unnecessary to walk
+ * the page counting records to find a specific entry.
+ */
+ WT_COL_RLE *repeats; /* RLE array for lookups */
+ uint32_t nrepeats; /* Number of repeat slots */
+
+ uint32_t entries; /* Entries */
+ } col_var;
+#undef pg_var_recno
+#define pg_var_recno u.col_var.recno
+#undef pg_var_d
+#define pg_var_d u.col_var.d
+#undef pg_var_repeats
+#define pg_var_repeats u.col_var.repeats
+#undef pg_var_nrepeats
+#define pg_var_nrepeats u.col_var.nrepeats
+#undef pg_var_entries
+#define pg_var_entries u.col_var.entries
+ } u;
+
+ /* Page's on-disk representation: NULL for pages created in memory. */
+ const WT_PAGE_HEADER *dsk;
+
+ /* If/when the page is modified, we need lots more information. */
+ WT_PAGE_MODIFY *modify;
+
+ /*
+ * The page's read generation acts as an LRU value for each page in the
+ * tree; it is used by the eviction server thread to select pages to be
+ * discarded from the in-memory tree.
+ *
+ * The read generation is a 64-bit value, if incremented frequently, a
+ * 32-bit value could overflow.
+ *
+ * The read generation is a piece of shared memory potentially read
+ * by many threads. We don't want to update page read generations for
+ * in-cache workloads and suffer the cache misses, so we don't simply
+ * increment the read generation value on every access. Instead, the
+ * read generation is incremented by the eviction server each time it
+ * becomes active. To avoid incrementing a page's read generation too
+ * frequently, it is set to a future point.
+ */
+#define WT_READGEN_NOTSET 0
+#define WT_READGEN_OLDEST 1
+#define WT_READGEN_STEP 100
+ uint64_t read_gen;
+
+ uint64_t memory_footprint; /* Memory attached to the page */
+
+#define WT_PAGE_IS_INTERNAL(page) \
+ ((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT)
+#define WT_PAGE_INVALID 0 /* Invalid page */
+#define WT_PAGE_BLOCK_MANAGER 1 /* Block-manager page */
+#define WT_PAGE_COL_FIX 2 /* Col-store fixed-len leaf */
+#define WT_PAGE_COL_INT 3 /* Col-store internal page */
+#define WT_PAGE_COL_VAR 4 /* Col-store var-length leaf page */
+#define WT_PAGE_OVFL 5 /* Overflow page */
+#define WT_PAGE_ROW_INT 6 /* Row-store internal page */
+#define WT_PAGE_ROW_LEAF 7 /* Row-store leaf page */
+ uint8_t type; /* Page type */
+
+#define WT_PAGE_BUILD_KEYS 0x01 /* Keys have been built in memory */
+#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */
+#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
+#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
+#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */
+#define WT_PAGE_SPLITTING 0x20 /* An internal page is growing. */
+ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
+};
+
+/*
+ * WT_PAGE_DISK_OFFSET, WT_PAGE_REF_OFFSET --
+ * Return the offset/pointer of a pointer/offset in a page disk image.
+ */
+#define WT_PAGE_DISK_OFFSET(page, p) \
+ WT_PTRDIFF32(p, (page)->dsk)
+#define WT_PAGE_REF_OFFSET(page, o) \
+ ((void *)((uint8_t *)((page)->dsk) + (o)))
+
+/*
+ * Page state.
+ *
+ * Synchronization is based on the WT_REF->state field, which has a number of
+ * possible states:
+ *
+ * WT_REF_DISK:
+ * The initial setting before a page is brought into memory, and set as a
+ * result of page eviction; the page is on disk, and must be read into
+ * memory before use. WT_REF_DISK has a value of 0 (the default state
+ * after allocating cleared memory).
+ *
+ * WT_REF_DELETED:
+ * The page is on disk, but has been deleted from the tree; we can delete
+ * row-store leaf pages without reading them if they don't reference
+ * overflow items.
+ *
+ * WT_REF_LOCKED:
+ * Locked for exclusive access. In eviction, this page or a parent has
+ * been selected for eviction; once hazard pointers are checked, the page
+ * will be evicted. When reading a page that was previously deleted, it
+ * is locked until the page is in memory with records marked deleted. The
+ * thread that set the page to WT_REF_LOCKED has exclusive access, no
+ * other thread may use the WT_REF until the state is changed.
+ *
+ * WT_REF_MEM:
+ * Set by a reading thread once the page has been read from disk; the page
+ * is in the cache and the page reference is OK.
+ *
+ * WT_REF_READING:
+ * Set by a reading thread before reading an ordinary page from disk;
+ * other readers of the page wait until the read completes. Sync can
+ * safely skip over such pages: they are clean by definition.
+ *
+ * WT_REF_SPLIT:
+ * Set when the page is split; the WT_REF is dead and can no longer be
+ * used.
+ *
+ * The life cycle of a typical page goes like this: pages are read into memory
+ * from disk and their state set to WT_REF_MEM. When the page is selected for
+ * eviction, the page state is set to WT_REF_LOCKED. In all cases, evicting
+ * threads reset the page's state when finished with the page: if eviction was
+ * successful (a clean page was discarded, and a dirty page was written to disk
+ * and then discarded), the page state is set to WT_REF_DISK; if eviction failed
+ * because the page was busy, page state is reset to WT_REF_MEM.
+ *
+ * Readers check the state field and if it's WT_REF_MEM, they set a hazard
+ * pointer to the page, flush memory and re-confirm the page state. If the
+ * page state is unchanged, the reader has a valid reference and can proceed.
+ *
+ * When an evicting thread wants to discard a page from the tree, it sets the
+ * WT_REF_LOCKED state, flushes memory, then checks hazard pointers. If a
+ * hazard pointer is found, state is reset to WT_REF_MEM, restoring the page
+ * to the readers. If the evicting thread does not find a hazard pointer,
+ * the page is evicted.
+ */
+typedef enum __wt_page_state {
+ WT_REF_DISK=0, /* Page is on disk */
+ WT_REF_DELETED, /* Page is on disk, but deleted */
+ WT_REF_LOCKED, /* Page locked for exclusive access */
+ WT_REF_MEM, /* Page is in cache and valid */
+ WT_REF_READING, /* Page being read */
+ WT_REF_SPLIT /* Page was split */
+} WT_PAGE_STATE;
+
+/*
+ * WT_PAGE_DELETED --
+ * Related information for fast-delete, on-disk pages.
+ */
+struct __wt_page_deleted {
+ uint64_t txnid; /* Transaction ID */
+
+ WT_UPDATE **update_list; /* List of updates for abort */
+};
+
+/*
+ * WT_REF --
+ * A single in-memory page and the state information used to determine if
+ * it's OK to dereference the pointer to the page.
+ */
+struct __wt_ref {
+ WT_PAGE *page; /* Page */
+
+ /*
+ * When the tree deepens as a result of a split, the home page value
+ * changes. Don't cache it, we need to see that change when looking
+ * up our slot in the page's index structure.
+ */
+ WT_PAGE * volatile home; /* Reference page */
+ uint32_t ref_hint; /* Reference page index hint */
+
+ volatile WT_PAGE_STATE state; /* Page state */
+
+ /*
+ * Address: on-page cell if read from backing block, off-page WT_ADDR
+ * if instantiated in-memory, or NULL if page created in-memory.
+ */
+ void *addr;
+
+ /*
+ * The child page's key. Do NOT change this union without reviewing
+ * __wt_ref_key.
+ */
+ union {
+ uint64_t recno; /* Column-store: starting recno */
+ void *ikey; /* Row-store: key */
+ } key;
+
+ WT_PAGE_DELETED *page_del; /* Deleted on-disk page information */
+};
+/*
+ * WT_REF_SIZE is the expected structure size -- we verify the build to ensure
+ * the compiler hasn't inserted padding which would break the world.
+ */
+#define WT_REF_SIZE 48
+
+/*
+ * WT_ROW --
+ * Each in-memory page row-store leaf page has an array of WT_ROW structures:
+ * this is created from on-page data when a page is read from the file. It's
+ * sorted by key, fixed in size, and starts with a reference to on-page data.
+ *
+ * Multiple threads of control may be searching the in-memory row-store pages,
+ * and the key may be instantiated at any time. Code must be able to handle
+ * both when the key has not been instantiated (the key field points into the
+ * page's disk image), and when the key has been instantiated (the key field
+ * points outside the page's disk image). We don't need barriers because the
+ * key is updated atomically, but code that reads the key field multiple times
+ * is a very, very bad idea. Specifically, do not do this:
+ *
+ * key = rip->key;
+ * if (key_is_on_page(key)) {
+ * cell = rip->key;
+ * }
+ *
+ * The field is declared volatile (so the compiler knows it shouldn't read it
+ * multiple times), and we obscure the field name and use a copy macro in all
+ * references to the field (so the code doesn't read it multiple times), all
+ * to make sure we don't introduce this bug (again).
+ */
+struct __wt_row { /* On-page key, on-page cell, or off-page WT_IKEY */
+ void * volatile __key;
+};
+#define WT_ROW_KEY_COPY(rip) ((rip)->__key)
+#define WT_ROW_KEY_SET(rip, v) ((rip)->__key) = (void *)(v)
+
+/*
+ * WT_ROW_FOREACH --
+ * Walk the entries of an in-memory row-store leaf page.
+ */
+#define WT_ROW_FOREACH(page, rip, i) \
+ for ((i) = (page)->pg_row_entries, \
+ (rip) = (page)->pg_row_d; (i) > 0; ++(rip), --(i))
+#define WT_ROW_FOREACH_REVERSE(page, rip, i) \
+ for ((i) = (page)->pg_row_entries, \
+ (rip) = (page)->pg_row_d + ((page)->pg_row_entries - 1); \
+ (i) > 0; --(rip), --(i))
+
+/*
+ * WT_ROW_SLOT --
+ * Return the 0-based array offset based on a WT_ROW reference.
+ */
+#define WT_ROW_SLOT(page, rip) \
+ ((uint32_t)(((WT_ROW *)(rip)) - (page)->pg_row_d))
+
+/*
+ * WT_COL --
+ * Each in-memory variable-length column-store leaf page has an array of WT_COL
+ * structures: this is created from on-page data when a page is read from the
+ * file. It's fixed in size, and references data on the page.
+ */
+struct __wt_col {
+ /*
+ * Variable-length column-store data references are page offsets, not
+ * pointers (we boldly re-invent short pointers). The trade-off is 4B
+ * per K/V pair on a 64-bit machine vs. a single cycle for the addition
+ * of a base pointer. The on-page data is a WT_CELL (same as row-store
+ * pages).
+ *
+ * If the value is 0, it's a single, deleted record.
+ *
+ * Obscure the field name, code shouldn't use WT_COL->__col_value, the
+ * public interface is WT_COL_PTR and WT_COL_PTR_SET.
+ */
+ uint32_t __col_value;
+};
+
+/*
+ * WT_COL_RLE --
+ * In variable-length column store leaf pages, we build an array of entries
+ * with RLE counts greater than 1 when reading the page. We can do a binary
+ * search in this array, then an offset calculation to find the cell.
+ */
+struct __wt_col_rle {
+ uint64_t recno; /* Record number of first repeat. */
+ uint64_t rle; /* Repeat count. */
+ uint32_t indx; /* Slot of entry in col_var.d */
+} WT_GCC_ATTRIBUTE((packed));
+
+/*
+ * WT_COL_PTR, WT_COL_PTR_SET --
+ * Return/Set a pointer corresponding to the data offset. (If the item does
+ * not exist on the page, return a NULL.)
+ */
+#define WT_COL_PTR(page, cip) \
+ ((cip)->__col_value == 0 ? \
+ NULL : WT_PAGE_REF_OFFSET(page, (cip)->__col_value))
+#define WT_COL_PTR_SET(cip, value) \
+ (cip)->__col_value = (value)
+
+/*
+ * WT_COL_FOREACH --
+ * Walk the entries of variable-length column-store leaf page.
+ */
+#define WT_COL_FOREACH(page, cip, i) \
+ for ((i) = (page)->pg_var_entries, \
+ (cip) = (page)->pg_var_d; (i) > 0; ++(cip), --(i))
+
+/*
+ * WT_COL_SLOT --
+ * Return the 0-based array offset based on a WT_COL reference.
+ */
+#define WT_COL_SLOT(page, cip) \
+ ((uint32_t)(((WT_COL *)cip) - (page)->pg_var_d))
+
+/*
+ * WT_IKEY --
+ * Instantiated key: row-store keys are usually prefix compressed and sometimes
+ * Huffman encoded or overflow objects. Normally, a row-store page in-memory
+ * key points to the on-page WT_CELL, but in some cases, we instantiate the key
+ * in memory, in which case the row-store page in-memory key points to a WT_IKEY
+ * structure.
+ */
+struct __wt_ikey {
+ uint32_t size; /* Key length */
+
+ /*
+ * If we no longer point to the key's on-page WT_CELL, we can't find its
+ * related value. Save the offset of the key cell in the page.
+ *
+ * Row-store cell references are page offsets, not pointers (we boldly
+ * re-invent short pointers). The trade-off is 4B per K/V pair on a
+ * 64-bit machine vs. a single cycle for the addition of a base pointer.
+ */
+ uint32_t cell_offset;
+
+ /* The key bytes immediately follow the WT_IKEY structure. */
+#define WT_IKEY_DATA(ikey) \
+ ((void *)((uint8_t *)(ikey) + sizeof(WT_IKEY)))
+};
+
+/*
+ * WT_UPDATE --
+ * Entries on leaf pages can be updated, either modified or deleted. Updates
+ * to entries referenced from the WT_ROW and WT_COL arrays are stored in the
+ * page's WT_UPDATE array. When the first element on a page is updated, the
+ * WT_UPDATE array is allocated, with one slot for every existing element in
+ * the page. A slot points to a WT_UPDATE structure; if more than one update
+ * is done for an entry, WT_UPDATE structures are formed into a forward-linked
+ * list.
+ */
+struct __wt_update {
+ uint64_t txnid; /* update transaction */
+
+ WT_UPDATE *next; /* forward-linked list */
+
+ /*
+ * We use the maximum size as an is-deleted flag, which means we can't
+ * store 4GB objects; I'd rather do that than increase the size of this
+ * structure for a flag bit.
+ */
+#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == UINT32_MAX)
+#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = UINT32_MAX)
+ uint32_t size; /* update length */
+
+ /* The untyped value immediately follows the WT_UPDATE structure. */
+#define WT_UPDATE_DATA(upd) \
+ ((void *)((uint8_t *)(upd) + sizeof(WT_UPDATE)))
+} WT_GCC_ATTRIBUTE((packed));
+
+/*
+ * WT_INSERT --
+ *
+ * Row-store leaf pages support inserts of new K/V pairs. When the first K/V
+ * pair is inserted, the WT_INSERT_HEAD array is allocated, with one slot for
+ * every existing element in the page, plus one additional slot. A slot points
+ * to a WT_INSERT_HEAD structure for the items which sort after the WT_ROW
+ * element that references it and before the subsequent WT_ROW element; the
+ * skiplist structure has a randomly chosen depth of next pointers in each
+ * inserted node.
+ *
+ * The additional slot is because it's possible to insert items smaller than any
+ * existing key on the page: for that reason, the first slot of the insert array
+ * holds keys smaller than any other key on the page.
+ *
+ * In column-store variable-length run-length encoded pages, a single indx
+ * entry may reference a large number of records, because there's a single
+ * on-page entry representing many identical records. (We don't expand those
+ * entries when the page comes into memory, as that would require resources as
+ * pages are moved to/from the cache, including read-only files.) Instead, a
+ * single indx entry represents all of the identical records originally found
+ * on the page.
+ *
+ * Modifying (or deleting) run-length encoded column-store records is hard
+ * because the page's entry no longer references a set of identical items. We
+ * handle this by "inserting" a new entry into the insert array, with its own
+ * record number. (This is the only case where it's possible to insert into a
+ * column-store: only appends are allowed, as insert requires re-numbering
+ * subsequent records. Berkeley DB did support mutable records, but it won't
+ * scale and it isn't useful enough to re-implement, IMNSHO.)
+ */
+struct __wt_insert {
+ WT_UPDATE *upd; /* value */
+
+ union {
+ uint64_t recno; /* column-store record number */
+ struct {
+ uint32_t offset; /* row-store key data start */
+ uint32_t size; /* row-store key data size */
+ } key;
+ } u;
+
+#define WT_INSERT_KEY_SIZE(ins) (((WT_INSERT *)ins)->u.key.size)
+#define WT_INSERT_KEY(ins) \
+ ((void *)((uint8_t *)(ins) + ((WT_INSERT *)ins)->u.key.offset))
+#define WT_INSERT_RECNO(ins) (((WT_INSERT *)ins)->u.recno)
+
+ WT_INSERT *next[0]; /* forward-linked skip list */
+};
+
+/*
+ * Skiplist helper macros.
+ */
+#define WT_SKIP_FIRST(ins_head) \
+ (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)ins_head)->head[0])
+#define WT_SKIP_LAST(ins_head) \
+ (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)ins_head)->tail[0])
+#define WT_SKIP_NEXT(ins) ((ins)->next[0])
+#define WT_SKIP_FOREACH(ins, ins_head) \
+ for ((ins) = WT_SKIP_FIRST(ins_head); \
+ (ins) != NULL; \
+ (ins) = WT_SKIP_NEXT(ins))
+
+/*
+ * Atomically allocate and swap a structure or array into place.
+ */
+#define WT_PAGE_ALLOC_AND_SWAP(s, page, dest, v, count) do { \
+ if (((v) = (dest)) == NULL) { \
+ WT_ERR(__wt_calloc_def(s, count, &(v))); \
+ if (WT_ATOMIC_CAS8(dest, NULL, v)) \
+ __wt_cache_page_inmem_incr( \
+ s, page, (count) * sizeof(*(v))); \
+ else \
+ __wt_free(s, v); \
+ } \
+} while (0)
+
+/*
+ * WT_INSERT_HEAD --
+ * The head of a skiplist of WT_INSERT items.
+ */
+struct __wt_insert_head {
+ WT_INSERT *head[WT_SKIP_MAXDEPTH]; /* first item on skiplists */
+ WT_INSERT *tail[WT_SKIP_MAXDEPTH]; /* last item on skiplists */
+};
+
+/*
+ * The row-store leaf page insert lists are arrays of pointers to structures,
+ * and may not exist. The following macros return an array entry if the array
+ * of pointers and the specific structure exist, else NULL.
+ */
+#define WT_ROW_INSERT_SLOT(page, slot) \
+ ((page)->pg_row_ins == NULL ? NULL : (page)->pg_row_ins[slot])
+#define WT_ROW_INSERT(page, ip) \
+ WT_ROW_INSERT_SLOT(page, WT_ROW_SLOT(page, ip))
+#define WT_ROW_UPDATE(page, ip) \
+ ((page)->pg_row_upd == NULL ? \
+ NULL : (page)->pg_row_upd[WT_ROW_SLOT(page, ip)])
+/*
+ * WT_ROW_INSERT_SMALLEST references an additional slot past the end of the
+ * the "one per WT_ROW slot" insert array. That's because the insert array
+ * requires an extra slot to hold keys that sort before any key found on the
+ * original page.
+ */
+#define WT_ROW_INSERT_SMALLEST(page) \
+ ((page)->pg_row_ins == NULL ? \
+ NULL : (page)->pg_row_ins[(page)->pg_row_entries])
+
+/*
+ * The column-store leaf page update lists are arrays of pointers to structures,
+ * and may not exist. The following macros return an array entry if the array
+ * of pointers and the specific structure exist, else NULL.
+ */
+#define WT_COL_UPDATE_SLOT(page, slot) \
+ ((page)->modify == NULL || (page)->modify->mod_update == NULL ? \
+ NULL : (page)->modify->mod_update[slot])
+#define WT_COL_UPDATE(page, ip) \
+ WT_COL_UPDATE_SLOT(page, WT_COL_SLOT(page, ip))
+
+/*
+ * WT_COL_UPDATE_SINGLE is a single WT_INSERT list, used for any fixed-length
+ * column-store updates for a page.
+ */
+#define WT_COL_UPDATE_SINGLE(page) \
+ WT_COL_UPDATE_SLOT(page, 0)
+
+/*
+ * WT_COL_APPEND is an WT_INSERT list, used for fixed- and variable-length
+ * appends.
+ */
+#define WT_COL_APPEND(page) \
+ ((page)->modify != NULL && (page)->modify->mod_append != NULL ? \
+ (page)->modify->mod_append[0] : NULL)
+
+/* WT_FIX_FOREACH walks fixed-length bit-fields on a disk page. */
+#define WT_FIX_FOREACH(btree, dsk, v, i) \
+ for ((i) = 0, \
+ (v) = (i) < (dsk)->u.entries ? \
+ __bit_getv( \
+ WT_PAGE_HEADER_BYTE(btree, dsk), 0, (btree)->bitcnt) : 0; \
+ (i) < (dsk)->u.entries; ++(i), \
+ (v) = __bit_getv( \
+ WT_PAGE_HEADER_BYTE(btree, dsk), i, (btree)->bitcnt))
+
+/*
+ * Manage split generation numbers. Splits walk the list of sessions to check
+ * when it is safe to free structures that have been replaced. We also check
+ * that list periodically (e.g., when wrapping up a transaction) to free any
+ * memory we can.
+ *
+ * Before a thread enters code that will examine page indexes (which are
+ * swapped out by splits), it publishes a copy of the current split generation
+ * into its session. Don't assume that threads never re-enter this code: if we
+ * already have a split generation, leave it alone. If our caller is examining
+ * an index, we don't want the oldest split generation to move forward and
+ * potentially free it.
+ */
+#define WT_ENTER_PAGE_INDEX(session) do { \
+ uint64_t __prev_split_gen = (session)->split_gen; \
+ if (__prev_split_gen == 0) \
+ WT_PUBLISH((session)->split_gen, S2C(session)->split_gen)
+
+#define WT_LEAVE_PAGE_INDEX(session) \
+ if (__prev_split_gen == 0) \
+ (session)->split_gen = 0; \
+ } while (0)
+
+#define WT_WITH_PAGE_INDEX(session, e) \
+ WT_ENTER_PAGE_INDEX(session); \
+ (e); \
+ WT_LEAVE_PAGE_INDEX(session)
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
new file mode 100644
index 00000000000..05250951a65
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -0,0 +1,155 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Supported btree formats: the "current" version is the maximum supported
+ * major/minor versions.
+ */
+#define WT_BTREE_MAJOR_VERSION_MIN 1 /* Oldest version supported */
+#define WT_BTREE_MINOR_VERSION_MIN 1
+
+#define WT_BTREE_MAJOR_VERSION_MAX 1 /* Newest version supported */
+#define WT_BTREE_MINOR_VERSION_MAX 1
+
+/*
+ * The maximum btree leaf and internal page size is 512MB (2^29). The limit
+ * is enforced in software, it could be larger, specifically, the underlying
+ * default block manager can support 4GB (2^32). Currently, the maximum page
+ * size must accommodate our dependence on the maximum page size fitting into
+ * a number of bits less than 32; see the row-store page key-lookup functions
+ * for the magic.
+ */
+#define WT_BTREE_PAGE_SIZE_MAX (512 * WT_MEGABYTE)
+
+/*
+ * The length of variable-length column-store values and row-store keys/values
+ * are stored in a 4B type, so the largest theoretical key/value item is 4GB.
+ * However, in the WT_UPDATE structure we use the UINT32_MAX size as a "deleted"
+ * flag, and second, the size of an overflow object is constrained by what an
+ * underlying block manager can actually write. (For example, in the default
+ * block manager, writing an overflow item includes the underlying block's page
+ * header and block manager specific structure, aligned to an allocation-sized
+ * unit). The btree engine limits the size of a single object to (4GB - 1KB);
+ * that gives us additional bytes if we ever want to store a structure length
+ * plus the object size in 4B, or if we need additional flag values. Attempts
+ * to store large key/value items in the tree trigger an immediate check to the
+ * block manager, to make sure it can write the item. Storing 4GB objects in a
+ * btree borders on clinical insanity, anyway.
+ *
+ * Record numbers are stored in 64-bit unsigned integers, meaning the largest
+ * record number is "really, really big".
+ */
+#define WT_BTREE_MAX_OBJECT_SIZE (UINT32_MAX - 1024)
+
+/*
+ * A location in a file is a variable-length cookie, but it has a maximum size
+ * so it's easy to create temporary space in which to store them. (Locations
+ * can't be much larger than this anyway, they must fit onto the minimum size
+ * page because a reference to an overflow page is itself a location.)
+ */
+#define WT_BTREE_MAX_ADDR_COOKIE 255 /* Maximum address cookie */
+
+/*
+ * WT_BTREE --
+ * A btree handle.
+ */
+struct __wt_btree {
+ WT_DATA_HANDLE *dhandle;
+
+ WT_CKPT *ckpt; /* Checkpoint information */
+
+ enum { BTREE_COL_FIX=1, /* Fixed-length column store */
+ BTREE_COL_VAR=2, /* Variable-length column store */
+ BTREE_ROW=3 /* Row-store */
+ } type; /* Type */
+
+ const char *key_format; /* Key format */
+ const char *value_format; /* Value format */
+ uint8_t bitcnt; /* Fixed-length field size in bits */
+
+ WT_COLLATOR *collator; /* Row-store comparator */
+ int collator_owned; /* The collator needs to be freed */
+
+ uint32_t id; /* File ID, for logging */
+
+ uint32_t key_gap; /* Row-store prefix key gap */
+
+ uint32_t allocsize; /* Allocation size */
+ uint32_t maxintlpage; /* Internal page max size */
+ uint32_t maxintlitem; /* Internal page max item size */
+ uint32_t maxleafpage; /* Leaf page max size */
+ uint32_t maxleafitem; /* Leaf page max item size */
+ uint64_t maxmempage; /* In memory page max size */
+
+ void *huffman_key; /* Key huffman encoding */
+ void *huffman_value; /* Value huffman encoding */
+
+ enum { CKSUM_ON=1, /* On */
+ CKSUM_OFF=2, /* Off */
+ CKSUM_UNCOMPRESSED=3 /* Uncompressed blocks only */
+ } checksum; /* Checksum configuration */
+
+ u_int dictionary; /* Reconcile: dictionary slots */
+ int internal_key_truncate; /* Reconcile: internal key truncate */
+ int maximum_depth; /* Reconcile: maximum tree depth */
+ int prefix_compression; /* Reconcile: prefix compression */
+ u_int prefix_compression_min; /* Reconcile: prefix compression min */
+ int split_pct; /* Reconcile: split page percent */
+ WT_COMPRESSOR *compressor; /* Reconcile: page compressor */
+ WT_RWLOCK *ovfl_lock; /* Reconcile: overflow lock */
+
+ uint64_t last_recno; /* Column-store last record number */
+
+ WT_REF root; /* Root page reference */
+ int modified; /* If the tree ever modified */
+ int bulk_load_ok; /* Bulk-load is a possibility */
+
+ WT_BM *bm; /* Block manager reference */
+ u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */
+
+ uint64_t write_gen; /* Write generation */
+
+ WT_REF *evict_ref; /* Eviction thread's location */
+ uint64_t evict_priority; /* Relative priority of cached pages */
+ u_int evict_walk_period; /* Skip this many LRU walks */
+ u_int evict_walk_skips; /* Number of walks skipped */
+ volatile uint32_t evict_busy; /* Count of threads in eviction */
+
+ int checkpointing; /* Checkpoint in progress */
+
+ /*
+ * We flush pages from the tree (in order to make checkpoint faster),
+ * without a high-level lock. To avoid multiple threads flushing at
+ * the same time, lock the tree.
+ */
+ WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */
+
+ /* Flags values up to 0xff are reserved for WT_DHANDLE_* */
+#define WT_BTREE_BULK 0x00100 /* Bulk-load handle */
+#define WT_BTREE_NO_EVICTION 0x00200 /* Disable eviction */
+#define WT_BTREE_NO_HAZARD 0x00400 /* Disable hazard pointers */
+#define WT_BTREE_SALVAGE 0x00800 /* Handle is for salvage */
+#define WT_BTREE_UPGRADE 0x01000 /* Handle is for upgrade */
+#define WT_BTREE_VERIFY 0x02000 /* Handle is for verify */
+ uint32_t flags;
+};
+
+/* Flags that make a btree handle special (not for normal use). */
+#define WT_BTREE_SPECIAL_FLAGS \
+ (WT_BTREE_BULK | WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)
+
+/*
+ * WT_SALVAGE_COOKIE --
+ * Encapsulation of salvage information for reconciliation.
+ */
+struct __wt_salvage_cookie {
+ uint64_t missing; /* Initial items to create */
+ uint64_t skip; /* Initial items to skip */
+ uint64_t take; /* Items to take */
+
+ int done; /* Ignore the rest */
+};
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
new file mode 100644
index 00000000000..b7957e6647f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -0,0 +1,1216 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_ref_is_root --
+ * Return if the page reference is for the root page.
+ */
+static inline int
+__wt_ref_is_root(WT_REF *ref)
+{
+ return (ref->home == NULL ? 1 : 0);
+}
+
+/*
+ * __wt_page_is_modified --
+ * Return if the page is dirty.
+ */
+static inline int
+__wt_page_is_modified(WT_PAGE *page)
+{
+ return (page->modify != NULL && page->modify->write_gen != 0 ? 1 : 0);
+}
+
+/*
+ * Estimate the per-allocation overhead. All implementations of malloc / free
+ * have some kind of header and pad for alignment. We can't know for sure what
+ * that adds up to, but this is an estimate based on some measurements of heap
+ * size versus bytes in use.
+ */
+#define WT_ALLOC_OVERHEAD 32U
+
+/*
+ * __wt_cache_page_inmem_incr --
+ * Increment a page's memory footprint in the cache.
+ */
+static inline void
+__wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
+{
+ WT_CACHE *cache;
+
+ size += WT_ALLOC_OVERHEAD;
+
+ cache = S2C(session)->cache;
+ (void)WT_ATOMIC_ADD8(cache->bytes_inmem, size);
+ (void)WT_ATOMIC_ADD8(page->memory_footprint, size);
+ if (__wt_page_is_modified(page)) {
+ (void)WT_ATOMIC_ADD8(cache->bytes_dirty, size);
+ (void)WT_ATOMIC_ADD8(page->modify->bytes_dirty, size);
+ }
+}
+
+/*
+ * __wt_cache_page_inmem_decr --
+ * Decrement a page's memory footprint in the cache.
+ */
+static inline void
+__wt_cache_page_inmem_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
+{
+ WT_CACHE *cache;
+
+ size += WT_ALLOC_OVERHEAD;
+
+ cache = S2C(session)->cache;
+ (void)WT_ATOMIC_SUB8(cache->bytes_inmem, size);
+ (void)WT_ATOMIC_SUB8(page->memory_footprint, size);
+ if (__wt_page_is_modified(page)) {
+ (void)WT_ATOMIC_SUB8(cache->bytes_dirty, size);
+ (void)WT_ATOMIC_SUB8(page->modify->bytes_dirty, size);
+ }
+}
+
+/*
+ * __wt_cache_dirty_incr --
+ * Increment the cache dirty page/byte counts.
+ */
+static inline void
+__wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CACHE *cache;
+ size_t size;
+
+ cache = S2C(session)->cache;
+ (void)WT_ATOMIC_ADD8(cache->pages_dirty, 1);
+
+ /*
+ * Take care to read the memory_footprint once in case we are racing
+ * with updates.
+ */
+ size = page->memory_footprint;
+ (void)WT_ATOMIC_ADD8(cache->bytes_dirty, size);
+ (void)WT_ATOMIC_ADD8(page->modify->bytes_dirty, size);
+}
+
+/*
+ * __wt_cache_dirty_decr --
+ * Decrement the cache dirty page/byte counts.
+ */
+static inline void
+__wt_cache_dirty_decr(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CACHE *cache;
+ size_t size;
+
+ cache = S2C(session)->cache;
+
+ if (cache->pages_dirty < 1) {
+ (void)__wt_errx(session,
+ "cache dirty decrement failed: cache dirty page count went "
+ "negative");
+ cache->pages_dirty = 0;
+ } else
+ (void)WT_ATOMIC_SUB8(cache->pages_dirty, 1);
+
+ /*
+ * It is possible to decrement the footprint of the page without making
+ * the page dirty (for example when freeing an obsolete update list),
+ * so the footprint could change between read and decrement, and we
+ * might attempt to decrement by a different amount than the bytes held
+ * by the page.
+ *
+ * We catch that by maintaining a per-page dirty size, and fixing the
+ * cache stats if that is non-zero when the page is discarded.
+ *
+ * Also take care that the global size doesn't go negative. This may
+ * lead to small accounting errors (particularly on the last page of the
+ * last file in a checkpoint), but that will come out in the wash when
+ * the page is evicted.
+ */
+ size = WT_MIN(page->memory_footprint, cache->bytes_dirty);
+ (void)WT_ATOMIC_SUB8(cache->bytes_dirty, size);
+ (void)WT_ATOMIC_SUB8(page->modify->bytes_dirty, size);
+}
+
+/*
+ * __wt_cache_page_evict --
+ * Evict pages from the cache.
+ */
+static inline void
+__wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CACHE *cache;
+ WT_PAGE_MODIFY *mod;
+
+ cache = S2C(session)->cache;
+ mod = page->modify;
+
+ /*
+ * In rare cases, we may race tracking a page's dirty footprint.
+ * If so, we will get here with a non-zero dirty_size in the page, and
+ * we can fix the global stats.
+ */
+ if (mod != NULL && mod->bytes_dirty != 0)
+ (void)WT_ATOMIC_SUB8(cache->bytes_dirty, mod->bytes_dirty);
+
+ WT_ASSERT(session, page->memory_footprint != 0);
+ (void)WT_ATOMIC_ADD8(cache->bytes_evict, page->memory_footprint);
+ page->memory_footprint = 0;
+
+ (void)WT_ATOMIC_ADD8(cache->pages_evict, 1);
+}
+
+/*
+ * __wt_cache_read_gen --
+ * Get the current read generation number.
+ */
+static inline uint64_t
+__wt_cache_read_gen(WT_SESSION_IMPL *session)
+{
+ return (S2C(session)->cache->read_gen);
+}
+
+/*
+ * __wt_cache_read_gen_incr --
+ * Increment the current read generation number.
+ */
+static inline void
+__wt_cache_read_gen_incr(WT_SESSION_IMPL *session)
+{
+ ++S2C(session)->cache->read_gen;
+}
+
+/*
+ * __wt_cache_read_gen_set --
+ * Get the read generation to store in a page.
+ */
+static inline uint64_t
+__wt_cache_read_gen_set(WT_SESSION_IMPL *session)
+{
+ /*
+ * We return read-generations from the future (where "the future" is
+ * measured by increments of the global read generation). The reason
+ * is because when acquiring a new hazard pointer for a page, we can
+ * check its read generation, and if the read generation isn't less
+ * than the current global generation, we don't bother updating the
+ * page. In other words, the goal is to avoid some number of updates
+ * immediately after each update we have to make.
+ */
+ return (__wt_cache_read_gen(session) + WT_READGEN_STEP);
+}
+
+/*
+ * __wt_cache_pages_inuse --
+ * Return the number of pages in use.
+ */
+static inline uint64_t
+__wt_cache_pages_inuse(WT_CACHE *cache)
+{
+ return (cache->pages_inmem - cache->pages_evict);
+}
+
+/*
+ * __wt_cache_bytes_inuse --
+ * Return the number of bytes in use.
+ */
+static inline uint64_t
+__wt_cache_bytes_inuse(WT_CACHE *cache)
+{
+ return (cache->bytes_inmem - cache->bytes_evict);
+}
+
+/*
+ * __wt_page_refp --
+ * Return the page's index and slot for a reference.
+ */
+static inline void
+__wt_page_refp(WT_SESSION_IMPL *session,
+ WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
+{
+ WT_PAGE_INDEX *pindex;
+ uint32_t i;
+
+ WT_ASSERT(session,
+ WT_SESSION_TXN_STATE(session)->snap_min != WT_TXN_NONE);
+
+ /*
+ * Copy the parent page's index value: the page can split at any time,
+ * but the index's value is always valid, even if it's not up-to-date.
+ */
+retry: pindex = WT_INTL_INDEX_COPY(ref->home);
+
+ /*
+ * Use the page's reference hint: it should be correct unless the page
+ * split before our slot. If the page splits after our slot, the hint
+ * will point earlier in the array than our actual slot, so the first
+ * loop is from the hint to the end of the list, and the second loop
+ * is from the start of the list to the end of the list. (The second
+ * loop overlaps the first, but that only happen in cases where we've
+ * deepened the tree and aren't going to find our slot at all, that's
+ * not worth optimizing.)
+ *
+ * It's not an error for the reference hint to be wrong, it just means
+ * the first retrieval (which sets the hint for subsequent retrievals),
+ * is slower.
+ */
+ for (i = ref->ref_hint; i < pindex->entries; ++i)
+ if (pindex->index[i]->page == ref->page) {
+ *pindexp = pindex;
+ *slotp = ref->ref_hint = i;
+ return;
+ }
+ for (i = 0; i < pindex->entries; ++i)
+ if (pindex->index[i]->page == ref->page) {
+ *pindexp = pindex;
+ *slotp = ref->ref_hint = i;
+ return;
+ }
+
+ /*
+ * If we don't find our reference, the page split into a new level and
+ * our home pointer references the wrong page. After internal pages
+ * deepen, their reference structure home value are updated; yield and
+ * wait for that to happen.
+ */
+ __wt_yield();
+ goto retry;
+}
+
+/*
+ * __wt_page_modify_init --
+ * A page is about to be modified, allocate the modification structure.
+ */
+static inline int
+__wt_page_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ return (page->modify == NULL ?
+ __wt_page_modify_alloc(session, page) : 0);
+}
+
+/*
+ * __wt_page_only_modify_set --
+ * Mark the page (but only the page) dirty.
+ */
+static inline void
+__wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ uint64_t last_running;
+
+ last_running = 0;
+ if (page->modify->write_gen == 0)
+ last_running = S2C(session)->txn_global.last_running;
+
+ /*
+ * We depend on atomic-add being a write barrier, that is, a barrier to
+ * ensure all changes to the page are flushed before updating the page
+ * write generation and/or marking the tree dirty, otherwise checkpoints
+ * and/or page reconciliation might be looking at a clean page/tree.
+ *
+ * Every time the page transitions from clean to dirty, update the cache
+ * and transactional information.
+ */
+ if (WT_ATOMIC_ADD4(page->modify->write_gen, 1) == 1) {
+ __wt_cache_dirty_incr(session, page);
+
+ /*
+ * The page can never end up with changes older than the oldest
+ * running transaction.
+ */
+ if (F_ISSET(&session->txn, TXN_HAS_SNAPSHOT))
+ page->modify->disk_snap_min = session->txn.snap_min;
+
+ /*
+ * We won the race to dirty the page, but another thread could
+ * have committed in the meantime, and the last_running field
+ * been updated past it. That is all very unlikely, but not
+ * impossible, so we take care to read the global state before
+ * the atomic increment. If we raced with reconciliation, just
+ * leave the previous value here: at worst, we will write a
+ * page in a checkpoint when not absolutely necessary.
+ */
+ if (last_running != 0)
+ page->modify->first_dirty_txn = last_running;
+ }
+
+ /* Check if this is the largest transaction ID to update the page. */
+ if (TXNID_LT(page->modify->update_txn, session->txn.id))
+ page->modify->update_txn = session->txn.id;
+}
+
+/*
+ * __wt_page_modify_set --
+ * Mark the page and tree dirty.
+ */
+static inline void
+__wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ /*
+ * Mark the tree dirty (even if the page is already marked dirty), newly
+ * created pages to support "empty" files are dirty, but the file isn't
+ * marked dirty until there's a real change needing to be written. Test
+ * before setting the dirty flag, it's a hot cache line.
+ *
+ * The tree's modified flag is cleared by the checkpoint thread: set it
+ * and insert a barrier before dirtying the page. (I don't think it's
+ * a problem if the tree is marked dirty with all the pages clean, it
+ * might result in an extra checkpoint that doesn't do any work but it
+ * shouldn't cause problems; regardless, let's play it safe.)
+ */
+ if (S2BT(session)->modified == 0) {
+ S2BT(session)->modified = 1;
+ WT_FULL_BARRIER();
+ }
+
+ __wt_page_only_modify_set(session, page);
+}
+
+/*
+ * __wt_page_parent_modify_set --
+ * Mark the parent page and tree dirty.
+ */
+static inline int
+__wt_page_parent_modify_set(
+ WT_SESSION_IMPL *session, WT_REF *ref, int page_only)
+{
+ WT_PAGE *parent;
+
+ /*
+ * This function exists as a place to stash this comment. There are a
+ * few places where we need to dirty a page's parent. The trick is the
+ * page's parent might split at any point, and the page parent might be
+ * the wrong parent at any particular time. We ignore this and dirty
+ * whatever page the page's reference structure points to. This is safe
+ * because if we're pointing to the wrong parent, that parent must have
+ * split, deepening the tree, which implies marking the original parent
+ * and all of the newly-created children as dirty. In other words, if
+ * we have the wrong parent page, everything was marked dirty already.
+ */
+ parent = ref->home;
+ WT_RET(__wt_page_modify_init(session, parent));
+ if (page_only)
+ __wt_page_only_modify_set(session, parent);
+ else
+ __wt_page_modify_set(session, parent);
+ return (0);
+}
+
+/*
+ * __wt_off_page --
+ * Return if a pointer references off-page data.
+ */
+static inline int
+__wt_off_page(WT_PAGE *page, const void *p)
+{
+ /*
+ * There may be no underlying page, in which case the reference is
+ * off-page by definition.
+ */
+ return (page->dsk == NULL ||
+ p < (void *)page->dsk ||
+ p >= (void *)((uint8_t *)page->dsk + page->dsk->mem_size));
+}
+
+/*
+ * __wt_ref_key --
+ * Return a reference to a row-store internal page key as cheaply as
+ * possible.
+ */
+static inline void
+__wt_ref_key(WT_PAGE *page, WT_REF *ref, void *keyp, size_t *sizep)
+{
+ uintptr_t v;
+
+ /*
+ * An internal page key is in one of two places: if we instantiated the
+ * key (for example, when reading the page), WT_REF.key.ikey references
+ * a WT_IKEY structure, otherwise WT_REF.key.ikey references an on-page
+ * key offset/length pair.
+ *
+ * Now the magic: allocated memory must be aligned to store any standard
+ * type, and we expect some standard type to require at least quad-byte
+ * alignment, so allocated memory should have some clear low-order bits.
+ * On-page objects consist of an offset/length pair: the maximum page
+ * size currently fits into 29 bits, so we use the low-order bits of the
+ * pointer to mark the other bits of the pointer as encoding the key's
+ * location and length. This breaks if allocated memory isn't aligned,
+ * of course.
+ *
+ * In this specific case, we use bit 0x01 to mark an on-page key, else
+ * it's a WT_IKEY reference. The bit pattern for internal row-store
+ * on-page keys is:
+ * 32 bits key length
+ * 31 bits page offset of the key's bytes,
+ * 1 bits flags
+ */
+#define WT_IK_FLAG 0x01
+#define WT_IK_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 32)
+#define WT_IK_DECODE_KEY_LEN(v) ((v) >> 32)
+#define WT_IK_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 1)
+#define WT_IK_DECODE_KEY_OFFSET(v) (((v) & 0xFFFFFFFF) >> 1)
+ v = (uintptr_t)ref->key.ikey;
+ if (v & WT_IK_FLAG) {
+ *(void **)keyp =
+ WT_PAGE_REF_OFFSET(page, WT_IK_DECODE_KEY_OFFSET(v));
+ *sizep = WT_IK_DECODE_KEY_LEN(v);
+ } else {
+ *(void **)keyp = WT_IKEY_DATA(ref->key.ikey);
+ *sizep = ((WT_IKEY *)ref->key.ikey)->size;
+ }
+}
+
+/*
+ * __wt_ref_key_onpage_set --
+ * Set a WT_REF to reference an on-page key.
+ */
+static inline void
+__wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK *unpack)
+{
+ uintptr_t v;
+
+ /*
+ * See the comment in __wt_ref_key for an explanation of the magic.
+ */
+ v = WT_IK_ENCODE_KEY_LEN(unpack->size) |
+ WT_IK_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) |
+ WT_IK_FLAG;
+ ref->key.ikey = (void *)v;
+}
+
+/*
+ * __wt_ref_key_instantiated --
+ * Return if a WT_REF key is instantiated.
+ */
+static inline WT_IKEY *
+__wt_ref_key_instantiated(WT_REF *ref)
+{
+ uintptr_t v;
+
+ /*
+ * See the comment in __wt_ref_key for an explanation of the magic.
+ */
+ v = (uintptr_t)ref->key.ikey;
+ return (v & WT_IK_FLAG ? NULL : ref->key.ikey);
+}
+
+/*
+ * __wt_ref_key_clear --
+ * Clear a WT_REF key.
+ */
+static inline void
+__wt_ref_key_clear(WT_REF *ref)
+{
+ /* The key union has 2 fields, both of which are 8B. */
+ ref->key.recno = 0;
+}
+
+/*
+ * __wt_row_leaf_key_info --
+ * Return a row-store leaf page key referenced by a WT_ROW if it can be
+ * had without unpacking a cell, and information about the cell, if the key
+ * isn't cheaply available.
+ */
+static inline int
+__wt_row_leaf_key_info(WT_PAGE *page, void *copy,
+ WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep)
+{
+ WT_IKEY *ikey;
+ uintptr_t v;
+
+ v = (uintptr_t)copy;
+
+ /*
+ * A row-store leaf page key is in one of two places: if instantiated,
+ * the WT_ROW pointer references a WT_IKEY structure, otherwise, it
+ * references an on-page offset. Further, on-page keys are in one of
+ * two states: if the key is a simple key (not an overflow key, prefix
+ * compressed or Huffman encoded, all of which are likely), the key's
+ * offset/size is encoded in the pointer. Otherwise, the offset is to
+ * the key's on-page cell.
+ *
+ * Now the magic: allocated memory must be aligned to store any standard
+ * type, and we expect some standard type to require at least quad-byte
+ * alignment, so allocated memory should have some clear low-order bits.
+ * On-page objects consist of an offset/length pair: the maximum page
+ * size currently fits into 29 bits, so we use the low-order bits of the
+ * pointer to mark the other bits of the pointer as encoding the key's
+ * location and length. This breaks if allocated memory isn't aligned,
+ * of course.
+ *
+ * In this specific case, we use bit 0x01 to mark an on-page cell, bit
+ * 0x02 to mark an on-page key, 0x03 to mark an on-page key/value pair,
+ * otherwise it's a WT_IKEY reference. The bit pattern for on-page cells
+ * is:
+ * 29 bits page offset of the key's cell,
+ * 2 bits flags
+ *
+ * The bit pattern for on-page keys is:
+ * 32 bits key length,
+ * 29 bits page offset of the key's bytes,
+ * 2 bits flags
+ *
+ * But, while that allows us to skip decoding simple key cells, we also
+ * want to skip decoding the value cell in the case where the value cell
+ * is also simple/short. We use bit 0x03 to mark an encoded on-page key
+ * and value pair. The bit pattern for on-page key/value pairs is:
+ * 9 bits key length,
+ * 13 bits value length,
+ * 20 bits page offset of the key's bytes,
+ * 20 bits page offset of the value's bytes,
+ * 2 bits flags
+ *
+ * These bit patterns are in-memory only, of course, so can be modified
+ * (we could even tune for specific workloads). Generally, the fields
+ * are larger than the anticipated values being stored (512B keys, 8KB
+ * values, 1MB pages), hopefully that won't be necessary.
+ *
+ * This function returns a list of things about the key (instantiation
+ * reference, cell reference and key/length pair). Our callers know
+ * the order in which we look things up and the information returned;
+ * for example, the cell will never be returned if we are working with
+ * an on-page key.
+ */
+#define WT_CELL_FLAG 0x01
+#define WT_CELL_ENCODE_OFFSET(v) ((uintptr_t)(v) << 2)
+#define WT_CELL_DECODE_OFFSET(v) (((v) & 0xFFFFFFFF) >> 2)
+
+#define WT_K_FLAG 0x02
+#define WT_K_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 32)
+#define WT_K_DECODE_KEY_LEN(v) ((v) >> 32)
+#define WT_K_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 2)
+#define WT_K_DECODE_KEY_OFFSET(v) (((v) & 0xFFFFFFFF) >> 2)
+
+#define WT_KV_FLAG 0x03
+#define WT_KV_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 55)
+#define WT_KV_DECODE_KEY_LEN(v) ((v) >> 55)
+#define WT_KV_MAX_KEY_LEN (0x200 - 1)
+#define WT_KV_ENCODE_VALUE_LEN(v) ((uintptr_t)(v) << 42)
+#define WT_KV_DECODE_VALUE_LEN(v) (((v) & 0x007FFC0000000000) >> 42)
+#define WT_KV_MAX_VALUE_LEN (0x2000 - 1)
+#define WT_KV_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 22)
+#define WT_KV_DECODE_KEY_OFFSET(v) (((v) & 0x000003FFFFC00000) >> 22)
+#define WT_KV_MAX_KEY_OFFSET (0x100000 - 1)
+#define WT_KV_ENCODE_VALUE_OFFSET(v) ((uintptr_t)(v) << 2)
+#define WT_KV_DECODE_VALUE_OFFSET(v) (((v) & 0x00000000003FFFFC) >> 2)
+#define WT_KV_MAX_VALUE_OFFSET (0x100000 - 1)
+ switch (v & 0x03) {
+ case WT_CELL_FLAG:
+ /* On-page cell: no instantiated key. */
+ if (ikeyp != NULL)
+ *ikeyp = NULL;
+ if (cellp != NULL)
+ *cellp =
+ WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v));
+ return (0);
+ case WT_K_FLAG:
+ /* Encoded key: no instantiated key, no cell. */
+ if (cellp != NULL)
+ *cellp = NULL;
+ if (ikeyp != NULL)
+ *ikeyp = NULL;
+ if (datap != NULL) {
+ *(void **)datap =
+ WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v));
+ *sizep = WT_K_DECODE_KEY_LEN(v);
+ return (1);
+ }
+ return (0);
+ case WT_KV_FLAG:
+ /* Encoded key/value pair: no instantiated key, no cell. */
+ if (cellp != NULL)
+ *cellp = NULL;
+ if (ikeyp != NULL)
+ *ikeyp = NULL;
+ if (datap != NULL) {
+ *(void **)datap = WT_PAGE_REF_OFFSET(
+ page, WT_KV_DECODE_KEY_OFFSET(v));
+ *sizep = WT_KV_DECODE_KEY_LEN(v);
+ return (1);
+ }
+ return (0);
+
+ }
+
+ /* Instantiated key. */
+ ikey = copy;
+ if (ikeyp != NULL)
+ *ikeyp = copy;
+ if (cellp != NULL)
+ *cellp = WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
+ if (datap != NULL) {
+ *(void **)datap = WT_IKEY_DATA(ikey);
+ *sizep = ikey->size;
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * __wt_row_leaf_key_set_cell --
+ * Set a WT_ROW to reference an on-page row-store leaf cell.
+ */
+static inline void
+__wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell)
+{
+ uintptr_t v;
+
+ /*
+ * See the comment in __wt_row_leaf_key_info for an explanation of the
+ * magic.
+ */
+ v = WT_CELL_ENCODE_OFFSET(WT_PAGE_DISK_OFFSET(page, cell)) |
+ WT_CELL_FLAG;
+ WT_ROW_KEY_SET(rip, v);
+}
+
+/*
+ * __wt_row_leaf_key_set --
+ * Set a WT_ROW to reference an on-page row-store leaf key.
+ */
+static inline void
+__wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack)
+{
+ uintptr_t v;
+
+ /*
+ * See the comment in __wt_row_leaf_key_info for an explanation of the
+ * magic.
+ */
+ v = WT_K_ENCODE_KEY_LEN(unpack->size) |
+ WT_K_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) |
+ WT_K_FLAG;
+ WT_ROW_KEY_SET(rip, v);
+}
+
+/*
+ * __wt_row_leaf_value_set --
+ * Set a WT_ROW to reference an on-page row-store leaf value.
+ */
+static inline void
+__wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack)
+{
+ uintptr_t key_len, key_offset, value_offset, v;
+
+ v = (uintptr_t)WT_ROW_KEY_COPY(rip);
+
+ /*
+ * See the comment in __wt_row_leaf_key_info for an explanation of the
+ * magic.
+ */
+ if (!(v & WT_K_FLAG)) /* Already an encoded key */
+ return;
+
+ key_len = WT_K_DECODE_KEY_LEN(v); /* Key length */
+ if (key_len > WT_KV_MAX_KEY_LEN)
+ return;
+ if (unpack->size > WT_KV_MAX_VALUE_LEN) /* Value length */
+ return;
+
+ key_offset = WT_K_DECODE_KEY_OFFSET(v); /* Page offsets */
+ if (key_offset > WT_KV_MAX_KEY_OFFSET)
+ return;
+ value_offset = WT_PAGE_DISK_OFFSET(page, unpack->data);
+ if (value_offset > WT_KV_MAX_VALUE_OFFSET)
+ return;
+
+ v = WT_KV_ENCODE_KEY_LEN(key_len) |
+ WT_KV_ENCODE_VALUE_LEN(unpack->size) |
+ WT_KV_ENCODE_KEY_OFFSET(key_offset) |
+ WT_KV_ENCODE_VALUE_OFFSET(value_offset) | WT_KV_FLAG;
+ WT_ROW_KEY_SET(rip, v);
+}
+
+/*
+ * __wt_row_leaf_key --
+ * Set a buffer to reference a row-store leaf page key as cheaply as
+ * possible.
+ */
+static inline int
+__wt_row_leaf_key(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_ROW *rip, WT_ITEM *key, int instantiate)
+{
+ void *copy;
+
+ /*
+ * A front-end for __wt_row_leaf_key_work, here to inline fast paths.
+ *
+ * The row-store key can change underfoot; explicitly take a copy.
+ */
+ copy = WT_ROW_KEY_COPY(rip);
+
+ /*
+ * All we handle here are on-page keys (which should be a common case),
+ * and instantiated keys (which start out rare, but become more common
+ * as a leaf page is searched, instantiating prefix-compressed keys).
+ */
+ if (__wt_row_leaf_key_info(
+ page, copy, NULL, NULL, &key->data, &key->size))
+ return (0);
+
+ /*
+ * The alternative is an on-page cell with some kind of compressed or
+ * overflow key that's never been instantiated. Call the underlying
+ * worker function to figure it out.
+ */
+ return (__wt_row_leaf_key_work(session, page, rip, key, instantiate));
+}
+
+/*
+ * __wt_cursor_row_leaf_key --
+ * Set a buffer to reference a cursor-referenced row-store leaf page key.
+ */
+static inline int
+__wt_cursor_row_leaf_key(WT_CURSOR_BTREE *cbt, WT_ITEM *key)
+{
+ WT_PAGE *page;
+ WT_ROW *rip;
+ WT_SESSION_IMPL *session;
+
+ /*
+ * If the cursor references a WT_INSERT item, take the key from there,
+ * else take the key from the original page.
+ */
+ if (cbt->ins == NULL) {
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ rip = &page->u.row.d[cbt->slot];
+ WT_RET(__wt_row_leaf_key(session, page, rip, key, 0));
+ } else {
+ key->data = WT_INSERT_KEY(cbt->ins);
+ key->size = WT_INSERT_KEY_SIZE(cbt->ins);
+ }
+ return (0);
+}
+
+/*
+ * __wt_row_leaf_value_cell --
+ * Return a pointer to the value cell for a row-store leaf page key, or
+ * NULL if there isn't one.
+ */
+static inline WT_CELL *
+__wt_row_leaf_value_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack)
+{
+ WT_CELL *kcell, *vcell;
+ WT_CELL_UNPACK unpack;
+ void *copy, *key;
+ size_t size;
+
+ /* If we already have an unpacked key cell, use it. */
+ if (kpack != NULL)
+ vcell = (WT_CELL *)
+ ((uint8_t *)kpack->cell + __wt_cell_total_len(kpack));
+ else {
+ /*
+ * The row-store key can change underfoot; explicitly take a
+ * copy.
+ */
+ copy = WT_ROW_KEY_COPY(rip);
+
+ /*
+ * Figure out where the key is, step past it to the value cell.
+ * The test for a cell not being set tells us that we have an
+ * on-page key, otherwise we're looking at an instantiated key
+ * or on-page cell, both of which require an unpack of the key's
+ * cell to find the value cell that follows.
+ */
+ if (__wt_row_leaf_key_info(
+ page, copy, NULL, &kcell, &key, &size) && kcell == NULL)
+ vcell = (WT_CELL *)((uint8_t *)key + size);
+ else {
+ __wt_cell_unpack(kcell, &unpack);
+ vcell = (WT_CELL *)((uint8_t *)
+ unpack.cell + __wt_cell_total_len(&unpack));
+ }
+ }
+
+ return (__wt_cell_leaf_value_parse(page, vcell));
+}
+
+/*
+ * __wt_row_leaf_value --
+ * Return the value for a row-store leaf page encoded key/value pair.
+ */
+static inline int
+__wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
+{
+ uintptr_t v;
+
+ /* The row-store key can change underfoot; explicitly take a copy. */
+ v = (uintptr_t)WT_ROW_KEY_COPY(rip);
+
+ /*
+ * See the comment in __wt_row_leaf_key_info for an explanation of the
+ * magic.
+ */
+ if ((v & 0x03) == WT_KV_FLAG) {
+ value->data =
+ WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v));
+ value->size = WT_KV_DECODE_VALUE_LEN(v);
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * __wt_ref_info --
+ * Return the addr/size and type triplet for a reference.
+ */
+static inline int
+__wt_ref_info(WT_SESSION_IMPL *session,
+ WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep)
+{
+ WT_ADDR *addr;
+ WT_CELL_UNPACK *unpack, _unpack;
+
+ addr = ref->addr;
+ unpack = &_unpack;
+
+ /*
+ * If NULL, there is no location.
+ * If off-page, the pointer references a WT_ADDR structure.
+ * If on-page, the pointer references a cell.
+ *
+ * The type is of a limited set: internal, leaf or no-overflow leaf.
+ */
+ if (addr == NULL) {
+ *addrp = NULL;
+ *sizep = 0;
+ if (typep != NULL)
+ *typep = 0;
+ } else if (__wt_off_page(ref->home, addr)) {
+ *addrp = addr->addr;
+ *sizep = addr->size;
+ if (typep != NULL)
+ switch (addr->type) {
+ case WT_ADDR_INT:
+ *typep = WT_CELL_ADDR_INT;
+ break;
+ case WT_ADDR_LEAF:
+ *typep = WT_CELL_ADDR_LEAF;
+ break;
+ case WT_ADDR_LEAF_NO:
+ *typep = WT_CELL_ADDR_LEAF_NO;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ } else {
+ __wt_cell_unpack((WT_CELL *)addr, unpack);
+ *addrp = unpack->data;
+ *sizep = unpack->size;
+ if (typep != NULL)
+ *typep = unpack->type;
+ }
+ return (0);
+}
+
+/*
+ * __wt_page_release --
+ * Release a reference to a page.
+ */
+static inline int
+__wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ int locked;
+
+ btree = S2BT(session);
+
+ /*
+ * Discard our hazard pointer. Ignore pages we don't have and the root
+ * page, which sticks in memory, regardless.
+ */
+ if (ref == NULL || __wt_ref_is_root(ref))
+ return (0);
+ page = ref->page;
+
+ /*
+ * Attempt to evict pages with the special "oldest" read generation.
+ *
+ * This is set for pages that grow larger than the configured
+ * memory_page_max setting, and when we are attempting to scan without
+ * trashing the cache.
+ *
+ * Skip this if eviction is disabled for this operation or this tree,
+ * or if there is no chance of eviction succeeding for dirty pages due
+ * to a checkpoint or because we've already tried writing this page and
+ * it contains an update that isn't stable.
+ */
+ if (LF_ISSET(WT_READ_NO_EVICT) ||
+ page->read_gen != WT_READGEN_OLDEST ||
+ F_ISSET(btree, WT_BTREE_NO_EVICTION) ||
+ (__wt_page_is_modified(page) && (btree->checkpointing ||
+ !__wt_txn_visible_all(session, page->modify->first_dirty_txn))))
+ return (__wt_hazard_clear(session, page));
+
+ /*
+ * Take some care with order of operations: if we release the hazard
+ * reference without first locking the page, it could be evicted in
+ * between.
+ */
+ locked = WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED);
+ WT_TRET(__wt_hazard_clear(session, page));
+ if (!locked)
+ return (ret);
+
+ (void)WT_ATOMIC_ADD4(btree->evict_busy, 1);
+ if ((ret = __wt_evict_page(session, ref)) == 0)
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
+ else {
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_force_fail);
+ if (ret == EBUSY)
+ ret = 0;
+ }
+ (void)WT_ATOMIC_SUB4(btree->evict_busy, 1);
+
+ return (ret);
+}
+
+/*
+ * __wt_page_swap_func --
+ * Swap one page's hazard pointer for another one when hazard pointer
+ * coupling up/down the tree.
+ */
+static inline int
+__wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held,
+ WT_REF *want, uint32_t flags
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ )
+{
+ WT_DECL_RET;
+ int acquired;
+
+ /*
+ * This function is here to simplify the error handling during hazard
+ * pointer coupling so we never leave a hazard pointer dangling. The
+ * assumption is we're holding a hazard pointer on "held", and want to
+ * acquire a hazard pointer on "want", releasing the hazard pointer on
+ * "held" when we're done.
+ */
+ ret = __wt_page_in_func(session, want, flags
+#ifdef HAVE_DIAGNOSTIC
+ , file, line
+#endif
+ );
+
+ /* An expected failure: WT_NOTFOUND when doing a cache-only read. */
+ if (LF_ISSET(WT_READ_CACHE) && ret == WT_NOTFOUND)
+ return (WT_NOTFOUND);
+
+ /* An expected failure: WT_RESTART */
+ if (ret == WT_RESTART)
+ return (WT_RESTART);
+
+ /* Discard the original held page. */
+ acquired = ret == 0;
+ WT_TRET(__wt_page_release(session, held, flags));
+
+ /*
+ * If there was an error discarding the original held page, discard
+ * the acquired page too, keeping it is never useful.
+ */
+ if (acquired && ret != 0)
+ WT_TRET(__wt_page_release(session, want, flags));
+ return (ret);
+}
+
+/*
+ * __wt_page_hazard_check --
+ * Return if there's a hazard pointer to the page in the system.
+ */
+static inline WT_HAZARD *
+__wt_page_hazard_check(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_HAZARD *hp;
+ WT_SESSION_IMPL *s;
+ uint32_t i, hazard_size, session_cnt;
+
+ conn = S2C(session);
+
+ /*
+ * No lock is required because the session array is fixed size, but it
+ * may contain inactive entries. We must review any active session
+ * that might contain a hazard pointer, so insert a barrier before
+ * reading the active session count. That way, no matter what sessions
+ * come or go, we'll check the slots for all of the sessions that could
+ * have been active when we started our check.
+ */
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (s = conn->sessions, i = 0; i < session_cnt; ++s, ++i) {
+ if (!s->active)
+ continue;
+ WT_ORDERED_READ(hazard_size, s->hazard_size);
+ for (hp = s->hazard; hp < s->hazard + hazard_size; ++hp)
+ if (hp->page == page)
+ return (hp);
+ }
+ return (NULL);
+}
+
+/*
+ * __wt_skip_choose_depth --
+ * Randomly choose a depth for a skiplist insert.
+ */
+static inline u_int
+__wt_skip_choose_depth(WT_SESSION_IMPL *session)
+{
+ u_int d;
+
+ for (d = 1; d < WT_SKIP_MAXDEPTH &&
+ __wt_random(session->rnd) < WT_SKIP_PROBABILITY; d++)
+ ;
+ return (d);
+}
+
+/*
+ * __wt_btree_size_overflow --
+ * Check if the size of an in-memory tree with a single leaf page is over
+ * a specified maximum. If called on anything other than a simple tree with a
+ * single leaf page, returns true so the calling code will switch to a new tree.
+ */
+static inline int
+__wt_btree_size_overflow(WT_SESSION_IMPL *session, uint64_t maxsize)
+{
+ WT_BTREE *btree;
+ WT_PAGE *child, *root;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *first;
+
+ btree = S2BT(session);
+ root = btree->root.page;
+
+ /* Check for a non-existent tree. */
+ if (root == NULL)
+ return (0);
+
+ /* A tree that can be evicted always requires a switch. */
+ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ return (1);
+
+ /* Check for a tree with a single leaf page. */
+ pindex = WT_INTL_INDEX_COPY(root);
+ if (pindex->entries != 1) /* > 1 child page, switch */
+ return (1);
+
+ first = pindex->index[0];
+ if (first->state != WT_REF_MEM) /* no child page, ignore */
+ return (0);
+
+ /*
+ * We're reaching down into the page without a hazard pointer, but
+ * that's OK because we know that no-eviction is set and so the page
+ * cannot disappear.
+ */
+ child = first->page;
+ if (child->type != WT_PAGE_ROW_LEAF) /* not a single leaf page */
+ return (1);
+
+ return (child->memory_footprint > maxsize);
+}
+
+/*
+ * __wt_lex_compare --
+ * Lexicographic comparison routine.
+ *
+ * Returns:
+ * < 0 if user_item is lexicographically < tree_item
+ * = 0 if user_item is lexicographically = tree_item
+ * > 0 if user_item is lexicographically > tree_item
+ *
+ * We use the names "user" and "tree" so it's clear in the btree code which
+ * the application is looking at when we call its comparison func.
+ */
+static inline int
+__wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item)
+{
+ const uint8_t *userp, *treep;
+ size_t len, usz, tsz;
+
+ usz = user_item->size;
+ tsz = tree_item->size;
+ len = WT_MIN(usz, tsz);
+
+ for (userp = user_item->data, treep = tree_item->data;
+ len > 0;
+ --len, ++userp, ++treep)
+ if (*userp != *treep)
+ return (*userp < *treep ? -1 : 1);
+
+ /* Contents are equal up to the smallest length. */
+ return ((usz == tsz) ? 0 : (usz < tsz) ? -1 : 1);
+}
+
+/*
+ * __wt_compare --
+ * The same as __wt_lex_compare, but using the application's collator
+ * function when configured.
+ */
+static inline int
+__wt_compare(WT_SESSION_IMPL *session, WT_COLLATOR *collator,
+ const WT_ITEM *user_item, const WT_ITEM *tree_item, int *cmpp)
+{
+ if (collator == NULL) {
+ *cmpp = __wt_lex_compare(user_item, tree_item);
+ return (0);
+ }
+ return (collator->compare(
+ collator, &session->iface, user_item, tree_item, cmpp));
+}
+
+/*
+ * __wt_lex_compare_skip --
+ * Lexicographic comparison routine, skipping leading bytes.
+ *
+ * Returns:
+ * < 0 if user_item is lexicographically < tree_item
+ * = 0 if user_item is lexicographically = tree_item
+ * > 0 if user_item is lexicographically > tree_item
+ *
+ * We use the names "user" and "tree" so it's clear in the btree code which
+ * the application is looking at when we call its comparison func.
+ */
+static inline int
+__wt_lex_compare_skip(
+ const WT_ITEM *user_item, const WT_ITEM *tree_item, size_t *matchp)
+{
+ const uint8_t *userp, *treep;
+ size_t len, usz, tsz;
+
+ usz = user_item->size;
+ tsz = tree_item->size;
+ len = WT_MIN(usz, tsz) - *matchp;
+
+ for (userp = (uint8_t *)user_item->data + *matchp,
+ treep = (uint8_t *)tree_item->data + *matchp;
+ len > 0;
+ --len, ++userp, ++treep, ++*matchp)
+ if (*userp != *treep)
+ return (*userp < *treep ? -1 : 1);
+
+ /* Contents are equal up to the smallest length. */
+ return ((usz == tsz) ? 0 : (usz < tsz) ? -1 : 1);
+}
+
+/*
+ * __wt_compare_skip --
+ * The same as __wt_lex_compare_skip, but using the application's collator
+ * function when configured.
+ */
+static inline int
+__wt_compare_skip(WT_SESSION_IMPL *session, WT_COLLATOR *collator,
+ const WT_ITEM *user_item, const WT_ITEM *tree_item, int *cmpp,
+ size_t *matchp)
+{
+ if (collator == NULL) {
+ *cmpp = __wt_lex_compare_skip(user_item, tree_item, matchp);
+ return (0);
+ }
+ return (collator->compare(
+ collator, &session->iface, user_item, tree_item, cmpp));
+}
diff --git a/src/third_party/wiredtiger/src/include/buf.i b/src/third_party/wiredtiger/src/include/buf.i
new file mode 100644
index 00000000000..09bee9ff831
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/buf.i
@@ -0,0 +1,133 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_buf_grow --
+ * Grow a buffer that may be in-use, and ensure that all data is local to
+ * the buffer.
+ */
+static inline int
+__wt_buf_grow(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+ return (size > buf->memsize || !WT_DATA_IN_ITEM(buf) ?
+ __wt_buf_grow_worker(session, buf, size) : 0);
+}
+
+/*
+ * __wt_buf_extend --
+ * Grow a buffer that's currently in-use.
+ */
+static inline int
+__wt_buf_extend(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+ /*
+ * The difference between __wt_buf_grow and __wt_buf_extend is that the
+ * latter is expected to be called repeatedly for the same buffer, and
+ * so grows the buffer exponentially to avoid repeated costly calls to
+ * realloc.
+ */
+ return (size > buf->memsize ?
+ __wt_buf_grow(session, buf, WT_MAX(size, 2 * buf->memsize)) : 0);
+}
+
+/*
+ * __wt_buf_init --
+ * Initialize a buffer at a specific size.
+ */
+static inline int
+__wt_buf_init(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+ buf->data = buf->mem;
+ buf->size = 0; /* Clear existing data length */
+ WT_RET(__wt_buf_grow(session, buf, size));
+
+ return (0);
+}
+
+/*
+ * __wt_buf_initsize --
+ * Initialize a buffer at a specific size, and set the data length.
+ */
+static inline int
+__wt_buf_initsize(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+ buf->data = buf->mem;
+ buf->size = 0; /* Clear existing data length */
+ WT_RET(__wt_buf_grow(session, buf, size));
+ buf->size = size; /* Set the data length. */
+
+ return (0);
+}
+
+/*
+ * __wt_buf_set --
+ * Set the contents of the buffer.
+ */
+static inline int
+__wt_buf_set(
+ WT_SESSION_IMPL *session, WT_ITEM *buf, const void *data, size_t size)
+{
+ /* Ensure the buffer is large enough. */
+ WT_RET(__wt_buf_initsize(session, buf, size));
+
+ /* Copy the data, allowing for overlapping strings. */
+ memmove(buf->mem, data, size);
+
+ return (0);
+}
+
+/*
+ * __wt_buf_setstr --
+ * Set the contents of the buffer to a NUL-terminated string.
+ */
+static inline int
+__wt_buf_setstr(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *s)
+{
+ return (__wt_buf_set(session, buf, s, strlen(s) + 1));
+}
+
+/*
+ * __wt_buf_set_printable --
+ * Set the contents of the buffer to a printable representation of a
+ * byte string.
+ */
+static inline int
+__wt_buf_set_printable(
+ WT_SESSION_IMPL *session, WT_ITEM *buf, const void *from_arg, size_t size)
+{
+ return (__wt_raw_to_esc_hex(session, from_arg, size, buf));
+}
+
+/*
+ * __wt_buf_free --
+ * Free a buffer.
+ */
+static inline void
+__wt_buf_free(WT_SESSION_IMPL *session, WT_ITEM *buf)
+{
+ __wt_free(session, buf->mem);
+
+ memset(buf, 0, sizeof(WT_ITEM));
+}
+
+/*
+ * __wt_scr_free --
+ * Release a scratch buffer.
+ */
+static inline void
+__wt_scr_free(WT_ITEM **bufp)
+{
+ WT_ITEM *buf;
+
+ if ((buf = *bufp) != NULL) {
+ *bufp = NULL;
+
+ buf->data = NULL;
+ buf->size = 0;
+ F_CLR(buf, WT_ITEM_INUSE);
+ }
+}
diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h
new file mode 100644
index 00000000000..b7dbd8401a9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cache.h
@@ -0,0 +1,139 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Tuning constants: I hesitate to call this tuning, but we want to review some
+ * number of pages from each file's in-memory tree for each page we evict.
+ */
+#define WT_EVICT_INT_SKEW (1<<20) /* Prefer leaf pages over internal
+ pages by this many increments of the
+ read generation. */
+#define WT_EVICT_WALK_PER_FILE 10 /* Pages to visit per file */
+#define WT_EVICT_WALK_BASE 300 /* Pages tracked across file visits */
+#define WT_EVICT_WALK_INCR 100 /* Pages added each walk */
+
+#define WT_EVICT_PASS_AGGRESSIVE 0x01
+#define WT_EVICT_PASS_ALL 0x02
+#define WT_EVICT_PASS_DIRTY 0x04
+
+/*
+ * WT_EVICT_ENTRY --
+ * Encapsulation of an eviction candidate.
+ */
+struct __wt_evict_entry {
+ WT_BTREE *btree; /* Enclosing btree object */
+ WT_REF *ref; /* Page to flush/evict */
+};
+
+/*
+ * WT_EVICT_WORKER --
+ * Encapsulation of an eviction worker thread.
+ */
+
+struct __wt_evict_worker {
+ WT_SESSION_IMPL *session;
+ u_int id;
+ wt_thread_t tid;
+#define WT_EVICT_WORKER_RUN 0x01
+ uint32_t flags;
+};
+
+/*
+ * WiredTiger cache structure.
+ */
+struct __wt_cache {
+ /*
+ * Different threads read/write pages to/from the cache and create pages
+ * in the cache, so we cannot know precisely how much memory is in use
+ * at any specific time. However, even though the values don't have to
+ * be exact, they can't be garbage, we track what comes in and what goes
+ * out and calculate the difference as needed.
+ */
+ uint64_t bytes_inmem; /* Bytes/pages in memory */
+ uint64_t pages_inmem;
+ uint64_t bytes_evict; /* Bytes/pages discarded by eviction */
+ uint64_t pages_evict;
+ uint64_t bytes_dirty; /* Bytes/pages currently dirty */
+ uint64_t pages_dirty;
+
+ /*
+ * Read information.
+ */
+ uint64_t read_gen; /* Page read generation (LRU) */
+
+ /*
+ * Eviction thread information.
+ */
+ WT_CONDVAR *evict_cond; /* Eviction server condition */
+ WT_SPINLOCK evict_lock; /* Eviction LRU queue */
+ WT_SPINLOCK evict_walk_lock; /* Eviction walk location */
+ /* Condition signalled when the eviction server populates the queue */
+ WT_CONDVAR *evict_waiter_cond;
+
+ u_int eviction_trigger; /* Percent to trigger eviction */
+ u_int eviction_target; /* Percent to end eviction */
+ u_int eviction_dirty_target; /* Percent to allow dirty */
+
+ /*
+ * LRU eviction list information.
+ */
+ WT_EVICT_ENTRY *evict; /* LRU pages being tracked */
+ WT_EVICT_ENTRY *evict_current; /* LRU current page to be evicted */
+ uint32_t evict_candidates; /* LRU list pages to evict */
+ uint32_t evict_entries; /* LRU entries in the queue */
+ volatile uint32_t evict_max; /* LRU maximum eviction slot used */
+ uint32_t evict_slots; /* LRU list eviction slots */
+ WT_DATA_HANDLE
+ *evict_file_next; /* LRU next file to search */
+
+ /*
+ * Sync/flush request information.
+ */
+ volatile uint64_t sync_request; /* File sync requests */
+ volatile uint64_t sync_complete;/* File sync requests completed */
+
+ /*
+ * Cache pool information.
+ */
+ uint64_t cp_saved_evict; /* Evict count from last pass */
+ uint64_t cp_current_evict; /* Evict count from current pass */
+ uint32_t cp_skip_count; /* Post change stabilization */
+ uint64_t cp_reserved; /* Base size for this cache */
+ WT_SESSION_IMPL *cp_session; /* May be used for cache management */
+ wt_thread_t cp_tid; /* Thread ID for cache pool manager */
+
+ /*
+ * Flags.
+ */
+#define WT_CACHE_POOL_MANAGER 0x01 /* The active cache pool manager */
+#define WT_CACHE_POOL_RUN 0x02 /* Cache pool thread running */
+#define WT_EVICT_ACTIVE 0x04 /* Eviction server is active */
+#define WT_EVICT_CLEAR_WALKS 0x08 /* Clear eviction walks */
+#define WT_EVICT_NO_PROGRESS 0x10 /* Check if pages are being evicted */
+#define WT_EVICT_STUCK 0x20 /* Eviction server is stuck */
+ uint32_t flags;
+};
+
+/*
+ * WT_CACHE_POOL --
+ * A structure that represents a shared cache.
+ */
+struct __wt_cache_pool {
+ WT_SPINLOCK cache_pool_lock;
+ WT_CONDVAR *cache_pool_cond;
+ const char *name;
+ uint64_t size;
+ uint64_t chunk;
+ uint64_t currently_used;
+ uint32_t refs; /* Reference count for structure. */
+ /* Locked: List of connections participating in the cache pool. */
+ TAILQ_HEAD(__wt_cache_pool_qh, __wt_connection_impl) cache_pool_qh;
+
+#define WT_CACHE_POOL_MANAGED 0x01 /* Cache pool has a manager thread */
+#define WT_CACHE_POOL_ACTIVE 0x02 /* Cache pool is active */
+ uint8_t flags_atomic;
+};
diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i
new file mode 100644
index 00000000000..fdb7302f4a8
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cache.i
@@ -0,0 +1,174 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_eviction_check --
+ * Wake the eviction server if necessary.
+ */
+static inline int
+__wt_eviction_check(WT_SESSION_IMPL *session, int *fullp, int wake)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ uint64_t bytes_inuse, bytes_max, dirty_inuse;
+
+ conn = S2C(session);
+ cache = conn->cache;
+
+ /*
+ * If we're over the maximum cache, shut out reads (which include page
+ * allocations) until we evict to back under the maximum cache.
+ * Eviction will keep pushing out pages so we don't run on the edge all
+ * the time. Avoid division by zero if the cache size has not yet been
+ * in a shared cache.
+ */
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ dirty_inuse = cache->bytes_dirty;
+ bytes_max = conn->cache_size + 1;
+
+ /* Calculate the cache full percentage. */
+ *fullp = (int)((100 * bytes_inuse) / bytes_max);
+
+ /* Wake eviction when we're over the trigger cache size. */
+ if (wake &&
+ (bytes_inuse > (cache->eviction_trigger * bytes_max) / 100 ||
+ dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100))
+ WT_RET(__wt_evict_server_wake(session));
+ return (0);
+}
+
+/*
+ * __wt_session_can_wait --
+ * Return if a session available for a potentially slow operation.
+ */
+static inline int
+__wt_session_can_wait(WT_SESSION_IMPL *session)
+{
+ /*
+ * Return if a session available for a potentially slow operation;
+ * for example, used by the block manager in the case of flushing
+ * the system cache.
+ */
+ if (!F_ISSET(session, WT_SESSION_CAN_WAIT))
+ return (0);
+
+ /*
+ * LSM sets the no-cache-check flag when holding the LSM tree lock,
+ * in that case, or when holding the schema lock, we don't want to
+ * highjack the thread for eviction.
+ */
+ if (F_ISSET(session,
+ WT_SESSION_NO_CACHE_CHECK | WT_SESSION_SCHEMA_LOCKED))
+ return (0);
+
+ return (1);
+}
+
+/*
+ * __wt_cache_full_check --
+ * Wait for there to be space in the cache before a read or update.
+ */
+static inline int
+__wt_cache_full_check(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *txn_state;
+ int busy, count, full;
+
+ /*
+ * LSM sets the no-cache-check flag when holding the LSM tree lock,
+ * in that case, or when holding the schema lock, we don't want to
+ * highjack the thread for eviction.
+ */
+ if (F_ISSET(session,
+ WT_SESSION_NO_CACHE_CHECK | WT_SESSION_SCHEMA_LOCKED))
+ return (0);
+
+ /*
+ * Threads operating on trees that cannot be evicted are ignored,
+ * mostly because they're not contributing to the problem.
+ */
+ if ((btree = S2BT_SAFE(session)) != NULL &&
+ F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ return (0);
+
+ /*
+ * Only wake the eviction server the first time through here (if the
+ * cache is too full).
+ *
+ * If the cache is less than 95% full, no work to be done.
+ */
+ WT_RET(__wt_eviction_check(session, &full, 1));
+ if (full < 95)
+ return (0);
+
+ /*
+ * If we are at the API boundary and the cache is more than 95% full,
+ * try to evict at least one page before we start an operation. This
+ * helps with some eviction-dominated workloads.
+ *
+ * If the current transaction is keeping the oldest ID pinned, it is in
+ * the middle of an operation. This may prevent the oldest ID from
+ * moving forward, leading to deadlock, so only evict what we can.
+ * Otherwise, we are at a transaction boundary and we can work harder
+ * to make sure there is free space in the cache.
+ */
+ txn_global = &S2C(session)->txn_global;
+ txn_state = &txn_global->states[session->id];
+ busy = txn_state->id != WT_TXN_NONE ||
+ session->nhazard > 0 ||
+ (txn_state->snap_min != WT_TXN_NONE &&
+ txn_global->current != txn_global->oldest_id);
+ if (busy && full < 100)
+ return (0);
+ count = busy ? 1 : 10;
+
+ for (;;) {
+ switch (ret = __wt_evict_lru_page(session, 1)) {
+ case 0:
+ if (--count == 0)
+ return (0);
+ break;
+ case EBUSY:
+ continue;
+ case WT_NOTFOUND:
+ break;
+ default:
+ return (ret);
+ }
+
+ WT_RET(__wt_eviction_check(session, &full, 0));
+ if (full < 100)
+ return (0);
+ else if (ret == 0)
+ continue;
+
+ /*
+ * The cache is still full and no pages were found in the queue
+ * to evict. If this transaction is the one holding back the
+ * oldest ID, we can't wait forever. We'll block next time we
+ * are not busy.
+ */
+ if (busy) {
+ __wt_txn_update_oldest(session);
+ if (txn_state->id == txn_global->oldest_id ||
+ txn_state->snap_min == txn_global->oldest_id)
+ return (0);
+ }
+
+ /* Wait for the queue to re-populate before trying again. */
+ WT_RET(__wt_cond_wait(session,
+ S2C(session)->cache->evict_waiter_cond, 100000));
+
+ /* Check if things have changed so that we are busy. */
+ if (!busy && txn_state->snap_min != WT_TXN_NONE &&
+ txn_global->current != txn_global->oldest_id)
+ busy = count = 1;
+ }
+}
diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i
new file mode 100644
index 00000000000..42c7c07a30c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cell.i
@@ -0,0 +1,816 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WT_CELL --
+ * Variable-length cell type.
+ *
+ * Pages containing variable-length keys or values data (the WT_PAGE_ROW_INT,
+ * WT_PAGE_ROW_LEAF, WT_PAGE_COL_INT and WT_PAGE_COL_VAR page types), have
+ * cells after the page header.
+ *
+ * There are 4 basic cell types: keys and data (each of which has an overflow
+ * form), deleted cells and off-page references. The cell is usually followed
+ * by additional data, varying by type: a key or data cell is followed by a set
+ * of bytes, an address cookie follows overflow or off-page cells.
+ *
+ * Deleted cells are place-holders for column-store files, where entries cannot
+ * be removed in order to preserve the record count.
+ *
+ * Here's the cell use by page type:
+ *
+ * WT_PAGE_ROW_INT (row-store internal page):
+ * Keys and offpage-reference pairs (a WT_CELL_KEY or WT_CELL_KEY_OVFL
+ * cell followed by a WT_CELL_ADDR_XXX cell).
+ *
+ * WT_PAGE_ROW_LEAF (row-store leaf page):
+ * Keys with optional data cells (a WT_CELL_KEY or WT_CELL_KEY_OVFL cell,
+ * normally followed by a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell).
+ *
+ * WT_PAGE_ROW_LEAF pages optionally prefix-compress keys, using a single
+ * byte count immediately following the cell.
+ *
+ * WT_PAGE_COL_INT (Column-store internal page):
+ * Off-page references (a WT_CELL_ADDR_XXX cell).
+ *
+ * WT_PAGE_COL_VAR (Column-store leaf page storing variable-length cells):
+ * Data cells (a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell), or deleted
+ * cells (a WT_CELL_DEL cell).
+ *
+ * Each cell starts with a descriptor byte:
+ *
+ * Bits 1 and 2 are reserved for "short" key and value cells (that is, a cell
+ * carrying data less than 64B, where we can store the data length in the cell
+ * descriptor byte):
+ * 0x00 Not a short key/data cell
+ * 0x01 Short key cell
+ * 0x10 Short key cell, with a following prefix-compression byte
+ * 0x11 Short value cell
+ * In these cases, the other 6 bits of the descriptor byte are the data length.
+ *
+ * Bit 3 marks an 8B packed, uint64_t value following the cell description byte.
+ * (A run-length counter or a record number for variable-length column store.)
+ *
+ * Bit 4 is unused.
+ *
+ * Bits 5-8 are cell "types".
+ */
+#define WT_CELL_KEY_SHORT 0x01 /* Short key */
+#define WT_CELL_KEY_SHORT_PFX 0x02 /* Short key with prefix byte */
+#define WT_CELL_VALUE_SHORT 0x03 /* Short data */
+#define WT_CELL_SHORT_TYPE(v) ((v) & 0x03U)
+
+#define WT_CELL_SHORT_MAX 63 /* Maximum short key/value */
+#define WT_CELL_SHORT_SHIFT 2 /* Shift for short key/value */
+
+#define WT_CELL_64V 0x04 /* Associated value */
+
+/*
+ * We could use bit 4 as a single bit (similar to bit 3), or as a type bit in a
+ * backward compatible way by adding bit 4 to the type mask and adding new types
+ * that incorporate it.
+ */
+#define WT_CELL_UNUSED_BIT4 0x08 /* Unused */
+
+/*
+ * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf
+ * block location, and WT_CELL_ADDR_LEAF_NO is a leaf block location where the
+ * page has no overflow items. (The goal is to speed up truncation as we don't
+ * have to read pages without overflow items in order to delete them. Note,
+ * WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without
+ * overflow items, the only guarantee is that if set, the page has no overflow
+ * items.)
+ *
+ * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting
+ * value dictionaries: if the two values are the same, we only store them once
+ * and have the second and subsequent use reference the original.
+ */
+#define WT_CELL_ADDR_DEL (0) /* Address: deleted */
+#define WT_CELL_ADDR_INT (1 << 4) /* Address: internal */
+#define WT_CELL_ADDR_LEAF (2 << 4) /* Address: leaf */
+#define WT_CELL_ADDR_LEAF_NO (3 << 4) /* Address: leaf no overflow */
+#define WT_CELL_DEL (4 << 4) /* Deleted value */
+#define WT_CELL_KEY (5 << 4) /* Key */
+#define WT_CELL_KEY_OVFL (6 << 4) /* Overflow key */
+#define WT_CELL_KEY_OVFL_RM (12 << 4) /* Overflow key (removed) */
+#define WT_CELL_KEY_PFX (7 << 4) /* Key with prefix byte */
+#define WT_CELL_VALUE (8 << 4) /* Value */
+#define WT_CELL_VALUE_COPY (9 << 4) /* Value copy */
+#define WT_CELL_VALUE_OVFL (10 << 4) /* Overflow value */
+#define WT_CELL_VALUE_OVFL_RM (11 << 4) /* Overflow value (removed) */
+
+#define WT_CELL_TYPE_MASK (0x0fU << 4) /* Maximum 16 cell types */
+#define WT_CELL_TYPE(v) ((v) & WT_CELL_TYPE_MASK)
+
+/*
+ * When we aren't able to create a short key or value (and, in the case of a
+ * value, there's no associated RLE), the key or value is at least 64B, else
+ * we'd have been able to store it as a short cell. Decrement/Increment the
+ * size before storing it, in the hopes that relatively small key/value sizes
+ * will pack into a single byte instead of two bytes.
+ */
+#define WT_CELL_SIZE_ADJUST 64
+
+/*
+ * WT_CELL --
+ * Variable-length, on-page cell header.
+ */
+struct __wt_cell {
+ /*
+ * Maximum of 16 bytes:
+ * 1: cell descriptor byte
+ * 1: prefix compression count
+ * 9: associated 64-bit value (uint64_t encoding, max 9 bytes)
+ * 5: data length (uint32_t encoding, max 5 bytes)
+ *
+ * This calculation is pessimistic: the prefix compression count and
+ * 64V value overlap, the 64V value and data length are optional.
+ */
+ uint8_t __chunk[1 + 1 + WT_INTPACK64_MAXSIZE + WT_INTPACK32_MAXSIZE];
+};
+
+/*
+ * WT_CELL_UNPACK --
+ * Unpacked cell.
+ */
+struct __wt_cell_unpack {
+ WT_CELL *cell; /* Cell's disk image address */
+
+ uint64_t v; /* RLE count or recno */
+
+ /*
+ * !!!
+ * The size and __len fields are reasonably type size_t; don't change
+ * the type, performance drops significantly if they're type size_t.
+ */
+ const void *data; /* Data */
+ uint32_t size; /* Data size */
+
+ uint32_t __len; /* Cell + data length (usually) */
+
+ uint8_t prefix; /* Cell prefix length */
+
+ uint8_t raw; /* Raw cell type (include "shorts") */
+ uint8_t type; /* Cell type */
+
+ uint8_t ovfl; /* boolean: cell is an overflow */
+};
+
+/*
+ * WT_CELL_FOREACH --
+ * Walk the cells on a page.
+ */
+#define WT_CELL_FOREACH(btree, dsk, cell, unpack, i) \
+ for ((cell) = \
+ WT_PAGE_HEADER_BYTE(btree, dsk), (i) = (dsk)->u.entries; \
+ (i) > 0; \
+ (cell) = (WT_CELL *)((uint8_t *)(cell) + (unpack)->__len), --(i))
+
+/*
+ * __wt_cell_pack_addr --
+ * Pack an address cell.
+ */
+static inline size_t
+__wt_cell_pack_addr(WT_CELL *cell, u_int cell_type, uint64_t recno, size_t size)
+{
+ uint8_t *p;
+
+ p = cell->__chunk + 1;
+
+ if (recno == 0)
+ cell->__chunk[0] = cell_type; /* Type */
+ else {
+ cell->__chunk[0] = cell_type | WT_CELL_64V;
+ (void)__wt_vpack_uint(&p, 0, recno); /* Record number */
+ }
+ (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
+ return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_data --
+ * Set a data item's WT_CELL contents.
+ */
+static inline size_t
+__wt_cell_pack_data(WT_CELL *cell, uint64_t rle, size_t size)
+{
+ uint8_t byte, *p;
+
+ /*
+ * Short data cells without run-length encoding have 6 bits of data
+ * length in the descriptor byte.
+ */
+ if (rle < 2 && size <= WT_CELL_SHORT_MAX) {
+ byte = (uint8_t)size; /* Type + length */
+ cell->__chunk[0] =
+ (byte << WT_CELL_SHORT_SHIFT) | WT_CELL_VALUE_SHORT;
+ return (1);
+ }
+
+ p = cell->__chunk + 1;
+ if (rle < 2) {
+ size -= WT_CELL_SIZE_ADJUST;
+ cell->__chunk[0] = WT_CELL_VALUE; /* Type */
+ } else {
+ cell->__chunk[0] = WT_CELL_VALUE | WT_CELL_64V;
+ (void)__wt_vpack_uint(&p, 0, rle); /* RLE */
+ }
+ (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
+ return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_data_match --
+ * Return if two items would have identical WT_CELLs (except for any RLE).
+ */
+static inline int
+__wt_cell_pack_data_match(
+ WT_CELL *page_cell, WT_CELL *val_cell, const uint8_t *val_data, int *matchp)
+{
+ const uint8_t *a, *b;
+ uint64_t av, bv;
+ int rle;
+
+ *matchp = 0; /* Default to no-match */
+
+ /*
+ * This is a special-purpose function used by reconciliation to support
+ * dictionary lookups. We're passed an on-page cell and a created cell
+ * plus a chunk of data we're about to write on the page, and we return
+ * if they would match on the page. The column-store comparison ignores
+ * the RLE because the copied cell will have its own RLE.
+ */
+ a = (uint8_t *)page_cell;
+ b = (uint8_t *)val_cell;
+
+ if (WT_CELL_SHORT_TYPE(a[0]) == WT_CELL_VALUE_SHORT) {
+ av = a[0] >> WT_CELL_SHORT_SHIFT;
+ ++a;
+ } else if (WT_CELL_TYPE(a[0]) == WT_CELL_VALUE) {
+ rle = a[0] & WT_CELL_64V ? 1 : 0; /* Skip any RLE */
+ ++a;
+ if (rle)
+ WT_RET(__wt_vunpack_uint(&a, 0, &av));
+ WT_RET(__wt_vunpack_uint(&a, 0, &av)); /* Length */
+ } else
+ return (0);
+
+ if (WT_CELL_SHORT_TYPE(b[0]) == WT_CELL_VALUE_SHORT) {
+ bv = b[0] >> WT_CELL_SHORT_SHIFT;
+ ++b;
+ } else if (WT_CELL_TYPE(b[0]) == WT_CELL_VALUE) {
+ rle = b[0] & WT_CELL_64V ? 1 : 0; /* Skip any RLE */
+ ++b;
+ if (rle)
+ WT_RET(__wt_vunpack_uint(&b, 0, &bv));
+ WT_RET(__wt_vunpack_uint(&b, 0, &bv)); /* Length */
+ } else
+ return (0);
+
+ if (av == bv)
+ *matchp = memcmp(a, val_data, av) == 0 ? 1 : 0;
+ return (0);
+}
+
+/*
+ * __wt_cell_pack_copy --
+ * Write a copy value cell.
+ */
+static inline size_t
+__wt_cell_pack_copy(WT_CELL *cell, uint64_t rle, uint64_t v)
+{
+ uint8_t *p;
+
+ p = cell->__chunk + 1;
+
+ if (rle < 2) /* Type */
+ cell->__chunk[0] = WT_CELL_VALUE_COPY;
+ else { /* Type */
+ cell->__chunk[0] = WT_CELL_VALUE_COPY | WT_CELL_64V;
+ (void)__wt_vpack_uint(&p, 0, rle); /* RLE */
+ }
+ (void)__wt_vpack_uint(&p, 0, v); /* Copy offset */
+ return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_del --
+ * Write a deleted value cell.
+ */
+static inline size_t
+__wt_cell_pack_del(WT_CELL *cell, uint64_t rle)
+{
+ uint8_t *p;
+
+ p = cell->__chunk + 1;
+ if (rle < 2) { /* Type */
+ cell->__chunk[0] = WT_CELL_DEL;
+ return (1);
+ }
+ /* Type */
+ cell->__chunk[0] = WT_CELL_DEL | WT_CELL_64V;
+ (void)__wt_vpack_uint(&p, 0, rle); /* RLE */
+ return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_int_key --
+ * Set a row-store internal page key's WT_CELL contents.
+ */
+static inline size_t
+__wt_cell_pack_int_key(WT_CELL *cell, size_t size)
+{
+ uint8_t byte, *p;
+
+ /* Short keys have 6 bits of data length in the descriptor byte. */
+ if (size <= WT_CELL_SHORT_MAX) {
+ byte = (uint8_t)size;
+ cell->__chunk[0] =
+ (byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT;
+ return (1);
+ }
+
+ cell->__chunk[0] = WT_CELL_KEY; /* Type */
+ p = cell->__chunk + 1;
+
+ size -= WT_CELL_SIZE_ADJUST;
+ (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
+
+ return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_leaf_key --
+ * Set a row-store leaf page key's WT_CELL contents.
+ */
+static inline size_t
+__wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size)
+{
+ uint8_t byte, *p;
+
+ /* Short keys have 6 bits of data length in the descriptor byte. */
+ if (size <= WT_CELL_SHORT_MAX) {
+ if (prefix == 0) {
+ byte = (uint8_t)size; /* Type + length */
+ cell->__chunk[0] =
+ (byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT;
+ return (1);
+ } else {
+ byte = (uint8_t)size; /* Type + length */
+ cell->__chunk[0] =
+ (byte << WT_CELL_SHORT_SHIFT) |
+ WT_CELL_KEY_SHORT_PFX;
+ cell->__chunk[1] = prefix; /* Prefix */
+ return (2);
+ }
+ }
+
+ if (prefix == 0) {
+ cell->__chunk[0] = WT_CELL_KEY; /* Type */
+ p = cell->__chunk + 1;
+ } else {
+ cell->__chunk[0] = WT_CELL_KEY_PFX; /* Type */
+ cell->__chunk[1] = prefix; /* Prefix */
+ p = cell->__chunk + 2;
+ }
+
+ size -= WT_CELL_SIZE_ADJUST;
+ (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
+
+ return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_ovfl --
+ * Pack an overflow cell.
+ */
+static inline size_t
+__wt_cell_pack_ovfl(WT_CELL *cell, uint8_t type, uint64_t rle, size_t size)
+{
+ uint8_t *p;
+
+ p = cell->__chunk + 1;
+ if (rle < 2) /* Type */
+ cell->__chunk[0] = type;
+ else {
+ cell->__chunk[0] = type | WT_CELL_64V;
+ (void)__wt_vpack_uint(&p, 0, rle); /* RLE */
+ }
+ (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
+ return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_rle --
+ * Return the cell's RLE value.
+ */
+static inline uint64_t
+__wt_cell_rle(WT_CELL_UNPACK *unpack)
+{
+ /*
+ * Any item with only 1 occurrence is stored with an RLE of 0, that is,
+ * without any RLE at all. This code is a single place to handle that
+ * correction, for simplicity.
+ */
+ return (unpack->v < 2 ? 1 : unpack->v);
+}
+
+/*
+ * __wt_cell_total_len --
+ * Return the cell's total length, including data.
+ */
+static inline size_t
+__wt_cell_total_len(WT_CELL_UNPACK *unpack)
+{
+ /*
+ * The length field is specially named because it's dangerous to use it:
+ * it represents the length of the current cell (normally used for the
+ * loop that walks through cells on the page), but occasionally we want
+ * to copy a cell directly from the page, and what we need is the cell's
+ * total length. The problem is dictionary-copy cells, because in that
+ * case, the __len field is the length of the current cell, not the cell
+ * for which we're returning data. To use the __len field, you must be
+ * sure you're not looking at a copy cell.
+ */
+ return (unpack->__len);
+}
+
+/*
+ * __wt_cell_type --
+ * Return the cell's type (collapsing special types).
+ */
+static inline u_int
+__wt_cell_type(WT_CELL *cell)
+{
+ u_int type;
+
+ switch (WT_CELL_SHORT_TYPE(cell->__chunk[0])) {
+ case WT_CELL_KEY_SHORT:
+ case WT_CELL_KEY_SHORT_PFX:
+ return (WT_CELL_KEY);
+ case WT_CELL_VALUE_SHORT:
+ return (WT_CELL_VALUE);
+ }
+
+ switch (type = WT_CELL_TYPE(cell->__chunk[0])) {
+ case WT_CELL_KEY_PFX:
+ return (WT_CELL_KEY);
+ case WT_CELL_KEY_OVFL_RM:
+ return (WT_CELL_KEY_OVFL);
+ case WT_CELL_VALUE_OVFL_RM:
+ return (WT_CELL_VALUE_OVFL);
+ }
+ return (type);
+}
+
+/*
+ * __wt_cell_type_raw --
+ * Return the cell's type.
+ */
+static inline u_int
+__wt_cell_type_raw(WT_CELL *cell)
+{
+ return (WT_CELL_SHORT_TYPE(cell->__chunk[0]) == 0 ?
+ WT_CELL_TYPE(cell->__chunk[0]) :
+ WT_CELL_SHORT_TYPE(cell->__chunk[0]));
+}
+
+/*
+ * __wt_cell_type_reset --
+ * Reset the cell's type.
+ */
+static inline void
+__wt_cell_type_reset(
+ WT_SESSION_IMPL *session, WT_CELL *cell, u_int old_type, u_int new_type)
+{
+ /*
+ * For all current callers of this function, this should happen once
+ * and only once, assert we're setting what we think we're setting.
+ */
+ WT_ASSERT(session, old_type == 0 || old_type == __wt_cell_type(cell));
+ WT_UNUSED(old_type);
+
+ cell->__chunk[0] =
+ (cell->__chunk[0] & ~WT_CELL_TYPE_MASK) | WT_CELL_TYPE(new_type);
+}
+
+/*
+ * __wt_cell_leaf_value_parse --
+ * Return the cell if it's a row-store leaf page value, otherwise return
+ * NULL.
+ */
+static inline WT_CELL *
+__wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell)
+{
+ /*
+ * This function exists so there's a place for this comment.
+ *
+ * Row-store leaf pages may have a single data cell between each key, or
+ * keys may be adjacent (when the data cell is empty).
+ *
+ * One special case: if the last key on a page is a key without a value,
+ * don't walk off the end of the page: the size of the underlying disk
+ * image is exact, which means the end of the last cell on the page plus
+ * the length of the cell should be the byte immediately after the page
+ * disk image.
+ *
+ * !!!
+ * This line of code is really a call to __wt_off_page, but we know the
+ * cell we're given will either be on the page or past the end of page,
+ * so it's a simpler check. (I wouldn't bother, but the real problem is
+ * we can't call __wt_off_page directly, it's in btree.i which requires
+ * this file be included first.)
+ */
+ if (cell >= (WT_CELL *)((uint8_t *)page->dsk + page->dsk->mem_size))
+ return (NULL);
+
+ switch (__wt_cell_type_raw(cell)) {
+ case WT_CELL_KEY:
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_KEY_OVFL_RM:
+ case WT_CELL_KEY_PFX:
+ case WT_CELL_KEY_SHORT:
+ case WT_CELL_KEY_SHORT_PFX:
+ return (NULL);
+ default:
+ return (cell);
+ }
+}
+
+/*
+ * __wt_cell_unpack_safe --
+ * Unpack a WT_CELL into a structure during verification.
+ */
+static inline int
+__wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end)
+{
+ uint64_t saved_v, v;
+ uint32_t saved_len;
+ int copied;
+ const uint8_t *p;
+
+ copied = 0;
+ saved_len = 0;
+ saved_v = 0;
+
+ /*
+ * The verification code specifies an end argument, a pointer to 1 past
+ * the end-of-page. In that case, make sure we don't go past the end
+ * of the page when reading. If an error occurs, we simply return the
+ * error code, the verification code takes care of complaining (and, in
+ * the case of salvage, it won't complain at all, it's OK to fail).
+ */
+#define WT_CELL_LEN_CHK(p, len) do { \
+ if (end != NULL && (((uint8_t *)p) + (len)) > end) \
+ return (WT_ERROR); \
+} while (0)
+
+restart:
+ /*
+ * This code is performance critical for scans through read-only trees.
+ * Avoid WT_CLEAR here: it makes this code run significantly slower.
+ */
+ WT_CLEAR_INLINE(WT_CELL_UNPACK, *unpack);
+ WT_CELL_LEN_CHK(cell, 0);
+ unpack->cell = cell;
+ unpack->type = __wt_cell_type(cell);
+ unpack->raw = __wt_cell_type_raw(cell);
+
+ /*
+ * Handle cells with neither an RLE count or data length: short key/data
+ * cells have 6 bits of data length in the descriptor byte.
+ */
+ switch (unpack->raw) {
+ case WT_CELL_KEY_SHORT_PFX:
+ WT_CELL_LEN_CHK(cell, 1); /* skip prefix */
+ unpack->prefix = cell->__chunk[1];
+
+ unpack->data = cell->__chunk + 2;
+ unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT;
+ unpack->__len = 2 + unpack->size;
+ goto done;
+ case WT_CELL_KEY_SHORT:
+ case WT_CELL_VALUE_SHORT:
+ unpack->data = cell->__chunk + 1;
+ unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT;
+ unpack->__len = 1 + unpack->size;
+ goto done;
+ }
+
+ p = (uint8_t *)cell + 1; /* skip cell */
+
+ /*
+ * Check for a prefix byte that optionally follows the cell descriptor
+ * byte on row-store leaf pages.
+ */
+ if (unpack->raw == WT_CELL_KEY_PFX) {
+ ++p; /* skip prefix */
+ WT_CELL_LEN_CHK(p, 0);
+ unpack->prefix = cell->__chunk[1];
+ }
+
+ /*
+ * Check for an RLE count or record number that optionally follows the
+ * cell descriptor byte on column-store variable-length pages.
+ */
+ if (cell->__chunk[0] & WT_CELL_64V) /* skip value */
+ WT_RET(__wt_vunpack_uint(
+ &p, end == NULL ? 0 : (size_t)(end - p), &unpack->v));
+
+ /*
+ * Handle special actions for a few different cell types and set the
+ * data length (deleted cells are fixed-size without length bytes,
+ * almost everything else has data length bytes).
+ */
+ switch (unpack->raw) {
+ case WT_CELL_VALUE_COPY:
+ /*
+ * The cell is followed by an offset to a cell written earlier
+ * in the page. Save/restore the length and RLE of this cell,
+ * we need the length to step through the set of cells on the
+ * page and this RLE is probably different from the RLE of the
+ * earlier cell.
+ */
+ WT_RET(__wt_vunpack_uint(
+ &p, end == NULL ? 0 : (size_t)(end - p), &v));
+ saved_len = WT_PTRDIFF32(p, cell);
+ saved_v = unpack->v;
+ cell = (WT_CELL *)((uint8_t *)cell - v);
+ copied = 1;
+ goto restart;
+
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_KEY_OVFL_RM:
+ case WT_CELL_VALUE_OVFL:
+ case WT_CELL_VALUE_OVFL_RM:
+ /*
+ * Set overflow flag.
+ */
+ unpack->ovfl = 1;
+ /* FALLTHROUGH */
+
+ case WT_CELL_ADDR_DEL:
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ case WT_CELL_KEY:
+ case WT_CELL_KEY_PFX:
+ case WT_CELL_VALUE:
+ /*
+ * The cell is followed by a 4B data length and a chunk of
+ * data.
+ */
+ WT_RET(__wt_vunpack_uint(
+ &p, end == NULL ? 0 : (size_t)(end - p), &v));
+
+ if (unpack->raw == WT_CELL_KEY ||
+ unpack->raw == WT_CELL_KEY_PFX ||
+ (unpack->raw == WT_CELL_VALUE && unpack->v == 0))
+ v += WT_CELL_SIZE_ADJUST;
+
+ unpack->data = p;
+ unpack->size = (uint32_t)v;
+ unpack->__len = WT_PTRDIFF32(p + unpack->size, cell);
+ break;
+
+ case WT_CELL_DEL:
+ unpack->__len = WT_PTRDIFF32(p, cell);
+ break;
+ default:
+ return (WT_ERROR); /* Unknown cell type. */
+ }
+
+ /*
+ * Check the original cell against the full cell length (this is a
+ * diagnostic as well, we may be copying the cell from the page and
+ * we need the right length).
+ */
+done: WT_CELL_LEN_CHK(cell, unpack->__len);
+ if (copied) {
+ unpack->raw = WT_CELL_VALUE_COPY;
+ unpack->__len = saved_len;
+ unpack->v = saved_v;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_cell_unpack --
+ * Unpack a WT_CELL into a structure.
+ */
+static inline void
+__wt_cell_unpack(WT_CELL *cell, WT_CELL_UNPACK *unpack)
+{
+ (void)__wt_cell_unpack_safe(cell, unpack, NULL);
+}
+
+/*
+ * __cell_data_ref --
+ * Set a buffer to reference the data from an unpacked cell.
+ */
+static inline int
+__cell_data_ref(WT_SESSION_IMPL *session,
+ WT_PAGE *page, int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+ WT_BTREE *btree;
+ void *huffman;
+
+ btree = S2BT(session);
+
+ /* Reference the cell's data, optionally decode it. */
+ switch (unpack->type) {
+ case WT_CELL_KEY:
+ store->data = unpack->data;
+ store->size = unpack->size;
+ if (page_type == WT_PAGE_ROW_INT)
+ return (0);
+
+ huffman = btree->huffman_key;
+ break;
+ case WT_CELL_VALUE:
+ store->data = unpack->data;
+ store->size = unpack->size;
+ huffman = btree->huffman_value;
+ break;
+ case WT_CELL_KEY_OVFL:
+ WT_RET(__wt_ovfl_read(session, page, unpack, store));
+ if (page_type == WT_PAGE_ROW_INT)
+ return (0);
+
+ huffman = btree->huffman_key;
+ break;
+ case WT_CELL_VALUE_OVFL:
+ WT_RET(__wt_ovfl_read(session, page, unpack, store));
+ huffman = btree->huffman_value;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (huffman == NULL ? 0 :
+ __wt_huffman_decode(
+ session, huffman, store->data, store->size, store));
+}
+
+/*
+ * __wt_dsk_cell_data_ref --
+ * Set a buffer to reference the data from an unpacked cell.
+ *
+ * There are two versions because of WT_CELL_VALUE_OVFL_RM type cells. When an
+ * overflow item is deleted, its backing blocks are removed; if there are still
+ * running transactions that might need to see the overflow item, we cache a
+ * copy of the item and reset the item's cell to WT_CELL_VALUE_OVFL_RM. If we
+ * find a WT_CELL_VALUE_OVFL_RM cell when reading an overflow item, we use the
+ * page reference to look aside into the cache. So, calling the "dsk" version
+ * of the function declares the cell cannot be of type WT_CELL_VALUE_OVFL_RM,
+ * and calling the "page" version means it might be.
+ */
+static inline int
+__wt_dsk_cell_data_ref(WT_SESSION_IMPL *session,
+ int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+ WT_ASSERT(session,
+ __wt_cell_type_raw(unpack->cell) != WT_CELL_VALUE_OVFL_RM);
+ return (__cell_data_ref(session, NULL, page_type, unpack, store));
+}
+
+/*
+ * __wt_page_cell_data_ref --
+ * Set a buffer to reference the data from an unpacked cell.
+ */
+static inline int
+__wt_page_cell_data_ref(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+ return (__cell_data_ref(session, page, page->type, unpack, store));
+}
+
+/*
+ * __wt_cell_data_copy --
+ * Copy the data from an unpacked cell into a buffer.
+ */
+static inline int
+__wt_cell_data_copy(WT_SESSION_IMPL *session,
+ int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+ /*
+ * We have routines to both copy and reference a cell's information. In
+ * most cases, all we need is a reference and we prefer that, especially
+ * when returning key/value items. In a few we need a real copy: call
+ * the standard reference function and get a reference. In some cases,
+ * a copy will be made (for example, when reading an overflow item from
+ * the underlying object. If that happens, we're done, otherwise make
+ * a copy.
+ *
+ * We don't require two versions of this function, no callers need to
+ * handle WT_CELL_VALUE_OVFL_RM cells.
+ */
+ WT_RET(__wt_dsk_cell_data_ref(session, page_type, unpack, store));
+ if (!WT_DATA_IN_ITEM(store))
+ WT_RET(__wt_buf_set(session, store, store->data, store->size));
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/include/column.i b/src/third_party/wiredtiger/src/include/column.i
new file mode 100644
index 00000000000..42c3664323d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/column.i
@@ -0,0 +1,201 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __col_insert_search_match --
+ * Search an column-store insert list for an exact match.
+ */
+static inline WT_INSERT *
+__col_insert_search_match(WT_INSERT_HEAD *inshead, uint64_t recno)
+{
+ WT_INSERT **insp, *ret_ins;
+ uint64_t ins_recno;
+ int cmp, i;
+
+ /* If there's no insert chain to search, we're done. */
+ if ((ret_ins = WT_SKIP_LAST(inshead)) == NULL)
+ return (NULL);
+
+ /* Fast path the check for values at the end of the skiplist. */
+ if (recno > WT_INSERT_RECNO(ret_ins))
+ return (NULL);
+ else if (recno == WT_INSERT_RECNO(ret_ins))
+ return (ret_ins);
+
+ /*
+ * The insert list is a skip list: start at the highest skip level, then
+ * go as far as possible at each level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0; ) {
+ if (*insp == NULL) {
+ --i;
+ --insp;
+ continue;
+ }
+
+ ins_recno = WT_INSERT_RECNO(*insp);
+ cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1;
+
+ if (cmp == 0) /* Exact match: return */
+ return (*insp);
+ else if (cmp > 0) /* Keep going at this level */
+ insp = &(*insp)->next[i];
+ else { /* Drop down a level */
+ --i;
+ --insp;
+ }
+ }
+
+ return (NULL);
+}
+
+/*
+ * __col_insert_search --
+ * Search a column-store insert list, creating a skiplist stack as we go.
+ */
+static inline WT_INSERT *
+__col_insert_search(WT_INSERT_HEAD *inshead,
+ WT_INSERT ***ins_stack, WT_INSERT **next_stack, uint64_t recno)
+{
+ WT_INSERT **insp, *ret_ins;
+ uint64_t ins_recno;
+ int cmp, i;
+
+ /* If there's no insert chain to search, we're done. */
+ if ((ret_ins = WT_SKIP_LAST(inshead)) == NULL)
+ return (NULL);
+
+ /* Fast path appends. */
+ if (recno >= WT_INSERT_RECNO(ret_ins)) {
+ for (i = 0; i < WT_SKIP_MAXDEPTH; i++) {
+ ins_stack[i] = (i == 0) ? &ret_ins->next[0] :
+ (inshead->tail[i] != NULL) ?
+ &inshead->tail[i]->next[i] : &inshead->head[i];
+ next_stack[i] = NULL;
+ }
+ return (ret_ins);
+ }
+
+ /*
+ * The insert list is a skip list: start at the highest skip level, then
+ * go as far as possible at each level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0; ) {
+ if ((ret_ins = *insp) == NULL) {
+ next_stack[i] = NULL;
+ ins_stack[i--] = insp--;
+ continue;
+ }
+
+ ins_recno = WT_INSERT_RECNO(ret_ins);
+ cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1;
+
+ if (cmp > 0) /* Keep going at this level */
+ insp = &ret_ins->next[i];
+ else if (cmp == 0) /* Exact match: return */
+ for (; i >= 0; i--) {
+ next_stack[i] = ret_ins->next[i];
+ ins_stack[i] = &ret_ins->next[i];
+ }
+ else { /* Drop down a level */
+ next_stack[i] = ret_ins;
+ ins_stack[i--] = insp--;
+ }
+ }
+ return (ret_ins);
+}
+
+/*
+ * __col_var_last_recno --
+ * Return the last record number for a variable-length column-store page.
+ */
+static inline uint64_t
+__col_var_last_recno(WT_PAGE *page)
+{
+ WT_COL_RLE *repeat;
+
+ /*
+ * If there's an append list (the last page), then there may be more
+ * records on the page. This function ignores those records, so our
+ * callers have to handle that explicitly, if they care.
+ */
+ if (page->pg_var_nrepeats == 0)
+ return (page->pg_var_entries == 0 ? 0 :
+ page->pg_var_recno + (page->pg_var_entries - 1));
+
+ repeat = &page->pg_var_repeats[page->pg_var_nrepeats - 1];
+ return ((repeat->recno + repeat->rle) - 1 +
+ (page->pg_var_entries - (repeat->indx + 1)));
+}
+
+/*
+ * __col_fix_last_recno --
+ * Return the last record number for a fixed-length column-store page.
+ */
+static inline uint64_t
+__col_fix_last_recno(WT_PAGE *page)
+{
+ /*
+ * If there's an append list (the last page), then there may be more
+ * records on the page. This function ignores those records, so our
+ * callers have to handle that explicitly, if they care.
+ */
+ return (page->pg_fix_entries == 0 ? 0 :
+ page->pg_fix_recno + (page->pg_fix_entries - 1));
+}
+
+/*
+ * __col_var_search --
+ * Search a variable-length column-store page for a record.
+ */
+static inline WT_COL *
+__col_var_search(WT_PAGE *page, uint64_t recno)
+{
+ WT_COL_RLE *repeat;
+ uint64_t start_recno;
+ uint32_t base, indx, limit, start_indx;
+
+ /*
+ * Find the matching slot.
+ *
+ * This is done in two stages: first, we do a binary search among any
+ * repeating records to find largest repeating less than the search key.
+ * Once there, we can do a simple offset calculation to find the correct
+ * slot for this record number, because we know any intervening records
+ * have repeat counts of 1.
+ */
+ for (base = 0, limit = page->pg_var_nrepeats; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+
+ repeat = page->pg_var_repeats + indx;
+ if (recno >= repeat->recno &&
+ recno < repeat->recno + repeat->rle)
+ return (page->pg_var_d + repeat->indx);
+ if (recno < repeat->recno)
+ continue;
+ base = indx + 1;
+ --limit;
+ }
+
+ /*
+ * We didn't find an exact match, move forward from the largest repeat
+ * less than the search key.
+ */
+ if (base == 0) {
+ start_indx = 0;
+ start_recno = page->pg_var_recno;
+ } else {
+ repeat = page->pg_var_repeats + (base - 1);
+ start_indx = repeat->indx + 1;
+ start_recno = repeat->recno + repeat->rle;
+ }
+
+ if (recno >= start_recno + (page->pg_var_entries - start_indx))
+ return (NULL);
+
+ return (page->pg_var_d + start_indx + (uint32_t)(recno - start_recno));
+}
diff --git a/src/third_party/wiredtiger/src/include/compact.h b/src/third_party/wiredtiger/src/include/compact.h
new file mode 100644
index 00000000000..aa34eab4d24
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/compact.h
@@ -0,0 +1,12 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+struct __wt_compact {
+ uint32_t lsm_count; /* Number of LSM trees seen */
+ uint32_t file_count; /* Number of files seen */
+ uint64_t max_time; /* Configured timeout */
+};
diff --git a/src/third_party/wiredtiger/src/include/config.h b/src/third_party/wiredtiger/src/include/config.h
new file mode 100644
index 00000000000..b9c4c97fa00
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/config.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+struct __wt_config {
+ WT_SESSION_IMPL *session;
+ const char *orig;
+ const char *end;
+ const char *cur;
+
+ int depth, top;
+ const int8_t *go;
+};
+
+struct __wt_config_check {
+ const char *name;
+ const char *type;
+ const char *checks;
+ const WT_CONFIG_CHECK *subconfigs;
+};
+
+#define WT_CONFIG_REF(session, n) \
+ (S2C(session)->config_entries[WT_CONFIG_ENTRY_##n])
+struct __wt_config_entry {
+ const char *method; /* method name */
+
+#define WT_CONFIG_BASE(session, n) (WT_CONFIG_REF(session, n)->base)
+ const char *base; /* configuration base */
+
+ const WT_CONFIG_CHECK *checks; /* check array */
+};
+
+struct __wt_config_parser_impl {
+ WT_CONFIG_PARSER iface;
+
+ WT_SESSION_IMPL *session;
+ WT_CONFIG config;
+ WT_CONFIG_ITEM config_item;
+};
+
+/*
+ * DO NOT EDIT: automatically built by dist/api_config.py.
+ * configuration section: BEGIN
+ */
+#define WT_CONFIG_ENTRY_colgroup_meta 0
+#define WT_CONFIG_ENTRY_connection_add_collator 1
+#define WT_CONFIG_ENTRY_connection_add_compressor 2
+#define WT_CONFIG_ENTRY_connection_add_data_source 3
+#define WT_CONFIG_ENTRY_connection_add_extractor 4
+#define WT_CONFIG_ENTRY_connection_async_new_op 5
+#define WT_CONFIG_ENTRY_connection_close 6
+#define WT_CONFIG_ENTRY_connection_load_extension 7
+#define WT_CONFIG_ENTRY_connection_open_session 8
+#define WT_CONFIG_ENTRY_connection_reconfigure 9
+#define WT_CONFIG_ENTRY_cursor_close 10
+#define WT_CONFIG_ENTRY_file_meta 11
+#define WT_CONFIG_ENTRY_index_meta 12
+#define WT_CONFIG_ENTRY_session_begin_transaction 13
+#define WT_CONFIG_ENTRY_session_checkpoint 14
+#define WT_CONFIG_ENTRY_session_close 15
+#define WT_CONFIG_ENTRY_session_commit_transaction 16
+#define WT_CONFIG_ENTRY_session_compact 17
+#define WT_CONFIG_ENTRY_session_create 18
+#define WT_CONFIG_ENTRY_session_drop 19
+#define WT_CONFIG_ENTRY_session_log_printf 20
+#define WT_CONFIG_ENTRY_session_open_cursor 21
+#define WT_CONFIG_ENTRY_session_reconfigure 22
+#define WT_CONFIG_ENTRY_session_rename 23
+#define WT_CONFIG_ENTRY_session_rollback_transaction 24
+#define WT_CONFIG_ENTRY_session_salvage 25
+#define WT_CONFIG_ENTRY_session_truncate 26
+#define WT_CONFIG_ENTRY_session_upgrade 27
+#define WT_CONFIG_ENTRY_session_verify 28
+#define WT_CONFIG_ENTRY_table_meta 29
+#define WT_CONFIG_ENTRY_wiredtiger_open 30
+#define WT_CONFIG_ENTRY_wiredtiger_open_all 31
+#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 32
+#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 33
+/*
+ * configuration section: END
+ * DO NOT EDIT: automatically built by dist/flags.py.
+ */
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
new file mode 100644
index 00000000000..81866e39df9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*******************************************
+ * Global per-process structure.
+ *******************************************/
+/*
+ * WT_PROCESS --
+ * Per-process information for the library.
+ */
+struct __wt_process {
+ WT_SPINLOCK spinlock; /* Per-process spinlock */
+
+ /* Locked: connection queue */
+ TAILQ_HEAD(__wt_connection_impl_qh, __wt_connection_impl) connqh;
+ WT_CACHE_POOL *cache_pool;
+};
+extern WT_PROCESS __wt_process;
+
+/*
+ * WT_NAMED_COLLATOR --
+ * A collator list entry
+ */
+struct __wt_named_collator {
+ const char *name; /* Name of collator */
+ WT_COLLATOR *collator; /* User supplied object */
+ TAILQ_ENTRY(__wt_named_collator) q; /* Linked list of collators */
+};
+
+/*
+ * WT_NAMED_COMPRESSOR --
+ * A compressor list entry
+ */
+struct __wt_named_compressor {
+ const char *name; /* Name of compressor */
+ WT_COMPRESSOR *compressor; /* User supplied callbacks */
+ /* Linked list of compressors */
+ TAILQ_ENTRY(__wt_named_compressor) q;
+};
+
+/*
+ * WT_NAMED_DATA_SOURCE --
+ * A data source list entry
+ */
+struct __wt_named_data_source {
+ const char *prefix; /* Name of data source */
+ WT_DATA_SOURCE *dsrc; /* User supplied callbacks */
+ /* Linked list of data sources */
+ TAILQ_ENTRY(__wt_named_data_source) q;
+};
+
+/*
+ * Allocate some additional slots for internal sessions. There is a default
+ * session for each connection, plus a session for each server thread.
+ */
+#define WT_NUM_INTERNAL_SESSIONS 10
+
+/*
+ * WT_CONNECTION_IMPL --
+ * Implementation of WT_CONNECTION
+ */
+struct __wt_connection_impl {
+ WT_CONNECTION iface;
+
+ /* For operations without an application-supplied session */
+ WT_SESSION_IMPL *default_session;
+ WT_SESSION_IMPL dummy_session;
+
+ const char *cfg; /* Connection configuration */
+
+ WT_SPINLOCK api_lock; /* Connection API spinlock */
+ WT_SPINLOCK checkpoint_lock; /* Checkpoint spinlock */
+ WT_SPINLOCK fh_lock; /* File handle queue spinlock */
+ WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */
+ WT_SPINLOCK schema_lock; /* Schema operation spinlock */
+
+ /*
+ * We distribute the btree page locks across a set of spin locks; it
+ * can't be an array, we impose cache-line alignment and gcc doesn't
+ * support that for arrays. Don't use too many: they are only held for
+ * very short operations, each one is 64 bytes, so 256 will fill the L1
+ * cache on most CPUs.
+ */
+#define WT_PAGE_LOCKS(conn) 16
+ WT_SPINLOCK *page_lock; /* Btree page spinlocks */
+ u_int page_lock_cnt; /* Next spinlock to use */
+
+ /* Connection queue */
+ TAILQ_ENTRY(__wt_connection_impl) q;
+ /* Cache pool queue */
+ TAILQ_ENTRY(__wt_connection_impl) cpq;
+
+ const char *home; /* Database home */
+ const char *error_prefix; /* Database error prefix */
+ int is_new; /* Connection created database */
+
+ WT_EXTENSION_API extension_api; /* Extension API */
+
+ /* Configuration */
+ const WT_CONFIG_ENTRY **config_entries;
+
+ void **foc; /* Free-on-close array */
+ size_t foc_cnt; /* Array entries */
+ size_t foc_size; /* Array size */
+
+ WT_FH *lock_fh; /* Lock file handle */
+
+ uint64_t split_gen; /* Generation number for splits */
+
+ WT_SPINLOCK dhandle_lock; /* Locked: dhandle sweep */
+ /* Locked: data handle list */
+ SLIST_HEAD(__wt_dhandle_lh, __wt_data_handle) dhlh;
+ /* Locked: LSM handle list. */
+ TAILQ_HEAD(__wt_lsm_qh, __wt_lsm_tree) lsmqh;
+ /* Locked: file list */
+ TAILQ_HEAD(__wt_fh_qh, __wt_fh) fhqh;
+ /* Locked: library list */
+ TAILQ_HEAD(__wt_dlh_qh, __wt_dlh) dlhqh;
+
+ WT_SPINLOCK block_lock; /* Locked: block manager list */
+ TAILQ_HEAD(__wt_block_qh, __wt_block) blockqh;
+
+ u_int open_btree_count; /* Locked: open writable btree count */
+ uint32_t next_file_id; /* Locked: file ID counter */
+
+ /*
+ * WiredTiger allocates space for 50 simultaneous sessions (threads of
+ * control) by default. Growing the number of threads dynamically is
+ * possible, but tricky since server threads are walking the array
+ * without locking it.
+ *
+ * There's an array of WT_SESSION_IMPL pointers that reference the
+ * allocated array; we do it that way because we want an easy way for
+ * the server thread code to avoid walking the entire array when only a
+ * few threads are running.
+ */
+ WT_SESSION_IMPL *sessions; /* Session reference */
+ uint32_t session_size; /* Session array size */
+ uint32_t session_cnt; /* Session count */
+
+ /*
+ * WiredTiger allocates space for a fixed number of hazard pointers
+ * in each thread of control.
+ */
+ uint32_t hazard_max; /* Hazard array size */
+
+ WT_CACHE *cache; /* Page cache */
+ uint64_t cache_size;
+
+ WT_TXN_GLOBAL txn_global; /* Global transaction state */
+
+ WT_SPINLOCK hot_backup_lock; /* Hot backup serialization */
+ int hot_backup;
+
+ WT_SESSION_IMPL *ckpt_session; /* Checkpoint thread session */
+ wt_thread_t ckpt_tid; /* Checkpoint thread */
+ int ckpt_tid_set; /* Checkpoint thread set */
+ WT_CONDVAR *ckpt_cond; /* Checkpoint wait mutex */
+ const char *ckpt_config; /* Checkpoint configuration */
+#define WT_CKPT_LOGSIZE(conn) ((conn)->ckpt_logsize != 0)
+ wt_off_t ckpt_logsize; /* Checkpoint log size period */
+ uint32_t ckpt_signalled; /* Checkpoint signalled */
+ long ckpt_usecs; /* Checkpoint period */
+
+ int compact_in_memory_pass; /* Compaction serialization */
+
+#define WT_CONN_STAT_ALL 0x01 /* "all" statistics configured */
+#define WT_CONN_STAT_CLEAR 0x02 /* clear after gathering */
+#define WT_CONN_STAT_FAST 0x04 /* "fast" statistics configured */
+#define WT_CONN_STAT_NONE 0x08 /* don't gather statistics */
+#define WT_CONN_STAT_ON_CLOSE 0x10 /* output statistics on close */
+ uint32_t stat_flags;
+
+ WT_CONNECTION_STATS stats; /* Connection statistics */
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+ /*
+ * Spinlock registration, so we can track which spinlocks are heavily
+ * used, which are blocking and where.
+ *
+ * There's an array of spinlocks, and an array of blocking IDs.
+ */
+#define WT_SPINLOCK_MAX 1024
+#define WT_SPINLOCK_MAX_LOCATION_ID 60
+ WT_SPINLOCK *spinlock_list[WT_SPINLOCK_MAX];
+
+ /* Spinlock blocking matrix */
+ struct __wt_connection_stats_spinlock {
+ const char *name; /* Mutex name */
+
+ const char *file; /* Caller's file/line, ID location */
+ int line;
+
+ u_int total; /* Count of total, blocked calls */
+ u_int blocked[WT_SPINLOCK_MAX_LOCATION_ID];
+ } spinlock_block[WT_SPINLOCK_MAX_LOCATION_ID];
+#endif
+
+ WT_ASYNC *async; /* Async structure */
+ int async_cfg; /* Global async configuration */
+ uint32_t async_size; /* Async op array size */
+ uint32_t async_workers; /* Number of async workers */
+
+ WT_LSM_MANAGER lsm_manager; /* LSM worker thread information */
+
+ WT_SESSION_IMPL *evict_session; /* Eviction server sessions */
+ wt_thread_t evict_tid; /* Eviction server thread ID */
+ int evict_tid_set; /* Eviction server thread ID set */
+
+ uint32_t evict_workers_max;/* Max eviction workers */
+ uint32_t evict_workers_min;/* Min eviction workers */
+ uint32_t evict_workers; /* Number of eviction workers */
+ WT_EVICT_WORKER *evict_workctx; /* Eviction worker context */
+
+ WT_SESSION_IMPL *stat_session; /* Statistics log session */
+ wt_thread_t stat_tid; /* Statistics log thread */
+ int stat_tid_set; /* Statistics log thread set */
+ WT_CONDVAR *stat_cond; /* Statistics log wait mutex */
+ const char *stat_format; /* Statistics log timestamp format */
+ FILE *stat_fp; /* Statistics log file handle */
+ char *stat_path; /* Statistics log path format */
+ char **stat_sources; /* Statistics log list of objects */
+ const char *stat_stamp; /* Statistics log entry timestamp */
+ long stat_usecs; /* Statistics log period */
+
+ int logging; /* Global logging configuration */
+ int archive; /* Global archive configuration */
+ WT_CONDVAR *arch_cond; /* Log archive wait mutex */
+ WT_SESSION_IMPL *arch_session; /* Log archive session */
+ wt_thread_t arch_tid; /* Log archive thread */
+ int arch_tid_set; /* Log archive thread set */
+ WT_LOG *log; /* Logging structure */
+ wt_off_t log_file_max; /* Log file max size */
+ const char *log_path; /* Logging path format */
+ uint32_t txn_logsync; /* Log sync configuration */
+
+ WT_SESSION_IMPL *sweep_session; /* Handle sweep session */
+ wt_thread_t sweep_tid; /* Handle sweep thread */
+ int sweep_tid_set; /* Handle sweep thread set */
+ WT_CONDVAR *sweep_cond; /* Handle sweep wait mutex */
+
+ /* Locked: collator list */
+ TAILQ_HEAD(__wt_coll_qh, __wt_named_collator) collqh;
+
+ /* Locked: compressor list */
+ TAILQ_HEAD(__wt_comp_qh, __wt_named_compressor) compqh;
+
+ /* Locked: data source list */
+ TAILQ_HEAD(__wt_dsrc_qh, __wt_named_data_source) dsrcqh;
+
+ void *lang_private; /* Language specific private storage */
+
+ /* If non-zero, all buffers used for I/O will be aligned to this. */
+ size_t buffer_alignment;
+
+ uint32_t schema_gen; /* Schema generation number */
+
+ wt_off_t data_extend_len; /* file_extend data length */
+ wt_off_t log_extend_len; /* file_extend log length */
+
+ uint32_t direct_io; /* O_DIRECT file type flags */
+ int mmap; /* mmap configuration */
+ uint32_t verbose;
+
+ uint32_t flags;
+};
diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h
new file mode 100644
index 00000000000..17185499b88
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cursor.h
@@ -0,0 +1,380 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Initialize a static WT_CURSOR structure.
+ */
+#define WT_CURSOR_STATIC_INIT(n, \
+ get_key, \
+ get_value, \
+ set_key, \
+ set_value, \
+ compare, \
+ next, \
+ prev, \
+ reset, \
+ search, \
+ search_near, \
+ insert, \
+ update, \
+ remove, \
+ close) \
+ static const WT_CURSOR n = { \
+ NULL, /* session */ \
+ NULL, /* uri */ \
+ NULL, /* key_format */ \
+ NULL, /* value_format */ \
+ (int (*)(WT_CURSOR *, ...))(get_key), \
+ (int (*)(WT_CURSOR *, ...))(get_value), \
+ (void (*)(WT_CURSOR *, ...))(set_key), \
+ (void (*)(WT_CURSOR *, ...))(set_value), \
+ (int (*)(WT_CURSOR *, WT_CURSOR *, int *))(compare), \
+ next, \
+ prev, \
+ reset, \
+ search, \
+ (int (*)(WT_CURSOR *, int *))(search_near), \
+ insert, \
+ update, \
+ remove, \
+ close, \
+ { NULL, NULL }, /* TAILQ_ENTRY q */ \
+ 0, /* recno key */ \
+ { 0 }, /* recno raw buffer */ \
+ NULL, /* json_private */ \
+ NULL, /* lang_private */ \
+ { NULL, 0, 0, NULL, 0 }, /* WT_ITEM key */ \
+ { NULL, 0, 0, NULL, 0 }, /* WT_ITEM value */ \
+ 0, /* int saved_err */ \
+ NULL, /* internal_uri */ \
+ 0 /* uint32_t flags */ \
+}
+
+struct __wt_cursor_backup_entry {
+ char *name; /* File name */
+ WT_DATA_HANDLE *handle; /* Handle */
+};
+struct __wt_cursor_backup {
+ WT_CURSOR iface;
+
+ size_t next; /* Cursor position */
+ FILE *bfp; /* Backup file */
+
+ WT_CURSOR_BACKUP_ENTRY *list; /* List of files to be copied. */
+ size_t list_allocated;
+ size_t list_next;
+};
+
+struct __wt_cursor_btree {
+ WT_CURSOR iface;
+
+ WT_BTREE *btree; /* Enclosing btree */
+
+ /*
+ * The following fields are set by the search functions as a precursor
+ * to page modification: we have a page, a WT_COL/WT_ROW slot on the
+ * page, an insert head, insert list and a skiplist stack (the stack of
+ * skiplist entries leading to the insert point). The search functions
+ * also return the relationship of the search key to the found key.
+ */
+ WT_REF *ref; /* Current page */
+ uint32_t slot; /* WT_COL/WT_ROW 0-based slot */
+
+ WT_INSERT_HEAD *ins_head; /* Insert chain head */
+ WT_INSERT *ins; /* Current insert node */
+ /* Search stack */
+ WT_INSERT **ins_stack[WT_SKIP_MAXDEPTH];
+
+ /* Next item(s) found during search */
+ WT_INSERT *next_stack[WT_SKIP_MAXDEPTH];
+
+ uint64_t recno; /* Record number */
+
+ /*
+ * The search function sets compare to:
+ * < 1 if the found key is less than the specified key
+ * 0 if the found key matches the specified key
+ * > 1 if the found key is larger than the specified key
+ */
+ int compare;
+
+ /*
+ * The key value from a binary search of a row-store files; we keep a
+ * copy of the last key we retrieved in the search, it avoids having
+ * doing the additional work of getting the key again for return to
+ * the application.
+ */
+ WT_ITEM search_key;
+
+ /*
+ * It's relatively expensive to calculate the last record on a variable-
+ * length column-store page because of the repeat values. Calculate it
+ * once per page and cache it. This value doesn't include the skiplist
+ * of appended entries on the last page.
+ */
+ uint64_t last_standard_recno;
+
+ /*
+ * For row-store pages, we need a single item that tells us the part of
+ * the page we're walking (otherwise switching from next to prev and
+ * vice-versa is just too complicated), so we map the WT_ROW and
+ * WT_INSERT_HEAD insert array slots into a single name space: slot 1
+ * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
+ * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are
+ * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
+ */
+ uint32_t row_iteration_slot; /* Row-store iteration slot */
+
+ /*
+ * Variable-length column-store values are run-length encoded and may
+ * be overflow values or Huffman encoded. To avoid repeatedly reading
+ * overflow values or decompressing encoded values, process it once and
+ * store the result in a temporary buffer. The cip_saved field is used
+ * to determine if we've switched columns since our last cursor call.
+ */
+ WT_COL *cip_saved; /* Last iteration reference */
+
+ /*
+ * We don't instantiate prefix-compressed keys on pages where there's no
+ * Huffman encoding because we don't want to waste memory if only moving
+ * a cursor through the page, and it's faster to build keys while moving
+ * through the page than to roll-forward from a previously instantiated
+ * key (we don't instantiate all of the keys, just the ones at binary
+ * search points). We can't use the application's WT_CURSOR key field
+ * as a copy of the last-returned key because it may have been altered
+ * by the API layer, for example, dump cursors. Instead we store the
+ * last-returned key in a temporary buffer. The rip_saved field is used
+ * to determine if the key in the temporary buffer has the prefix needed
+ * for building the current key.
+ */
+ WT_ROW *rip_saved; /* Last-returned key reference */
+
+ /*
+ * A temporary buffer for caching RLE values for column-store files.
+ */
+ WT_ITEM tmp;
+
+ /*
+ * The update structure allocated by the row- and column-store modify
+ * functions, used to avoid a data copy in the WT_CURSOR.update call.
+ */
+ WT_UPDATE *modify_update;
+
+ /*
+ * Fixed-length column-store items are a single byte, and it's simpler
+ * and cheaper to allocate the space for it now than keep checking to
+ * see if we need to grow the buffer.
+ */
+ uint8_t v; /* Fixed-length return value */
+
+ uint8_t append_tree; /* Cursor appended to the tree */
+
+#define WT_CBT_ACTIVE 0x01 /* Active in the tree */
+#define WT_CBT_ITERATE_APPEND 0x02 /* Col-store: iterating append list */
+#define WT_CBT_ITERATE_NEXT 0x04 /* Next iteration configuration */
+#define WT_CBT_ITERATE_PREV 0x08 /* Prev iteration configuration */
+#define WT_CBT_MAX_RECORD 0x10 /* Col-store: past end-of-table */
+#define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */
+ uint8_t flags;
+};
+
+struct __wt_cursor_bulk {
+ WT_CURSOR_BTREE cbt;
+
+ WT_REF *ref; /* The leaf page */
+ WT_PAGE *leaf;
+
+ /*
+ * Variable-length column store compares values during bulk load as
+ * part of RLE compression, row-store compares keys during bulk load
+ * to avoid corruption.
+ */
+ WT_ITEM last; /* Last key/value seen */
+
+ /*
+ * Variable-length column-store RLE counter (also overloaded to mean
+ * the first time through the bulk-load insert routine, when set to 0).
+ */
+ uint64_t rle;
+
+ /*
+ * Fixed-length column-store current entry in memory chunk count, and
+ * the maximum number of records per chunk.
+ */
+ uint32_t entry; /* Entry count */
+ uint32_t nrecs; /* Max records per chunk */
+
+ /* Special bitmap bulk load for fixed-length column stores. */
+ int bitmap;
+
+ void *reconcile; /* Reconciliation information */
+};
+
+struct __wt_cursor_config {
+ WT_CURSOR iface;
+};
+
+struct __wt_cursor_data_source {
+ WT_CURSOR iface;
+
+ WT_COLLATOR *collator; /* Configured collator */
+ int collator_owned; /* Collator needs to be terminated */
+
+ WT_CURSOR *source; /* Application-owned cursor */
+};
+
+struct __wt_cursor_dump {
+ WT_CURSOR iface;
+
+ WT_CURSOR *child;
+};
+
+struct __wt_cursor_index {
+ WT_CURSOR iface;
+
+ WT_TABLE *table;
+ WT_INDEX *index;
+ const char *key_plan, *value_plan;
+
+ WT_CURSOR *child;
+ WT_CURSOR **cg_cursors;
+};
+
+struct __wt_cursor_json {
+ char *key_buf; /* JSON formatted string */
+ char *value_buf; /* JSON formatted string */
+ WT_CONFIG_ITEM key_names; /* Names of key columns */
+ WT_CONFIG_ITEM value_names; /* Names of value columns */
+};
+
+struct __wt_cursor_log {
+ WT_CURSOR iface;
+
+ WT_LSN *cur_lsn; /* LSN of current record */
+ WT_LSN *next_lsn; /* LSN of next record */
+ WT_ITEM *logrec; /* Copy of record for cursor */
+ WT_ITEM *opkey, *opvalue; /* Op key/value copy */
+ const uint8_t *stepp, *stepp_end; /* Pointer within record */
+ uint32_t step_count; /* Intra-record count */
+ uint32_t rectype; /* Record type */
+ uint64_t txnid; /* Record txnid */
+ uint32_t flags;
+};
+
+struct __wt_cursor_metadata {
+ WT_CURSOR iface;
+
+ WT_CURSOR *file_cursor; /* Queries of regular metadata */
+
+#define WT_MDC_POSITIONED 0x01
+#define WT_MDC_ONMETADATA 0x02
+ uint32_t flags;
+};
+
+struct __wt_cursor_stat {
+ WT_CURSOR iface;
+
+ int notpositioned; /* Cursor not positioned */
+
+ WT_STATS *stats; /* Stats owned by the cursor */
+ WT_STATS *stats_first; /* First stats reference */
+ int stats_base; /* Base statistics value */
+ int stats_count; /* Count of stats elements */
+
+ union { /* Copies of the statistics */
+ WT_DSRC_STATS dsrc_stats;
+ WT_CONNECTION_STATS conn_stats;
+ } u;
+
+ int key; /* Current stats key */
+ uint64_t v; /* Current stats value */
+ WT_ITEM pv; /* Current stats value (string) */
+
+ /* Uses the same values as WT_CONNECTION::stat_flags field */
+ uint32_t flags;
+};
+
+/*
+ * WT_CURSOR_STATS --
+ * Return a reference to a statistic cursor's stats structures; use the
+ * WT_CURSOR.stats_first field instead of WT_CURSOR.stats because the latter
+ * is NULL when non-cursor memory is used to hold the statistics.
+ */
+#define WT_CURSOR_STATS(cursor) \
+ (((WT_CURSOR_STAT *)cursor)->stats_first)
+
+struct __wt_cursor_table {
+ WT_CURSOR iface;
+
+ WT_TABLE *table;
+ const char *plan;
+
+ const char **cfg; /* Saved configuration string */
+
+ WT_CURSOR **cg_cursors;
+ WT_CURSOR **idx_cursors;
+};
+
+#define WT_CURSOR_PRIMARY(cursor) \
+ (((WT_CURSOR_TABLE *)cursor)->cg_cursors[0])
+
+#define WT_CURSOR_RECNO(cursor) WT_STREQ((cursor)->key_format, "r")
+
+/*
+ * WT_CURSOR_NEEDKEY, WT_CURSOR_NEEDVALUE --
+ * Check if we have a key/value set. There's an additional semantic
+ * implemented here: if we're pointing into the tree, and about to perform
+ * a cursor operation, get a local copy of whatever we're referencing in
+ * the tree, there's an obvious race with the cursor moving and the key or
+ * value reference, and it's better to solve it here than in the underlying
+ * data-source layers.
+ *
+ * WT_CURSOR_CHECKKEY --
+ * Check if a key is set without making a copy.
+ *
+ * WT_CURSOR_NOVALUE --
+ * Release any cached value before an operation that could update the
+ * transaction context and free data a value is pointing to.
+ */
+#define WT_CURSOR_CHECKKEY(cursor) do { \
+ if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) \
+ WT_ERR(__wt_cursor_kv_not_set(cursor, 1)); \
+} while (0)
+#define WT_CURSOR_CHECKVALUE(cursor) do { \
+ if (!F_ISSET(cursor, WT_CURSTD_VALUE_SET)) \
+ WT_ERR(__wt_cursor_kv_not_set(cursor, 0)); \
+} while (0)
+#define WT_CURSOR_NEEDKEY(cursor) do { \
+ if (F_ISSET(cursor, WT_CURSTD_KEY_INT)) { \
+ if (!WT_DATA_IN_ITEM(&(cursor)->key)) \
+ WT_ERR(__wt_buf_set( \
+ (WT_SESSION_IMPL *)(cursor)->session, \
+ &(cursor)->key, \
+ (cursor)->key.data, (cursor)->key.size)); \
+ F_CLR(cursor, WT_CURSTD_KEY_INT); \
+ F_SET(cursor, WT_CURSTD_KEY_EXT); \
+ } \
+ WT_CURSOR_CHECKKEY(cursor); \
+} while (0)
+#define WT_CURSOR_NEEDVALUE(cursor) do { \
+ if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) { \
+ if (!WT_DATA_IN_ITEM(&(cursor)->value)) \
+ WT_ERR(__wt_buf_set( \
+ (WT_SESSION_IMPL *)(cursor)->session, \
+ &(cursor)->value, \
+ (cursor)->value.data, (cursor)->value.size));\
+ F_CLR(cursor, WT_CURSTD_VALUE_INT); \
+ F_SET(cursor, WT_CURSTD_VALUE_EXT); \
+ } \
+ WT_CURSOR_CHECKVALUE(cursor); \
+} while (0)
+#define WT_CURSOR_NOVALUE(cursor) do { \
+ F_CLR(cursor, WT_CURSTD_VALUE_INT); \
+} while (0)
+
+#define WT_CURSOR_RAW_OK \
+ WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_PRINT | WT_CURSTD_RAW
diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i
new file mode 100644
index 00000000000..7f8e83643c5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cursor.i
@@ -0,0 +1,277 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __cursor_set_recno --
+ * The cursor value in the interface has to track the value in the
+ * underlying cursor, update them in parallel.
+ */
+static inline void
+__cursor_set_recno(WT_CURSOR_BTREE *cbt, uint64_t v)
+{
+ cbt->iface.recno = cbt->recno = v;
+}
+
+/*
+ * __cursor_pos_clear --
+ * Reset the cursor's location.
+ */
+static inline void
+__cursor_pos_clear(WT_CURSOR_BTREE *cbt)
+{
+ /*
+ * Most of the cursor's location information that needs to be set on
+ * successful return is always set by a successful return, for example,
+ * we don't initialize the compare return value because it's always
+ * set by the row-store search. The other stuff gets cleared here,
+ * and it's a minimal set of things we need to clear. It would be a
+ * lot simpler to clear everything, but we call this function a lot.
+ */
+ cbt->recno = 0;
+
+ cbt->ins = NULL;
+ cbt->ins_head = NULL;
+ cbt->ins_stack[0] = NULL;
+
+ cbt->cip_saved = NULL;
+ cbt->rip_saved = NULL;
+
+ /*
+ * Don't clear the active flag, it's owned by the cursor enter/leave
+ * functions.
+ */
+ F_CLR(cbt, ~WT_CBT_ACTIVE);
+}
+
+/*
+ * __cursor_enter --
+ * Activate a cursor.
+ */
+static inline int
+__cursor_enter(WT_SESSION_IMPL *session)
+{
+ /*
+ * If there are no other cursors positioned in the session, check
+ * whether the cache is full.
+ */
+ if (session->ncursors == 0)
+ WT_RET(__wt_cache_full_check(session));
+ ++session->ncursors;
+ return (0);
+}
+
+/*
+ * __cursor_leave --
+ * Deactivate a cursor.
+ */
+static inline int
+__cursor_leave(WT_SESSION_IMPL *session)
+{
+ /*
+ * Decrement the count of active cursors in the session. When that
+ * goes to zero, there are no active cursors, and we can release any
+ * snapshot we're holding for read committed isolation.
+ */
+ WT_ASSERT(session, session->ncursors > 0);
+ if (--session->ncursors == 0)
+ __wt_txn_read_last(session);
+
+ return (0);
+}
+
+/*
+ * __curfile_enter --
+ * Activate a file cursor.
+ */
+static inline int
+__curfile_enter(WT_CURSOR_BTREE *cbt)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ WT_RET(__cursor_enter(session));
+ F_SET(cbt, WT_CBT_ACTIVE);
+ return (0);
+}
+
+/*
+ * __curfile_leave --
+ * Clear a file cursor's position.
+ */
+static inline int
+__curfile_leave(WT_CURSOR_BTREE *cbt)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ /* If the cursor was active, deactivate it. */
+ if (F_ISSET(cbt, WT_CBT_ACTIVE)) {
+ WT_RET(__cursor_leave(session));
+ F_CLR(cbt, WT_CBT_ACTIVE);
+ }
+
+ /*
+ * Release any page references we're holding. This can trigger
+ * eviction (e.g., forced eviction of big pages), so it is important to
+ * do it after releasing our snapshot above.
+ */
+ WT_RET(__wt_page_release(session, cbt->ref, 0));
+ cbt->ref = NULL;
+ return (0);
+}
+
+/*
+ * __cursor_func_init --
+ * Cursor call setup.
+ */
+static inline int
+__cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ if (reenter)
+ WT_RET(__curfile_leave(cbt));
+ if (!F_ISSET(cbt, WT_CBT_ACTIVE))
+ WT_RET(__curfile_enter(cbt));
+ __wt_txn_cursor_op(session);
+ return (0);
+}
+
+/*
+ * __cursor_reset --
+ * Reset the cursor.
+ */
+static inline int
+__cursor_reset(WT_CURSOR_BTREE *cbt)
+{
+ WT_DECL_RET;
+
+ /*
+ * The cursor is leaving the API, and no longer holds any position,
+ * generally called to clean up the cursor after an error.
+ */
+ ret = __curfile_leave(cbt);
+ __cursor_pos_clear(cbt);
+ return (ret);
+}
+
+/*
+ * __cursor_row_slot_return --
+ * Return a row-store leaf page slot's K/V pair.
+ */
+static inline int
+__cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd)
+{
+ WT_BTREE *btree;
+ WT_ITEM *kb, *vb;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ void *copy;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ btree = S2BT(session);
+ page = cbt->ref->page;
+
+ unpack = NULL;
+
+ kb = &cbt->iface.key;
+ vb = &cbt->iface.value;
+
+ /*
+ * The row-store key can change underfoot; explicitly take a copy.
+ */
+ copy = WT_ROW_KEY_COPY(rip);
+
+ /*
+ * Get a key: we could just call __wt_row_leaf_key, but as a cursor
+ * is running through the tree, we may have additional information
+ * here (we may have the fully-built key that's immediately before
+ * the prefix-compressed key we want, so it's a faster construction).
+ *
+ * First, check for an immediately available key.
+ */
+ if (__wt_row_leaf_key_info(
+ page, copy, NULL, &cell, &kb->data, &kb->size))
+ goto value;
+
+ /* Huffman encoded keys are a slow path in all cases. */
+ if (btree->huffman_key != NULL)
+ goto slow;
+
+ /*
+ * Unpack the cell and deal with overflow and prefix-compressed keys.
+ * Inline building simple prefix-compressed keys from a previous key,
+ * otherwise build from scratch.
+ */
+ unpack = &_unpack;
+ __wt_cell_unpack(cell, unpack);
+ if (unpack->type == WT_CELL_KEY &&
+ cbt->rip_saved != NULL && cbt->rip_saved == rip - 1) {
+ WT_ASSERT(session, cbt->tmp.size >= unpack->prefix);
+
+ /*
+ * Grow the buffer as necessary as well as ensure data has been
+ * copied into local buffer space, then append the suffix to the
+ * prefix already in the buffer.
+ *
+ * Don't grow the buffer unnecessarily or copy data we don't
+ * need, truncate the item's data length to the prefix bytes.
+ */
+ cbt->tmp.size = unpack->prefix;
+ WT_RET(__wt_buf_grow(
+ session, &cbt->tmp, cbt->tmp.size + unpack->size));
+ memcpy((uint8_t *)cbt->tmp.data + cbt->tmp.size,
+ unpack->data, unpack->size);
+ cbt->tmp.size += unpack->size;
+ } else {
+ /*
+ * Call __wt_row_leaf_key_work instead of __wt_row_leaf_key: we
+ * already did __wt_row_leaf_key's fast-path checks inline.
+ */
+slow: WT_RET(
+ __wt_row_leaf_key_work(session, page, rip, &cbt->tmp, 0));
+ }
+ kb->data = cbt->tmp.data;
+ kb->size = cbt->tmp.size;
+ cbt->rip_saved = rip;
+
+value:
+ /*
+ * If the item was ever modified, use the WT_UPDATE data. Note the
+ * caller passes us the update: it has already resolved which one
+ * (if any) is visible.
+ */
+ if (upd != NULL) {
+ vb->data = WT_UPDATE_DATA(upd);
+ vb->size = upd->size;
+ return (0);
+ }
+
+ /* Else, simple values have their location encoded in the WT_ROW. */
+ if (__wt_row_leaf_value(page, rip, vb))
+ return (0);
+
+ /*
+ * Else, take the value from the original page cell (which may be
+ * empty).
+ */
+ if ((cell = __wt_row_leaf_value_cell(page, rip, unpack)) == NULL) {
+ vb->data = "";
+ vb->size = 0;
+ return (0);
+ }
+
+ unpack = &_unpack;
+ __wt_cell_unpack(cell, unpack);
+ return (__wt_page_cell_data_ref(session, cbt->ref->page, unpack, vb));
+}
diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h
new file mode 100644
index 00000000000..5556627c74d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/dhandle.h
@@ -0,0 +1,73 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * XXX
+ * The server threads use their own WT_SESSION_IMPL handles because they may
+ * want to block (for example, the eviction server calls reconciliation, and
+ * some of the reconciliation diagnostic code reads pages), and the user's
+ * session handle is already blocking on a server thread. The problem is the
+ * server thread needs to reference the correct btree handle, and that's
+ * hanging off the application's thread of control. For now, I'm just making
+ * it obvious where that's getting done.
+ */
+#define WT_SET_BTREE_IN_SESSION(s, b) ((s)->dhandle = b->dhandle)
+#define WT_CLEAR_BTREE_IN_SESSION(s) ((s)->dhandle = NULL)
+
+#define WT_WITH_DHANDLE(s, d, e) do { \
+ WT_DATA_HANDLE *__saved_dhandle = (s)->dhandle; \
+ (s)->dhandle = (d); \
+ e; \
+ (s)->dhandle = __saved_dhandle; \
+} while (0)
+
+#define WT_WITH_BTREE(s, b, e) WT_WITH_DHANDLE(s, (b)->dhandle, e)
+
+/*
+ * WT_DATA_HANDLE --
+ * A handle for a generic named data source.
+ */
+struct __wt_data_handle {
+ WT_RWLOCK *rwlock; /* Lock for shared/exclusive ops */
+ SLIST_ENTRY(__wt_data_handle) l;/* Linked list of handles */
+
+ /*
+ * Sessions caching a connection's data handle will have a non-zero
+ * reference count; sessions using a connection's data handle will
+ * have a non-zero in-use count.
+ */
+ uint32_t session_ref; /* Sessions referencing this handle */
+ int32_t session_inuse; /* Sessions using this handle */
+ time_t timeofdeath; /* Use count went to 0 */
+
+ uint64_t name_hash; /* Hash of name */
+ const char *name; /* Object name as a URI */
+ const char *checkpoint; /* Checkpoint name (or NULL) */
+ const char **cfg; /* Configuration information */
+
+ WT_DATA_SOURCE *dsrc; /* Data source for this handle */
+ void *handle; /* Generic handle */
+
+ /*
+ * Data handles can be closed without holding the schema lock; threads
+ * walk the list of open handles, operating on them (checkpoint is the
+ * best example). To avoid sources disappearing underneath checkpoint,
+ * lock the data handle when closing it.
+ */
+ WT_SPINLOCK close_lock; /* Lock to close the handle */
+
+ WT_DSRC_STATS stats; /* Data-source statistics */
+
+ /* Flags values over 0xff are reserved for WT_BTREE_* */
+#define WT_DHANDLE_DISCARD 0x01 /* Discard on release */
+#define WT_DHANDLE_DISCARD_CLOSE 0x02 /* Close on release */
+#define WT_DHANDLE_EXCLUSIVE 0x04 /* Need exclusive access */
+#define WT_DHANDLE_HAVE_REF 0x08 /* Already have ref */
+#define WT_DHANDLE_LOCK_ONLY 0x10 /* Handle only used as a lock */
+#define WT_DHANDLE_OPEN 0x20 /* Handle is open */
+ uint32_t flags;
+};
diff --git a/src/third_party/wiredtiger/src/include/dlh.h b/src/third_party/wiredtiger/src/include/dlh.h
new file mode 100644
index 00000000000..3974ae2792c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/dlh.h
@@ -0,0 +1,15 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+struct __wt_dlh {
+ TAILQ_ENTRY(__wt_dlh) q; /* List of open libraries. */
+
+ void *handle; /* Handle returned by dlopen. */
+ char *name;
+
+ int (*terminate)(WT_CONNECTION *); /* Terminate function. */
+};
diff --git a/src/third_party/wiredtiger/src/include/error.h b/src/third_party/wiredtiger/src/include/error.h
new file mode 100644
index 00000000000..9bccc80faec
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/error.h
@@ -0,0 +1,141 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define WT_DEBUG_POINT ((void *)0xdeadbeef)
+#define WT_DEBUG_BYTE (0xab)
+
+/* In DIAGNOSTIC mode, yield in places where we want to encourage races. */
+#ifdef HAVE_DIAGNOSTIC
+#define WT_HAVE_DIAGNOSTIC_YIELD do { \
+ __wt_yield(); \
+} while (0)
+#else
+#define WT_HAVE_DIAGNOSTIC_YIELD
+#endif
+
+/* Set "ret" and branch-to-err-label tests. */
+#define WT_ERR(a) do { \
+ if ((ret = (a)) != 0) \
+ goto err; \
+} while (0)
+#define WT_ERR_MSG(session, v, ...) do { \
+ ret = (v); \
+ __wt_err(session, ret, __VA_ARGS__); \
+ goto err; \
+} while (0)
+#define WT_ERR_BUSY_OK(a) do { \
+ if ((ret = (a)) != 0) { \
+ if (ret == EBUSY) \
+ ret = 0; \
+ else \
+ goto err; \
+ } \
+} while (0)
+#define WT_ERR_NOTFOUND_OK(a) do { \
+ if ((ret = (a)) != 0) { \
+ if (ret == WT_NOTFOUND) \
+ ret = 0; \
+ else \
+ goto err; \
+ } \
+} while (0)
+#define WT_ERR_TEST(a, v) do { \
+ if (a) { \
+ ret = (v); \
+ goto err; \
+ } \
+} while (0)
+
+/* Return tests. */
+#define WT_RET(a) do { \
+ int __ret; \
+ if ((__ret = (a)) != 0) \
+ return (__ret); \
+} while (0)
+#define WT_RET_TEST(a, v) do { \
+ if (a) \
+ return (v); \
+} while (0)
+#define WT_RET_MSG(session, v, ...) do { \
+ int __ret = (v); \
+ __wt_err(session, __ret, __VA_ARGS__); \
+ return (__ret); \
+} while (0)
+#define WT_RET_BUSY_OK(a) do { \
+ int __ret; \
+ if ((__ret = (a)) != 0 && __ret != EBUSY) \
+ return (__ret); \
+} while (0)
+#define WT_RET_NOTFOUND_OK(a) do { \
+ int __ret; \
+ if ((__ret = (a)) != 0 && __ret != WT_NOTFOUND) \
+ return (__ret); \
+} while (0)
+/* Set "ret" if not already set. */
+#define WT_TRET(a) do { \
+ int __ret; \
+ if ((__ret = (a)) != 0 && \
+ (__ret == WT_PANIC || \
+ ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND)) \
+ ret = __ret; \
+} while (0)
+#define WT_TRET_BUSY_OK(a) do { \
+ int __ret; \
+ if ((__ret = (a)) != 0 && __ret != EBUSY && \
+ (__ret == WT_PANIC || \
+ ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND)) \
+ ret = __ret; \
+} while (0)
+#define WT_TRET_NOTFOUND_OK(a) do { \
+ int __ret; \
+ if ((__ret = (a)) != 0 && __ret != WT_NOTFOUND && \
+ (__ret == WT_PANIC || \
+ ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND)) \
+ ret = __ret; \
+} while (0)
+
+/* Return and branch-to-err-label cases for switch statements. */
+#define WT_ILLEGAL_VALUE(session) \
+ default: \
+ return (__wt_illegal_value(session, NULL))
+#define WT_ILLEGAL_VALUE_ERR(session) \
+ default: \
+ WT_ERR(__wt_illegal_value(session, NULL))
+#define WT_ILLEGAL_VALUE_SET(session) \
+ default: \
+ ret = __wt_illegal_value(session, NULL); \
+ break
+
+#define WT_PANIC_MSG(session, v, ...) do { \
+ __wt_err(session, v, __VA_ARGS__); \
+ (void)__wt_panic(session); \
+} while (0)
+#define WT_PANIC_ERR(session, v, ...) do { \
+ WT_PANIC_MSG(session, v, __VA_ARGS__); \
+ WT_ERR(WT_PANIC); \
+} while (0)
+#define WT_PANIC_RET(session, v, ...) do { \
+ WT_PANIC_MSG(session, v, __VA_ARGS__); \
+ /* Return WT_PANIC regardless of earlier return codes. */ \
+ return (WT_PANIC); \
+} while (0)
+
+/*
+ * WT_ASSERT
+ * Assert an expression, aborting in diagnostic mode. Otherwise,
+ * "use" the session to keep the compiler quiet and don't evaluate the
+ * expression.
+ */
+#ifdef HAVE_DIAGNOSTIC
+#define WT_ASSERT(session, exp) do { \
+ if (!(exp)) \
+ __wt_assert(session, 0, __FILE__, __LINE__, "%s", #exp);\
+} while (0)
+#else
+#define WT_ASSERT(session, exp) \
+ WT_UNUSED(session)
+#endif
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
new file mode 100644
index 00000000000..2ab964475d8
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -0,0 +1,650 @@
+/* DO NOT EDIT: automatically built by dist/s_prototypes. */
+
+extern void __wt_async_stats_update(WT_SESSION_IMPL *session);
+extern int __wt_async_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_async_destroy(WT_SESSION_IMPL *session);
+extern int __wt_async_flush(WT_SESSION_IMPL *session);
+extern int __wt_async_new_op(WT_SESSION_IMPL *session, const char *uri, const char *config, const char *cfg[], WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP_IMPL **opp);
+extern int __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op);
+extern int __wt_async_op_init(WT_SESSION_IMPL *session);
+extern void *__wt_async_worker(void *arg);
+extern int __wt_block_addr_to_buffer(WT_BLOCK *block, uint8_t **pp, wt_off_t offset, uint32_t size, uint32_t cksum);
+extern int __wt_block_buffer_to_addr(WT_BLOCK *block, const uint8_t *p, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump);
+extern int __wt_block_addr_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int live);
+extern int __wt_block_addr_string(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
+extern int __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci);
+extern int __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci);
+extern int __wt_block_ckpt_init( WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name);
+extern int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, int checkpoint);
+extern int __wt_block_checkpoint_unload( WT_SESSION_IMPL *session, WT_BLOCK *block, int checkpoint);
+extern void __wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci);
+extern int __wt_block_checkpoint(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, int data_cksum);
+extern int __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp);
+extern int __wt_block_compact_page_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int *skipp);
+extern int __wt_block_misplaced(WT_SESSION_IMPL *session, WT_BLOCK *block, const char *tag, wt_off_t offset, uint32_t size, int live);
+extern int __wt_block_off_remove_overlap( WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size);
+extern int __wt_block_alloc( WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size);
+extern int __wt_block_free(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size);
+extern int __wt_block_off_free( WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size);
+extern int __wt_block_extlist_check( WT_SESSION_IMPL *session, WT_EXTLIST *al, WT_EXTLIST *bl);
+extern int __wt_block_extlist_overlap( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci);
+extern int __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b);
+extern int __wt_block_insert_ext( WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size);
+extern int __wt_block_extlist_read_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size);
+extern int __wt_block_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size);
+extern int __wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, WT_EXTLIST *additional);
+extern int __wt_block_extlist_truncate( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el);
+extern int __wt_block_extlist_init(WT_SESSION_IMPL *session, WT_EXTLIST *el, const char *name, const char *extname, int track_size);
+extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el);
+extern int __wt_block_map( WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp, void **mappingcookie);
+extern int __wt_block_unmap( WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen, void **mappingcookie);
+extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], int forced_salvage, int readonly, uint32_t allocsize, WT_BM **bmp);
+extern int __wt_block_manager_truncate( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize);
+extern int __wt_block_manager_create( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize);
+extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], int forced_salvage, int readonly, uint32_t allocsize, WT_BLOCK **blockp);
+extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize);
+extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats);
+extern int __wt_bm_preload(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
+extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
+extern int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset);
+extern int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum);
+extern int __wt_block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp);
+extern void __wt_block_ext_free(WT_SESSION_IMPL *session, WT_EXT *ext);
+extern int __wt_block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp);
+extern void __wt_block_size_free(WT_SESSION_IMPL *session, WT_SIZE *sz);
+extern int __wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max);
+extern int __wt_block_ext_discard(WT_SESSION_IMPL *session, u_int max);
+extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size);
+extern int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp);
+extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid);
+extern int __wt_block_verify_start( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase);
+extern int __wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_verify_ckpt_load( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci);
+extern int __wt_verify_ckpt_unload(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_verify_addr(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size);
+extern u_int __wt_block_header(WT_BLOCK *block);
+extern int __wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep);
+extern int __wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int data_cksum);
+extern int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int caller_locked);
+extern int __wt_bloom_create( WT_SESSION_IMPL *session, const char *uri, const char *config, uint64_t count, uint32_t factor, uint32_t k, WT_BLOOM **bloomp);
+extern int __wt_bloom_open(WT_SESSION_IMPL *session, const char *uri, uint32_t factor, uint32_t k, WT_CURSOR *owner, WT_BLOOM **bloomp);
+extern int __wt_bloom_insert(WT_BLOOM *bloom, WT_ITEM *key);
+extern int __wt_bloom_finalize(WT_BLOOM *bloom);
+extern int __wt_bloom_hash(WT_BLOOM *bloom, WT_ITEM *key, WT_BLOOM_HASH *bhash);
+extern int __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash);
+extern int __wt_bloom_get(WT_BLOOM *bloom, WT_ITEM *key);
+extern int __wt_bloom_close(WT_BLOOM *bloom);
+extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config);
+extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp);
+extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next);
+extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating);
+extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating);
+extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_search(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp);
+extern int __wt_btcur_insert(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_update(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp);
+extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop);
+extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt);
+extern int __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v);
+extern int __wt_debug_addr_print( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
+extern int __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, const char *ofile);
+extern int __wt_debug_offset_blind( WT_SESSION_IMPL *session, wt_off_t offset, const char *ofile);
+extern int __wt_debug_offset(WT_SESSION_IMPL *session, wt_off_t offset, uint32_t size, uint32_t cksum, const char *ofile);
+extern int __wt_debug_disk( WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile);
+extern int __wt_debug_tree_shape( WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
+extern int __wt_debug_tree_all(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
+extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
+extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
+extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp);
+extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref);
+extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref);
+extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep);
+extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, int free_pages);
+extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_INDEX *pindex, int free_pages);
+extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_evict_server_wake(WT_SESSION_IMPL *session);
+extern int __wt_evict_create(WT_SESSION_IMPL *session);
+extern int __wt_evict_destroy(WT_SESSION_IMPL *session);
+extern int __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session);
+extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session);
+extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app);
+extern void __wt_cache_dump(WT_SESSION_IMPL *session);
+extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]);
+extern int __wt_btree_close(WT_SESSION_IMPL *session);
+extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno);
+extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
+extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep);
+extern void __wt_btree_evictable(WT_SESSION_IMPL *session, int on);
+extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize);
+extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session);
+extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session);
+extern int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
+extern int __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int checkpoint, int compressed);
+extern const char *__wt_page_type_string(u_int type);
+extern const char *__wt_cell_type_string(uint8_t type);
+extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf);
+extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf);
+extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store);
+extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack);
+extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell);
+extern int
+__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ );
+extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep);
+extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep);
+extern int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd);
+extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]);
+extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst);
+extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op);
+extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk, size_t size);
+extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf);
+extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags);
+extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove);
+extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt);
+extern int __wt_rec_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive);
+extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
+extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session);
+extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp);
+extern int __wt_split_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive);
+extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell);
+extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, uint8_t **addrp, size_t *addr_sizep, const void *value, size_t value_size);
+extern int __wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, size_t addr_size, const void *value, size_t value_size);
+extern void __wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_ovfl_txnc_search( WT_PAGE *page, const uint8_t *addr, size_t addr_size, WT_ITEM *store);
+extern int __wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, size_t addr_size, const void *value, size_t value_size);
+extern void __wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_rec_write(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags);
+extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key);
+extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, int instantiate);
+extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, void *ikeyp);
+extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, void *ikeyp);
+extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove);
+extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep);
+extern int __wt_update_alloc( WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep);
+extern WT_UPDATE *__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd);
+extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd);
+extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key);
+extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert);
+extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern int __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len);
+extern int __wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str);
+extern int __wt_config_subinit( WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item);
+extern int __wt_config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_get(WT_SESSION_IMPL *session, const char **cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_gets(WT_SESSION_IMPL *session, const char **cfg, const char *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_getone(WT_SESSION_IMPL *session, const char *config, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_getones(WT_SESSION_IMPL *session, const char *config, const char *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_gets_def(WT_SESSION_IMPL *session, const char **cfg, const char *key, int def, WT_CONFIG_ITEM *value);
+extern int __wt_config_subgetraw(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_subgets(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cfg, const char *key, WT_CONFIG_ITEM *value);
+extern void __wt_conn_foc_discard(WT_SESSION_IMPL *session);
+extern int __wt_configure_method(WT_SESSION_IMPL *session, const char *method, const char *uri, const char *config, const char *type, const char *check);
+extern int __wt_config_check(WT_SESSION_IMPL *session, const WT_CONFIG_ENTRY *entry, const char *config, size_t config_len);
+extern int __wt_config_collapse( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret);
+extern int __wt_config_merge( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret);
+extern int __wt_config_concat( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret);
+extern int __wt_conn_config_init(WT_SESSION_IMPL *session);
+extern void __wt_conn_config_discard(WT_SESSION_IMPL *session);
+extern int __wt_ext_config_parser_open(WT_EXTENSION_API *wt_ext, WT_SESSION *wt_session, const char *config, size_t len, WT_CONFIG_PARSER **config_parserp);
+extern int __wt_ext_config_get(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_CONFIG_ARG *cfg_arg, const char *key, WT_CONFIG_ITEM *cval);
+extern int __wt_config_upgrade(WT_SESSION_IMPL *session, WT_ITEM *buf);
+extern int __wt_collator_config(WT_SESSION_IMPL *session, const char **cfg, WT_COLLATOR **collatorp, int *ownp);
+extern int __wt_conn_remove_collator(WT_SESSION_IMPL *session);
+extern int __wt_conn_remove_compressor(WT_SESSION_IMPL *session);
+extern int __wt_conn_remove_data_source(WT_SESSION_IMPL *session);
+extern int __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_cache_stats_update(WT_SESSION_IMPL *session);
+extern int __wt_cache_destroy(WT_SESSION_IMPL *session);
+extern int __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg);
+extern int __wt_conn_cache_pool_open(WT_SESSION_IMPL *session);
+extern int __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session);
+extern void *__wt_cache_pool_server(void *arg);
+extern int __wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session);
+extern int __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize);
+extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int force);
+extern int __wt_conn_btree_get(WT_SESSION_IMPL *session, const char *name, const char *ckpt, const char *op_cfg[], uint32_t flags);
+extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, int apply_checkpoints, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
+extern int __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
+extern void __wt_conn_btree_close(WT_SESSION_IMPL *session);
+extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *name, int force);
+extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, int final);
+extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session);
+extern int __wt_connection_init(WT_CONNECTION_IMPL *conn);
+extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn);
+extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session);
+extern int __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]);
+extern int __wt_connection_close(WT_CONNECTION_IMPL *conn);
+extern int __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_conn_stat_init(WT_SESSION_IMPL *session);
+extern int __wt_statlog_log_one(WT_SESSION_IMPL *session);
+extern int __wt_statlog_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_statlog_destroy(WT_SESSION_IMPL *session, int is_close);
+extern int __wt_sweep_create(WT_SESSION_IMPL *session);
+extern int __wt_sweep_destroy(WT_SESSION_IMPL *session);
+extern int __wt_curbackup_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_backup_list_uri_append( WT_SESSION_IMPL *session, const char *name, int *skip);
+extern int __wt_curbulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, int bitmap, int skip_sort_check);
+extern int __wt_curconfig_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp);
+extern int __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp);
+extern int __wt_curfile_update_check(WT_CURSOR *cursor);
+extern int __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap, WT_CURSOR **cursorp);
+extern int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, WT_CURSOR_JSON *json, int iskey, va_list ap);
+extern void __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor);
+extern size_t __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, int force_unicode);
+extern int __wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf);
+extern int __wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype, const char **tokstart, size_t *toklen);
+extern const char *__wt_json_tokname(int toktype);
+extern int __wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, const char *format, WT_CURSOR_JSON *json, int iskey, WT_ITEM *item);
+extern ssize_t __wt_json_strlen(const char *src, size_t srclen);
+extern int __wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen);
+extern int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst);
+extern int __wt_curstat_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst);
+extern int __wt_curstat_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_cursor_notsup(WT_CURSOR *cursor);
+extern int __wt_cursor_noop(WT_CURSOR *cursor);
+extern void __wt_cursor_set_notsup(WT_CURSOR *cursor);
+extern int __wt_cursor_config_readonly(WT_CURSOR *cursor, const char *cfg[], int def);
+extern int __wt_cursor_kv_not_set(WT_CURSOR *cursor, int key);
+extern int __wt_cursor_get_key(WT_CURSOR *cursor, ...);
+extern void __wt_cursor_set_key(WT_CURSOR *cursor, ...);
+extern int __wt_cursor_get_raw_key(WT_CURSOR *cursor, WT_ITEM *key);
+extern void __wt_cursor_set_raw_key(WT_CURSOR *cursor, WT_ITEM *key);
+extern int __wt_cursor_get_raw_value(WT_CURSOR *cursor, WT_ITEM *value);
+extern void __wt_cursor_set_raw_value(WT_CURSOR *cursor, WT_ITEM *value);
+extern int __wt_cursor_get_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap);
+extern void __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap);
+extern int __wt_cursor_get_value(WT_CURSOR *cursor, ...);
+extern int __wt_cursor_get_valuev(WT_CURSOR *cursor, va_list ap);
+extern void __wt_cursor_set_value(WT_CURSOR *cursor, ...);
+extern void __wt_cursor_set_valuev(WT_CURSOR *cursor, va_list ap);
+extern int __wt_cursor_close(WT_CURSOR *cursor);
+extern int __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor);
+extern int __wt_cursor_init(WT_CURSOR *cursor, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curtable_get_key(WT_CURSOR *cursor, ...);
+extern int __wt_curtable_get_value(WT_CURSOR *cursor, ...);
+extern void __wt_curtable_set_key(WT_CURSOR *cursor, ...);
+extern void __wt_curtable_set_value(WT_CURSOR *cursor, ...);
+extern int __wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop);
+extern int __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
+extern void __wt_log_written_reset(WT_SESSION_IMPL *session);
+extern int __wt_log_get_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp);
+extern int __wt_log_get_active_files( WT_SESSION_IMPL *session, char ***filesp, u_int *countp);
+extern void __wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count);
+extern int __wt_log_filename(WT_SESSION_IMPL *session, uint32_t id, WT_ITEM *buf);
+extern int __wt_log_extract_lognum( WT_SESSION_IMPL *session, const char *name, uint32_t *id);
+extern int __wt_log_remove(WT_SESSION_IMPL *session, uint32_t lognum);
+extern int __wt_log_open(WT_SESSION_IMPL *session);
+extern int __wt_log_close(WT_SESSION_IMPL *session);
+extern int __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create);
+extern int __wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags);
+extern int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie);
+extern int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags);
+extern int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap);
+extern int __wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp);
+extern void __wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp);
+extern int __wt_logrec_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *rectypep);
+extern int __wt_logop_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *optypep, uint32_t *opsizep);
+extern int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno, WT_ITEM *value);
+extern int __wt_logop_col_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep);
+extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno);
+extern int __wt_logop_col_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop);
+extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t start, uint64_t stop);
+extern int __wt_logop_col_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *startp, uint64_t *stopp);
+extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key, WT_ITEM *value);
+extern int __wt_logop_row_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep);
+extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key);
+extern int __wt_logop_row_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp);
+extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode);
+extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep);
+extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_log_slot_init(WT_SESSION_IMPL *session);
+extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session);
+extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslotp);
+extern int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size);
+extern int __wt_log_slot_free(WT_LOGSLOT *slot);
+extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize);
+extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks);
+extern int __wt_clsm_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_lsm_manager_config(WT_SESSION_IMPL *session, const char **cfg);
+extern int __wt_lsm_manager_reconfig(WT_SESSION_IMPL *session, const char **cfg);
+extern int __wt_lsm_manager_start(WT_SESSION_IMPL *session);
+extern void __wt_lsm_manager_free_work_unit( WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT *entry);
+extern int __wt_lsm_manager_destroy(WT_SESSION_IMPL *session);
+extern int __wt_lsm_manager_clear_tree( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_manager_pop_entry( WT_SESSION_IMPL *session, uint32_t type, WT_LSM_WORK_UNIT **entryp);
+extern int __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, uint32_t type, uint32_t flags, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_merge_update_tree(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int start_chunk, u_int nchunks, WT_LSM_CHUNK *chunk);
+extern int __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id);
+extern int __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_curstat_lsm_init( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cst);
+extern int __wt_lsm_tree_close_all(WT_SESSION_IMPL *session);
+extern int __wt_lsm_tree_bloom_name(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp);
+extern int __wt_lsm_tree_chunk_name(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp);
+extern int __wt_lsm_tree_set_chunk_size( WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk);
+extern int __wt_lsm_tree_setup_chunk( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk);
+extern int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config);
+extern int __wt_lsm_tree_get(WT_SESSION_IMPL *session, const char *uri, int exclusive, WT_LSM_TREE **treep);
+extern void __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern void __wt_lsm_tree_throttle( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only);
+extern int __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_tree_drop( WT_SESSION_IMPL *session, const char *name, const char *cfg[]);
+extern int __wt_lsm_tree_rename(WT_SESSION_IMPL *session, const char *olduri, const char *newuri, const char *cfg[]);
+extern int __wt_lsm_tree_truncate( WT_SESSION_IMPL *session, const char *name, const char *cfg[]);
+extern int __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip);
+extern int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, int *), const char *cfg[], uint32_t open_flags);
+extern int __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int force, WT_LSM_CHUNK **chunkp);
+extern int __wt_lsm_work_switch( WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **entryp, int *ran);
+extern int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk);
+extern int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args);
+extern int __wt_meta_btree_apply(WT_SESSION_IMPL *session, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
+extern int __wt_meta_checkpoint(WT_SESSION_IMPL *session, const char *fname, const char *checkpoint, WT_CKPT *ckpt);
+extern int __wt_meta_checkpoint_last_name( WT_SESSION_IMPL *session, const char *fname, const char **namep);
+extern int __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname);
+extern int __wt_meta_ckptlist_get( WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep);
+extern int __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn);
+extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase);
+extern void __wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt);
+extern int __wt_ext_metadata_insert(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char *value);
+extern int __wt_ext_metadata_remove( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key);
+extern int __wt_ext_metadata_search(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char **valuep);
+extern int __wt_ext_metadata_update(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char *value);
+extern int __wt_metadata_get_ckptlist( WT_SESSION *session, const char *name, WT_CKPT **ckptbasep);
+extern void __wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase);
+extern int __wt_metadata_open(WT_SESSION_IMPL *session);
+extern int __wt_metadata_cursor( WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp);
+extern int __wt_metadata_insert( WT_SESSION_IMPL *session, const char *key, const char *value);
+extern int __wt_metadata_update( WT_SESSION_IMPL *session, const char *key, const char *value);
+extern int __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key);
+extern int __wt_metadata_search( WT_SESSION_IMPL *session, const char *key, const char **valuep);
+extern void __wt_meta_track_discard(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_on(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_off(WT_SESSION_IMPL *session, int unroll);
+extern int __wt_meta_track_sub_on(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_sub_off(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_checkpoint(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_insert(WT_SESSION_IMPL *session, const char *key);
+extern int __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key);
+extern int __wt_meta_track_fileop( WT_SESSION_IMPL *session, const char *olduri, const char *newuri);
+extern int __wt_meta_track_handle_lock(WT_SESSION_IMPL *session, int created);
+extern int __wt_turtle_init(WT_SESSION_IMPL *session);
+extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, const char **valuep);
+extern int __wt_turtle_update( WT_SESSION_IMPL *session, const char *key, const char *value);
+extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_ATTRIBUTE((noreturn));
+extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp);
+extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
+extern int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
+extern int __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp);
+extern int __wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp);
+extern void __wt_free_int(WT_SESSION_IMPL *session, const void *p_arg);
+extern int __wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix, uint32_t flags, char ***dirlist, u_int *countp);
+extern int __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp);
+extern int __wt_dlsym(WT_SESSION_IMPL *session, WT_DLH *dlh, const char *name, int fail, void *sym_ret);
+extern int __wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh);
+extern int __wt_errno(void);
+extern int __wt_exist(WT_SESSION_IMPL *session, const char *filename, int *existp);
+extern void __wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh);
+extern int __wt_fallocate( WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len);
+extern int __wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep);
+extern int __wt_filesize_name( WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep);
+extern int __wt_bytelock(WT_FH *fhp, wt_off_t byte, int lock);
+extern int __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh);
+extern int __wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh);
+extern int __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len);
+extern int __wt_getline(WT_SESSION_IMPL *session, WT_ITEM *buf, FILE *fp);
+extern int __wt_getopt( const char *progname, int nargc, char *const *nargv, const char *ostr);
+extern int __wt_mmap(WT_SESSION_IMPL *session, WT_FH *fh, void *mapp, size_t *lenp, void **mappingcookie);
+extern int __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size);
+extern int __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size);
+extern int __wt_munmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len, void **mappingcookie);
+extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, int is_signalled, WT_CONDVAR **condp);
+extern int __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs);
+extern int __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
+extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp);
+extern int __wt_rwlock_alloc( WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name);
+extern int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp);
+extern int __wt_once(void (*init_routine)(void));
+extern int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp);
+extern int __wt_close(WT_SESSION_IMPL *session, WT_FH *fh);
+extern int __wt_absolute_path(const char *path);
+extern const char *__wt_path_separator(void);
+extern int __wt_has_priv(void);
+extern int __wt_remove(WT_SESSION_IMPL *session, const char *name);
+extern int __wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to);
+extern int __wt_read( WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf);
+extern int __wt_write(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, const void *buf);
+extern void __wt_sleep(long seconds, long micro_seconds);
+extern uint64_t __wt_strtouq(const char *nptr, char **endptr, int base);
+extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, void *(*func)(void *), void *arg);
+extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid);
+extern void __wt_thread_id(char *buf, size_t buflen);
+extern int __wt_seconds(WT_SESSION_IMPL *session, time_t *timep);
+extern int __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp);
+extern void __wt_yield(void);
+extern int __wt_ext_struct_pack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *buffer, size_t size, const char *fmt, ...);
+extern int __wt_ext_struct_size(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t *sizep, const char *fmt, ...);
+extern int __wt_ext_struct_unpack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const void *buffer, size_t size, const char *fmt, ...);
+extern int __wt_struct_check(WT_SESSION_IMPL *session, const char *fmt, size_t len, int *fixedp, uint32_t *fixed_lenp);
+extern int __wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, ...);
+extern int __wt_struct_pack(WT_SESSION_IMPL *session, void *buffer, size_t size, const char *fmt, ...);
+extern int __wt_struct_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, ...);
+extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep);
+extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf);
+extern int __wt_schema_index_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf);
+extern int __wt_schema_create( WT_SESSION_IMPL *session, const char *uri, const char *config);
+extern int __wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]);
+extern int __wt_schema_get_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, int ok_incomplete, WT_TABLE **tablep);
+extern void __wt_schema_release_table(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern void __wt_schema_destroy_colgroup(WT_SESSION_IMPL *session, WT_COLGROUP *colgroup);
+extern void __wt_schema_destroy_index(WT_SESSION_IMPL *session, WT_INDEX *idx);
+extern void __wt_schema_destroy_table(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern void __wt_schema_remove_table( WT_SESSION_IMPL *session, WT_TABLE *table);
+extern void __wt_schema_close_tables(WT_SESSION_IMPL *session);
+extern int __wt_schema_colgroup_name(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, size_t len, WT_ITEM *buf);
+extern int __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern int __wt_schema_open_index(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp);
+extern int __wt_schema_open_indices(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern int __wt_schema_open_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, WT_TABLE **tablep);
+extern int __wt_schema_get_colgroup(WT_SESSION_IMPL *session, const char *uri, WT_TABLE **tablep, WT_COLGROUP **colgroupp);
+extern int __wt_schema_get_index(WT_SESSION_IMPL *session, const char *uri, WT_TABLE **tablep, WT_INDEX **indexp);
+extern int __wt_schema_colcheck(WT_SESSION_IMPL *session, const char *key_format, const char *value_format, WT_CONFIG_ITEM *colconf, u_int *kcolsp, u_int *vcolsp);
+extern int __wt_table_check(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern int __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns, size_t len, int value_only, WT_ITEM *plan);
+extern int __wt_struct_reformat(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns, size_t len, const char *extra_cols, int value_only, WT_ITEM *format);
+extern int __wt_struct_truncate(WT_SESSION_IMPL *session, const char *input_fmt, u_int ncols, WT_ITEM *format);
+extern int __wt_schema_project_in(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, va_list ap);
+extern int __wt_schema_project_out(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, va_list ap);
+extern int __wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, int key_only, const char *vformat, WT_ITEM *value);
+extern int __wt_schema_project_merge(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value);
+extern int __wt_schema_rename(WT_SESSION_IMPL *session, const char *uri, const char *newuri, const char *cfg[]);
+extern int __wt_curstat_colgroup_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst);
+extern int __wt_curstat_index_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst);
+extern int __wt_curstat_table_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst);
+extern int __wt_schema_truncate( WT_SESSION_IMPL *session, const char *uri, const char *cfg[]);
+extern int __wt_range_truncate(WT_CURSOR *start, WT_CURSOR *stop);
+extern int __wt_schema_range_truncate( WT_SESSION_IMPL *session, WT_CURSOR *start, WT_CURSOR *stop);
+extern WT_DATA_SOURCE *__wt_schema_get_source(WT_SESSION_IMPL *session, const char *name);
+extern int __wt_str_name_check(WT_SESSION_IMPL *session, const char *str);
+extern int __wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len);
+extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, int *), const char *cfg[], uint32_t open_flags);
+extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session);
+extern int __wt_session_copy_values(WT_SESSION_IMPL *session);
+extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_session_create_strip(WT_SESSION *wt_session, const char *v1, const char *v2, const char **value_ret);
+extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp);
+extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, WT_SESSION_IMPL **sessionp);
+extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip);
+extern int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config);
+extern void __wt_session_dhandle_incr_use(WT_SESSION_IMPL *session);
+extern int __wt_session_lock_btree(WT_SESSION_IMPL *session, uint32_t flags);
+extern int __wt_session_release_btree(WT_SESSION_IMPL *session);
+extern int __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], uint32_t flags);
+extern void __wt_session_close_cache(WT_SESSION_IMPL *session);
+extern int __wt_session_get_btree(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags);
+extern int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint);
+extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]);
+extern uint32_t __wt_cksum(const void *chunk, size_t len);
+extern void __wt_cksum_init(void);
+extern void __wt_event_handler_set(WT_SESSION_IMPL *session, WT_EVENT_HANDLER *handler);
+extern int __wt_eventv(WT_SESSION_IMPL *session, int msg_event, int error, const char *file_name, int line_number, const char *fmt, va_list ap);
+extern void __wt_err(WT_SESSION_IMPL *session, int error, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern void __wt_errx(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 2, 3)));
+extern int __wt_ext_err_printf( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern int __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 2, 3)));
+extern int __wt_ext_msg_printf( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern int __wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v);
+extern void __wt_assert(WT_SESSION_IMPL *session, int error, const char *file_name, int line_number, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 5, 6)));
+extern int __wt_panic(WT_SESSION_IMPL *session);
+extern int __wt_illegal_value(WT_SESSION_IMPL *session, const char *name);
+extern int __wt_object_unsupported(WT_SESSION_IMPL *session, const char *uri);
+extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri);
+extern int __wt_filename(WT_SESSION_IMPL *session, const char *name, char **path);
+extern int __wt_nfilename( WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path);
+extern int __wt_library_init(void);
+extern int __wt_breakpoint(void);
+extern void __wt_attach(WT_SESSION_IMPL *session);
+extern uint64_t __wt_hash_city64(const void *s, size_t len);
+extern uint64_t __wt_hash_fnv64(const void *string, size_t len);
+extern int
+__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ );
+extern int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern void __wt_hazard_close(WT_SESSION_IMPL *session);
+extern int __wt_raw_to_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to);
+extern int __wt_raw_to_esc_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to);
+extern int __wt_hex2byte(const u_char *from, u_char *to);
+extern int __wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to);
+extern int __wt_nhex_to_raw( WT_SESSION_IMPL *session, const char *from, size_t size, WT_ITEM *to);
+extern int __wt_esc_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to);
+extern int __wt_huffman_open(WT_SESSION_IMPL *session, void *symbol_frequency_array, u_int symcnt, u_int numbytes, void *retp);
+extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg);
+extern int __wt_print_huffman_code(void *huffman_arg, uint16_t symbol);
+extern int __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf);
+extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf);
+extern int __wt_spin_lock_register_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t);
+extern void __wt_spin_lock_unregister_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t);
+extern int __wt_spin_lock_register_caller(WT_SESSION_IMPL *session, const char *name, const char *file, int line, int *idp);
+extern int __wt_statlog_dump_spinlock(WT_CONNECTION_IMPL *conn, const char *tag);
+extern uint32_t __wt_nlpo2_round(uint32_t v);
+extern uint32_t __wt_nlpo2(uint32_t v);
+extern uint32_t __wt_log2_int(uint32_t n);
+extern int __wt_ispo2(uint32_t v);
+extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2);
+extern void __wt_random_init(uint32_t *rnd);
+extern uint32_t __wt_random(uint32_t *rnd);
+extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size);
+extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern int
+__wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ );
+extern void __wt_scr_discard(WT_SESSION_IMPL *session);
+extern void *__wt_ext_scr_alloc( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t size);
+extern void __wt_ext_scr_free(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *p);
+extern void __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats);
+extern void __wt_stat_refresh_dsrc_stats(void *stats_arg);
+extern void __wt_stat_aggregate_dsrc_stats(const void *child, const void *parent);
+extern void __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats);
+extern void __wt_stat_refresh_connection_stats(void *stats_arg);
+extern int __wt_txnid_cmp(const void *v1, const void *v2);
+extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session);
+extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session);
+extern void __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot);
+extern int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_txn_release(WT_SESSION_IMPL *session);
+extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_init(WT_SESSION_IMPL *session);
+extern void __wt_txn_stats_update(WT_SESSION_IMPL *session);
+extern void __wt_txn_destroy(WT_SESSION_IMPL *session);
+extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session);
+extern int __wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len);
+extern int __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_checkpoint_close(WT_SESSION_IMPL *session, int force);
+extern uint64_t __wt_ext_transaction_id(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session);
+extern int __wt_ext_transaction_isolation_level( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session);
+extern int __wt_ext_transaction_notify( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_TXN_NOTIFY *notify);
+extern uint64_t __wt_ext_transaction_oldest(WT_EXTENSION_API *wt_api);
+extern int __wt_ext_transaction_visible( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint64_t transaction_id);
+extern void __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op);
+extern int __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern int __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_checkpoint_logread( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_LSN *ckpt_lsn);
+extern int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, int full, uint32_t flags, WT_LSN *lsnp);
+extern int __wt_txn_truncate_log( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop);
+extern int __wt_txn_truncate_end(WT_SESSION_IMPL *session);
+extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out);
+extern int __wt_txn_recover(WT_CONNECTION_IMPL *conn);
diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h
new file mode 100644
index 00000000000..3aac7193407
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/flags.h
@@ -0,0 +1,88 @@
+/*
+ * DO NOT EDIT: automatically built by dist/flags.py.
+ * flags section: BEGIN
+ */
+#define WT_CONN_CACHE_POOL 0x00001000
+#define WT_CONN_CKPT_SYNC 0x00000800
+#define WT_CONN_EVICTION_RUN 0x00000400
+#define WT_CONN_LEAK_MEMORY 0x00000200
+#define WT_CONN_LSM_MERGE 0x00000100
+#define WT_CONN_PANIC 0x00000080
+#define WT_CONN_SERVER_ASYNC 0x00000040
+#define WT_CONN_SERVER_CHECKPOINT 0x00000020
+#define WT_CONN_SERVER_LSM 0x00000010
+#define WT_CONN_SERVER_RUN 0x00000008
+#define WT_CONN_SERVER_STATISTICS 0x00000004
+#define WT_CONN_SERVER_SWEEP 0x00000002
+#define WT_CONN_WAS_BACKUP 0x00000001
+#define WT_EVICTING 0x00000004
+#define WT_FILE_TYPE_CHECKPOINT 0x00000004
+#define WT_FILE_TYPE_DATA 0x00000002
+#define WT_FILE_TYPE_LOG 0x00000001
+#define WT_LOGSCAN_FIRST 0x00000008
+#define WT_LOGSCAN_FROM_CKP 0x00000004
+#define WT_LOGSCAN_ONE 0x00000002
+#define WT_LOGSCAN_RECOVER 0x00000001
+#define WT_LOG_DSYNC 0x00000004
+#define WT_LOG_FLUSH 0x00000002
+#define WT_LOG_FSYNC 0x00000001
+#define WT_READ_CACHE 0x00000200
+#define WT_READ_COMPACT 0x00000100
+#define WT_READ_NO_EVICT 0x00000080
+#define WT_READ_NO_GEN 0x00000040
+#define WT_READ_NO_WAIT 0x00000020
+#define WT_READ_PREV 0x00000010
+#define WT_READ_SKIP_INTL 0x00000008
+#define WT_READ_SKIP_LEAF 0x00000004
+#define WT_READ_TRUNCATE 0x00000002
+#define WT_READ_WONT_NEED 0x00000001
+#define WT_SESSION_CAN_WAIT 0x00000800
+#define WT_SESSION_DISCARD_FORCE 0x00000400
+#define WT_SESSION_INTERNAL 0x00000200
+#define WT_SESSION_LOGGING_INMEM 0x00000100
+#define WT_SESSION_NO_CACHE 0x00000080
+#define WT_SESSION_NO_CACHE_CHECK 0x00000040
+#define WT_SESSION_NO_DATA_HANDLES 0x00000020
+#define WT_SESSION_NO_LOGGING 0x00000010
+#define WT_SESSION_NO_SCHEMA_LOCK 0x00000008
+#define WT_SESSION_SALVAGE_CORRUPT_OK 0x00000004
+#define WT_SESSION_SCHEMA_LOCKED 0x00000002
+#define WT_SESSION_SERVER_ASYNC 0x00000001
+#define WT_SKIP_UPDATE_ERR 0x00000002
+#define WT_SKIP_UPDATE_RESTORE 0x00000001
+#define WT_SYNC_CHECKPOINT 0x00000010
+#define WT_SYNC_CLOSE 0x00000008
+#define WT_SYNC_DISCARD 0x00000004
+#define WT_SYNC_DISCARD_FORCE 0x00000002
+#define WT_SYNC_WRITE_LEAVES 0x00000001
+#define WT_TXN_LOG_CKPT_FAIL 0x00000008
+#define WT_TXN_LOG_CKPT_PREPARE 0x00000004
+#define WT_TXN_LOG_CKPT_START 0x00000002
+#define WT_TXN_LOG_CKPT_STOP 0x00000001
+#define WT_VERB_API 0x00400000
+#define WT_VERB_BLOCK 0x00200000
+#define WT_VERB_CHECKPOINT 0x00100000
+#define WT_VERB_COMPACT 0x00080000
+#define WT_VERB_EVICT 0x00040000
+#define WT_VERB_EVICTSERVER 0x00020000
+#define WT_VERB_FILEOPS 0x00010000
+#define WT_VERB_LOG 0x00008000
+#define WT_VERB_LSM 0x00004000
+#define WT_VERB_METADATA 0x00002000
+#define WT_VERB_MUTEX 0x00001000
+#define WT_VERB_OVERFLOW 0x00000800
+#define WT_VERB_READ 0x00000400
+#define WT_VERB_RECONCILE 0x00000200
+#define WT_VERB_RECOVERY 0x00000100
+#define WT_VERB_SALVAGE 0x00000080
+#define WT_VERB_SHARED_CACHE 0x00000040
+#define WT_VERB_SPLIT 0x00000020
+#define WT_VERB_TEMPORARY 0x00000010
+#define WT_VERB_TRANSACTION 0x00000008
+#define WT_VERB_VERIFY 0x00000004
+#define WT_VERB_VERSION 0x00000002
+#define WT_VERB_WRITE 0x00000001
+/*
+ * flags section: END
+ * DO NOT EDIT: automatically built by dist/flags.py.
+ */
diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h
new file mode 100644
index 00000000000..50e237a1fed
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/gcc.h
@@ -0,0 +1,152 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/* Add GCC-specific attributes to types and function declarations. */
+#define WT_GCC_ATTRIBUTE(x) __attribute__(x)
+
+/*
+ * Attribute are only permitted on function declarations, not definitions.
+ * This macro is a marker for function definitions that is rewritten by
+ * dist/s_prototypes to create extern.h.
+ */
+#define WT_GCC_FUNC_ATTRIBUTE(x)
+
+/*
+ * Atomic writes:
+ *
+ * WiredTiger requires pointers (void *) and some variables to be read/written
+ * atomically, that is, in a single cycle. This is not write ordering -- to be
+ * clear, the requirement is that no partial value can ever be read or written.
+ * For example, if 8-bits of a 32-bit quantity were written, then the rest of
+ * the 32-bits were written, and another thread of control was able to read the
+ * memory location after the first 8-bits were written and before the subsequent
+ * 24-bits were written, WiredTiger would break. Or, if two threads of control
+ * attempt to write the same location simultaneously, the result must be one or
+ * the other of the two values, not some combination of both.
+ *
+ * To reduce memory requirements, we use a 32-bit type on 64-bit machines, which
+ * is OK if the compiler doesn't accumulate two adjacent 32-bit variables into a
+ * single 64-bit write, that is, there needs to be a single load/store of the 32
+ * bits, not a load/store of 64 bits, where the 64 bits is comprised of two
+ * adjacent 32-bit locations. The problem is when two threads are cooperating
+ * (thread X finds 32-bits set to 0, writes in a new value, flushes memory;
+ * thread Y reads 32-bits that are non-zero, does some operation, resets the
+ * memory location to 0 and flushes). If thread X were to read the 32 bits
+ * adjacent to a different 32 bits, and write them both, the two threads could
+ * race. If that can happen, you must increase the size of the memory type to
+ * a type guaranteed to be written atomically in a single cycle, without writing
+ * an adjacent memory location.
+ *
+ * WiredTiger additionally requires atomic writes for 64-bit memory locations,
+ * and so cannot run on machines with a 32-bit memory bus.
+ *
+ * We don't depend on writes across cache lines being atomic, and to make sure
+ * that never happens, we check address alignment: we know of no architectures
+ * with cache lines other than a multiple of 4 bytes in size, so aligned 4-byte
+ * accesses will always be in a single cache line.
+ *
+ * Atomic writes are often associated with memory barriers, implemented by the
+ * WT_READ_BARRIER and WT_WRITE_BARRIER macros. WiredTiger's requirement as
+ * described by the Solaris membar_enter description:
+ *
+ * No stores from after the memory barrier will reach visibility and
+ * no loads from after the barrier will be resolved before the lock
+ * acquisition reaches global visibility
+ *
+ * In other words, the WT_WRITE_BARRIER macro must ensure that memory stores by
+ * the processor, made before the WT_WRITE_BARRIER call, be visible to all
+ * processors in the system before any memory stores by the processor, made
+ * after the WT_WRITE_BARRIER call, are visible to any processor. The
+ * WT_READ_BARRIER macro ensures that all loads before the barrier are complete
+ * before any loads after the barrier. The compiler cannot reorder or cache
+ * values across a barrier.
+ *
+ * Lock and unlock operations imply both read and write barriers. In other
+ * words, barriers are not required for values protected by locking.
+ *
+ * Data locations may also be marked volatile, forcing the compiler to re-load
+ * the data on each access. This is a weaker semantic than barriers provide,
+ * only ensuring that the compiler will not cache values. It makes no ordering
+ * guarantees and may have no effect on systems with weaker cache guarantees.
+ *
+ * In summary, locking > barriers > volatile.
+ *
+ * To avoid locking shared data structures such as statistics and to permit
+ * atomic state changes, we rely on the WT_ATOMIC_ADD and WT_ATOMIC_CAS
+ * (compare and swap) operations.
+ */
+#define __WT_ATOMIC_ADD(v, val, n) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_add_and_fetch(&(v), val))
+#define __WT_ATOMIC_FETCH_ADD(v, val, n) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_fetch_and_add(&(v), val))
+#define __WT_ATOMIC_CAS(v, old, new, n) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ __sync_bool_compare_and_swap(&(v), old, new))
+#define __WT_ATOMIC_CAS_VAL(v, old, new, n) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ __sync_val_compare_and_swap(&(v), old, new))
+#define __WT_ATOMIC_STORE(v, val, n) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ __sync_lock_test_and_set(&(v), val))
+#define __WT_ATOMIC_SUB(v, val, n) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_sub_and_fetch(&(v), val))
+
+#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val, 1)
+#define WT_ATOMIC_FETCH_ADD1(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 1)
+#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new, 1)
+#define WT_ATOMIC_CAS_VAL1(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 1)
+#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val, 1)
+#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val, 1)
+
+#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val, 2)
+#define WT_ATOMIC_FETCH_ADD2(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 2)
+#define WT_ATOMIC_CAS2(v, old, new) __WT_ATOMIC_CAS(v, old, new, 2)
+#define WT_ATOMIC_CAS_VAL2(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 2)
+#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val, 2)
+#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val, 2)
+
+#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val, 4)
+#define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 4)
+#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new, 4)
+#define WT_ATOMIC_CAS_VAL4(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 4)
+#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val, 4)
+#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val, 4)
+
+#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val, 8)
+#define WT_ATOMIC_FETCH_ADD8(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 8)
+#define WT_ATOMIC_CAS8(v, old, new) __WT_ATOMIC_CAS(v, old, new, 8)
+#define WT_ATOMIC_CAS_VAL8(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 8)
+#define WT_ATOMIC_STORE8(v, val) __WT_ATOMIC_STORE(v, val, 8)
+#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val, 8)
+
+/* Compile read-write barrier */
+#define WT_BARRIER() __asm__ volatile("" ::: "memory")
+
+/* Pause instruction to prevent excess processor bus usage */
+#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
+
+#if defined(x86_64) || defined(__x86_64__)
+#define WT_FULL_BARRIER() do { \
+ __asm__ volatile ("mfence" ::: "memory"); \
+} while (0)
+#define WT_READ_BARRIER() do { \
+ __asm__ volatile ("lfence" ::: "memory"); \
+} while (0)
+#define WT_WRITE_BARRIER() do { \
+ __asm__ volatile ("sfence" ::: "memory"); \
+} while (0)
+
+#elif defined(i386) || defined(__i386__)
+#define WT_FULL_BARRIER() do { \
+ __asm__ volatile ("lock; addl $0, 0(%%esp)" ::: "memory"); \
+} while (0)
+#define WT_READ_BARRIER() WT_FULL_BARRIER()
+#define WT_WRITE_BARRIER() WT_FULL_BARRIER()
+
+#else
+#error "No write barrier implementation for this hardware"
+#endif
diff --git a/src/third_party/wiredtiger/src/include/hardware.h b/src/third_party/wiredtiger/src/include/hardware.h
new file mode 100644
index 00000000000..720f512cf2d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/hardware.h
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Publish a value to a shared location. All previous stores must complete
+ * before the value is made public.
+ */
+#define WT_PUBLISH(v, val) do { \
+ WT_WRITE_BARRIER(); \
+ (v) = (val); \
+} while (0)
+
+/*
+ * Read a shared location and guarantee that subsequent reads do not see any
+ * earlier state.
+ */
+#define WT_ORDERED_READ(v, val) do { \
+ (v) = (val); \
+ WT_READ_BARRIER(); \
+} while (0)
+
+/*
+ * Atomic versions of the flag set/clear macros.
+ */
+#define F_ISSET_ATOMIC(p, mask) ((p)->flags_atomic & (uint8_t)(mask))
+
+#define F_SET_ATOMIC(p, mask) do { \
+ uint8_t __orig; \
+ do { \
+ __orig = (p)->flags_atomic; \
+ } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \
+ __orig, __orig | (uint8_t)(mask))); \
+} while (0)
+
+#define F_CAS_ATOMIC(p, mask, ret) do { \
+ uint8_t __orig; \
+ ret = 0; \
+ do { \
+ __orig = (p)->flags_atomic; \
+ if ((__orig & (uint8_t)(mask)) != 0) { \
+ ret = EBUSY; \
+ break; \
+ } \
+ } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \
+ __orig, __orig | (uint8_t)(mask))); \
+} while (0)
+
+#define F_CLR_ATOMIC(p, mask) do { \
+ uint8_t __orig; \
+ do { \
+ __orig = (p)->flags_atomic; \
+ } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \
+ __orig, __orig & ~(uint8_t)(mask))); \
+} while (0)
+
+#define WT_CACHE_LINE_ALIGNMENT 64 /* Cache line alignment */
diff --git a/src/third_party/wiredtiger/src/include/intpack.i b/src/third_party/wiredtiger/src/include/intpack.i
new file mode 100644
index 00000000000..01559657acd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/intpack.i
@@ -0,0 +1,371 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Variable-length integer encoding.
+ * We need up to 64 bits, signed and unsigned. Further, we want the packed
+ * representation to have the same lexicographic ordering as the integer
+ * values. This avoids the need for special-purpose comparison code.
+ *
+ * Try hard to keep small values small (up to ~2 bytes): that gives the biggest
+ * benefit for common cases storing small values. After that, just encode the
+ * length in the first byte: we could squeeze in a couple of extra bits, but
+ * the marginal benefit is small, and we want this code to be relatively
+ * easy to implement in client code or scripting APIs.
+ *
+ * First byte | Next | |
+ * byte | bytes| Min Value | Max Value
+ * ------------+------+------------------------+--------------------------------
+ * [00 00xxxx] | free | N/A | N/A
+ * [00 01llll] | llll | -2^64 | -2^13 - 2^6
+ * [00 1xxxxx] | 1 | -2^13 - 2^6 | -2^6 - 1
+ * [01 xxxxxx] | 0 | -2^6 | -1
+ * [10 xxxxxx] | 0 | 0 | 2^6 - 1
+ * [11 0xxxxx] | 1 | 2^6 | 2^13 + 2^6 - 1
+ * [11 10llll] | llll | 2^13 + 2^6 | 2^64 - 1
+ * [11 11xxxx] | free | N/A | N/A
+ */
+
+#define NEG_MULTI_MARKER (uint8_t)0x10
+#define NEG_2BYTE_MARKER (uint8_t)0x20
+#define NEG_1BYTE_MARKER (uint8_t)0x40
+#define POS_1BYTE_MARKER (uint8_t)0x80
+#define POS_2BYTE_MARKER (uint8_t)0xc0
+#define POS_MULTI_MARKER (uint8_t)0xe0
+
+#define NEG_1BYTE_MIN ((-1) << 6)
+#define NEG_2BYTE_MIN (((-1) << 13) + NEG_1BYTE_MIN)
+#define POS_1BYTE_MAX ((1 << 6) - 1)
+#define POS_2BYTE_MAX ((1 << 13) + POS_1BYTE_MAX)
+
+/* Extract bits <start> to <end> from a value (counting from LSB == 0). */
+#define GET_BITS(x, start, end) \
+ (((uint64_t)(x) & ((1U << (start)) - 1U)) >> (end))
+
+#define WT_SIZE_CHECK(l, maxl) \
+ WT_RET_TEST((maxl) != 0 && (size_t)(l) > (maxl), ENOMEM)
+
+/* Count the leading zero bytes. */
+#if defined(__GNUC__)
+#define WT_LEADING_ZEROS(x, i) \
+ (i = (x == 0) ? (int)sizeof (x) : __builtin_clzll(x) >> 3)
+#elif defined(_MSC_VER)
+#define WT_LEADING_ZEROS(x, i) do { \
+ if (x == 0) i = (int)sizeof(x); \
+ else { \
+ unsigned long __index; \
+ _BitScanReverse64(&__index, x); \
+ __index = 63 ^ __index; \
+ i = (int)(__index >> 3); } \
+ } while (0)
+#else
+#define WT_LEADING_ZEROS(x, i) do { \
+ uint64_t __x = (x); \
+ uint64_t __m = (uint64_t)0xff << 56; \
+ for (i = 0; !(__x & __m) && i != 8; i++) \
+ __m >>= 8; \
+} while (0)
+#endif
+
+/*
+ * __wt_vpack_posint --
+ * Packs a positive variable-length integer in the specified location.
+ */
+static inline int
+__wt_vpack_posint(uint8_t **pp, size_t maxlen, uint64_t x)
+{
+ uint8_t *p;
+ int len, lz, shift;
+
+ WT_LEADING_ZEROS(x, lz);
+ len = (int)sizeof (x) - lz;
+ WT_SIZE_CHECK(len + 1, maxlen);
+ p = *pp;
+
+ /* There are four bits we can use in the first byte. */
+ *p++ |= (len & 0xf);
+
+ for (shift = (len - 1) << 3; len != 0; --len, shift -= 8)
+ *p++ = (uint8_t)(x >> shift);
+
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vpack_negint --
+ * Packs a negative variable-length integer in the specified location.
+ */
+static inline int
+__wt_vpack_negint(uint8_t **pp, size_t maxlen, uint64_t x)
+{
+ uint8_t *p;
+ int len, lz, shift;
+
+ WT_LEADING_ZEROS(~x, lz);
+ len = (int)sizeof (x) - lz;
+ WT_SIZE_CHECK(len + 1, maxlen);
+ p = *pp;
+
+ /*
+ * There are four size bits we can use in the first byte.
+ * For negative numbers, we store the number of leading 0xff bytes
+ * to maintain ordering (if this is not obvious, it may help to
+ * remember that -1 is the largest negative number).
+ */
+ *p++ |= (lz & 0xf);
+
+ for (shift = (len - 1) << 3; len != 0; shift -= 8, --len)
+ *p++ = (uint8_t)(x >> shift);
+
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vunpack_posint --
+ * Reads a variable-length positive integer from the specified location.
+ */
+static inline int
+__wt_vunpack_posint(const uint8_t **pp, size_t maxlen, uint64_t *retp)
+{
+ uint64_t x;
+ const uint8_t *p;
+ uint8_t len;
+
+ /* There are four length bits in the first byte. */
+ p = *pp;
+ len = (*p++ & 0xf);
+ WT_SIZE_CHECK(len + 1, maxlen);
+
+ for (x = 0; len != 0; --len)
+ x = (x << 8) | *p++;
+
+ *retp = x;
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vunpack_negint --
+ * Reads a variable-length negative integer from the specified location.
+ */
+static inline int
+__wt_vunpack_negint(const uint8_t **pp, size_t maxlen, uint64_t *retp)
+{
+ uint64_t x;
+ const uint8_t *p;
+ uint8_t len;
+
+ /* There are four length bits in the first byte. */
+ p = *pp;
+ len = (int)sizeof (x) - (*p++ & 0xf);
+ WT_SIZE_CHECK(len + 1, maxlen);
+
+ for (x = UINT64_MAX; len != 0; --len)
+ x = (x << 8) | *p++;
+
+ *retp = x;
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vpack_uint --
+ * Variable-sized packing for unsigned integers
+ */
+static inline int
+__wt_vpack_uint(uint8_t **pp, size_t maxlen, uint64_t x)
+{
+ uint8_t *p;
+
+ WT_SIZE_CHECK(1, maxlen);
+ p = *pp;
+ if (x <= POS_1BYTE_MAX)
+ *p++ = POS_1BYTE_MARKER | GET_BITS(x, 6, 0);
+ else if (x <= POS_2BYTE_MAX) {
+ WT_SIZE_CHECK(2, maxlen);
+ x -= POS_1BYTE_MAX + 1;
+ *p++ = POS_2BYTE_MARKER | GET_BITS(x, 13, 8);
+ *p++ = GET_BITS(x, 8, 0);
+ } else {
+ x -= POS_2BYTE_MAX + 1;
+ *p = POS_MULTI_MARKER;
+ return (__wt_vpack_posint(pp, maxlen, x));
+ }
+
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vpack_int --
+ * Variable-sized packing for signed integers
+ */
+static inline int
+__wt_vpack_int(uint8_t **pp, size_t maxlen, int64_t x)
+{
+ uint8_t *p;
+
+ WT_SIZE_CHECK(1, maxlen);
+ p = *pp;
+ if (x < NEG_2BYTE_MIN) {
+ *p = NEG_MULTI_MARKER;
+ return (__wt_vpack_negint(pp, maxlen, (uint64_t)x));
+ } else if (x < NEG_1BYTE_MIN) {
+ WT_SIZE_CHECK(2, maxlen);
+ x -= NEG_2BYTE_MIN;
+ *p++ = NEG_2BYTE_MARKER | GET_BITS(x, 13, 8);
+ *p++ = GET_BITS(x, 8, 0);
+ } else if (x < 0) {
+ x -= NEG_1BYTE_MIN;
+ *p++ = NEG_1BYTE_MARKER | GET_BITS(x, 6, 0);
+ } else
+ /* For non-negative values, use the unsigned code above. */
+ return (__wt_vpack_uint(pp, maxlen, (uint64_t)x));
+
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vunpack_uint --
+ * Variable-sized unpacking for unsigned integers
+ */
+static inline int
+__wt_vunpack_uint(const uint8_t **pp, size_t maxlen, uint64_t *xp)
+{
+ const uint8_t *p;
+
+ WT_SIZE_CHECK(1, maxlen);
+ p = *pp;
+ switch (*p & 0xf0) {
+ case POS_1BYTE_MARKER:
+ case POS_1BYTE_MARKER | 0x10:
+ case POS_1BYTE_MARKER | 0x20:
+ case POS_1BYTE_MARKER | 0x30:
+ *xp = GET_BITS(*p, 6, 0);
+ p += 1;
+ break;
+ case POS_2BYTE_MARKER:
+ case POS_2BYTE_MARKER | 0x10:
+ WT_SIZE_CHECK(2, maxlen);
+ *xp = GET_BITS(*p++, 5, 0) << 8;
+ *xp |= *p++;
+ *xp += POS_1BYTE_MAX + 1;
+ break;
+ case POS_MULTI_MARKER:
+ WT_RET(__wt_vunpack_posint(pp, maxlen, xp));
+ *xp += POS_2BYTE_MAX + 1;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vunpack_int --
+ * Variable-sized packing for signed integers
+ */
+static inline int
+__wt_vunpack_int(const uint8_t **pp, size_t maxlen, int64_t *xp)
+{
+ const uint8_t *p;
+
+ WT_SIZE_CHECK(1, maxlen);
+ p = *pp;
+ switch (*p & 0xf0) {
+ case NEG_MULTI_MARKER:
+ WT_RET(__wt_vunpack_negint(pp, maxlen, (uint64_t *)xp));
+ return (0);
+ case NEG_2BYTE_MARKER:
+ case NEG_2BYTE_MARKER | 0x10:
+ WT_SIZE_CHECK(2, maxlen);
+ *xp = (int64_t)(GET_BITS(*p++, 5, 0) << 8);
+ *xp |= *p++;
+ *xp += NEG_2BYTE_MIN;
+ p += 2;
+ break;
+ case NEG_1BYTE_MARKER:
+ case NEG_1BYTE_MARKER | 0x10:
+ case NEG_1BYTE_MARKER | 0x20:
+ case NEG_1BYTE_MARKER | 0x30:
+ *xp = NEG_1BYTE_MIN + (int64_t)GET_BITS(*p, 6, 0);
+ p += 1;
+ break;
+ default:
+ /* Identical to the unsigned case. */
+ return (__wt_vunpack_uint(pp, maxlen, (uint64_t *)xp));
+ }
+
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vsize_posint --
+ * Return the packed size of a positive variable-length integer.
+ */
+static inline size_t
+__wt_vsize_posint(uint64_t x)
+{
+ int lz;
+
+ WT_LEADING_ZEROS(x, lz);
+ return ((size_t)(WT_INTPACK64_MAXSIZE - lz));
+}
+
+/*
+ * __wt_vsize_negint --
+ * Return the packed size of a negative variable-length integer.
+ */
+static inline size_t
+__wt_vsize_negint(uint64_t x)
+{
+ int lz;
+
+ WT_LEADING_ZEROS(~x, lz);
+ return (size_t)(WT_INTPACK64_MAXSIZE - lz);
+}
+
+/*
+ * __wt_vsize_uint --
+ * Return the packed size of an unsigned integer.
+ */
+static inline size_t
+__wt_vsize_uint(uint64_t x)
+{
+ if (x <= POS_1BYTE_MAX)
+ return (1);
+ else if (x <= POS_2BYTE_MAX) {
+ return (2);
+ } else {
+ x -= POS_2BYTE_MAX + 1;
+ return (__wt_vsize_posint(x));
+ }
+}
+
+/*
+ * __wt_vsize_int --
+ * Return the packed size of a signed integer.
+ */
+static inline size_t
+__wt_vsize_int(int64_t x)
+{
+ if (x < NEG_2BYTE_MIN) {
+ return (__wt_vsize_negint((uint64_t)x));
+ } else if (x < NEG_1BYTE_MIN) {
+ return (2);
+ } else if (x < 0) {
+ return (1);
+ } else
+ /* For non-negative values, use the unsigned code above. */
+ return (__wt_vsize_uint((uint64_t)x));
+}
diff --git a/src/third_party/wiredtiger/src/include/lint.h b/src/third_party/wiredtiger/src/include/lint.h
new file mode 100644
index 00000000000..7c0a103a8ee
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/lint.h
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define WT_GCC_ATTRIBUTE(x)
+#define WT_GCC_FUNC_ATTRIBUTE(x)
+
+#define __WT_ATOMIC_ADD(v, val) \
+ ((v) += (val))
+#define __WT_ATOMIC_FETCH_ADD(v, val) \
+ ((v) += (val), (v))
+#define __WT_ATOMIC_CAS(v, old, new) \
+ ((v) = ((v) == (old) ? (new) : (old)), (v) == (old))
+#define __WT_ATOMIC_CAS_VAL(v, old, new) \
+ ((v) = ((v) == (old) ? (new) : (old)), (v) == (old))
+#define __WT_ATOMIC_STORE(v, val) \
+ ((v) = (val))
+#define __WT_ATOMIC_SUB(v, val) \
+ ((v) -= (val), (v))
+
+#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val)
+#define WT_ATOMIC_FETCH_ADD1(v, val) __WT_ATOMIC_FETCH_ADD(v, val)
+#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new)
+#define WT_ATOMIC_CAS_VAL1(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new)
+#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val)
+#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val)
+
+#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val)
+#define WT_ATOMIC_FETCH_ADD2(v, val) __WT_ATOMIC_FETCH_ADD(v, val)
+#define WT_ATOMIC_CAS2(v, old, new) __WT_ATOMIC_CAS(v, old, new)
+#define WT_ATOMIC_CAS_VAL2(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new)
+#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val)
+#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val)
+
+#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val)
+#define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val)
+#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new)
+#define WT_ATOMIC_CAS_VAL4(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new)
+#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val)
+#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val)
+
+#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val)
+#define WT_ATOMIC_FETCH_ADD8(v, val) __WT_ATOMIC_FETCH_ADD(v, val)
+#define WT_ATOMIC_CAS8(v, old, new) __WT_ATOMIC_CAS(v, old, new)
+#define WT_ATOMIC_CAS_VAL8(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new)
+#define WT_ATOMIC_STORE8(v, val) __WT_ATOMIC_STORE(v, val)
+#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val)
+
+static inline void WT_BARRIER(void) { return; }
+static inline void WT_FULL_BARRIER(void) { return; }
+static inline void WT_PAUSE(void) { return; }
+static inline void WT_READ_BARRIER(void) { return; }
+static inline void WT_WRITE_BARRIER(void) { return; }
diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h
new file mode 100644
index 00000000000..15054e34906
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/log.h
@@ -0,0 +1,177 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define WT_LOG_FILENAME "WiredTigerLog" /* Log file name */
+
+/* Logging subsystem declarations. */
+#define LOG_ALIGN 128
+#define WT_LOG_SLOT_BUF_INIT_SIZE 64 * 1024
+
+#define INIT_LSN(l) do { \
+ (l)->file = 1; \
+ (l)->offset = 0; \
+} while (0)
+
+#define IS_INIT_LSN(l) ((l)->file == 1 && (l)->offset == 0)
+
+/*
+ * Both of the macros below need to change if the content of __wt_lsn
+ * ever changes. The value is the following:
+ * txnid, record type, operation type, file id, operation key, operation value
+ */
+#define LOGC_KEY_FORMAT WT_UNCHECKED_STRING(IqI)
+#define LOGC_VALUE_FORMAT WT_UNCHECKED_STRING(qIIIuu)
+
+#define LOG_SKIP_HEADER(data) \
+ ((const uint8_t *)(data) + offsetof(WT_LOG_RECORD, record))
+#define LOG_REC_SIZE(size) \
+ ((size) - offsetof(WT_LOG_RECORD, record))
+
+#define MAX_LSN(l) do { \
+ (l)->file = UINT32_MAX; \
+ (l)->offset = INT64_MAX; \
+} while (0)
+
+/*
+ * Compare 2 LSNs, return -1 if lsn0 < lsn1, 0 if lsn0 == lsn1
+ * and 1 if lsn0 > lsn1.
+ */
+#define LOG_CMP(lsn1, lsn2) \
+ ((lsn1)->file != (lsn2)->file ? \
+ ((lsn1)->file < (lsn2)->file ? -1 : 1) : \
+ ((lsn1)->offset != (lsn2)->offset ? \
+ ((lsn1)->offset < (lsn2)->offset ? -1 : 1) : 0))
+
+/*
+ * Possible values for the consolidation array slot states:
+ * < WT_LOG_SLOT_DONE - threads are actively writing to the log.
+ * WT_LOG_SLOT_DONE - all activity on this slot is complete.
+ * WT_LOG_SLOT_FREE - slot is available for allocation.
+ * WT_LOG_SLOT_PENDING - slot is transitioning from ready to active.
+ * WT_LOG_SLOT_READY - slot is ready for threads to join.
+ * > WT_LOG_SLOT_READY - threads are actively consolidating on this slot.
+ */
+#define WT_LOG_SLOT_DONE 0
+#define WT_LOG_SLOT_FREE 1
+#define WT_LOG_SLOT_PENDING 2
+#define WT_LOG_SLOT_READY 3
+typedef struct {
+ int64_t slot_state; /* Slot state */
+ uint64_t slot_group_size; /* Group size */
+ int32_t slot_error; /* Error value */
+#define SLOT_INVALID_INDEX 0xffffffff
+ uint32_t slot_index; /* Active slot index */
+ wt_off_t slot_start_offset; /* Starting file offset */
+ WT_LSN slot_release_lsn; /* Slot release LSN */
+ WT_LSN slot_start_lsn; /* Slot starting LSN */
+ WT_LSN slot_end_lsn; /* Slot ending LSN */
+ WT_FH *slot_fh; /* File handle for this group */
+ WT_ITEM slot_buf; /* Buffer for grouped writes */
+ int32_t slot_churn; /* Active slots are scarce. */
+
+#define SLOT_BUF_GROW 0x01 /* Grow buffer on release */
+#define SLOT_BUFFERED 0x02 /* Buffer writes */
+#define SLOT_CLOSEFH 0x04 /* Close old fh on release */
+#define SLOT_SYNC 0x08 /* Needs sync on release */
+ uint32_t flags; /* Flags */
+} WT_LOGSLOT WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+typedef struct {
+ WT_LOGSLOT *slot;
+ wt_off_t offset;
+} WT_MYSLOT;
+
+ /* Offset of first record */
+#define LOG_FIRST_RECORD log->allocsize
+
+typedef struct {
+ uint32_t allocsize; /* Allocation alignment size */
+ wt_off_t log_written; /* Amount of log written this period */
+ /*
+ * Log file information
+ */
+ uint32_t fileid; /* Current log file number */
+ WT_FH *log_fh; /* Logging file handle */
+ WT_FH *log_close_fh; /* Logging file handle to close */
+
+ /*
+ * System LSNs
+ */
+ WT_LSN alloc_lsn; /* Next LSN for allocation */
+ WT_LSN ckpt_lsn; /* Last checkpoint LSN */
+ WT_LSN first_lsn; /* First LSN */
+ WT_LSN sync_lsn; /* LSN of the last sync */
+ WT_LSN trunc_lsn; /* End LSN for recovery truncation */
+ WT_LSN write_lsn; /* Last LSN written to log file */
+
+ /*
+ * Synchronization resources
+ */
+ WT_SPINLOCK log_lock; /* Locked: Logging fields */
+ WT_SPINLOCK log_slot_lock; /* Locked: Consolidation array */
+ WT_SPINLOCK log_sync_lock; /* Locked: Single-thread fsync */
+
+ WT_RWLOCK *log_archive_lock; /* Archive and log cursors */
+
+ /* Notify any waiting threads when sync_lsn is updated. */
+ WT_CONDVAR *log_sync_cond;
+
+ /*
+ * Consolidation array information
+ * SLOT_ACTIVE must be less than SLOT_POOL.
+ * Our testing shows that the more consolidation we generate the
+ * better the performance we see which equates to an active slot
+ * slot count of one.
+ */
+#define SLOT_ACTIVE 1
+#define SLOT_POOL 16
+ uint32_t pool_index; /* Global pool index */
+ WT_LOGSLOT *slot_array[SLOT_ACTIVE]; /* Active slots */
+ WT_LOGSLOT slot_pool[SLOT_POOL]; /* Pool of all slots */
+
+#define WT_LOG_FORCE_CONSOLIDATE 0x01 /* Disable direct writes */
+ uint32_t flags;
+} WT_LOG;
+
+typedef struct {
+ uint32_t len; /* 00-03: Record length including hdr */
+ uint32_t checksum; /* 04-07: Checksum of the record */
+ uint8_t unused[8]; /* 08-15: Padding */
+ uint8_t record[0]; /* Beginning of actual data */
+} WT_LOG_RECORD;
+
+/*
+ * WT_LOG_DESC --
+ * The log file's description.
+ */
+struct __wt_log_desc {
+#define WT_LOG_MAGIC 0x101064
+ uint32_t log_magic; /* 00-03: Magic number */
+#define WT_LOG_MAJOR_VERSION 1
+ uint16_t majorv; /* 04-05: Major version */
+#define WT_LOG_MINOR_VERSION 0
+ uint16_t minorv; /* 06-07: Minor version */
+ uint64_t log_size; /* 08-15: Log file size */
+};
+
+/*
+ * WT_LOG_REC_DESC --
+ * A descriptor for a log record type.
+ */
+struct __wt_log_rec_desc {
+ const char *fmt;
+ int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end);
+};
+
+/*
+ * WT_LOG_OP_DESC --
+ * A descriptor for a log operation type.
+ */
+struct __wt_log_op_desc {
+ const char *fmt;
+ int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end);
+};
diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h
new file mode 100644
index 00000000000..99532b97850
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/lsm.h
@@ -0,0 +1,232 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WT_LSM_WORKER_COOKIE --
+ * State for an LSM worker thread.
+ */
+struct __wt_lsm_worker_cookie {
+ WT_LSM_CHUNK **chunk_array;
+ size_t chunk_alloc;
+ u_int nchunks;
+};
+
+/*
+ * WT_LSM_WORKER_ARGS --
+ * State for an LSM worker thread.
+ */
+struct __wt_lsm_worker_args {
+ WT_SESSION_IMPL *session; /* Session */
+ WT_CONDVAR *work_cond; /* Owned by the manager */
+ wt_thread_t tid; /* Thread id */
+ u_int id; /* My manager slot id */
+ uint32_t type; /* Types of operations handled */
+#define WT_LSM_WORKER_RUN 0x01
+ uint32_t flags; /* Worker flags */
+};
+
+/*
+ * WT_CURSOR_LSM --
+ * An LSM cursor.
+ */
+struct __wt_cursor_lsm {
+ WT_CURSOR iface;
+
+ WT_LSM_TREE *lsm_tree;
+ uint64_t dsk_gen;
+
+ u_int nchunks; /* Number of chunks in the cursor */
+ u_int nupdates; /* Updates needed (including
+ snapshot isolation checks). */
+ WT_BLOOM **blooms; /* Bloom filter handles. */
+ size_t bloom_alloc;
+
+ WT_CURSOR **cursors; /* Cursor handles. */
+ size_t cursor_alloc;
+
+ WT_CURSOR *current; /* The current cursor for iteration */
+ WT_LSM_CHUNK *primary_chunk; /* The current primary chunk */
+
+ uint64_t *switch_txn; /* Switch txn for each chunk */
+ size_t txnid_alloc;
+
+ u_int update_count; /* Updates performed. */
+
+#define WT_CLSM_ACTIVE 0x01 /* Incremented the session count */
+#define WT_CLSM_ITERATE_NEXT 0x02 /* Forward iteration */
+#define WT_CLSM_ITERATE_PREV 0x04 /* Backward iteration */
+#define WT_CLSM_MERGE 0x08 /* Merge cursor, don't update */
+#define WT_CLSM_MINOR_MERGE 0x10 /* Minor merge, include tombstones */
+#define WT_CLSM_MULTIPLE 0x20 /* Multiple cursors have values for the
+ current key */
+#define WT_CLSM_OPEN_READ 0x40 /* Open for reads */
+#define WT_CLSM_OPEN_SNAPSHOT 0x80 /* Open for snapshot isolation */
+ uint32_t flags;
+};
+
+/*
+ * WT_LSM_CHUNK --
+ * A single chunk (file) in an LSM tree.
+ */
+struct __wt_lsm_chunk {
+ const char *uri; /* Data source for this chunk */
+ const char *bloom_uri; /* URI of Bloom filter, if any */
+ struct timespec create_ts; /* Creation time (for rate limiting) */
+ uint64_t count; /* Approximate count of records */
+ uint64_t size; /* Final chunk size */
+
+ uint64_t switch_txn; /*
+ * Largest transaction that can write
+ * to this chunk, set by a worker
+ * thread when the chunk is switched
+ * out, or by compact to get the most
+ * recent chunk flushed.
+ */
+
+ uint32_t id; /* ID used to generate URIs */
+ uint32_t generation; /* Merge generation */
+ uint32_t refcnt; /* Number of worker thread references */
+ uint32_t bloom_busy; /* Number of worker thread references */
+
+ int8_t empty; /* 1/0: checkpoint missing */
+ int8_t evicted; /* 1/0: in-memory chunk was evicted */
+
+#define WT_LSM_CHUNK_BLOOM 0x01
+#define WT_LSM_CHUNK_MERGING 0x02
+#define WT_LSM_CHUNK_ONDISK 0x04
+#define WT_LSM_CHUNK_STABLE 0x08
+ uint32_t flags;
+} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+/*
+ * Different types of work units. Used by LSM worker threads to choose which
+ * type of work they will execute, and by work units to define which action
+ * is required.
+ */
+#define WT_LSM_WORK_BLOOM 0x01 /* Create a bloom filter */
+#define WT_LSM_WORK_DROP 0x02 /* Drop unused chunks */
+#define WT_LSM_WORK_FLUSH 0x04 /* Flush a chunk to disk */
+#define WT_LSM_WORK_MERGE 0x08 /* Look for a tree merge */
+#define WT_LSM_WORK_SWITCH 0x10 /* Switch to new in-memory chunk */
+
+/*
+ * WT_LSM_WORK_UNIT --
+ * A definition of maintenance that an LSM tree needs done.
+ */
+struct __wt_lsm_work_unit {
+ TAILQ_ENTRY(__wt_lsm_work_unit) q; /* Worker unit queue */
+ uint32_t type; /* Type of operation */
+#define WT_LSM_WORK_FORCE 0x0001 /* Force operation */
+ uint32_t flags; /* Flags for operation */
+ WT_LSM_TREE *lsm_tree;
+};
+
+/*
+ * WT_LSM_MANAGER --
+ * A structure that holds resources used to manage any LSM trees in a
+ * database.
+ */
+struct __wt_lsm_manager {
+ /*
+ * Queues of work units for LSM worker threads. We maintain three
+ * queues, to allow us to keep each queue FIFO, rather than needing
+ * to manage the order of work by shuffling the queue order.
+ * One queue for switches - since switches should never wait for other
+ * work to be done.
+ * One queue for application requested work. For example flushing
+ * and creating bloom filters.
+ * One queue that is for longer running operations such as merges.
+ */
+ TAILQ_HEAD(__wt_lsm_work_switch_qh, __wt_lsm_work_unit) switchqh;
+ TAILQ_HEAD(__wt_lsm_work_app_qh, __wt_lsm_work_unit) appqh;
+ TAILQ_HEAD(__wt_lsm_work_manager_qh, __wt_lsm_work_unit) managerqh;
+ WT_SPINLOCK switch_lock; /* Lock for switch queue */
+ WT_SPINLOCK app_lock; /* Lock for application queue */
+ WT_SPINLOCK manager_lock; /* Lock for manager queue */
+ WT_CONDVAR *work_cond; /* Used to notify worker of activity */
+ uint32_t lsm_workers; /* Current number of LSM workers */
+ uint32_t lsm_workers_max;
+#define WT_LSM_MAX_WORKERS 20
+ WT_LSM_WORKER_ARGS lsm_worker_cookies[WT_LSM_MAX_WORKERS];
+};
+
+/*
+ * WT_LSM_TREE --
+ * An LSM tree.
+ */
+struct __wt_lsm_tree {
+ const char *name, *config, *filename;
+ const char *key_format, *value_format;
+ const char *bloom_config, *file_config;
+
+ WT_COLLATOR *collator;
+ const char *collator_name;
+
+ int refcnt; /* Number of users of the tree */
+#define LSM_TREE_MAX_QUEUE 100
+ int queue_ref;
+ WT_RWLOCK *rwlock;
+ TAILQ_ENTRY(__wt_lsm_tree) q;
+
+ WT_DSRC_STATS stats; /* LSM-level statistics */
+
+ uint64_t dsk_gen;
+
+ long ckpt_throttle; /* Rate limiting due to checkpoints */
+ long merge_throttle; /* Rate limiting due to merges */
+ uint64_t chunk_fill_ms; /* Estimate of time to fill a chunk */
+ struct timespec last_flush_ts; /* Timestamp last flush finished */
+ struct timespec work_push_ts; /* Timestamp last work unit added */
+ uint64_t merge_progressing; /* Bumped when merges are active */
+ uint32_t merge_syncing; /* Bumped when merges are syncing */
+
+ /* Configuration parameters */
+ uint32_t bloom_bit_count;
+ uint32_t bloom_hash_count;
+ uint64_t chunk_size;
+ uint64_t chunk_max;
+ u_int merge_min, merge_max;
+
+ u_int merge_idle; /* Count of idle merge threads */
+
+#define WT_LSM_BLOOM_MERGED 0x00000001
+#define WT_LSM_BLOOM_OFF 0x00000002
+#define WT_LSM_BLOOM_OLDEST 0x00000004
+ uint32_t bloom; /* Bloom creation policy */
+
+ WT_LSM_CHUNK **chunk; /* Array of active LSM chunks */
+ size_t chunk_alloc; /* Space allocated for chunks */
+ u_int nchunks; /* Number of active chunks */
+ uint32_t last; /* Last allocated ID */
+ int modified; /* Have there been updates? */
+
+ WT_LSM_CHUNK **old_chunks; /* Array of old LSM chunks */
+ size_t old_alloc; /* Space allocated for old chunks */
+ u_int nold_chunks; /* Number of old chunks */
+ int freeing_old_chunks; /* Whether chunks are being freed */
+ uint32_t merge_aggressiveness; /* Increase amount of work per merge */
+
+#define WT_LSM_TREE_ACTIVE 0x01 /* Workers are active */
+#define WT_LSM_TREE_COMPACTING 0x02 /* Tree being compacted */
+#define WT_LSM_TREE_NEED_SWITCH 0x04 /* New chunk needs creating */
+#define WT_LSM_TREE_OPEN 0x08 /* The tree is open */
+#define WT_LSM_TREE_THROTTLE 0x10 /* Throttle updates */
+ uint32_t flags;
+
+#define WT_LSM_TREE_EXCLUSIVE 0x01 /* Tree is opened exclusively */
+ uint8_t flags_atomic;
+};
+
+/*
+ * WT_LSM_DATA_SOURCE --
+ * Implementation of the WT_DATA_SOURCE interface for LSM.
+ */
+struct __wt_lsm_data_source {
+ WT_DATA_SOURCE iface;
+
+ WT_RWLOCK *rwlock;
+};
diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h
new file mode 100644
index 00000000000..e4d7fd64f94
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/meta.h
@@ -0,0 +1,58 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define WT_WIREDTIGER "WiredTiger" /* Version file */
+#define WT_SINGLETHREAD "WiredTiger.lock" /* Locking file */
+
+#define WT_BASECONFIG "WiredTiger.basecfg" /* Configuration */
+#define WT_USERCONFIG "WiredTiger.config" /* Configuration */
+
+#define WT_METADATA_BACKUP "WiredTiger.backup" /* Hot backup file */
+
+#define WT_METADATA_TURTLE "WiredTiger.turtle" /* Metadata metadata */
+#define WT_METADATA_TURTLE_SET "WiredTiger.turtle.set" /* Turtle temp file */
+
+#define WT_METADATA_URI "metadata:" /* Metadata alias */
+#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata file URI */
+#define WT_IS_METADATA(dh) \
+ (strcmp((dh)->name, WT_METAFILE_URI) == 0)
+#define WT_METAFILE_ID 0 /* Metadata file ID */
+
+#define WT_METADATA_VERSION "WiredTiger version" /* Version keys */
+#define WT_METADATA_VERSION_STR "WiredTiger version string"
+
+/*
+ * WT_CKPT --
+ * Encapsulation of checkpoint information, shared by the metadata, the
+ * btree engine, and the block manager.
+ */
+#define WT_CHECKPOINT "WiredTigerCheckpoint"
+#define WT_CKPT_FOREACH(ckptbase, ckpt) \
+ for ((ckpt) = (ckptbase); (ckpt)->name != NULL; ++(ckpt))
+
+struct __wt_ckpt {
+ char *name; /* Name or NULL */
+
+ WT_ITEM addr; /* Checkpoint cookie string */
+ WT_ITEM raw; /* Checkpoint cookie raw */
+
+ int64_t order; /* Checkpoint order */
+
+ uintmax_t sec; /* Timestamp */
+
+ uint64_t ckpt_size; /* Checkpoint size */
+
+ uint64_t write_gen; /* Write generation */
+
+ void *bpriv; /* Block manager private */
+
+#define WT_CKPT_ADD 0x01 /* Checkpoint to be added */
+#define WT_CKPT_DELETE 0x02 /* Checkpoint to be deleted */
+#define WT_CKPT_FAKE 0x04 /* Checkpoint is a fake */
+#define WT_CKPT_UPDATE 0x08 /* Checkpoint requires update */
+ uint32_t flags;
+};
diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
new file mode 100644
index 00000000000..bf2c4ccb8cf
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -0,0 +1,221 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Quiet compiler warnings about unused function parameters and variables,
+ * and unused function return values.
+ */
+#define WT_UNUSED(var) (void)(var)
+
+/* Basic constants. */
+#define WT_MILLION (1000000)
+#define WT_BILLION (1000000000)
+
+#define WT_KILOBYTE (1024)
+#define WT_MEGABYTE (1048576)
+#define WT_GIGABYTE (1073741824)
+#define WT_TERABYTE ((uint64_t)1099511627776)
+#define WT_PETABYTE ((uint64_t)1125899906842624)
+
+/*
+ * Number of directory entries can grow dynamically.
+ */
+#define WT_DIR_ENTRY 32
+
+#define WT_DIRLIST_EXCLUDE 0x1 /* Exclude files matching prefix */
+#define WT_DIRLIST_INCLUDE 0x2 /* Include files matching prefix */
+
+/*
+ * Sizes that cannot be larger than 2**32 are stored in uint32_t fields in
+ * common structures to save space. To minimize conversions from size_t to
+ * uint32_t through the code, we use the following macros.
+ */
+#define WT_STORE_SIZE(s) ((uint32_t)(s))
+#define WT_PTRDIFF(end, begin) \
+ ((size_t)((uint8_t *)(end) - (uint8_t *)(begin)))
+#define WT_PTRDIFF32(end, begin) \
+ WT_STORE_SIZE(WT_PTRDIFF((end), (begin)))
+#define WT_BLOCK_FITS(p, len, begin, maxlen) \
+ ((uint8_t *)(p) >= (uint8_t *)(begin) && \
+ ((uint8_t *)(p) + (len) <= (uint8_t *)(begin) + (maxlen)))
+#define WT_PTR_IN_RANGE(p, begin, maxlen) \
+ WT_BLOCK_FITS((p), 1, (begin), (maxlen))
+
+/*
+ * Align an unsigned value of any type to a specified power-of-2, including the
+ * offset result of a pointer subtraction; do the calculation using the largest
+ * unsigned integer type available.
+ */
+#define WT_ALIGN(n, v) \
+ ((((uintmax_t)(n)) + ((v) - 1)) & ~(((uintmax_t)(v)) - 1))
+
+/* Min, max. */
+#define WT_MIN(a, b) ((a) < (b) ? (a) : (b))
+#define WT_MAX(a, b) ((a) < (b) ? (b) : (a))
+
+/* Elements in an array. */
+#define WT_ELEMENTS(a) (sizeof(a) / sizeof(a[0]))
+
+/* 10 level skip lists, 1/4 have a link to the next element. */
+#define WT_SKIP_MAXDEPTH 10
+#define WT_SKIP_PROBABILITY (UINT32_MAX >> 2)
+
+/*
+ * __wt_calloc_def --
+ * Simple calls don't need separate sizeof arguments.
+ */
+#define __wt_calloc_def(session, number, addr) \
+ __wt_calloc(session, (size_t)(number), sizeof(**(addr)), addr)
+
+/*
+ * __wt_realloc_def --
+ * Common case allocate-and-grow function.
+ * Starts by allocating the requested number of items (at least 10), then
+ * doubles each time the list needs to grow.
+ */
+#define __wt_realloc_def(session, sizep, number, addr) \
+ (((number) * sizeof(**(addr)) <= *(sizep)) ? 0 : \
+ __wt_realloc(session, sizep, WT_MAX(*(sizep) * 2, \
+ WT_MAX(10, (number)) * sizeof(**(addr))), addr))
+/*
+ * Our internal free function clears the underlying address atomically so there
+ * is a smaller chance of racing threads seeing intermediate results while a
+ * structure is being free'd. (That would be a bug, of course, but I'd rather
+ * not drop core, just the same.) That's a non-standard "free" API, and the
+ * resulting bug is a mother to find -- make sure we get it right, don't make
+ * the caller remember to put the & operator on the pointer.
+ */
+#define __wt_free(session, p) do { \
+ if ((p) != NULL) \
+ __wt_free_int(session, (void *)&(p)); \
+} while (0)
+#ifdef HAVE_DIAGNOSTIC
+#define __wt_overwrite_and_free(session, p) do { \
+ memset(p, WT_DEBUG_BYTE, sizeof(*(p))); \
+ __wt_free(session, p); \
+} while (0)
+#define __wt_overwrite_and_free_len(session, p, len) do { \
+ memset(p, WT_DEBUG_BYTE, len); \
+ __wt_free(session, p); \
+} while (0)
+#else
+#define __wt_overwrite_and_free(session, p) __wt_free(session, p)
+#define __wt_overwrite_and_free_len(session, p, len) __wt_free(session, p)
+#endif
+
+/*
+ * Flag set, clear and test.
+ *
+ * They come in 3 flavors: F_XXX (handles a field named "flags" in the structure
+ * referenced by its argument), LF_XXX (handles a local variable named "flags"),
+ * and FLD_XXX (handles any variable, anywhere).
+ *
+ * Flags are unsigned 32-bit values -- we cast to keep the compiler quiet (the
+ * hex constant might be a negative integer), and to ensure the hex constant is
+ * the correct size before applying the bitwise not operator.
+ */
+#define F_CLR(p, mask) ((p)->flags &= ~((uint32_t)(mask)))
+#define F_ISSET(p, mask) ((p)->flags & ((uint32_t)(mask)))
+#define F_SET(p, mask) ((p)->flags |= ((uint32_t)(mask)))
+
+#define LF_CLR(mask) ((flags) &= ~((uint32_t)(mask)))
+#define LF_ISSET(mask) ((flags) & ((uint32_t)(mask)))
+#define LF_SET(mask) ((flags) |= ((uint32_t)(mask)))
+
+#define FLD_CLR(field, mask) ((field) &= ~((uint32_t)(mask)))
+#define FLD_ISSET(field, mask) ((field) & ((uint32_t)(mask)))
+#define FLD_SET(field, mask) ((field) |= ((uint32_t)(mask)))
+
+/* Verbose messages. */
+#ifdef HAVE_VERBOSE
+#define WT_VERBOSE_ISSET(session, f) \
+ (FLD_ISSET(S2C(session)->verbose, f))
+#else
+#define WT_VERBOSE_ISSET(session, f) 0
+#endif
+
+/*
+ * Clear a structure, two flavors: inline when we want to guarantee there's
+ * no function call or setup/tear-down of a loop, and the default where the
+ * compiler presumably chooses. Gcc 4.3 is supposed to get this right, but
+ * we've seen problems when calling memset to clear structures in performance
+ * critical paths.
+ */
+#define WT_CLEAR_INLINE(type, s) do { \
+ static const type __clear; \
+ s = __clear; \
+} while (0)
+#define WT_CLEAR(s) \
+ memset(&(s), 0, sizeof(s))
+
+/* Check if a string matches a prefix. */
+#define WT_PREFIX_MATCH(str, pfx) \
+ (((const char *)str)[0] == ((const char *)pfx)[0] && \
+ strncmp((str), (pfx), strlen(pfx)) == 0)
+
+/* Check if a non-nul-terminated string matches a prefix. */
+#define WT_PREFIX_MATCH_LEN(str, len, pfx) \
+ ((len) >= strlen(pfx) && WT_PREFIX_MATCH(str, pfx))
+
+/* Check if a string matches a prefix, and move past it. */
+#define WT_PREFIX_SKIP(str, pfx) \
+ (WT_PREFIX_MATCH(str, pfx) ? ((str) += strlen(pfx), 1) : 0)
+
+/*
+ * Check if a variable string equals a constant string. Inline the common
+ * case for WiredTiger of a single byte string. This is required because not
+ * all compilers optimize this case in strcmp (e.g., clang).
+ */
+#define WT_STREQ(s, cs) \
+ (sizeof(cs) == 2 ? (s)[0] == (cs)[0] && (s)[1] == '\0' : \
+ strcmp(s, cs) == 0)
+
+/* Check if a string matches a byte string of len bytes. */
+#define WT_STRING_MATCH(str, bytes, len) \
+ (((const char *)str)[0] == ((const char *)bytes)[0] && \
+ strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0')
+
+/*
+ * Macro that produces a string literal that isn't wrapped in quotes, to avoid
+ * tripping up spell checkers.
+ */
+#define WT_UNCHECKED_STRING(str) #str
+
+/* Function return value and scratch buffer declaration and initialization. */
+#define WT_DECL_ITEM(i) WT_ITEM *i = NULL
+#define WT_DECL_RET int ret = 0
+
+/* If a WT_ITEM data field points somewhere in its allocated memory. */
+#define WT_DATA_IN_ITEM(i) \
+ ((i)->mem != NULL && (i)->data >= (i)->mem && \
+ WT_PTRDIFF((i)->data, (i)->mem) < (i)->memsize)
+
+/* Copy the data and size fields of an item. */
+#define WT_ITEM_SET(dst, src) do { \
+ (dst).data = (src).data; \
+ (dst).size = (src).size; \
+} while (0)
+
+/*
+ * In diagnostic mode we track the locations from which hazard pointers and
+ * scratch buffers were acquired.
+ */
+#ifdef HAVE_DIAGNOSTIC
+#define __wt_scr_alloc(session, size, scratchp) \
+ __wt_scr_alloc_func(session, size, scratchp, __FILE__, __LINE__)
+#define __wt_page_in(session, ref, flags) \
+ __wt_page_in_func(session, ref, flags, __FILE__, __LINE__)
+#define __wt_page_swap(session, held, want, flags) \
+ __wt_page_swap_func(session, held, want, flags, __FILE__, __LINE__)
+#else
+#define __wt_scr_alloc(session, size, scratchp) \
+ __wt_scr_alloc_func(session, size, scratchp)
+#define __wt_page_in(session, ref, flags) \
+ __wt_page_in_func(session, ref, flags)
+#define __wt_page_swap(session, held, want, flags) \
+ __wt_page_swap_func(session, held, want, flags)
+#endif
diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i
new file mode 100644
index 00000000000..73caed09c8c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/misc.i
@@ -0,0 +1,32 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_verbose --
+ * Verbose message.
+ */
+static inline int
+__wt_verbose(WT_SESSION_IMPL *session, int flag, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+{
+#ifdef HAVE_VERBOSE
+ WT_DECL_RET;
+ va_list ap;
+
+ if (WT_VERBOSE_ISSET(session, flag)) {
+ va_start(ap, fmt);
+ ret = __wt_eventv(session, 1, 0, NULL, 0, fmt, ap);
+ va_end(ap);
+ }
+ return (ret);
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(fmt);
+ WT_UNUSED(flag);
+ return (0);
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/include/msvc.h b/src/third_party/wiredtiger/src/include/msvc.h
new file mode 100644
index 00000000000..8f44a329940
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/msvc.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+#include <intrin.h>
+
+#ifndef _M_AMD64
+#error "Only x64 is supported with MSVC"
+#endif
+
+#define inline __inline
+
+#define WT_GCC_ATTRIBUTE(x)
+#define WT_GCC_FUNC_ATTRIBUTE(x)
+
+#define __WT_ATOMIC_ADD(v, val, n, s, t) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ _InterlockedExchangeAdd ## s((t*)&(v), (t)(val)) + (val))
+#define __WT_ATOMIC_CAS(v, old, new, n, s, t) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ _InterlockedCompareExchange ## s \
+ ((t*)&(v), (t)(new), (t)(old)) == (t)(old))
+#define __WT_ATOMIC_CAS_VAL(v, old, new, n, s, t) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ _InterlockedCompareExchange ## s((t*)&(v), (t)(new), (t)(old)))
+#define __WT_ATOMIC_STORE(v, val, n, s, t) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ _InterlockedExchange ## s((t*)&(v), (t)(val)))
+#define __WT_ATOMIC_SUB(v, val, n, s, t) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ _InterlockedExchangeAdd ## s((t*)&(v), -(t) val) - (val))
+
+#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val, 1, 8, char)
+#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new, 1, 8, char)
+#define WT_ATOMIC_CAS_VAL1(v, old, new) \
+ __WT_ATOMIC_CAS_VAL(v, old, new, 1, 8, char)
+#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val, 1, 8, char)
+#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val, 1, 8, char)
+
+#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val, 2, 16, short)
+#define WT_ATOMIC_CAS2(v, old, new) \
+ __WT_ATOMIC_CAS(v, old, new, 2, 16, short)
+#define WT_ATOMIC_CAS_VAL2(v, old, new) \
+ __WT_ATOMIC_CAS_VAL(v, old, new, 2, 16, short)
+#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val, 2, 16, short)
+#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val, 2, 16, short)
+
+#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val, 4, , long)
+#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new, 4, , long)
+#define WT_ATOMIC_CAS_VAL4(v, old, new) \
+ __WT_ATOMIC_CAS_VAL(v, old, new, 4, , long)
+#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val, 4, , long)
+#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val, 4, , long)
+
+#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val, 8, 64, __int64)
+#define WT_ATOMIC_CAS8(v, old, new) \
+ __WT_ATOMIC_CAS(v, old, new, 8, 64, __int64)
+#define WT_ATOMIC_CAS_VAL8(v, old, new) \
+ __WT_ATOMIC_CAS_VAL(v, old, new, 8, 64, __int64)
+#define WT_ATOMIC_STORE8(v, val) \
+ __WT_ATOMIC_STORE(v, val, 8, 64, __int64)
+#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val, 8, 64, __int64)
+
+static inline void WT_BARRIER(void) { _ReadWriteBarrier(); }
+static inline void WT_FULL_BARRIER(void) { _mm_mfence(); }
+static inline void WT_PAUSE(void) { _mm_pause(); }
+static inline void WT_READ_BARRIER(void) { _mm_lfence(); }
+static inline void WT_WRITE_BARRIER(void) { _mm_sfence(); }
diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h
new file mode 100644
index 00000000000..b71496dd595
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/mutex.h
@@ -0,0 +1,73 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Condition variables:
+ *
+ * WiredTiger uses condition variables to signal between threads, and for
+ * locking operations that are expected to block.
+ */
+struct __wt_condvar {
+ const char *name; /* Mutex name for debugging */
+
+ wt_mutex_t mtx; /* Mutex */
+ wt_cond_t cond; /* Condition variable */
+
+ int waiters; /* Numbers of waiters, or
+ -1 if signalled with no waiters. */
+};
+
+/*
+ * Read/write locks:
+ *
+ * WiredTiger uses read/write locks for shared/exclusive access to resources.
+ */
+struct __wt_rwlock {
+ const char *name; /* Lock name for debugging */
+
+ wt_rwlock_t rwlock; /* Read/write lock */
+};
+
+/*
+ * Spin locks:
+ *
+ * WiredTiger uses spinlocks for fast mutual exclusion (where operations done
+ * while holding the spin lock are expected to complete in a small number of
+ * instructions).
+ */
+#define SPINLOCK_GCC 0
+#define SPINLOCK_PTHREAD_MUTEX 1
+#define SPINLOCK_PTHREAD_MUTEX_ADAPTIVE 2
+#define SPINLOCK_PTHREAD_MUTEX_LOGGING 3
+#define SPINLOCK_MSVC 4
+
+#if SPINLOCK_TYPE == SPINLOCK_GCC
+
+typedef volatile int
+ WT_SPINLOCK WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+#elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\
+ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\
+ SPINLOCK_TYPE == SPINLOCK_MSVC ||\
+ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+
+typedef struct {
+ wt_mutex_t lock;
+
+ uint64_t counter; /* Statistics: counter */
+
+ const char *name; /* Statistics: mutex name */
+ int8_t id; /* Statistics: current holder ID */
+
+ int8_t initialized; /* Lock initialized, for cleanup */
+} WT_SPINLOCK WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+#else
+
+#error Unknown spinlock type
+
+#endif
diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i
new file mode 100644
index 00000000000..0d5a8586051
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/mutex.i
@@ -0,0 +1,368 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Spin locks:
+ *
+ * These used for cases where fast mutual exclusion is needed (where operations
+ * done while holding the spin lock are expected to complete in a small number
+ * of instructions.
+ */
+
+#if SPINLOCK_TYPE == SPINLOCK_GCC
+
+#define WT_DECL_SPINLOCK_ID(i)
+#define __wt_spin_trylock(session, lock, idp) \
+ __wt_spin_trylock_func(session, lock)
+
+/* Default to spinning 1000 times before yielding. */
+#ifndef WT_SPIN_COUNT
+#define WT_SPIN_COUNT 1000
+#endif
+
+/*
+ * __wt_spin_init --
+ * Initialize a spinlock.
+ */
+static inline int
+__wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name)
+{
+ WT_UNUSED(session);
+ WT_UNUSED(name);
+
+ *(t) = 0;
+ return (0);
+}
+
+/*
+ * __wt_spin_destroy --
+ * Destroy a spinlock.
+ */
+static inline void
+__wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ *(t) = 0;
+}
+
+/*
+ * __wt_spin_trylock_func --
+ * Try to lock a spinlock or fail immediately if it is busy.
+ */
+static inline int
+__wt_spin_trylock_func(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ return (__sync_lock_test_and_set(t, 1) == 0 ? 0 : EBUSY);
+}
+
+/*
+ * __wt_spin_lock --
+ * Spin until the lock is acquired.
+ */
+static inline void
+__wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ int i;
+
+ WT_UNUSED(session);
+
+ while (__sync_lock_test_and_set(t, 1)) {
+ for (i = 0; *t && i < WT_SPIN_COUNT; i++)
+ WT_PAUSE();
+ if (*t)
+ __wt_yield();
+ }
+}
+
+/*
+ * __wt_spin_unlock --
+ * Release the spinlock.
+ */
+static inline void
+__wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ __sync_lock_release(t);
+}
+
+#elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\
+ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\
+ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+
+/*
+ * __wt_spin_init --
+ * Initialize a spinlock.
+ */
+static inline int
+__wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name)
+{
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE
+ pthread_mutexattr_t attr;
+
+ WT_RET(pthread_mutexattr_init(&attr));
+ WT_RET(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP));
+ WT_RET(pthread_mutex_init(&t->lock, &attr));
+#else
+ WT_RET(pthread_mutex_init(&t->lock, NULL));
+#endif
+
+ t->name = name;
+ t->initialized = 1;
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+ WT_RET(__wt_spin_lock_register_lock(session, t));
+#endif
+
+ WT_UNUSED(session);
+ return (0);
+}
+
+/*
+ * __wt_spin_destroy --
+ * Destroy a spinlock.
+ */
+static inline void
+__wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+ __wt_spin_lock_unregister_lock(session, t);
+#endif
+ if (t->initialized) {
+ (void)pthread_mutex_destroy(&t->lock);
+ t->initialized = 0;
+ }
+}
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\
+ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE
+
+#define WT_DECL_SPINLOCK_ID(i)
+#define __wt_spin_trylock(session, lock, idp) \
+ __wt_spin_trylock_func(session, lock)
+
+/*
+ * __wt_spin_trylock_func --
+ * Try to lock a spinlock or fail immediately if it is busy.
+ */
+static inline int
+__wt_spin_trylock_func(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ return (pthread_mutex_trylock(&t->lock));
+}
+
+/*
+ * __wt_spin_lock --
+ * Spin until the lock is acquired.
+ */
+static inline void
+__wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ pthread_mutex_lock(&t->lock);
+}
+
+#endif
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+
+/*
+ * When logging statistics, we track which spinlocks block and why.
+ */
+#define WT_DECL_SPINLOCK_ID(i) \
+ static int i = WT_SPINLOCK_REGISTER
+#define WT_SPINLOCK_REGISTER -1
+#define WT_SPINLOCK_REGISTER_FAILED -2
+#define __wt_spin_trylock(session, lock, idp) \
+ __wt_spin_trylock_func(session, lock, idp, __FILE__, __LINE__)
+#define __wt_spin_lock(session, lock) do { \
+ WT_DECL_SPINLOCK_ID(__id); \
+ __wt_spin_lock_func(session, lock, &__id, __FILE__, __LINE__); \
+} while (0)
+
+/*
+ * __wt_spin_trylock_func --
+ * Try to lock a spinlock or fail immediately if it is busy.
+ */
+static inline int
+__wt_spin_trylock_func(WT_SESSION_IMPL *session,
+ WT_SPINLOCK *t, int *idp, const char *file, int line)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C_SAFE(session);
+ /* If we're not maintaining statistics, it's simple. */
+ if (session == NULL || !FLD_ISSET(conn->stat_flags, WT_STAT_CONN_FAST))
+ return (pthread_mutex_trylock(&t->lock));
+
+ /*
+ * If this caller hasn't yet registered, do so. The caller's location
+ * ID is a static offset into a per-connection structure, and that has
+ * problems: first, if there are multiple connections, we'll need to
+ * hold some kind of lock to avoid racing when setting that value, and
+ * second, if/when there are multiple connections and/or a single
+ * connection is closed and re-opened, the variable may be initialized
+ * and the underlying connection information may not. Check both.
+ */
+ if (*idp == WT_SPINLOCK_REGISTER ||
+ conn->spinlock_block[*idp].name == NULL)
+ WT_RET(__wt_spin_lock_register_caller(
+ session, t->name, file, line, idp));
+
+ /*
+ * Try to acquire the mutex: on failure, update blocking statistics, on
+ * success, set our ID as the mutex holder.
+ *
+ * Note the race between acquiring the lock and setting our ID as the
+ * holder, this can appear in the output as mutexes blocking in ways
+ * that can't actually happen (although still an indicator of a mutex
+ * that's busier than we'd like).
+ */
+ if ((ret = pthread_mutex_trylock(&t->lock)) == 0)
+ t->id = *idp;
+ else
+ if (*idp >= 0) {
+ ++conn->spinlock_block[*idp].total;
+ if (t->id >= 0)
+ ++conn->spinlock_block[*idp].blocked[t->id];
+ }
+
+ /* Update the mutex counter and flush to minimize the windows. */
+ ++t->counter;
+ WT_FULL_BARRIER();
+ return (ret);
+}
+
+/*
+ * __wt_spin_lock_func --
+ * Spin until the lock is acquired.
+ */
+static inline void
+__wt_spin_lock_func(WT_SESSION_IMPL *session,
+ WT_SPINLOCK *t, int *idp, const char *file, int line)
+{
+ /* If we're not maintaining statistics, it's simple. */
+ if (session == NULL ||
+ !FLD_ISSET(conn->stat_flags, WT_STAT_CONN_FAST)) {
+ pthread_mutex_lock(&t->lock);
+ return;
+ }
+
+ /* Try to acquire the mutex. */
+ if (__wt_spin_trylock_func(session, t, idp, file, line) == 0)
+ return;
+
+ /*
+ * On failure, wait on the mutex; once acquired, set our ID as the
+ * holder and flush to minimize the windows.
+ */
+ pthread_mutex_lock(&t->lock);
+ t->id = *idp;
+ WT_FULL_BARRIER();
+}
+
+#endif
+
+/*
+ * __wt_spin_unlock --
+ * Release the spinlock.
+ */
+static inline void
+__wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ pthread_mutex_unlock(&t->lock);
+}
+
+#elif SPINLOCK_TYPE == SPINLOCK_MSVC
+
+#define WT_DECL_SPINLOCK_ID(i)
+#define WT_SPINLOCK_REGISTER -1
+#define WT_SPINLOCK_REGISTER_FAILED -2
+
+#define __wt_spin_trylock(session, lock, idp) \
+ __wt_spin_trylock_func(session, lock)
+
+/*
+ * __wt_spin_init --
+ * Initialize a spinlock.
+ */
+static inline int
+__wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name)
+{
+ WT_UNUSED(session);
+ WT_UNUSED(name);
+
+ InitializeCriticalSectionAndSpinCount(&t->lock, 4000);
+
+ return (0);
+}
+
+/*
+ * __wt_spin_destroy --
+ * Destroy a spinlock.
+ */
+static inline void
+__wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ DeleteCriticalSection(&t->lock);
+}
+
+/*
+ * __wt_spin_trylock_func --
+ * Try to lock a spinlock or fail immediately if it is busy.
+ */
+static inline int
+__wt_spin_trylock_func(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ BOOL b = TryEnterCriticalSection(&t->lock);
+ return (b == 0 ? EBUSY : 0);
+}
+
+/*
+ * __wt_spin_lock --
+ * Spin until the lock is acquired.
+ */
+static inline void
+__wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ EnterCriticalSection(&t->lock);
+}
+
+/*
+ * __wt_spin_unlock --
+ * Release the spinlock.
+ */
+static inline void
+__wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ LeaveCriticalSection(&t->lock);
+}
+
+#else
+
+#error Unknown spinlock type
+
+#endif
diff --git a/src/third_party/wiredtiger/src/include/os.h b/src/third_party/wiredtiger/src/include/os.h
new file mode 100644
index 00000000000..846249294fe
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/os.h
@@ -0,0 +1,72 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define WT_SYSCALL_RETRY(call, ret) do { \
+ int __retry; \
+ for (__retry = 0; __retry < 10; ++__retry) { \
+ if ((call) == 0) { \
+ (ret) = 0; \
+ break; \
+ } \
+ switch ((ret) = __wt_errno()) { \
+ case 0: \
+ /* The call failed but didn't set errno. */ \
+ (ret) = WT_ERROR; \
+ break; \
+ case EAGAIN: \
+ case EBUSY: \
+ case EINTR: \
+ case EIO: \
+ case EMFILE: \
+ case ENFILE: \
+ case ENOSPC: \
+ __wt_sleep(0L, 500000L); \
+ continue; \
+ default: \
+ break; \
+ } \
+ break; \
+ } \
+} while (0)
+
+#define WT_TIMEDIFF(end, begin) \
+ (1000000000 * (uint64_t)((end).tv_sec - (begin).tv_sec) + \
+ (uint64_t)(end).tv_nsec - (uint64_t)(begin).tv_nsec)
+#define WT_TIMECMP(t1, t2) \
+ ((t1).tv_sec < (t2).tv_sec ? -1 : \
+ (t1).tv_sec == (t2.tv_sec) ? \
+ (t1).tv_nsec < (t2).tv_nsec ? -1 : \
+ (t1).tv_nsec == (t2).tv_nsec ? 0 : 1 : 1)
+
+struct __wt_fh {
+ char *name; /* File name */
+ TAILQ_ENTRY(__wt_fh) q; /* List of open handles */
+
+ u_int ref; /* Reference count */
+
+#ifndef _WIN32
+ int fd; /* POSIX file handle */
+#else
+ HANDLE filehandle; /* Windows file handle */
+ HANDLE filehandle_secondary; /* Windows file handle
+ for file size changes */
+#endif
+ wt_off_t size; /* File size */
+ wt_off_t extend_size; /* File extended size */
+ wt_off_t extend_len; /* File extend chunk size */
+
+ int direct_io; /* O_DIRECT configured */
+
+ int fallocate_available; /* fallocate/posix_fallocate */
+ int fallocate_requires_locking;
+};
+
+#ifndef _WIN32
+#define WT_SIZET_FMT "zu" /* size_t format string */
+#else
+#define WT_SIZET_FMT "Iu" /* size_t format string */
+#endif
diff --git a/src/third_party/wiredtiger/src/include/os_windows.h b/src/third_party/wiredtiger/src/include/os_windows.h
new file mode 100644
index 00000000000..fcae531184f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/os_windows.h
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Define WT threading and concurrency primitives
+ * Assumes Windows 7+/2008 R2+
+ */
+typedef CONDITION_VARIABLE wt_cond_t;
+typedef CRITICAL_SECTION wt_mutex_t;
+typedef HANDLE wt_thread_t;
+typedef SRWLOCK wt_rwlock_t;
+
+/* Timespec is a POSIX structure not defined in Windows */
+struct timespec {
+ time_t tv_sec; /* seconds */
+ long tv_nsec; /* nanoseconds */
+};
+
+#define strncasecmp _strnicmp
+
+/*
+ * Windows Portability stuff
+ * These are POSIX types which Windows lacks
+ * Eventually WiredTiger will migrate away from these types
+ */
+typedef uint32_t u_int;
+typedef unsigned char u_char;
+typedef unsigned long u_long;
+
+/* < VS 2013 is not C99 compat */
+#if _MSC_VER < 1900
+#define snprintf _snprintf
+#endif
+
+/*
+ * Windows does have ssize_t
+ * Python headers declare also though so we need to guard it
+ */
+#ifndef HAVE_SSIZE_T
+typedef int ssize_t;
+#endif
+
+/*
+ * Provide a custom version of vsnprintf that returns the
+ * needed buffer length instead of -1 on truncation
+ */
+#define vsnprintf _wt_vsnprintf
+
+_Check_return_opt_ int __cdecl _wt_vsnprintf(
+ _Out_writes_(_MaxCount) char * _DstBuf,
+ _In_ size_t _MaxCount,
+ _In_z_ _Printf_format_string_ const char * _Format,
+ va_list _ArgList);
+
+/* Provide a custom version of localtime_r */
+struct tm *localtime_r(const time_t* timer, struct tm* result);
diff --git a/src/third_party/wiredtiger/src/include/packing.i b/src/third_party/wiredtiger/src/include/packing.i
new file mode 100644
index 00000000000..6e0e7be13eb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/packing.i
@@ -0,0 +1,685 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Throughout this code we have to be aware of default argument conversion.
+ *
+ * Refer to Chapter 8 of "Expert C Programming" by Peter van der Linden for the
+ * gory details. The short version is that we have less cases to deal with
+ * because the compiler promotes shorter types to int or unsigned int.
+ */
+typedef struct {
+ union {
+ int64_t i;
+ uint64_t u;
+ const char *s;
+ WT_ITEM item;
+ } u;
+ uint32_t size;
+ int8_t havesize;
+ char type;
+} WT_PACK_VALUE;
+
+#define WT_PACK_VALUE_INIT { { 0 }, 0, 0, 0 }
+#define WT_DECL_PACK_VALUE(pv) WT_PACK_VALUE pv = WT_PACK_VALUE_INIT
+
+typedef struct {
+ WT_SESSION_IMPL *session;
+ const char *cur, *end, *orig;
+ unsigned long repeats;
+ WT_PACK_VALUE lastv;
+} WT_PACK;
+
+#define WT_PACK_INIT { NULL, NULL, NULL, NULL, 0, WT_PACK_VALUE_INIT }
+#define WT_DECL_PACK(pack) WT_PACK pack = WT_PACK_INIT
+
+typedef struct {
+ WT_CONFIG config;
+ char buf[20];
+ int count;
+ int iskey;
+ int genname;
+} WT_PACK_NAME;
+
+/*
+ * __pack_initn --
+ * Initialize a pack iterator with the specified string and length.
+ */
+static inline int
+__pack_initn(
+ WT_SESSION_IMPL *session, WT_PACK *pack, const char *fmt, size_t len)
+{
+ if (*fmt == '@' || *fmt == '<' || *fmt == '>')
+ return (EINVAL);
+ if (*fmt == '.')
+ ++fmt;
+
+ pack->session = session;
+ pack->cur = pack->orig = fmt;
+ pack->end = fmt + len;
+ pack->repeats = 0;
+ return (0);
+}
+
+/*
+ * __pack_init --
+ * Initialize a pack iterator with the specified string.
+ */
+static inline int
+__pack_init(WT_SESSION_IMPL *session, WT_PACK *pack, const char *fmt)
+{
+ return (__pack_initn(session, pack, fmt, strlen(fmt)));
+}
+
+/*
+ * __pack_name_init --
+ * Initialize the name of a pack iterator.
+ */
+static inline int
+__pack_name_init(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *names,
+ int iskey, WT_PACK_NAME *pn)
+{
+ WT_CLEAR(*pn);
+ pn->iskey = iskey;
+
+ if (names->str != NULL)
+ WT_RET(__wt_config_subinit(session, &pn->config, names));
+ else
+ pn->genname = 1;
+
+ return (0);
+}
+
+/*
+ * __pack_name_next --
+ * Get the next field type from a pack iterator.
+ */
+static inline int
+__pack_name_next(WT_PACK_NAME *pn, WT_CONFIG_ITEM *name)
+{
+ WT_CONFIG_ITEM ignore;
+
+ if (pn->genname) {
+ (void)snprintf(pn->buf, sizeof(pn->buf),
+ (pn->iskey ? "key%d" : "value%d"), pn->count);
+ WT_CLEAR(*name);
+ name->str = pn->buf;
+ name->len = strlen(pn->buf);
+ name->type = WT_CONFIG_ITEM_STRING;
+ pn->count++;
+ }
+ else
+ WT_RET(__wt_config_next(&pn->config, name, &ignore));
+
+ return (0);
+}
+
+/*
+ * __pack_next --
+ * Next pack iterator.
+ */
+static inline int
+__pack_next(WT_PACK *pack, WT_PACK_VALUE *pv)
+{
+ char *endsize;
+
+ if (pack->repeats > 0) {
+ *pv = pack->lastv;
+ --pack->repeats;
+ return (0);
+ }
+
+next: if (pack->cur == pack->end)
+ return (WT_NOTFOUND);
+
+ if (isdigit(*pack->cur)) {
+ pv->havesize = 1;
+ pv->size = WT_STORE_SIZE(strtoul(pack->cur, &endsize, 10));
+ pack->cur = endsize;
+ } else {
+ pv->havesize = 0;
+ pv->size = 1;
+ }
+
+ pv->type = *pack->cur++;
+ pack->repeats = 0;
+
+ switch (pv->type) {
+ case 'S':
+ case 's':
+ case 'x':
+ return (0);
+ case 't':
+ if (pv->size < 1 || pv->size > 8)
+ WT_RET_MSG(pack->session, EINVAL,
+ "Bitfield sizes must be between 1 and 8 bits "
+ "in format '%.*s'",
+ (int)(pack->end - pack->orig), pack->orig);
+ return (0);
+ case 'u':
+ case 'U':
+ /* Special case for items with a size prefix. */
+ pv->type = (!pv->havesize && *pack->cur != '\0') ? 'U' : 'u';
+ return (0);
+ case 'b':
+ case 'h':
+ case 'i':
+ case 'B':
+ case 'H':
+ case 'I':
+ case 'l':
+ case 'L':
+ case 'q':
+ case 'Q':
+ case 'r':
+ case 'R':
+ /* Integral types repeat <size> times. */
+ if (pv->size == 0)
+ goto next;
+ pack->repeats = pv->size - 1;
+ pack->lastv = *pv;
+ return (0);
+ default:
+ WT_RET_MSG(pack->session, EINVAL,
+ "Invalid type '%c' found in format '%.*s'",
+ pv->type, (int)(pack->end - pack->orig), pack->orig);
+ }
+
+}
+
+#define WT_PACK_GET(session, pv, ap) do { \
+ WT_ITEM *__item; \
+ switch (pv.type) { \
+ case 'x': \
+ break; \
+ case 's': \
+ case 'S': \
+ pv.u.s = va_arg(ap, const char *); \
+ break; \
+ case 'U': \
+ case 'u': \
+ __item = va_arg(ap, WT_ITEM *); \
+ pv.u.item.data = __item->data; \
+ pv.u.item.size = __item->size; \
+ break; \
+ case 'b': \
+ case 'h': \
+ case 'i': \
+ pv.u.i = va_arg(ap, int); \
+ break; \
+ case 'B': \
+ case 'H': \
+ case 'I': \
+ case 't': \
+ pv.u.u = va_arg(ap, unsigned int); \
+ break; \
+ case 'l': \
+ pv.u.i = va_arg(ap, long); \
+ break; \
+ case 'L': \
+ pv.u.u = va_arg(ap, unsigned long); \
+ break; \
+ case 'q': \
+ pv.u.i = va_arg(ap, int64_t); \
+ break; \
+ case 'Q': \
+ case 'r': \
+ case 'R': \
+ pv.u.u = va_arg(ap, uint64_t); \
+ break; \
+ /* User format strings have already been validated. */ \
+ WT_ILLEGAL_VALUE(session); \
+ } \
+} while (0)
+
+/*
+ * __pack_size --
+ * Get the size of a packed value.
+ */
+static inline size_t
+__pack_size(WT_SESSION_IMPL *session, WT_PACK_VALUE *pv)
+{
+ size_t s, pad;
+
+ switch (pv->type) {
+ case 'x':
+ return (pv->size);
+ case 'j':
+ case 'J':
+ if (pv->type == 'j' || pv->havesize)
+ s = pv->size;
+ else {
+ ssize_t len;
+
+ /* The string was previously validated. */
+ len = __wt_json_strlen(pv->u.item.data,
+ pv->u.item.size);
+ WT_ASSERT(session, len >= 0);
+ s = (size_t)len + 1;
+ }
+ return (s);
+ case 's':
+ case 'S':
+ if (pv->type == 's' || pv->havesize)
+ s = pv->size;
+ else
+ s = strlen(pv->u.s) + 1;
+ return (s);
+ case 'U':
+ case 'u':
+ s = pv->u.item.size;
+ pad = 0;
+ if (pv->havesize && pv->size < s)
+ s = pv->size;
+ else if (pv->havesize)
+ pad = pv->size - s;
+ if (pv->type == 'U')
+ s += __wt_vsize_uint(s + pad);
+ return (s + pad);
+ case 'b':
+ case 'B':
+ case 't':
+ return (1);
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'q':
+ return (__wt_vsize_int(pv->u.i));
+ case 'H':
+ case 'I':
+ case 'L':
+ case 'Q':
+ case 'r':
+ return (__wt_vsize_uint(pv->u.u));
+ case 'R':
+ return (sizeof(uint64_t));
+ }
+
+ __wt_err(session, EINVAL, "unknown pack-value type: %c", (int)pv->type);
+ return ((size_t)-1);
+}
+
+/*
+ * __pack_write --
+ * Pack a value into a buffer.
+ */
+static inline int
+__pack_write(
+ WT_SESSION_IMPL *session, WT_PACK_VALUE *pv, uint8_t **pp, size_t maxlen)
+{
+ uint8_t *oldp;
+ size_t s, pad;
+
+ switch (pv->type) {
+ case 'x':
+ WT_SIZE_CHECK(pv->size, maxlen);
+ memset(*pp, 0, pv->size);
+ *pp += pv->size;
+ break;
+ case 's':
+ case 'S':
+ /*
+ * XXX if pv->havesize, only want to know if there is a
+ * '\0' in the first pv->size characters.
+ */
+ s = strlen(pv->u.s);
+ if ((pv->type == 's' || pv->havesize) && pv->size < s) {
+ s = pv->size;
+ pad = 0;
+ } else if (pv->havesize)
+ pad = pv->size - s;
+ else
+ pad = 1;
+ WT_SIZE_CHECK(s + pad, maxlen);
+ if (s > 0)
+ memcpy(*pp, pv->u.s, s);
+ *pp += s;
+ if (pad > 0) {
+ memset(*pp, 0, pad);
+ *pp += pad;
+ }
+ break;
+ case 'j':
+ case 'J':
+ s = pv->u.item.size;
+ if ((pv->type == 'j' || pv->havesize) && pv->size < s) {
+ s = pv->size;
+ pad = 0;
+ } else if (pv->havesize)
+ pad = pv->size - s;
+ else
+ pad = 1;
+ if (s > 0) {
+ oldp = *pp;
+ WT_RET(__wt_json_strncpy((char **)pp, maxlen,
+ pv->u.item.data, s));
+ maxlen -= (size_t)(*pp - oldp);
+ }
+ if (pad > 0) {
+ WT_SIZE_CHECK(pad, maxlen);
+ memset(*pp, 0, pad);
+ *pp += pad;
+ }
+ break;
+ case 'U':
+ case 'u':
+ s = pv->u.item.size;
+ pad = 0;
+ if (pv->havesize && pv->size < s)
+ s = pv->size;
+ else if (pv->havesize)
+ pad = pv->size - s;
+ if (pv->type == 'U') {
+ oldp = *pp;
+ WT_RET(__wt_vpack_uint(pp, maxlen, s + pad));
+ maxlen -= (size_t)(*pp - oldp);
+ }
+ WT_SIZE_CHECK(s + pad, maxlen);
+ if (s > 0)
+ memcpy(*pp, pv->u.item.data, s);
+ *pp += s;
+ if (pad > 0) {
+ memset(*pp, 0, pad);
+ *pp += pad;
+ }
+ break;
+ case 'b':
+ /* Translate to maintain ordering with the sign bit. */
+ WT_SIZE_CHECK(1, maxlen);
+ **pp = (uint8_t)(pv->u.i + 0x80);
+ *pp += 1;
+ break;
+ case 'B':
+ case 't':
+ WT_SIZE_CHECK(1, maxlen);
+ **pp = (uint8_t)pv->u.u;
+ *pp += 1;
+ break;
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'q':
+ WT_RET(__wt_vpack_int(pp, maxlen, pv->u.i));
+ break;
+ case 'H':
+ case 'I':
+ case 'L':
+ case 'Q':
+ case 'r':
+ WT_RET(__wt_vpack_uint(pp, maxlen, pv->u.u));
+ break;
+ case 'R':
+ WT_SIZE_CHECK(sizeof(uint64_t), maxlen);
+ *(uint64_t *)*pp = pv->u.u;
+ *pp += sizeof(uint64_t);
+ break;
+ default:
+ WT_RET_MSG(session, EINVAL,
+ "unknown pack-value type: %c", (int)pv->type);
+ }
+
+ return (0);
+}
+
+/*
+ * __unpack_read --
+ * Read a packed value from a buffer.
+ */
+static inline int
+__unpack_read(WT_SESSION_IMPL *session,
+ WT_PACK_VALUE *pv, const uint8_t **pp, size_t maxlen)
+{
+ size_t s;
+
+ switch (pv->type) {
+ case 'x':
+ WT_SIZE_CHECK(pv->size, maxlen);
+ *pp += pv->size;
+ break;
+ case 's':
+ case 'S':
+ if (pv->type == 's' || pv->havesize)
+ s = pv->size;
+ else
+ s = strlen((const char *)*pp) + 1;
+ if (s > 0)
+ pv->u.s = (const char *)*pp;
+ WT_SIZE_CHECK(s, maxlen);
+ *pp += s;
+ break;
+ case 'U':
+ WT_RET(__wt_vunpack_uint(pp, maxlen, &pv->u.u));
+ /* FALLTHROUGH */
+ case 'u':
+ if (pv->havesize)
+ s = pv->size;
+ else if (pv->type == 'U')
+ s = (size_t)pv->u.u;
+ else
+ s = maxlen;
+ WT_SIZE_CHECK(s, maxlen);
+ pv->u.item.data = *pp;
+ pv->u.item.size = s;
+ *pp += s;
+ break;
+ case 'b':
+ /* Translate to maintain ordering with the sign bit. */
+ WT_SIZE_CHECK(1, maxlen);
+ pv->u.i = (int8_t)(*(*pp)++ - 0x80);
+ break;
+ case 'B':
+ case 't':
+ WT_SIZE_CHECK(1, maxlen);
+ pv->u.u = *(*pp)++;
+ break;
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'q':
+ WT_RET(__wt_vunpack_int(pp, maxlen, &pv->u.i));
+ break;
+ case 'H':
+ case 'I':
+ case 'L':
+ case 'Q':
+ case 'r':
+ WT_RET(__wt_vunpack_uint(pp, maxlen, &pv->u.u));
+ break;
+ case 'R':
+ WT_SIZE_CHECK(sizeof(uint64_t), maxlen);
+ pv->u.u = *(uint64_t *)*pp;
+ *pp += sizeof(uint64_t);
+ break;
+ default:
+ WT_RET_MSG(session, EINVAL,
+ "unknown pack-value type: %c", (int)pv->type);
+ }
+
+ return (0);
+}
+
+#define WT_UNPACK_PUT(session, pv, ap) do { \
+ WT_ITEM *__item; \
+ switch (pv.type) { \
+ case 'x': \
+ break; \
+ case 's': \
+ case 'S': \
+ *va_arg(ap, const char **) = pv.u.s; \
+ break; \
+ case 'U': \
+ case 'u': \
+ __item = va_arg(ap, WT_ITEM *); \
+ __item->data = pv.u.item.data; \
+ __item->size = pv.u.item.size; \
+ break; \
+ case 'b': \
+ *va_arg(ap, int8_t *) = (int8_t)pv.u.i; \
+ break; \
+ case 'h': \
+ *va_arg(ap, int16_t *) = (short)pv.u.i; \
+ break; \
+ case 'i': \
+ case 'l': \
+ *va_arg(ap, int32_t *) = (int32_t)pv.u.i; \
+ break; \
+ case 'q': \
+ *va_arg(ap, int64_t *) = pv.u.i; \
+ break; \
+ case 'B': \
+ case 't': \
+ *va_arg(ap, uint8_t *) = (uint8_t)pv.u.u; \
+ break; \
+ case 'H': \
+ *va_arg(ap, uint16_t *) = (uint16_t)pv.u.u; \
+ break; \
+ case 'I': \
+ case 'L': \
+ *va_arg(ap, uint32_t *) = (uint32_t)pv.u.u; \
+ break; \
+ case 'Q': \
+ case 'r': \
+ case 'R': \
+ *va_arg(ap, uint64_t *) = pv.u.u; \
+ break; \
+ /* User format strings have already been validated. */ \
+ WT_ILLEGAL_VALUE(session); \
+ } \
+} while (0)
+
+/*
+ * __wt_struct_packv --
+ * Pack a byte string (va_list version).
+ */
+static inline int
+__wt_struct_packv(WT_SESSION_IMPL *session,
+ void *buffer, size_t size, const char *fmt, va_list ap)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ uint8_t *p, *end;
+
+ p = buffer;
+ end = p + size;
+
+ if (fmt[0] != '\0' && fmt[1] == '\0') {
+ pv.type = fmt[0];
+ WT_PACK_GET(session, pv, ap);
+ return (__pack_write(session, &pv, &p, size));
+ }
+
+ WT_RET(__pack_init(session, &pack, fmt));
+ while ((ret = __pack_next(&pack, &pv)) == 0) {
+ WT_PACK_GET(session, pv, ap);
+ WT_RET(__pack_write(session, &pv, &p, (size_t)(end - p)));
+ }
+
+ /* Be paranoid - __pack_write should never overflow. */
+ WT_ASSERT(session, p <= end);
+
+ if (ret != WT_NOTFOUND)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __wt_struct_sizev --
+ * Calculate the size of a packed byte string (va_list version).
+ */
+static inline int
+__wt_struct_sizev(
+ WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, va_list ap)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_PACK pack;
+ size_t total;
+
+ if (fmt[0] != '\0' && fmt[1] == '\0') {
+ pv.type = fmt[0];
+ WT_PACK_GET(session, pv, ap);
+ *sizep = __pack_size(session, &pv);
+ return (0);
+ }
+
+ WT_RET(__pack_init(session, &pack, fmt));
+ for (total = 0; __pack_next(&pack, &pv) == 0;) {
+ WT_PACK_GET(session, pv, ap);
+ total += __pack_size(session, &pv);
+ }
+ *sizep = total;
+ return (0);
+}
+
+/*
+ * __wt_struct_unpackv --
+ * Unpack a byte string (va_list version).
+ */
+static inline int
+__wt_struct_unpackv(WT_SESSION_IMPL *session,
+ const void *buffer, size_t size, const char *fmt, va_list ap)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ const uint8_t *p, *end;
+
+ p = buffer;
+ end = p + size;
+
+ if (fmt[0] != '\0' && fmt[1] == '\0') {
+ pv.type = fmt[0];
+ if ((ret = __unpack_read(session, &pv, &p, size)) == 0)
+ WT_UNPACK_PUT(session, pv, ap);
+ return (0);
+ }
+
+ WT_RET(__pack_init(session, &pack, fmt));
+ while ((ret = __pack_next(&pack, &pv)) == 0) {
+ WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p)));
+ WT_UNPACK_PUT(session, pv, ap);
+ }
+
+ /* Be paranoid - __pack_write should never overflow. */
+ WT_ASSERT(session, p <= end);
+
+ if (ret != WT_NOTFOUND)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __wt_struct_size_adjust --
+ * Adjust the size field for a packed structure.
+ *
+ * Sometimes we want to include the size as a field in a packed structure.
+ * This is done by calling __wt_struct_size with the expected format and
+ * a size of zero. Then we want to pack the structure using the final
+ * size. This function adjusts the size appropriately (taking into
+ * account the size of the final size or the size field itself).
+ */
+static inline void
+__wt_struct_size_adjust(WT_SESSION_IMPL *session, size_t *sizep)
+{
+ size_t prev_size = 1;
+ size_t orig_size = *sizep;
+ size_t field_size0 = __wt_vsize_uint(orig_size);
+ size_t field_size1 =
+ __wt_vsize_uint(orig_size + field_size0 - prev_size);
+ *sizep += field_size1 - prev_size;
+
+ /*
+ * Make sure the field size we calculated matches the adjusted size.
+ * This relies on the fact that we are only adjusting by a small number
+ * of bytes, so we won't cross multiple boundaries in the packing
+ * routine. If that were not true, we would need to iterate here until
+ * the field size stops growing.
+ */
+ WT_ASSERT(session, field_size1 == __wt_vsize_uint(*sizep));
+}
diff --git a/src/third_party/wiredtiger/src/include/posix.h b/src/third_party/wiredtiger/src/include/posix.h
new file mode 100644
index 00000000000..e3b43ea38ab
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/posix.h
@@ -0,0 +1,47 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/* Some systems don't configure 64-bit MIN/MAX by default. */
+#ifndef ULLONG_MAX
+#define ULLONG_MAX 0xffffffffffffffffULL
+#endif
+#ifndef LLONG_MAX
+#define LLONG_MAX 0x7fffffffffffffffLL
+#endif
+#ifndef LLONG_MIN
+#define LLONG_MIN (-0x7fffffffffffffffLL - 1)
+#endif
+
+/* Define O_BINARY for Posix systems */
+#define O_BINARY 0
+
+/*
+ * Define WT threading and concurrency primitives
+ */
+typedef pthread_cond_t wt_cond_t;
+typedef pthread_mutex_t wt_mutex_t;
+typedef pthread_t wt_thread_t;
+
+/*
+ * !!!
+ * Don't touch this structure without understanding the read/write
+ * locking functions.
+ */
+typedef union { /* Read/write lock */
+#ifdef WORDS_BIGENDIAN
+ WiredTiger read/write locks require modification for big-endian systems.
+#else
+ uint64_t u;
+ uint32_t us;
+ struct {
+ uint16_t writers;
+ uint16_t readers;
+ uint16_t users;
+ uint16_t pad;
+ } s;
+#endif
+} wt_rwlock_t;
diff --git a/src/third_party/wiredtiger/src/include/queue.h b/src/third_party/wiredtiger/src/include/queue.h
new file mode 100644
index 00000000000..42e736e7b09
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/queue.h
@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)queue.h 8.5 (Berkeley) 8/20/94
+ * $FreeBSD: src/sys/sys/queue.h,v 1.54 2002/08/05 05:18:43 alfred Exp $
+ */
+
+#ifndef _DB_QUEUE_H_
+#define _DB_QUEUE_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * This file defines four types of data structures: singly-linked lists,
+ * singly-linked tail queues, lists and tail queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The elements
+ * are singly linked for minimum space and pointer manipulation overhead at
+ * the expense of O(n) removal for arbitrary elements. New elements can be
+ * added to the list after an existing element or at the head of the list.
+ * Elements being removed from the head of the list should use the explicit
+ * macro for this purpose for optimum efficiency. A singly-linked list may
+ * only be traversed in the forward direction. Singly-linked lists are ideal
+ * for applications with large datasets and few or no removals or for
+ * implementing a LIFO queue.
+ *
+ * A singly-linked tail queue is headed by a pair of pointers, one to the
+ * head of the list and the other to the tail of the list. The elements are
+ * singly linked for minimum space and pointer manipulation overhead at the
+ * expense of O(n) removal for arbitrary elements. New elements can be added
+ * to the list after an existing element, at the head of the list, or at the
+ * end of the list. Elements being removed from the head of the tail queue
+ * should use the explicit macro for this purpose for optimum efficiency.
+ * A singly-linked tail queue may only be traversed in the forward direction.
+ * Singly-linked tail queues are ideal for applications with large datasets
+ * and few or no removals or for implementing a FIFO queue.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ *
+ *
+ * SLIST LIST STAILQ TAILQ
+ * _HEAD + + + +
+ * _HEAD_INITIALIZER + + + +
+ * _ENTRY + + + +
+ * _INIT + + + +
+ * _EMPTY + + + +
+ * _FIRST + + + +
+ * _NEXT + + + +
+ * _PREV - - - +
+ * _LAST - - + +
+ * _FOREACH + + + +
+ * _FOREACH_REVERSE - - - +
+ * _INSERT_HEAD + + + +
+ * _INSERT_BEFORE - + - +
+ * _INSERT_AFTER + + + +
+ * _INSERT_TAIL - - + +
+ * _CONCAT - - + +
+ * _REMOVE_HEAD + - + -
+ * _REMOVE + + + +
+ *
+ */
+
+/*
+ * XXX
+ * We #undef all of the macros because there are incompatible versions of this
+ * file and these macros on various systems. What makes the problem worse is
+ * they are included and/or defined by system include files which we may have
+ * already loaded into Berkeley DB before getting here. For example, FreeBSD's
+ * <rpc/rpc.h> includes its system <sys/queue.h>, and VxWorks UnixLib.h defines
+ * several of the LIST_XXX macros. Visual C.NET 7.0 also defines some of these
+ * same macros in Vc7\PlatformSDK\Include\WinNT.h. Make sure we use ours.
+ */
+#undef LIST_EMPTY
+#undef LIST_ENTRY
+#undef LIST_FIRST
+#undef LIST_FOREACH
+#undef LIST_HEAD
+#undef LIST_HEAD_INITIALIZER
+#undef LIST_INIT
+#undef LIST_INSERT_AFTER
+#undef LIST_INSERT_BEFORE
+#undef LIST_INSERT_HEAD
+#undef LIST_NEXT
+#undef LIST_REMOVE
+#undef QMD_TRACE_ELEM
+#undef QMD_TRACE_HEAD
+#undef QUEUE_MACRO_DEBUG
+#undef SLIST_EMPTY
+#undef SLIST_ENTRY
+#undef SLIST_FIRST
+#undef SLIST_FOREACH
+#undef SLIST_FOREACH_PREVPTR
+#undef SLIST_HEAD
+#undef SLIST_HEAD_INITIALIZER
+#undef SLIST_INIT
+#undef SLIST_INSERT_AFTER
+#undef SLIST_INSERT_HEAD
+#undef SLIST_NEXT
+#undef SLIST_REMOVE
+#undef SLIST_REMOVE_HEAD
+#undef STAILQ_CONCAT
+#undef STAILQ_EMPTY
+#undef STAILQ_ENTRY
+#undef STAILQ_FIRST
+#undef STAILQ_FOREACH
+#undef STAILQ_HEAD
+#undef STAILQ_HEAD_INITIALIZER
+#undef STAILQ_INIT
+#undef STAILQ_INSERT_AFTER
+#undef STAILQ_INSERT_HEAD
+#undef STAILQ_INSERT_TAIL
+#undef STAILQ_LAST
+#undef STAILQ_NEXT
+#undef STAILQ_REMOVE
+#undef STAILQ_REMOVE_HEAD
+#undef STAILQ_REMOVE_HEAD_UNTIL
+#undef TAILQ_CONCAT
+#undef TAILQ_EMPTY
+#undef TAILQ_ENTRY
+#undef TAILQ_FIRST
+#undef TAILQ_FOREACH
+#undef TAILQ_FOREACH_REVERSE
+#undef TAILQ_HEAD
+#undef TAILQ_HEAD_INITIALIZER
+#undef TAILQ_INIT
+#undef TAILQ_INSERT_AFTER
+#undef TAILQ_INSERT_BEFORE
+#undef TAILQ_INSERT_HEAD
+#undef TAILQ_INSERT_TAIL
+#undef TAILQ_LAST
+#undef TAILQ_NEXT
+#undef TAILQ_PREV
+#undef TAILQ_REMOVE
+#undef TRACEBUF
+#undef TRASHIT
+
+#define QUEUE_MACRO_DEBUG 0
+#if QUEUE_MACRO_DEBUG
+/* Store the last 2 places the queue element or head was altered */
+struct qm_trace {
+ char * lastfile;
+ int lastline;
+ char * prevfile;
+ int prevline;
+};
+
+#define TRACEBUF struct qm_trace trace;
+#define TRASHIT(x) do {(x) = (void *)-1;} while (0)
+
+#define QMD_TRACE_HEAD(head) do { \
+ (head)->trace.prevline = (head)->trace.lastline; \
+ (head)->trace.prevfile = (head)->trace.lastfile; \
+ (head)->trace.lastline = __LINE__; \
+ (head)->trace.lastfile = __FILE__; \
+} while (0)
+
+#define QMD_TRACE_ELEM(elem) do { \
+ (elem)->trace.prevline = (elem)->trace.lastline; \
+ (elem)->trace.prevfile = (elem)->trace.lastfile; \
+ (elem)->trace.lastline = __LINE__; \
+ (elem)->trace.lastfile = __FILE__; \
+} while (0)
+
+#else
+#define QMD_TRACE_ELEM(elem)
+#define QMD_TRACE_HEAD(head)
+#define TRACEBUF
+#define TRASHIT(x)
+#endif /* QUEUE_MACRO_DEBUG */
+
+/*
+ * Singly-linked List declarations.
+ */
+#define SLIST_HEAD(name, type) \
+struct name { \
+ struct type *slh_first; /* first element */ \
+}
+
+#define SLIST_HEAD_INITIALIZER(head) \
+ { NULL }
+
+#define SLIST_ENTRY(type) \
+struct { \
+ struct type *sle_next; /* next element */ \
+}
+
+/*
+ * Singly-linked List functions.
+ */
+#define SLIST_EMPTY(head) ((head)->slh_first == NULL)
+
+#define SLIST_FIRST(head) ((head)->slh_first)
+
+#define SLIST_FOREACH(var, head, field) \
+ for ((var) = SLIST_FIRST((head)); \
+ (var); \
+ (var) = SLIST_NEXT((var), field))
+
+#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \
+ for ((varp) = &SLIST_FIRST((head)); \
+ ((var) = *(varp)) != NULL; \
+ (varp) = &SLIST_NEXT((var), field))
+
+#define SLIST_INIT(head) do { \
+ SLIST_FIRST((head)) = NULL; \
+} while (0)
+
+#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \
+ SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \
+ SLIST_NEXT((slistelm), field) = (elm); \
+} while (0)
+
+#define SLIST_INSERT_HEAD(head, elm, field) do { \
+ SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \
+ SLIST_FIRST((head)) = (elm); \
+} while (0)
+
+#define SLIST_NEXT(elm, field) ((elm)->field.sle_next)
+
+#define SLIST_REMOVE(head, elm, type, field) do { \
+ if (SLIST_FIRST((head)) == (elm)) { \
+ SLIST_REMOVE_HEAD((head), field); \
+ } \
+ else { \
+ struct type *curelm = SLIST_FIRST((head)); \
+ while (SLIST_NEXT(curelm, field) != (elm)) \
+ curelm = SLIST_NEXT(curelm, field); \
+ SLIST_NEXT(curelm, field) = \
+ SLIST_NEXT(SLIST_NEXT(curelm, field), field); \
+ } \
+} while (0)
+
+#define SLIST_REMOVE_HEAD(head, field) do { \
+ SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \
+} while (0)
+
+/*
+ * Singly-linked Tail queue declarations.
+ */
+#define STAILQ_HEAD(name, type) \
+struct name { \
+ struct type *stqh_first;/* first element */ \
+ struct type **stqh_last;/* addr of last next element */ \
+}
+
+#define STAILQ_HEAD_INITIALIZER(head) \
+ { NULL, &(head).stqh_first }
+
+#define STAILQ_ENTRY(type) \
+struct { \
+ struct type *stqe_next; /* next element */ \
+}
+
+/*
+ * Singly-linked Tail queue functions.
+ */
+#define STAILQ_CONCAT(head1, head2) do { \
+ if (!STAILQ_EMPTY((head2))) { \
+ *(head1)->stqh_last = (head2)->stqh_first; \
+ (head1)->stqh_last = (head2)->stqh_last; \
+ STAILQ_INIT((head2)); \
+ } \
+} while (0)
+
+#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL)
+
+#define STAILQ_FIRST(head) ((head)->stqh_first)
+
+#define STAILQ_FOREACH(var, head, field) \
+ for ((var) = STAILQ_FIRST((head)); \
+ (var); \
+ (var) = STAILQ_NEXT((var), field))
+
+#define STAILQ_INIT(head) do { \
+ STAILQ_FIRST((head)) = NULL; \
+ (head)->stqh_last = &STAILQ_FIRST((head)); \
+} while (0)
+
+#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \
+ if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
+ (head)->stqh_last = &STAILQ_NEXT((elm), field); \
+ STAILQ_NEXT((tqelm), field) = (elm); \
+} while (0)
+
+#define STAILQ_INSERT_HEAD(head, elm, field) do { \
+ if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \
+ (head)->stqh_last = &STAILQ_NEXT((elm), field); \
+ STAILQ_FIRST((head)) = (elm); \
+} while (0)
+
+#define STAILQ_INSERT_TAIL(head, elm, field) do { \
+ STAILQ_NEXT((elm), field) = NULL; \
+ *(head)->stqh_last = (elm); \
+ (head)->stqh_last = &STAILQ_NEXT((elm), field); \
+} while (0)
+
+#define STAILQ_LAST(head, type, field) \
+ (STAILQ_EMPTY((head)) ? \
+ NULL : \
+ ((struct type *) \
+ ((char *)((head)->stqh_last) - __offsetof(struct type, field))))
+
+#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next)
+
+#define STAILQ_REMOVE(head, elm, type, field) do { \
+ if (STAILQ_FIRST((head)) == (elm)) { \
+ STAILQ_REMOVE_HEAD((head), field); \
+ } \
+ else { \
+ struct type *curelm = STAILQ_FIRST((head)); \
+ while (STAILQ_NEXT(curelm, field) != (elm)) \
+ curelm = STAILQ_NEXT(curelm, field); \
+ if ((STAILQ_NEXT(curelm, field) = \
+ STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
+ (head)->stqh_last = &STAILQ_NEXT((curelm), field);\
+ } \
+} while (0)
+
+#define STAILQ_REMOVE_HEAD(head, field) do { \
+ if ((STAILQ_FIRST((head)) = \
+ STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \
+ (head)->stqh_last = &STAILQ_FIRST((head)); \
+} while (0)
+
+#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \
+ if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \
+ (head)->stqh_last = &STAILQ_FIRST((head)); \
+} while (0)
+
+/*
+ * List declarations.
+ */
+#define LIST_HEAD(name, type) \
+struct name { \
+ struct type *lh_first; /* first element */ \
+}
+
+#define LIST_HEAD_INITIALIZER(head) \
+ { NULL }
+
+#define LIST_ENTRY(type) \
+struct { \
+ struct type *le_next; /* next element */ \
+ struct type **le_prev; /* address of previous next element */ \
+}
+
+/*
+ * List functions.
+ */
+
+#define LIST_EMPTY(head) ((head)->lh_first == NULL)
+
+#define LIST_FIRST(head) ((head)->lh_first)
+
+#define LIST_FOREACH(var, head, field) \
+ for ((var) = LIST_FIRST((head)); \
+ (var); \
+ (var) = LIST_NEXT((var), field))
+
+#define LIST_INIT(head) do { \
+ LIST_FIRST((head)) = NULL; \
+} while (0)
+
+#define LIST_INSERT_AFTER(listelm, elm, field) do { \
+ if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
+ LIST_NEXT((listelm), field)->field.le_prev = \
+ &LIST_NEXT((elm), field); \
+ LIST_NEXT((listelm), field) = (elm); \
+ (elm)->field.le_prev = &LIST_NEXT((listelm), field); \
+} while (0)
+
+#define LIST_INSERT_BEFORE(listelm, elm, field) do { \
+ (elm)->field.le_prev = (listelm)->field.le_prev; \
+ LIST_NEXT((elm), field) = (listelm); \
+ *(listelm)->field.le_prev = (elm); \
+ (listelm)->field.le_prev = &LIST_NEXT((elm), field); \
+} while (0)
+
+#define LIST_INSERT_HEAD(head, elm, field) do { \
+ if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \
+ LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
+ LIST_FIRST((head)) = (elm); \
+ (elm)->field.le_prev = &LIST_FIRST((head)); \
+} while (0)
+
+#define LIST_NEXT(elm, field) ((elm)->field.le_next)
+
+#define LIST_REMOVE(elm, field) do { \
+ if (LIST_NEXT((elm), field) != NULL) \
+ LIST_NEXT((elm), field)->field.le_prev = \
+ (elm)->field.le_prev; \
+ *(elm)->field.le_prev = LIST_NEXT((elm), field); \
+} while (0)
+
+/*
+ * Tail queue declarations.
+ */
+#define TAILQ_HEAD(name, type) \
+struct name { \
+ struct type *tqh_first; /* first element */ \
+ struct type **tqh_last; /* addr of last next element */ \
+ TRACEBUF \
+}
+
+#define TAILQ_HEAD_INITIALIZER(head) \
+ { NULL, &(head).tqh_first }
+
+#define TAILQ_ENTRY(type) \
+struct { \
+ struct type *tqe_next; /* next element */ \
+ struct type **tqe_prev; /* address of previous next element */ \
+ TRACEBUF \
+}
+
+/*
+ * Tail queue functions.
+ */
+#define TAILQ_CONCAT(head1, head2, field) do { \
+ if (!TAILQ_EMPTY(head2)) { \
+ *(head1)->tqh_last = (head2)->tqh_first; \
+ (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \
+ (head1)->tqh_last = (head2)->tqh_last; \
+ TAILQ_INIT((head2)); \
+ QMD_TRACE_HEAD(head); \
+ QMD_TRACE_HEAD(head2); \
+ } \
+} while (0)
+
+#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL)
+
+#define TAILQ_FIRST(head) ((head)->tqh_first)
+
+#define TAILQ_FOREACH(var, head, field) \
+ for ((var) = TAILQ_FIRST((head)); \
+ (var); \
+ (var) = TAILQ_NEXT((var), field))
+
+#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \
+ for ((var) = TAILQ_LAST((head), headname); \
+ (var); \
+ (var) = TAILQ_PREV((var), headname, field))
+
+#define TAILQ_INIT(head) do { \
+ TAILQ_FIRST((head)) = NULL; \
+ (head)->tqh_last = &TAILQ_FIRST((head)); \
+ QMD_TRACE_HEAD(head); \
+} while (0)
+
+#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
+ TAILQ_NEXT((elm), field)->field.tqe_prev = \
+ &TAILQ_NEXT((elm), field); \
+ else { \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ QMD_TRACE_HEAD(head); \
+ } \
+ TAILQ_NEXT((listelm), field) = (elm); \
+ (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+ QMD_TRACE_ELEM(&listelm->field); \
+} while (0)
+
+#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \
+ (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \
+ TAILQ_NEXT((elm), field) = (listelm); \
+ *(listelm)->field.tqe_prev = (elm); \
+ (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+ QMD_TRACE_ELEM(&listelm->field); \
+} while (0)
+
+#define TAILQ_INSERT_HEAD(head, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \
+ TAILQ_FIRST((head))->field.tqe_prev = \
+ &TAILQ_NEXT((elm), field); \
+ else \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ TAILQ_FIRST((head)) = (elm); \
+ (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \
+ QMD_TRACE_HEAD(head); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+} while (0)
+
+#define TAILQ_INSERT_TAIL(head, elm, field) do { \
+ TAILQ_NEXT((elm), field) = NULL; \
+ (elm)->field.tqe_prev = (head)->tqh_last; \
+ *(head)->tqh_last = (elm); \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ QMD_TRACE_HEAD(head); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+} while (0)
+
+#define TAILQ_LAST(head, headname) \
+ (*(((struct headname *)((head)->tqh_last))->tqh_last))
+
+#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+
+#define TAILQ_PREV(elm, headname, field) \
+ (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#define TAILQ_REMOVE(head, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field)) != NULL) \
+ TAILQ_NEXT((elm), field)->field.tqe_prev = \
+ (elm)->field.tqe_prev; \
+ else { \
+ (head)->tqh_last = (elm)->field.tqe_prev; \
+ QMD_TRACE_HEAD(head); \
+ } \
+ *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \
+ TRASHIT((elm)->field.tqe_next); \
+ TRASHIT((elm)->field.tqe_prev); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+} while (0)
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_QUEUE_H_ */
diff --git a/src/third_party/wiredtiger/src/include/schema.h b/src/third_party/wiredtiger/src/include/schema.h
new file mode 100644
index 00000000000..e24a19b03ca
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/schema.h
@@ -0,0 +1,101 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/* Character constants for projection plans */
+#define WT_PROJ_KEY 'k' /* Go to key in cursor <arg> */
+#define WT_PROJ_NEXT 'n' /* Process the next item (<arg> repeats) */
+#define WT_PROJ_REUSE 'r' /* Reuse the previous item (<arg> repeats) */
+#define WT_PROJ_SKIP 's' /* Skip a column in the cursor (<arg> repeats) */
+#define WT_PROJ_VALUE 'v' /* Go to the value in cursor <arg> */
+
+struct __wt_colgroup {
+ const char *name; /* Logical name */
+ const char *source; /* Underlying data source */
+ const char *config; /* Configuration string */
+
+ WT_CONFIG_ITEM colconf; /* List of columns from config */
+};
+
+struct __wt_index {
+ const char *name; /* Logical name */
+ const char *source; /* Underlying data source */
+ const char *config; /* Configuration string */
+
+ WT_CONFIG_ITEM colconf; /* List of columns from config */
+
+ const char *idxkey_format; /* Index key format (hides primary) */
+ const char *key_format; /* Key format */
+ const char *key_plan; /* Key projection plan */
+ const char *value_plan; /* Value projection plan */
+};
+
+/*
+ * WT_TABLE --
+ * Handle for a logical table. A table consists of one or more column
+ * groups, each of which holds some set of columns all sharing a primary
+ * key; and zero or more indices, each of which holds some set of columns
+ * in an index key that can be used to reconstruct the primary key.
+ */
+struct __wt_table {
+ const char *name, *config, *plan;
+ const char *key_format, *value_format;
+
+ WT_CONFIG_ITEM cgconf, colconf;
+
+ WT_COLGROUP **cgroups;
+ WT_INDEX **indices;
+ size_t idx_alloc;
+
+ TAILQ_ENTRY(__wt_table) q;
+
+ int cg_complete, idx_complete, is_simple;
+ u_int ncolgroups, nindices, nkey_columns;
+
+ uint32_t refcnt; /* Number of open cursors */
+ uint32_t schema_gen; /* Cached schema generation number */
+};
+
+/*
+ * Tables without explicit column groups have a single default column group
+ * containing all of the columns.
+ */
+#define WT_COLGROUPS(t) WT_MAX((t)->ncolgroups, 1)
+
+/*
+ * WT_WITH_SCHEMA_LOCK --
+ * Acquire the schema lock, perform an operation, drop the lock.
+ */
+#define WT_WITH_SCHEMA_LOCK(session, op) do { \
+ WT_ASSERT(session, \
+ F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) || \
+ !F_ISSET(session, WT_SESSION_NO_SCHEMA_LOCK)); \
+ if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { \
+ (op); \
+ } else { \
+ __wt_spin_lock(session, &S2C(session)->schema_lock); \
+ F_SET(session, WT_SESSION_SCHEMA_LOCKED); \
+ (op); \
+ __wt_spin_unlock(session, &S2C(session)->schema_lock); \
+ F_CLR(session, WT_SESSION_SCHEMA_LOCKED); \
+ } \
+} while (0)
+
+/*
+ * WT_WITHOUT_SCHEMA_LOCK --
+ * Drop the schema lock, perform an operation, re-acquire the lock.
+ */
+#define WT_WITHOUT_SCHEMA_LOCK(session, op) do { \
+ if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { \
+ __wt_spin_unlock(session, &S2C(session)->schema_lock); \
+ F_CLR(session, WT_SESSION_SCHEMA_LOCKED); \
+ (op); \
+ __wt_spin_lock(session, &S2C(session)->schema_lock); \
+ F_SET(session, WT_SESSION_SCHEMA_LOCKED); \
+ } else { \
+ (op); \
+ } \
+} while (0)
diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i
new file mode 100644
index 00000000000..70dc6b8764d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/serial.i
@@ -0,0 +1,329 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __page_write_gen_wrapped_check --
+ * Confirm the page's write generation number won't wrap.
+ */
+static inline int
+__page_write_gen_wrapped_check(WT_PAGE *page)
+{
+ return (page->modify->write_gen >
+ UINT32_MAX - WT_MILLION ? WT_RESTART : 0);
+}
+
+/*
+ * __insert_serial_func --
+ * Worker function to add a WT_INSERT entry to a skiplist.
+ */
+static inline int
+__insert_serial_func(WT_SESSION_IMPL *session,
+ WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT *new_ins,
+ u_int skipdepth)
+{
+ u_int i;
+
+ WT_UNUSED(session);
+
+ /*
+ * Confirm we are still in the expected position, and no item has been
+ * added where our insert belongs. Take extra care at the beginning
+ * and end of the list (at each level): retry if we race there.
+ *
+ * !!!
+ * Note the test for ins_stack[0] == NULL: that's the test for an
+ * uninitialized cursor, ins_stack[0] is cleared as part of
+ * initializing a cursor for a search.
+ */
+ for (i = 0; i < skipdepth; i++) {
+ if (ins_stack[i] == NULL ||
+ *ins_stack[i] != new_ins->next[i])
+ return (WT_RESTART);
+ if (new_ins->next[i] == NULL &&
+ ins_head->tail[i] != NULL &&
+ ins_stack[i] != &ins_head->tail[i]->next[i])
+ return (WT_RESTART);
+ }
+
+ /* Update the skiplist elements referencing the new WT_INSERT item. */
+ for (i = 0; i < skipdepth; i++) {
+ if (ins_head->tail[i] == NULL ||
+ ins_stack[i] == &ins_head->tail[i]->next[i])
+ ins_head->tail[i] = new_ins;
+ *ins_stack[i] = new_ins;
+ }
+
+ return (0);
+}
+
+/*
+ * __col_append_serial_func --
+ * Worker function to allocate a record number as necessary, then add a
+ * WT_INSERT entry to a skiplist.
+ */
+static inline int
+__col_append_serial_func(WT_SESSION_IMPL *session,
+ WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT *new_ins,
+ uint64_t *recnop, u_int skipdepth)
+{
+ WT_BTREE *btree;
+ uint64_t recno;
+ u_int i;
+
+ btree = S2BT(session);
+
+ /*
+ * If the application didn't specify a record number, allocate a new one
+ * and set up for an append.
+ */
+ if ((recno = WT_INSERT_RECNO(new_ins)) == 0) {
+ recno = WT_INSERT_RECNO(new_ins) = btree->last_recno + 1;
+ WT_ASSERT(session, WT_SKIP_LAST(ins_head) == NULL ||
+ recno > WT_INSERT_RECNO(WT_SKIP_LAST(ins_head)));
+ for (i = 0; i < skipdepth; i++)
+ ins_stack[i] = ins_head->tail[i] == NULL ?
+ &ins_head->head[i] : &ins_head->tail[i]->next[i];
+ }
+
+ /* Confirm position and insert the new WT_INSERT item. */
+ WT_RET(__insert_serial_func(
+ session, ins_head, ins_stack, new_ins, skipdepth));
+
+ /*
+ * Set the calling cursor's record number.
+ * If we extended the file, update the last record number.
+ */
+ *recnop = recno;
+ if (recno > btree->last_recno)
+ btree->last_recno = recno;
+
+ return (0);
+}
+
+/*
+ * __update_serial_func --
+ * Worker function to add an WT_UPDATE entry in the page array.
+ */
+static inline int
+__update_serial_func(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_UPDATE **upd_entry, WT_UPDATE *upd)
+{
+ WT_DECL_RET;
+ WT_UPDATE *obsolete;
+ WT_DECL_SPINLOCK_ID(id); /* Must appear last */
+
+ /*
+ * Swap the update into place. If that fails, a new update was added
+ * after our search, we raced. Check if our update is still permitted,
+ * and if it is, do a full-barrier to ensure the update's next pointer
+ * is set before we update the linked list and try again.
+ */
+ while (!WT_ATOMIC_CAS8(*upd_entry, upd->next, upd)) {
+ WT_RET(__wt_txn_update_check(session, upd->next = *upd_entry));
+ WT_WRITE_BARRIER();
+ }
+
+ /*
+ * If there are subsequent WT_UPDATE structures, we're evicting pages
+ * and the page-scanning mutex isn't held, discard obsolete WT_UPDATE
+ * structures. Serialization is needed so only one thread does the
+ * obsolete check at a time, and to protect updates from disappearing
+ * under reconciliation.
+ */
+ if (upd->next != NULL &&
+ F_ISSET(S2C(session)->cache, WT_EVICT_ACTIVE)) {
+ F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
+ /* If we can't lock it, don't scan, that's okay. */
+ if (ret != 0)
+ return (0);
+ obsolete = __wt_update_obsolete_check(session, upd->next);
+ F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
+ if (obsolete != NULL)
+ __wt_update_obsolete_free(session, page, obsolete);
+ }
+ return (0);
+}
+
+/*
+ * DO NOT EDIT: automatically built by dist/serial.py.
+ * Serialization function section: BEGIN
+ */
+
+static inline int
+__wt_col_append_serial(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *ins_head,
+ WT_INSERT ***ins_stack, WT_INSERT **new_insp, size_t new_ins_size,
+ uint64_t *recnop, u_int skipdepth)
+{
+ WT_INSERT *new_ins = *new_insp;
+ WT_DECL_RET;
+ size_t incr_mem;
+
+ /* Clear references to memory we now own. */
+ *new_insp = NULL;
+
+ /*
+ * Check to see if the page's write generation is about to wrap (wildly
+ * unlikely as it implies 4B updates between clean page reconciliations,
+ * but technically possible), and fail the update.
+ *
+ * The check is outside of the serialization mutex because the page's
+ * write generation is going to be a hot cache line, so technically it's
+ * possible for the page's write generation to wrap between the test and
+ * our subsequent modification of it. However, the test is (4B-1M), and
+ * there cannot be a million threads that have done the test but not yet
+ * completed their modification.
+ */
+ WT_RET(__page_write_gen_wrapped_check(page));
+
+ /* Acquire the page's spinlock, call the worker function. */
+ WT_PAGE_LOCK(session, page);
+ ret = __col_append_serial_func(
+ session, ins_head, ins_stack, new_ins, recnop, skipdepth);
+ WT_PAGE_UNLOCK(session, page);
+
+ /* Free unused memory on error. */
+ if (ret != 0) {
+ __wt_free(session, new_ins);
+
+ return (ret);
+ }
+
+ /*
+ * Increment in-memory footprint after releasing the mutex: that's safe
+ * because the structures we added cannot be discarded while visible to
+ * any running transaction, and we're a running transaction, which means
+ * there can be no corresponding delete until we complete.
+ */
+ incr_mem = 0;
+ WT_ASSERT(session, new_ins_size != 0);
+ incr_mem += new_ins_size;
+ if (incr_mem != 0)
+ __wt_cache_page_inmem_incr(session, page, incr_mem);
+
+ /* Mark the page dirty after updating the footprint. */
+ __wt_page_modify_set(session, page);
+
+ return (0);
+}
+
+static inline int
+__wt_insert_serial(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *ins_head,
+ WT_INSERT ***ins_stack, WT_INSERT **new_insp, size_t new_ins_size,
+ u_int skipdepth)
+{
+ WT_INSERT *new_ins = *new_insp;
+ WT_DECL_RET;
+ size_t incr_mem;
+
+ /* Clear references to memory we now own. */
+ *new_insp = NULL;
+
+ /*
+ * Check to see if the page's write generation is about to wrap (wildly
+ * unlikely as it implies 4B updates between clean page reconciliations,
+ * but technically possible), and fail the update.
+ *
+ * The check is outside of the serialization mutex because the page's
+ * write generation is going to be a hot cache line, so technically it's
+ * possible for the page's write generation to wrap between the test and
+ * our subsequent modification of it. However, the test is (4B-1M), and
+ * there cannot be a million threads that have done the test but not yet
+ * completed their modification.
+ */
+ WT_RET(__page_write_gen_wrapped_check(page));
+
+ /* Acquire the page's spinlock, call the worker function. */
+ WT_PAGE_LOCK(session, page);
+ ret = __insert_serial_func(
+ session, ins_head, ins_stack, new_ins, skipdepth);
+ WT_PAGE_UNLOCK(session, page);
+
+ /* Free unused memory on error. */
+ if (ret != 0) {
+ __wt_free(session, new_ins);
+
+ return (ret);
+ }
+
+ /*
+ * Increment in-memory footprint after releasing the mutex: that's safe
+ * because the structures we added cannot be discarded while visible to
+ * any running transaction, and we're a running transaction, which means
+ * there can be no corresponding delete until we complete.
+ */
+ incr_mem = 0;
+ WT_ASSERT(session, new_ins_size != 0);
+ incr_mem += new_ins_size;
+ if (incr_mem != 0)
+ __wt_cache_page_inmem_incr(session, page, incr_mem);
+
+ /* Mark the page dirty after updating the footprint. */
+ __wt_page_modify_set(session, page);
+
+ return (0);
+}
+
+static inline int
+__wt_update_serial(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE **srch_upd,
+ WT_UPDATE **updp, size_t upd_size)
+{
+ WT_UPDATE *upd = *updp;
+ WT_DECL_RET;
+ size_t incr_mem;
+
+ /* Clear references to memory we now own. */
+ *updp = NULL;
+
+ /*
+ * Check to see if the page's write generation is about to wrap (wildly
+ * unlikely as it implies 4B updates between clean page reconciliations,
+ * but technically possible), and fail the update.
+ *
+ * The check is outside of the serialization mutex because the page's
+ * write generation is going to be a hot cache line, so technically it's
+ * possible for the page's write generation to wrap between the test and
+ * our subsequent modification of it. However, the test is (4B-1M), and
+ * there cannot be a million threads that have done the test but not yet
+ * completed their modification.
+ */
+ WT_RET(__page_write_gen_wrapped_check(page));
+
+ ret = __update_serial_func(
+ session, page, srch_upd, upd);
+
+ /* Free unused memory on error. */
+ if (ret != 0) {
+ __wt_free(session, upd);
+
+ return (ret);
+ }
+
+ /*
+ * Increment in-memory footprint after releasing the mutex: that's safe
+ * because the structures we added cannot be discarded while visible to
+ * any running transaction, and we're a running transaction, which means
+ * there can be no corresponding delete until we complete.
+ */
+ incr_mem = 0;
+ WT_ASSERT(session, upd_size != 0);
+ incr_mem += upd_size;
+ if (incr_mem != 0)
+ __wt_cache_page_inmem_incr(session, page, incr_mem);
+
+ /* Mark the page dirty after updating the footprint. */
+ __wt_page_modify_set(session, page);
+
+ return (0);
+}
+
+/*
+ * Serialization function section: END
+ * DO NOT EDIT: automatically built by dist/serial.py.
+ */
diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h
new file mode 100644
index 00000000000..788ffe5eb45
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/session.h
@@ -0,0 +1,156 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WT_DATA_HANDLE_CACHE --
+ * Per-session cache of handles to avoid synchronization when opening
+ * cursors.
+ */
+struct __wt_data_handle_cache {
+ WT_DATA_HANDLE *dhandle;
+
+ SLIST_ENTRY(__wt_data_handle_cache) l;
+};
+
+/*
+ * WT_HAZARD --
+ * A hazard pointer.
+ */
+struct __wt_hazard {
+ WT_PAGE *page; /* Page address */
+#ifdef HAVE_DIAGNOSTIC
+ const char *file; /* File/line where hazard acquired */
+ int line;
+#endif
+};
+
+/* Get the connection implementation for a session */
+#define S2C(session) ((WT_CONNECTION_IMPL *)(session)->iface.connection)
+#define S2C_SAFE(session) ((session) == NULL ? NULL : S2C(session))
+
+/* Get the btree for a session */
+#define S2BT(session) ((WT_BTREE *)(session)->dhandle->handle)
+#define S2BT_SAFE(session) ((session)->dhandle == NULL ? NULL : S2BT(session))
+
+/*
+ * WT_SESSION_IMPL --
+ * Implementation of WT_SESSION.
+ */
+struct __wt_session_impl {
+ WT_SESSION iface;
+
+ void *lang_private; /* Language specific private storage */
+
+ u_int active; /* Non-zero if the session is in-use */
+
+ const char *name; /* Name */
+ const char *lastop; /* Last operation */
+ uint32_t id; /* UID, offset in session array */
+
+ WT_CONDVAR *cond; /* Condition variable */
+
+ uint32_t rnd[2]; /* Random number generation state */
+
+ WT_EVENT_HANDLER *event_handler;/* Application's event handlers */
+
+ WT_DATA_HANDLE *dhandle; /* Current data handle */
+
+ /* Session handle reference list */
+ SLIST_HEAD(__dhandles, __wt_data_handle_cache) dhandles;
+#define WT_DHANDLE_SWEEP_WAIT 60 /* Wait before discarding */
+#define WT_DHANDLE_SWEEP_PERIOD 20 /* Only sweep every 20 seconds */
+ time_t last_sweep; /* Last sweep for dead handles */
+
+ WT_CURSOR *cursor; /* Current cursor */
+ /* Cursors closed with the session */
+ TAILQ_HEAD(__cursors, __wt_cursor) cursors;
+
+ WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */
+ WT_COMPACT *compact; /* Compact state */
+
+ WT_BTREE *metafile; /* Metadata file */
+ void *meta_track; /* Metadata operation tracking */
+ void *meta_track_next; /* Current position */
+ void *meta_track_sub; /* Child transaction / save point */
+ size_t meta_track_alloc; /* Currently allocated */
+ int meta_track_nest; /* Nesting level of meta transaction */
+#define WT_META_TRACKING(session) (session->meta_track_next != NULL)
+
+ TAILQ_HEAD(__tables, __wt_table) tables;
+
+ WT_ITEM **scratch; /* Temporary memory for any function */
+ u_int scratch_alloc; /* Currently allocated */
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * It's hard to figure out from where a buffer was allocated after it's
+ * leaked, so in diagnostic mode we track them; DIAGNOSTIC can't simply
+ * add additional fields to WT_ITEM structures because they are visible
+ * to applications, create a parallel structure instead.
+ */
+ struct __wt_scratch_track {
+ const char *file; /* Allocating file, line */
+ int line;
+ } *scratch_track;
+#endif
+
+ WT_TXN_ISOLATION isolation;
+ WT_TXN txn; /* Transaction state */
+ u_int ncursors; /* Count of active file cursors. */
+
+ WT_REF **excl; /* Eviction exclusive list */
+ u_int excl_next; /* Next empty slot */
+ size_t excl_allocated; /* Bytes allocated */
+
+ void *block_manager; /* Block-manager support */
+ int (*block_manager_cleanup)(WT_SESSION_IMPL *);
+
+ WT_DATA_HANDLE **ckpt_handle; /* Checkpoint support */
+ u_int ckpt_handle_next; /* Next empty slot */
+ size_t ckpt_handle_allocated; /* Bytes allocated */
+
+ void *reconcile; /* Reconciliation support */
+ int (*reconcile_cleanup)(WT_SESSION_IMPL *);
+
+ int compaction; /* Compaction did some work */
+
+ /*
+ * The split stash memory and hazard information persist past session
+ * close, because they are accessed by threads of control other than
+ * the thread owning the session. They live at the end of the
+ * structure so it's somewhat easier to clear everything but the fields
+ * that persist.
+ */
+#define WT_SESSION_CLEAR_SIZE(s) \
+ (WT_PTRDIFF(&(s)->flags, s) + sizeof((s)->flags))
+ uint32_t flags;
+
+ /*
+ * Splits can "free" memory that may still be in use, and we use a
+ * split generation number to track it, that is, the session stores a
+ * reference to the memory and allocates a split generation; when no
+ * session is reading from that split generation, the memory can be
+ * freed for real.
+ */
+ struct __wt_split_stash {
+ uint64_t split_gen; /* Split generation */
+ void *p; /* Memory, length */
+ size_t len;
+ } *split_stash; /* Split stash array */
+ size_t split_stash_cnt; /* Array entries */
+ size_t split_stash_alloc; /* Allocated bytes */
+
+ uint64_t split_gen; /* Reading split generation */
+
+ /*
+ * Hazard pointers.
+ * The number of hazard pointers that can be in use grows dynamically.
+ */
+#define WT_HAZARD_INCR 10
+ uint32_t hazard_size; /* Allocated slots in hazard array. */
+ uint32_t nhazard; /* Count of active hazard pointers */
+ WT_HAZARD *hazard; /* Hazard pointer array */
+} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
new file mode 100644
index 00000000000..11f42ac5500
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -0,0 +1,332 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+struct __wt_stats {
+ const char *desc; /* text description */
+ uint64_t v; /* 64-bit value */
+};
+
+/*
+ * Read/write statistics without any test for statistics configuration.
+ */
+#define WT_STAT(stats, fld) \
+ ((stats)->fld.v)
+#define WT_STAT_ATOMIC_DECRV(stats, fld, value) do { \
+ (void)WT_ATOMIC_SUB8(WT_STAT(stats, fld), (value)); \
+} while (0)
+#define WT_STAT_ATOMIC_DECR(stats, fld) WT_STAT_ATOMIC_DECRV(stats, fld, 1)
+#define WT_STAT_ATOMIC_INCRV(stats, fld, value) do { \
+ (void)WT_ATOMIC_ADD8(WT_STAT(stats, fld), (value)); \
+} while (0)
+#define WT_STAT_ATOMIC_INCR(stats, fld) WT_ATOMIC_ADD(WT_STAT(stats, fld), 1)
+#define WT_STAT_DECRV(stats, fld, value) do { \
+ (stats)->fld.v -= (value); \
+} while (0)
+#define WT_STAT_DECR(stats, fld) WT_STAT_DECRV(stats, fld, 1)
+#define WT_STAT_INCRV(stats, fld, value) do { \
+ (stats)->fld.v += (value); \
+} while (0)
+#define WT_STAT_INCR(stats, fld) WT_STAT_INCRV(stats, fld, 1)
+#define WT_STAT_SET(stats, fld, value) do { \
+ (stats)->fld.v = (uint64_t)(value); \
+} while (0)
+
+/*
+ * Read/write statistics if "fast" statistics are configured.
+ */
+#define WT_STAT_FAST_ATOMIC_DECRV(session, stats, fld, value) do { \
+ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
+ WT_STAT_ATOMIC_DECRV(stats, fld, value); \
+} while (0)
+#define WT_STAT_FAST_ATOMIC_DECR(session, stats, fld) \
+ WT_STAT_FAST_ATOMIC_DECRV(session, stats, fld, 1)
+#define WT_STAT_FAST_ATOMIC_INCRV(session, stats, fld, value) do { \
+ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
+ WT_STAT_ATOMIC_INCRV(stats, fld, value); \
+} while (0)
+#define WT_STAT_FAST_ATOMIC_INCR(session, stats, fld) \
+ WT_STAT_FAST_ATOMIC_INCRV(session, stats, fld, 1)
+#define WT_STAT_FAST_DECRV(session, stats, fld, value) do { \
+ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
+ WT_STAT_DECRV(stats, fld, value); \
+} while (0)
+#define WT_STAT_FAST_DECR(session, stats, fld) \
+ WT_STAT_FAST_DECRV(session, stats, fld, 1)
+#define WT_STAT_FAST_INCRV(session, stats, fld, value) do { \
+ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
+ WT_STAT_INCRV(stats, fld, value); \
+} while (0)
+#define WT_STAT_FAST_INCR(session, stats, fld) \
+ WT_STAT_FAST_INCRV(session, stats, fld, 1)
+#define WT_STAT_FAST_SET(session, stats, fld, value) do { \
+ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
+ WT_STAT_SET(stats, fld, value); \
+} while (0)
+
+/*
+ * Read/write connection handle statistics if "fast" statistics are configured.
+ */
+#define WT_STAT_FAST_CONN_ATOMIC_DECRV(session, fld, value) \
+ WT_STAT_FAST_ATOMIC_DECRV(session, &S2C(session)->stats, fld, value)
+#define WT_STAT_FAST_CONN_ATOMIC_DECR(session, fld) \
+ WT_STAT_FAST_ATOMIC_DECR(session, &S2C(session)->stats, fld)
+#define WT_STAT_FAST_CONN_ATOMIC_INCRV(session, fld, value) \
+ WT_STAT_FAST_ATOMIC_INCRV(session, &S2C(session)->stats, fld, value)
+#define WT_STAT_FAST_CONN_ATOMIC_INCR(session, fld) \
+ WT_STAT_FAST_ATOMIC_INCR(session, &S2C(session)->stats, fld)
+#define WT_STAT_FAST_CONN_DECR(session, fld) \
+ WT_STAT_FAST_DECR(session, &S2C(session)->stats, fld)
+#define WT_STAT_FAST_CONN_DECRV(session, fld, value) \
+ WT_STAT_FAST_DECRV(session, &S2C(session)->stats, fld, value)
+#define WT_STAT_FAST_CONN_INCR(session, fld) \
+ WT_STAT_FAST_INCR(session, &S2C(session)->stats, fld)
+#define WT_STAT_FAST_CONN_INCRV(session, fld, value) \
+ WT_STAT_FAST_INCRV(session, &S2C(session)->stats, fld, value)
+#define WT_STAT_FAST_CONN_SET(session, fld, value) \
+ WT_STAT_FAST_SET(session, &S2C(session)->stats, fld, value)
+
+/*
+ * Read/write data-source handle statistics if the data-source handle is set
+ * and "fast" statistics are configured.
+ *
+ * XXX
+ * We shouldn't have to check if the data-source handle is NULL, but it's
+ * useful until everything is converted to using data-source handles.
+ */
+#define WT_STAT_FAST_DATA_DECRV(session, fld, value) do { \
+ if ((session)->dhandle != NULL) \
+ WT_STAT_FAST_DECRV( \
+ session, &(session)->dhandle->stats, fld, value); \
+} while (0)
+#define WT_STAT_FAST_DATA_DECR(session, fld) \
+ WT_STAT_FAST_DATA_DECRV(session, fld, 1)
+#define WT_STAT_FAST_DATA_INCRV(session, fld, value) do { \
+ if ((session)->dhandle != NULL) \
+ WT_STAT_FAST_INCRV( \
+ session, &(session)->dhandle->stats, fld, value); \
+} while (0)
+#define WT_STAT_FAST_DATA_INCR(session, fld) \
+ WT_STAT_FAST_DATA_INCRV(session, fld, 1)
+#define WT_STAT_FAST_DATA_SET(session, fld, value) do { \
+ if ((session)->dhandle != NULL) \
+ WT_STAT_FAST_SET( \
+ session, &(session)->dhandle->stats, fld, value); \
+} while (0)
+
+/*
+ * DO NOT EDIT: automatically built by dist/stat.py.
+ */
+/* Statistics section: BEGIN */
+
+/*
+ * Statistics entries for connections.
+ */
+#define WT_CONNECTION_STATS_BASE 1000
+struct __wt_connection_stats {
+ WT_STATS async_alloc_race;
+ WT_STATS async_alloc_view;
+ WT_STATS async_cur_queue;
+ WT_STATS async_flush;
+ WT_STATS async_full;
+ WT_STATS async_max_queue;
+ WT_STATS async_nowork;
+ WT_STATS async_op_alloc;
+ WT_STATS async_op_compact;
+ WT_STATS async_op_insert;
+ WT_STATS async_op_remove;
+ WT_STATS async_op_search;
+ WT_STATS async_op_update;
+ WT_STATS block_byte_map_read;
+ WT_STATS block_byte_read;
+ WT_STATS block_byte_write;
+ WT_STATS block_map_read;
+ WT_STATS block_preload;
+ WT_STATS block_read;
+ WT_STATS block_write;
+ WT_STATS cache_bytes_dirty;
+ WT_STATS cache_bytes_inuse;
+ WT_STATS cache_bytes_max;
+ WT_STATS cache_bytes_read;
+ WT_STATS cache_bytes_write;
+ WT_STATS cache_eviction_checkpoint;
+ WT_STATS cache_eviction_clean;
+ WT_STATS cache_eviction_deepen;
+ WT_STATS cache_eviction_dirty;
+ WT_STATS cache_eviction_fail;
+ WT_STATS cache_eviction_force;
+ WT_STATS cache_eviction_force_fail;
+ WT_STATS cache_eviction_hazard;
+ WT_STATS cache_eviction_internal;
+ WT_STATS cache_eviction_queue_empty;
+ WT_STATS cache_eviction_queue_not_empty;
+ WT_STATS cache_eviction_server_evicting;
+ WT_STATS cache_eviction_server_not_evicting;
+ WT_STATS cache_eviction_slow;
+ WT_STATS cache_eviction_split;
+ WT_STATS cache_eviction_walk;
+ WT_STATS cache_pages_dirty;
+ WT_STATS cache_pages_inuse;
+ WT_STATS cache_read;
+ WT_STATS cache_write;
+ WT_STATS cond_wait;
+ WT_STATS cursor_create;
+ WT_STATS cursor_insert;
+ WT_STATS cursor_next;
+ WT_STATS cursor_prev;
+ WT_STATS cursor_remove;
+ WT_STATS cursor_reset;
+ WT_STATS cursor_search;
+ WT_STATS cursor_search_near;
+ WT_STATS cursor_update;
+ WT_STATS dh_session_handles;
+ WT_STATS dh_session_sweeps;
+ WT_STATS file_open;
+ WT_STATS log_buffer_grow;
+ WT_STATS log_buffer_size;
+ WT_STATS log_bytes_user;
+ WT_STATS log_bytes_written;
+ WT_STATS log_close_yields;
+ WT_STATS log_max_filesize;
+ WT_STATS log_reads;
+ WT_STATS log_scan_records;
+ WT_STATS log_scan_rereads;
+ WT_STATS log_scans;
+ WT_STATS log_slot_closes;
+ WT_STATS log_slot_consolidated;
+ WT_STATS log_slot_joins;
+ WT_STATS log_slot_races;
+ WT_STATS log_slot_switch_fails;
+ WT_STATS log_slot_toobig;
+ WT_STATS log_slot_toosmall;
+ WT_STATS log_slot_transitions;
+ WT_STATS log_sync;
+ WT_STATS log_writes;
+ WT_STATS lsm_checkpoint_throttle;
+ WT_STATS lsm_merge_throttle;
+ WT_STATS lsm_rows_merged;
+ WT_STATS lsm_work_queue_app;
+ WT_STATS lsm_work_queue_manager;
+ WT_STATS lsm_work_queue_max;
+ WT_STATS lsm_work_queue_switch;
+ WT_STATS lsm_work_units_created;
+ WT_STATS lsm_work_units_discarded;
+ WT_STATS lsm_work_units_done;
+ WT_STATS memory_allocation;
+ WT_STATS memory_free;
+ WT_STATS memory_grow;
+ WT_STATS read_io;
+ WT_STATS rec_pages;
+ WT_STATS rec_pages_eviction;
+ WT_STATS rec_split_stashed_bytes;
+ WT_STATS rec_split_stashed_objects;
+ WT_STATS rwlock_read;
+ WT_STATS rwlock_write;
+ WT_STATS session_cursor_open;
+ WT_STATS session_open;
+ WT_STATS txn_begin;
+ WT_STATS txn_checkpoint;
+ WT_STATS txn_checkpoint_running;
+ WT_STATS txn_commit;
+ WT_STATS txn_fail_cache;
+ WT_STATS txn_pinned_range;
+ WT_STATS txn_rollback;
+ WT_STATS write_io;
+};
+
+/*
+ * Statistics entries for data sources.
+ */
+#define WT_DSRC_STATS_BASE 2000
+struct __wt_dsrc_stats {
+ WT_STATS allocation_size;
+ WT_STATS block_alloc;
+ WT_STATS block_checkpoint_size;
+ WT_STATS block_extension;
+ WT_STATS block_free;
+ WT_STATS block_magic;
+ WT_STATS block_major;
+ WT_STATS block_minor;
+ WT_STATS block_reuse_bytes;
+ WT_STATS block_size;
+ WT_STATS bloom_count;
+ WT_STATS bloom_false_positive;
+ WT_STATS bloom_hit;
+ WT_STATS bloom_miss;
+ WT_STATS bloom_page_evict;
+ WT_STATS bloom_page_read;
+ WT_STATS bloom_size;
+ WT_STATS btree_column_deleted;
+ WT_STATS btree_column_fix;
+ WT_STATS btree_column_internal;
+ WT_STATS btree_column_variable;
+ WT_STATS btree_compact_rewrite;
+ WT_STATS btree_entries;
+ WT_STATS btree_fixed_len;
+ WT_STATS btree_maximum_depth;
+ WT_STATS btree_maxintlitem;
+ WT_STATS btree_maxintlpage;
+ WT_STATS btree_maxleafitem;
+ WT_STATS btree_maxleafpage;
+ WT_STATS btree_overflow;
+ WT_STATS btree_row_internal;
+ WT_STATS btree_row_leaf;
+ WT_STATS cache_bytes_read;
+ WT_STATS cache_bytes_write;
+ WT_STATS cache_eviction_checkpoint;
+ WT_STATS cache_eviction_clean;
+ WT_STATS cache_eviction_dirty;
+ WT_STATS cache_eviction_fail;
+ WT_STATS cache_eviction_hazard;
+ WT_STATS cache_eviction_internal;
+ WT_STATS cache_overflow_value;
+ WT_STATS cache_read;
+ WT_STATS cache_read_overflow;
+ WT_STATS cache_write;
+ WT_STATS compress_raw_fail;
+ WT_STATS compress_raw_fail_temporary;
+ WT_STATS compress_raw_ok;
+ WT_STATS compress_read;
+ WT_STATS compress_write;
+ WT_STATS compress_write_fail;
+ WT_STATS compress_write_too_small;
+ WT_STATS cursor_create;
+ WT_STATS cursor_insert;
+ WT_STATS cursor_insert_bulk;
+ WT_STATS cursor_insert_bytes;
+ WT_STATS cursor_next;
+ WT_STATS cursor_prev;
+ WT_STATS cursor_remove;
+ WT_STATS cursor_remove_bytes;
+ WT_STATS cursor_reset;
+ WT_STATS cursor_search;
+ WT_STATS cursor_search_near;
+ WT_STATS cursor_update;
+ WT_STATS cursor_update_bytes;
+ WT_STATS lsm_checkpoint_throttle;
+ WT_STATS lsm_chunk_count;
+ WT_STATS lsm_generation_max;
+ WT_STATS lsm_lookup_no_bloom;
+ WT_STATS lsm_merge_throttle;
+ WT_STATS rec_dictionary;
+ WT_STATS rec_multiblock_internal;
+ WT_STATS rec_multiblock_leaf;
+ WT_STATS rec_multiblock_max;
+ WT_STATS rec_overflow_key_internal;
+ WT_STATS rec_overflow_key_leaf;
+ WT_STATS rec_overflow_value;
+ WT_STATS rec_page_delete;
+ WT_STATS rec_page_match;
+ WT_STATS rec_pages;
+ WT_STATS rec_pages_eviction;
+ WT_STATS rec_prefix_compression;
+ WT_STATS rec_suffix_compression;
+ WT_STATS session_compact;
+ WT_STATS session_cursor_open;
+ WT_STATS txn_update_conflict;
+};
+
+/* Statistics section: END */
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
new file mode 100644
index 00000000000..c28a9231750
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -0,0 +1,139 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define WT_TXN_NONE 0 /* No txn running in a session. */
+#define WT_TXN_ABORTED UINT64_MAX /* Update rolled back, ignore. */
+
+/*
+ * Transaction ID comparison dealing with edge cases.
+ *
+ * WT_TXN_ABORTED is the largest possible ID (never visible to a running
+ * transaction), WT_TXN_NONE is smaller than any possible ID (visible to all
+ * running transactions).
+ */
+#define TXNID_LE(t1, t2) \
+ ((t1) <= (t2))
+
+#define TXNID_LT(t1, t2) \
+ ((t1) != (t2) && TXNID_LE(t1, t2))
+
+#define WT_SESSION_TXN_STATE(s) (&S2C(s)->txn_global.states[(s)->id])
+
+struct __wt_txn_state {
+ volatile uint64_t id;
+ volatile uint64_t snap_min;
+} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+struct __wt_txn_global {
+ volatile uint64_t current; /* Current transaction ID. */
+
+ /* The oldest running transaction ID (may race). */
+ uint64_t last_running;
+
+ /*
+ * The oldest transaction ID that is not yet visible to some
+ * transaction in the system.
+ */
+ volatile uint64_t oldest_id;
+
+ /* The oldest session found in the last scan. */
+ uint32_t oldest_session;
+
+ /* Count of scanning threads, or -1 for exclusive access. */
+ volatile int32_t scan_count;
+
+ WT_TXN_STATE *states; /* Per-session transaction states */
+};
+
+typedef enum __wt_txn_isolation {
+ TXN_ISO_EVICTION, /* Internal: eviction context */
+ TXN_ISO_READ_UNCOMMITTED,
+ TXN_ISO_READ_COMMITTED,
+ TXN_ISO_SNAPSHOT
+} WT_TXN_ISOLATION;
+
+/*
+ * WT_TXN_OP --
+ * A transactional operation. Each transaction builds an in-memory array
+ * of these operations as it runs, then uses the array to either write log
+ * records during commit or undo the operations during rollback.
+ */
+struct __wt_txn_op {
+ uint32_t fileid;
+ enum {
+ TXN_OP_BASIC,
+ TXN_OP_INMEM,
+ TXN_OP_REF,
+ TXN_OP_TRUNCATE_COL,
+ TXN_OP_TRUNCATE_ROW
+ } type;
+ union {
+ /* TXN_OP_BASIC, TXN_OP_INMEM */
+ WT_UPDATE *upd;
+ /* TXN_OP_REF */
+ WT_REF *ref;
+ /* TXN_OP_TRUNCATE_COL */
+ struct {
+ uint64_t start, stop;
+ } truncate_col;
+ /* TXN_OP_TRUNCATE_ROW */
+ struct {
+ WT_ITEM start, stop;
+ enum {
+ TXN_TRUNC_ALL,
+ TXN_TRUNC_BOTH,
+ TXN_TRUNC_START,
+ TXN_TRUNC_STOP
+ } mode;
+ } truncate_row;
+ } u;
+};
+
+/*
+ * WT_TXN --
+ * Per-session transaction context.
+ */
+struct __wt_txn {
+ uint64_t id;
+
+ WT_TXN_ISOLATION isolation;
+
+ /*
+ * Snapshot data:
+ * ids < snap_min are visible,
+ * ids > snap_max are invisible,
+ * everything else is visible unless it is in the snapshot.
+ */
+ uint64_t snap_min, snap_max;
+ uint64_t *snapshot;
+ uint32_t snapshot_count;
+ uint32_t txn_logsync; /* Log sync configuration */
+
+ /* Array of modifications by this transaction. */
+ WT_TXN_OP *mod;
+ size_t mod_alloc;
+ u_int mod_count;
+
+ /* Scratch buffer for in-memory log records. */
+ WT_ITEM *logrec;
+
+ /* Requested notification when transactions are resolved. */
+ WT_TXN_NOTIFY *notify;
+
+ /* Checkpoint status. */
+ WT_LSN ckpt_lsn;
+ int full_ckpt;
+ uint32_t ckpt_nsnapshot;
+ WT_ITEM *ckpt_snapshot;
+
+#define TXN_AUTOCOMMIT 0x01
+#define TXN_ERROR 0x02
+#define TXN_HAS_ID 0x04
+#define TXN_HAS_SNAPSHOT 0x08
+#define TXN_RUNNING 0x10
+ uint32_t flags;
+};
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
new file mode 100644
index 00000000000..127176c67ea
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -0,0 +1,382 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+static inline int __wt_txn_id_check(WT_SESSION_IMPL *session);
+static inline void __wt_txn_read_last(WT_SESSION_IMPL *session);
+
+/*
+ * __txn_next_op --
+ * Mark a WT_UPDATE object modified by the current transaction.
+ */
+static inline int
+__txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ *opp = NULL;
+
+ /*
+ * We're about to perform an update.
+ * Make sure we have allocated a transaction ID.
+ */
+ WT_RET(__wt_txn_id_check(session));
+ WT_ASSERT(session, F_ISSET(txn, TXN_HAS_ID));
+
+ WT_RET(__wt_realloc_def(session, &txn->mod_alloc,
+ txn->mod_count + 1, &txn->mod));
+
+ *opp = &txn->mod[txn->mod_count++];
+ WT_CLEAR(**opp);
+ (*opp)->fileid = S2BT(session)->id;
+ return (0);
+}
+
+/*
+ * __wt_txn_unmodify --
+ * If threads race making updates, they may discard the last referenced
+ * WT_UPDATE item while the transaction is still active. This function
+ * removes the last update item from the "log".
+ */
+static inline void
+__wt_txn_unmodify(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ if (F_ISSET(txn, TXN_HAS_ID)) {
+ WT_ASSERT(session, txn->mod_count > 0);
+ txn->mod_count--;
+ }
+}
+
+/*
+ * __wt_txn_modify --
+ * Mark a WT_UPDATE object modified by the current transaction.
+ */
+static inline int
+__wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+ WT_DECL_RET;
+ WT_TXN_OP *op;
+
+ WT_RET(__txn_next_op(session, &op));
+ op->type = F_ISSET(session, WT_SESSION_LOGGING_INMEM) ?
+ TXN_OP_INMEM : TXN_OP_BASIC;
+ op->u.upd = upd;
+ upd->txnid = session->txn.id;
+ return (ret);
+}
+
+/*
+ * __wt_txn_modify_ref --
+ * Remember a WT_REF object modified by the current transaction.
+ */
+static inline int
+__wt_txn_modify_ref(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_TXN_OP *op;
+
+ WT_RET(__txn_next_op(session, &op));
+ op->type = TXN_OP_REF;
+ op->u.ref = ref;
+ return (__wt_txn_log_op(session, NULL));
+}
+
+/*
+ * __wt_txn_visible_all --
+ * Check if a given transaction ID is "globally visible". This is, if
+ * all sessions in the system will see the transaction ID.
+ */
+static inline int
+__wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id)
+{
+ uint64_t oldest_id;
+
+ oldest_id = S2C(session)->txn_global.oldest_id;
+ return (TXNID_LT(id, oldest_id));
+}
+
+/*
+ * __wt_txn_visible --
+ * Can the current transaction see the given ID?
+ */
+static inline int
+__wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+
+ /*
+ * Eviction only sees globally visible updates, or if there is a
+ * checkpoint transaction running, use its transaction.
+ */
+ if (txn->isolation == TXN_ISO_EVICTION)
+ return (__wt_txn_visible_all(session, id));
+
+ /* Nobody sees the results of aborted transactions. */
+ if (id == WT_TXN_ABORTED)
+ return (0);
+
+ /* Changes with no associated transaction are always visible. */
+ if (id == WT_TXN_NONE)
+ return (1);
+
+ /*
+ * Read-uncommitted transactions see all other changes.
+ *
+ * All metadata reads are at read-uncommitted isolation. That's
+ * because once a schema-level operation completes, subsequent
+ * operations must see the current version of checkpoint metadata, or
+ * they may try to read blocks that may have been freed from a file.
+ * Metadata updates use non-transactional techniques (such as the
+ * schema and metadata locks) to protect access to in-flight updates.
+ */
+ if (txn->isolation == TXN_ISO_READ_UNCOMMITTED ||
+ S2BT_SAFE(session) == session->metafile)
+ return (1);
+
+ /* Transactions see their own changes. */
+ if (id == txn->id)
+ return (1);
+
+ /*
+ * TXN_ISO_SNAPSHOT, TXN_ISO_READ_COMMITTED: the ID is visible if it is
+ * not the result of a concurrent transaction, that is, if was
+ * committed before the snapshot was taken.
+ *
+ * The order here is important: anything newer than the maximum ID we
+ * saw when taking the snapshot should be invisible, even if the
+ * snapshot is empty.
+ */
+ if (TXNID_LE(txn->snap_max, id))
+ return (0);
+ if (txn->snapshot_count == 0 || TXNID_LT(id, txn->snap_min))
+ return (1);
+
+ return (bsearch(&id, txn->snapshot, txn->snapshot_count,
+ sizeof(uint64_t), __wt_txnid_cmp) == NULL);
+}
+
+/*
+ * __wt_txn_read --
+ * Get the first visible update in a list (or NULL if none are visible).
+ */
+static inline WT_UPDATE *
+__wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+ while (upd != NULL && !__wt_txn_visible(session, upd->txnid))
+ upd = upd->next;
+
+ return (upd);
+}
+
+/*
+ * __wt_txn_autocommit_check --
+ * If an auto-commit transaction is required, start one.
+*/
+static inline int
+__wt_txn_autocommit_check(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ if (F_ISSET(txn, TXN_AUTOCOMMIT)) {
+ F_CLR(txn, TXN_AUTOCOMMIT);
+ return (__wt_txn_begin(session, NULL));
+ }
+ return (0);
+}
+
+/*
+ * __wt_txn_new_id --
+ * Allocate a new transaction ID.
+ */
+static inline uint64_t
+__wt_txn_new_id(WT_SESSION_IMPL *session)
+{
+ /*
+ * We want the global value to lead the allocated values, so that any
+ * allocated transaction ID eventually becomes globally visible. When
+ * there are no transactions running, the oldest_id will reach the
+ * global current ID, so we want post-increment semantics. Our atomic
+ * add primitive does pre-increment, so adjust the result here.
+ */
+ return (WT_ATOMIC_ADD8(S2C(session)->txn_global.current, 1) - 1);
+}
+
+/*
+ * __wt_txn_id_check --
+ * A transaction is going to do an update, start an auto commit
+ * transaction if required and allocate a transaction ID.
+ */
+static inline int
+__wt_txn_id_check(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *txn_state;
+
+ txn = &session->txn;
+
+ WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING));
+ if (!F_ISSET(txn, TXN_HAS_ID)) {
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+ txn_state = &txn_global->states[session->id];
+
+ WT_ASSERT(session, txn_state->id == WT_TXN_NONE);
+
+ /*
+ * Allocate a transaction ID.
+ *
+ * We use an atomic compare and swap to ensure that we get a
+ * unique ID that is published before the global counter is
+ * updated.
+ *
+ * If two threads race to allocate an ID, only the latest ID
+ * will proceed. The winning thread can be sure its snapshot
+ * contains all of the earlier active IDs. Threads that race
+ * and get an earlier ID may not appear in the snapshot, but
+ * they will loop and allocate a new ID before proceeding to
+ * make any updates.
+ *
+ * This potentially wastes transaction IDs when threads race to
+ * begin transactions: that is the price we pay to keep this
+ * path latch free.
+ */
+ do {
+ txn_state->id = txn->id = txn_global->current;
+ } while (!WT_ATOMIC_CAS8(
+ txn_global->current, txn->id, txn->id + 1));
+
+ /*
+ * If we have used 64-bits of transaction IDs, there is nothing
+ * more we can do.
+ */
+ if (txn->id == WT_TXN_ABORTED)
+ WT_RET_MSG(session, ENOMEM, "Out of transaction IDs");
+ F_SET(txn, TXN_HAS_ID);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_txn_update_check --
+ * Check if the current transaction can update an item.
+ */
+static inline int
+__wt_txn_update_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ if (txn->isolation == TXN_ISO_SNAPSHOT)
+ while (upd != NULL && !__wt_txn_visible(session, upd->txnid)) {
+ if (upd->txnid != WT_TXN_ABORTED) {
+ WT_STAT_FAST_DATA_INCR(
+ session, txn_update_conflict);
+ return (WT_ROLLBACK);
+ }
+ upd = upd->next;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_txn_read_last --
+ * Called when the last page for a session is released.
+ */
+static inline void
+__wt_txn_read_last(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+
+ /* Release the snap_min ID we put in the global table. */
+ if (!F_ISSET(txn, TXN_RUNNING) ||
+ txn->isolation != TXN_ISO_SNAPSHOT)
+ __wt_txn_release_snapshot(session);
+}
+
+/*
+ * __wt_txn_cursor_op --
+ * Called for each cursor operation.
+ */
+static inline void
+__wt_txn_cursor_op(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *txn_state;
+
+ txn = &session->txn;
+ txn_global = &S2C(session)->txn_global;
+ txn_state = &txn_global->states[session->id];
+
+ /*
+ * If there is no transaction running (so we don't have an ID), and no
+ * snapshot allocated, put an ID in the global table to prevent any
+ * update that we are reading from being trimmed to save memory. Do a
+ * read before the write because this shared data is accessed a lot.
+ *
+ * !!!
+ * Note: We are updating the global table unprotected, so the
+ * oldest_id may move past this ID if a scan races with this
+ * value being published. That said, read-uncommitted operations
+ * always take the most recent version of a value, so for that version
+ * to be freed, two newer versions would have to be committed. Putting
+ * this snap_min ID in the table prevents the oldest ID from moving
+ * further forward, so that once a read-uncommitted cursor is
+ * positioned on a value, it can't be freed.
+ */
+ if (txn->isolation == TXN_ISO_READ_UNCOMMITTED &&
+ !F_ISSET(txn, TXN_HAS_ID) &&
+ TXNID_LT(txn_state->snap_min, txn_global->last_running))
+ txn_state->snap_min = txn_global->last_running;
+
+ if (txn->isolation != TXN_ISO_READ_UNCOMMITTED &&
+ !F_ISSET(txn, TXN_HAS_SNAPSHOT))
+ __wt_txn_refresh(session, 1);
+}
+
+/*
+ * __wt_txn_am_oldest --
+ * Am I the oldest transaction in the system?
+ */
+static inline int
+__wt_txn_am_oldest(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *s;
+ uint64_t id;
+ uint32_t i, session_cnt;
+
+ conn = S2C(session);
+ txn = &session->txn;
+ txn_global = &conn->txn_global;
+
+ if (txn->id == WT_TXN_NONE)
+ return (0);
+
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = 0, s = txn_global->states;
+ i < session_cnt;
+ i++, s++)
+ if ((id = s->id) != WT_TXN_NONE &&
+ TXNID_LT(id, txn->id))
+ return (0);
+
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/include/verify_build.h b/src/third_party/wiredtiger/src/include/verify_build.h
new file mode 100644
index 00000000000..5f05db11c4b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/verify_build.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#undef ALIGN_CHECK
+#undef SIZE_CHECK
+
+/*
+ * NOTE: If you see a compile failure in this file, your compiler is laying out
+ * structs in memory in a way WiredTiger does not expect. Please refer to the
+ * build instructions in the documentation (docs/html/install.html) for more
+ * information.
+ */
+
+/*
+ * Compile time assertions.
+ *
+ * If the argument to WT_STATIC_ASSERT is zero, the macro evaluates to:
+ *
+ * (void)sizeof(char[-1])
+ *
+ * which fails to compile (which is what we want, the assertion failed).
+ * If the value of the argument to WT_STATIC_ASSERT is non-zero, then the
+ * macro evaluates to:
+ *
+ * (void)sizeof(char[1]);
+ *
+ * which compiles with no warnings, and produces no code.
+ *
+ * For more details about why this works, see
+ * http://scaryreasoner.wordpress.com/2009/02/28/
+ */
+#define WT_STATIC_ASSERT(cond) (void)sizeof(char[1 - 2 * !(cond)])
+
+#define SIZE_CHECK(type, e) do { \
+ char __check_##type[1 - 2 * !(sizeof(type) == (e))]; \
+ (void)__check_##type; \
+} while (0)
+
+#define ALIGN_CHECK(type, a) \
+ WT_STATIC_ASSERT(WT_ALIGN(sizeof(type), (a)) == sizeof(type))
+
+/*
+ * __wt_verify_build --
+ * This function is never called: it exists so there is a place for code
+ * that checks build-time conditions.
+ */
+static inline void
+__wt_verify_build(void)
+{
+ /* Check specific structures weren't padded. */
+ SIZE_CHECK(WT_BLOCK_DESC, WT_BLOCK_DESC_SIZE);
+ SIZE_CHECK(WT_REF, WT_REF_SIZE);
+
+ /*
+ * The btree code encodes key/value pairs in size_t's, and requires at
+ * least 8B size_t's.
+ */
+ WT_STATIC_ASSERT(sizeof(size_t) >= 8);
+
+ /*
+ * We require a wt_off_t fit into an 8B chunk because 8B is the largest
+ * integral value we can encode into an address cookie.
+ *
+ * WiredTiger has never been tested on a system with 4B file offsets,
+ * disallow them for now.
+ */
+ WT_STATIC_ASSERT(sizeof(wt_off_t) == 8);
+}
+
+#undef ALIGN_CHECK
+#undef SIZE_CHECK
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
new file mode 100644
index 00000000000..09cbca89f17
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -0,0 +1,3463 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#ifndef __WIREDTIGER_H_
+#define __WIREDTIGER_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*******************************************
+ * Version information
+ *******************************************/
+#define WIREDTIGER_VERSION_MAJOR @VERSION_MAJOR@
+#define WIREDTIGER_VERSION_MINOR @VERSION_MINOR@
+#define WIREDTIGER_VERSION_PATCH @VERSION_PATCH@
+#define WIREDTIGER_VERSION_STRING @VERSION_STRING@
+
+/*******************************************
+ * Required includes
+ *******************************************/
+@wiredtiger_includes_decl@
+
+/*******************************************
+ * Portable type names
+ *******************************************/
+@off_t_decl@
+@uintmax_t_decl@
+@uintptr_t_decl@
+
+#if defined(DOXYGEN) || defined(SWIG)
+#define __F(func) func
+#else
+#define __F(func) (*func)
+#endif
+
+#ifdef SWIG
+%{
+#include <wiredtiger.h>
+%}
+#endif
+
+/*!
+ * @defgroup wt WiredTiger API
+ * The functions, handles and methods applications use to access and manage
+ * data with WiredTiger.
+ *
+ * @{
+ */
+
+/*******************************************
+ * Public forward structure declarations
+ *******************************************/
+struct __wt_async_callback;
+ typedef struct __wt_async_callback WT_ASYNC_CALLBACK;
+struct __wt_async_op; typedef struct __wt_async_op WT_ASYNC_OP;
+struct __wt_collator; typedef struct __wt_collator WT_COLLATOR;
+struct __wt_compressor; typedef struct __wt_compressor WT_COMPRESSOR;
+struct __wt_config_item; typedef struct __wt_config_item WT_CONFIG_ITEM;
+struct __wt_config_parser;
+ typedef struct __wt_config_parser WT_CONFIG_PARSER;
+struct __wt_connection; typedef struct __wt_connection WT_CONNECTION;
+struct __wt_cursor; typedef struct __wt_cursor WT_CURSOR;
+struct __wt_data_source; typedef struct __wt_data_source WT_DATA_SOURCE;
+struct __wt_event_handler; typedef struct __wt_event_handler WT_EVENT_HANDLER;
+struct __wt_extension_api; typedef struct __wt_extension_api WT_EXTENSION_API;
+struct __wt_extractor; typedef struct __wt_extractor WT_EXTRACTOR;
+struct __wt_item; typedef struct __wt_item WT_ITEM;
+struct __wt_lsn; typedef struct __wt_lsn WT_LSN;
+struct __wt_session; typedef struct __wt_session WT_SESSION;
+
+#if defined(SWIGJAVA)
+#define WT_HANDLE_NULLABLE(typename) typename##_NULLABLE
+#define WT_HANDLE_CLOSED(typename) typename##_CLOSED
+typedef WT_CURSOR WT_CURSOR_NULLABLE;
+typedef WT_CURSOR WT_CURSOR_CLOSED;
+typedef WT_SESSION WT_SESSION_CLOSED;
+typedef WT_CONNECTION WT_CONNECTION_CLOSED;
+#elif !defined(DOXYGEN)
+#define WT_HANDLE_NULLABLE(typename) typename
+#define WT_HANDLE_CLOSED(typename) typename
+#endif
+
+/*!
+ * A raw item of data to be managed, including a pointer to the data and a
+ * length.
+ *
+ * WT_ITEM structures do not need to be cleared before use.
+ */
+struct __wt_item {
+ /*!
+ * The memory reference of the data item.
+ *
+ * For items returned by a WT_CURSOR, the pointer is only valid until
+ * the next operation on that cursor. Applications that need to keep
+ * an item across multiple cursor operations must make a copy.
+ */
+ const void *data;
+
+ /*!
+ * The number of bytes in the data item.
+ *
+ * The maximum length of a single column stored in a table is not fixed
+ * (as it partially depends on the underlying file configuration), but
+ * is always a small number of bytes less than 4GB.
+ */
+ size_t size;
+
+#ifndef DOXYGEN
+#define WT_ITEM_ALIGNED 0x00000001
+#define WT_ITEM_INUSE 0x00000002
+ /* This appears in the middle of the struct to avoid padding. */
+ /*! Object flags (internal use). */
+ uint32_t flags;
+
+ /*! Managed memory chunk (internal use). */
+ void *mem;
+ /*! Managed memory size (internal use). */
+ size_t memsize;
+#endif
+};
+
+/*
+ * We rely on this structure being aligned at 64 bits by the compiler,
+ * if we were paranoid we could add an unused field to ensure the padding
+ * is correct.
+ *
+ * NOTE: If you change the contents of this structure you must also update
+ * the macros in log.h.
+ */
+/*!
+ * A log sequence number, representing a position in the transaction log.
+ */
+struct __wt_lsn {
+ uint32_t file; /*!< Log file number */
+ wt_off_t offset; /*!< Log file offset */
+};
+
+/*!
+ * The maximum packed size of a 64-bit integer. The ::wiredtiger_struct_pack
+ * function will pack single long integers into at most this many bytes.
+ */
+#define WT_INTPACK64_MAXSIZE ((int)sizeof (int64_t) + 1)
+
+/*!
+ * The maximum packed size of a 32-bit integer. The ::wiredtiger_struct_pack
+ * function will pack single integers into at most this many bytes.
+ */
+#define WT_INTPACK32_MAXSIZE ((int)sizeof (int32_t) + 1)
+
+/*!
+ * A WT_CURSOR handle is the interface to a cursor.
+ *
+ * Cursors allow data to be searched, iterated and modified, implementing the
+ * CRUD (create, read, update and delete) operations. Cursors are opened in
+ * the context of a session. If a transaction is started, cursors operate in
+ * the context of the transaction until the transaction is resolved.
+ *
+ * Raw data is represented by key/value pairs of WT_ITEM structures, but
+ * cursors can also provide access to fields within the key and value if the
+ * formats are described in the WT_SESSION::create method.
+ *
+ * In the common case, a cursor is used to access records in a table. However,
+ * cursors can be used on subsets of tables (such as a single column or a
+ * projection of multiple columns), as an interface to statistics, configuration
+ * data or application-specific data sources. See WT_SESSION::open_cursor for
+ * more information.
+ *
+ * <b>Thread safety:</b> A WT_CURSOR handle is not usually shared between
+ * threads, see @ref threads for more information.
+ */
+struct __wt_cursor {
+ WT_SESSION *session; /*!< The session handle for this cursor. */
+
+ /*!
+ * The name of the data source for the cursor, matches the \c uri
+ * parameter to WT_SESSION::open_cursor used to open the cursor.
+ */
+ const char *uri;
+
+ /*!
+ * The format of the data packed into key items. See @ref packing for
+ * details. If not set, a default value of "u" is assumed, and
+ * applications must use WT_ITEM structures to manipulate untyped byte
+ * arrays.
+ */
+ const char *key_format;
+
+ /*!
+ * The format of the data packed into value items. See @ref packing
+ * for details. If not set, a default value of "u" is assumed, and
+ * applications must use WT_ITEM structures to manipulate untyped byte
+ * arrays.
+ */
+ const char *value_format;
+
+ /*!
+ * @name Data access
+ * @{
+ */
+ /*!
+ * Get the key for the current record.
+ *
+ * @snippet ex_all.c Get the cursor's string key
+ *
+ * @snippet ex_all.c Get the cursor's record number key
+ *
+ * @param cursor the cursor handle
+ * @param ... pointers to hold key fields corresponding to
+ * WT_CURSOR::key_format.
+ * @errors
+ */
+ int __F(get_key)(WT_CURSOR *cursor, ...);
+
+ /*!
+ * Get the value for the current record.
+ *
+ * @snippet ex_all.c Get the cursor's string value
+ *
+ * @snippet ex_all.c Get the cursor's raw value
+ *
+ * @param cursor the cursor handle
+ * @param ... pointers to hold value fields corresponding to
+ * WT_CURSOR::value_format.
+ * @errors
+ */
+ int __F(get_value)(WT_CURSOR *cursor, ...);
+
+ /*!
+ * Set the key for the next operation.
+ *
+ * @snippet ex_all.c Set the cursor's string key
+ *
+ * @snippet ex_all.c Set the cursor's record number key
+ *
+ * @param cursor the cursor handle
+ * @param ... key fields corresponding to WT_CURSOR::key_format.
+ *
+ * If an error occurs during this operation, a flag will be set in the
+ * cursor, and the next operation to access the key will fail. This
+ * simplifies error handling in applications.
+ */
+ void __F(set_key)(WT_CURSOR *cursor, ...);
+
+ /*!
+ * Set the value for the next operation.
+ *
+ * @snippet ex_all.c Set the cursor's string value
+ *
+ * @snippet ex_all.c Set the cursor's raw value
+ *
+ * @param cursor the cursor handle
+ * @param ... value fields corresponding to WT_CURSOR::value_format.
+ *
+ * If an error occurs during this operation, a flag will be set in the
+ * cursor, and the next operation to access the value will fail. This
+ * simplifies error handling in applications.
+ */
+ void __F(set_value)(WT_CURSOR *cursor, ...);
+ /*! @} */
+
+ /*!
+ * @name Cursor positioning
+ * @{
+ */
+ /*!
+ * Return the ordering relationship between two cursors: both cursors
+ * must have the same data source and have valid keys.
+ *
+ * @snippet ex_all.c Cursor comparison
+ *
+ * @param cursor the cursor handle
+ * @param other another cursor handle
+ * @param comparep the status of the comparison: < 0 if
+ * <code>cursor</code> refers to a key that appears before
+ * <code>other</code>, 0 if the cursors refer to the same key,
+ * and > 0 if <code>cursor</code> refers to a key that appears after
+ * <code>other</code>.
+ * @errors
+ */
+ int __F(compare)(WT_CURSOR *cursor, WT_CURSOR *other, int *comparep);
+
+ /*!
+ * Return the next record.
+ *
+ * @snippet ex_all.c Return the next record
+ *
+ * @param cursor the cursor handle
+ * @errors
+ */
+ int __F(next)(WT_CURSOR *cursor);
+
+ /*!
+ * Return the previous record.
+ *
+ * @snippet ex_all.c Return the previous record
+ *
+ * @param cursor the cursor handle
+ * @errors
+ */
+ int __F(prev)(WT_CURSOR *cursor);
+
+ /*!
+ * Reset the position of the cursor. Any resources held by the cursor
+ * are released, and the cursor's key and position are no longer valid.
+ * A subsequent iteration with WT_CURSOR::next will move to the first
+ * record, or with WT_CURSOR::prev will move to the last record.
+ *
+ * @snippet ex_all.c Reset the cursor
+ *
+ * @param cursor the cursor handle
+ * @errors
+ */
+ int __F(reset)(WT_CURSOR *cursor);
+
+ /*!
+ * Return the record matching the key. The key must first be set.
+ *
+ * @snippet ex_all.c Search for an exact match
+ *
+ * On success, the cursor ends positioned at the returned record; to
+ * minimize cursor resources, the WT_CURSOR::reset method should be
+ * called as soon as the record has been retrieved and the cursor no
+ * longer needs that position.
+ *
+ * @param cursor the cursor handle
+ * @errors
+ */
+ int __F(search)(WT_CURSOR *cursor);
+
+ /*!
+ * Return the record matching the key if it exists, or an adjacent
+ * record. An adjacent record is either the smallest record larger
+ * than the key or the largest record smaller than the key (in other
+ * words, a logically adjacent key).
+ *
+ * The key must first be set.
+ *
+ * An example of a search for an exact or adjacent match:
+ *
+ * @snippet ex_all.c Search for an exact or adjacent match
+ *
+ * An example of a forward scan through the table, where all keys
+ * greater than or equal to a specified prefix are included in the
+ * scan:
+ *
+ * @snippet ex_all.c Forward scan greater than or equal
+ *
+ * An example of a backward scan through the table, where all keys
+ * less than a specified prefix are included in the scan:
+ *
+ * @snippet ex_all.c Backward scan less than
+ *
+ * On success, the cursor ends positioned at the returned record; to
+ * minimize cursor resources, the WT_CURSOR::reset method should be
+ * called as soon as the record has been retrieved and the cursor no
+ * longer needs that position.
+ *
+ * @param cursor the cursor handle
+ * @param exactp the status of the search: 0 if an exact match is
+ * found, < 0 if a smaller key is returned, > 0 if a larger key is
+ * returned
+ * @errors
+ */
+ int __F(search_near)(WT_CURSOR *cursor, int *exactp);
+ /*! @} */
+
+ /*!
+ * @name Data modification
+ * @{
+ */
+ /*!
+ * Insert a record and optionally update an existing record.
+ *
+ * If the cursor was configured with "overwrite=true" (the default),
+ * both the key and value must be set; if the record already exists,
+ * the key's value will be updated, otherwise, the record will be
+ * inserted.
+ *
+ * @snippet ex_all.c Insert a new record or overwrite an existing record
+ *
+ * If the cursor was not configured with "overwrite=true", both the key
+ * and value must be set and the record must not already exist; the
+ * record will be inserted.
+ *
+ * @snippet ex_all.c Insert a new record and fail if the record exists
+ *
+ * If a cursor with record number keys was configured with
+ * "append=true" (not the default), the value must be set; a new record
+ * will be appended and the record number set as the cursor key value.
+ *
+ * @snippet ex_all.c Insert a new record and assign a record number
+ *
+ * The cursor ends with no position, and a subsequent call to the
+ * WT_CURSOR::next (WT_CURSOR::prev) method will iterate from the
+ * beginning (end) of the table.
+ *
+ * Inserting a new record after the current maximum record in a
+ * fixed-length bit field column-store (that is, a store with an
+ * 'r' type key and 't' type value) may implicitly create the missing
+ * records as records with a value of 0.
+ *
+ * When loading a large amount of data into a new object, using
+ * a cursor with the \c bulk configuration string enabled and
+ * loading the data in sorted order will be much faster than doing
+ * out-of-order inserts. See @ref tune_bulk_load for more information.
+ *
+ * The maximum length of a single column stored in a table is not fixed
+ * (as it partially depends on the underlying file configuration), but
+ * is always a small number of bytes less than 4GB.
+ *
+ * @param cursor the cursor handle
+ * @errors
+ * In particular, if \c overwrite is not configured and a record with
+ * the specified key already exists, ::WT_DUPLICATE_KEY is returned.
+ */
+ int __F(insert)(WT_CURSOR *cursor);
+
+ /*!
+ * Update a record and optionally insert an existing record.
+ *
+ * If the cursor was configured with "overwrite=true" (the default),
+ * both the key and value must be set; if the record already exists, the
+ * key's value will be updated, otherwise, the record will be inserted.
+ *
+ * @snippet ex_all.c Update an existing record or insert a new record
+ *
+ * If the cursor was not configured with "overwrite=true", both the key
+ * and value must be set and the record must already exist; the
+ * record will be updated.
+ *
+ * @snippet ex_all.c Update an existing record and fail if DNE
+ *
+ * On success, the cursor ends positioned at the modified record; to
+ * minimize cursor resources, the WT_CURSOR::reset method should be
+ * called as soon as the cursor no longer needs that position.
+ *
+ * The maximum length of a single column stored in a table is not fixed
+ * (as it partially depends on the underlying file configuration), but
+ * is always a small number of bytes less than 4GB.
+ *
+ * @param cursor the cursor handle
+ * @errors
+ * In particular, if \c overwrite is not configured and no record with
+ * the specified key exists, ::WT_NOTFOUND is returned.
+ */
+ int __F(update)(WT_CURSOR *cursor);
+
+ /*!
+ * Remove a record.
+ *
+ * If the cursor was configured with "overwrite=true" (the default),
+ * the key must be set; the key's record will be removed if it exists,
+ * no error will be returned if the record does not exist.
+ *
+ * @snippet ex_all.c Remove a record
+ *
+ * If the cursor was not configured with "overwrite=true", the key must
+ * be set and the key's record must exist; the record will be removed.
+ *
+ * @snippet ex_all.c Remove a record and fail if DNE
+ *
+ * Removing a record in a fixed-length bit field column-store
+ * (that is, a store with an 'r' type key and 't' type value) is
+ * identical to setting the record's value to 0.
+ *
+ * On success, the cursor ends positioned at the removed record; to
+ * minimize cursor resources, the WT_CURSOR::reset method should be
+ * called as soon as the cursor no longer needs that position.
+ *
+ * @param cursor the cursor handle
+ * @errors
+ * In particular, if \c overwrite is not configured and no record with
+ * the specified key exists, ::WT_NOTFOUND is returned.
+ */
+ int __F(remove)(WT_CURSOR *cursor);
+ /*! @} */
+
+ /*!
+ * Close the cursor.
+ *
+ * This releases the resources associated with the cursor handle.
+ * Cursors are closed implicitly by ending the enclosing connection or
+ * closing the session in which they were opened.
+ *
+ * @snippet ex_all.c Close the cursor
+ *
+ * @param cursor the cursor handle
+ * @errors
+ */
+ int __F(close)(WT_HANDLE_CLOSED(WT_CURSOR) *cursor);
+
+ /*
+ * Protected fields, only to be used by cursor implementations.
+ */
+#if !defined(SWIG) && !defined(DOXYGEN)
+ /*
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_ENTRY(wt_cursor) q;
+ */
+ struct {
+ WT_CURSOR *tqe_next;
+ WT_CURSOR **tqe_prev;
+ } q; /* Linked list of WT_CURSORs. */
+
+ uint64_t recno; /* Record number, normal and raw mode */
+ uint8_t raw_recno_buf[WT_INTPACK64_MAXSIZE];
+
+ void *json_private; /* JSON specific storage */
+ void *lang_private; /* Language specific private storage */
+
+ WT_ITEM key, value;
+ int saved_err; /* Saved error in set_{key,value}. */
+ /*
+ * URI used internally, may differ from the URI provided by the
+ * user on open.
+ */
+ const char *internal_uri;
+
+#define WT_CURSTD_APPEND 0x0001
+#define WT_CURSTD_BULK 0x0002
+#define WT_CURSTD_DATA_SOURCE 0x0004
+#define WT_CURSTD_DUMP_HEX 0x0008
+#define WT_CURSTD_DUMP_JSON 0x0010
+#define WT_CURSTD_DUMP_PRINT 0x0020
+#define WT_CURSTD_KEY_EXT 0x0040 /* Key points out of the tree. */
+#define WT_CURSTD_KEY_INT 0x0080 /* Key points into the tree. */
+#define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT)
+#define WT_CURSTD_OPEN 0x0100
+#define WT_CURSTD_OVERWRITE 0x0200
+#define WT_CURSTD_RAW 0x0400
+#define WT_CURSTD_VALUE_EXT 0x0800 /* Value points out of the tree. */
+#define WT_CURSTD_VALUE_INT 0x1000 /* Value points into the tree. */
+#define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT)
+ uint32_t flags;
+#endif
+};
+
+/*! Asynchronous operation types. */
+typedef enum {
+ WT_AOP_NONE=0, /*!< No operation type set */
+ WT_AOP_COMPACT, /*!< WT_ASYNC_OP::compact */
+ WT_AOP_INSERT, /*!< WT_ASYNC_OP::insert */
+ WT_AOP_REMOVE, /*!< WT_ASYNC_OP::remove */
+ WT_AOP_SEARCH, /*!< WT_ASYNC_OP::search */
+ WT_AOP_UPDATE /*!< WT_ASYNC_OP::update */
+} WT_ASYNC_OPTYPE;
+
+/*!
+ * A WT_ASYNC_OP handle is the interface to an asynchronous operation.
+ *
+ * An asynchronous operation describes a data manipulation to be performed
+ * asynchronously by a WiredTiger worker thread. These operations implement
+ * the CRUD (create, read, update and delete) operations. Each operation
+ * is a self-contained work unit. The operation will be performed in the
+ * context of the worker thread's session. Each operation is performed
+ * within the context of a transaction. The application is notified of its
+ * completion with a callback. The transaction is resolved once the callback
+ * returns.
+ *
+ * The table referenced in an operation must already exist.
+ *
+ * Raw data is represented by key/value pairs of WT_ITEM structures, but
+ * operations can also provide access to fields within the key and value if
+ * the formats are described in the WT_SESSION::create method.
+ *
+ * <b>Thread safety:</b> A WT_ASYNC_OP handle may not be shared between
+ * threads, see @ref threads for more information.
+ */
+struct __wt_async_op {
+ /*! The connection for this operation. */
+ WT_CONNECTION *connection;
+
+ /*!
+ * The format of the data packed into key items. See @ref packing for
+ * details. If not set, a default value of "u" is assumed, and
+ * applications must use WT_ITEM structures to manipulate untyped byte
+ * arrays.
+ */
+ const char *key_format;
+
+ /*!
+ * The format of the data packed into value items. See @ref packing
+ * for details. If not set, a default value of "u" is assumed, and
+ * applications must use WT_ITEM structures to manipulate untyped byte
+ * arrays.
+ */
+ const char *value_format;
+
+ /*
+ * Don't expose app_private to non-C language bindings - they have
+ * their own way to attach data to an operation.
+ */
+#if !defined(SWIG)
+ /*!
+ * A location for applications to store information that will be
+ * available in the callback from an async operation.
+ */
+ void *app_private;
+#endif
+
+ /*!
+ * @name Data access
+ * @{
+ */
+ /*!
+ * Invoke the underlying WT_CURSOR::get_key method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ * @returns as described for WT_CURSOR::get_key
+ */
+ int __F(get_key)(WT_ASYNC_OP *op, ...);
+
+ /*!
+ * Invoke the underlying WT_CURSOR::get_value method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ * @returns as described for WT_CURSOR::get_value
+ */
+ int __F(get_value)(WT_ASYNC_OP *op, ...);
+
+ /*!
+ * Invoke the underlying WT_CURSOR::set_key method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ */
+ void __F(set_key)(WT_ASYNC_OP *op, ...);
+
+ /*!
+ * Invoke the underlying WT_CURSOR::set_value method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ */
+ void __F(set_value)(WT_ASYNC_OP *op, ...);
+ /*! @} */
+
+ /*!
+ * @name Positioning
+ * @{
+ */
+ /*!
+ * Invoke the underlying WT_CURSOR::search method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ * @returns via the callback as described for WT_CURSOR::search
+ */
+ int __F(search)(WT_ASYNC_OP *op);
+ /*! @} */
+
+ /*!
+ * @name Data modification
+ * @{
+ */
+ /*!
+ * Invoke the underlying WT_CURSOR::insert method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ * @returns via the callback as described for WT_CURSOR::insert
+ */
+ int __F(insert)(WT_ASYNC_OP *op);
+
+ /*!
+ * Invoke the underlying WT_CURSOR::update method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ * @returns via the callback as described for WT_CURSOR::update
+ */
+ int __F(update)(WT_ASYNC_OP *op);
+
+ /*!
+ * Invoke the underlying WT_CURSOR::remove method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ * @returns via the callback as described for WT_CURSOR::remove
+ */
+ int __F(remove)(WT_ASYNC_OP *op);
+ /*! @} */
+
+ /*!
+ * @name Table operations
+ * @{
+ */
+ /*!
+ * Invoke the underlying WT_SESSION::compact method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ * @returns via the callback as described for WT_SESSION::compact
+ */
+ int __F(compact)(WT_ASYNC_OP *op);
+ /*! @} */
+
+ /*!
+ * Get the unique identifier for this operation.
+ *
+ * @snippet ex_async.c async get identifier
+ *
+ * @param op the operation handle
+ * @returns the id of the operation
+ */
+ uint64_t __F(get_id)(WT_ASYNC_OP *op);
+
+ /*!
+ * Get the type for this operation.
+ *
+ * @snippet ex_async.c async get type
+ *
+ * @param op the operation handle
+ * @returns the ::WT_ASYNC_OPTYPE of the operation
+ */
+ WT_ASYNC_OPTYPE __F(get_type)(WT_ASYNC_OP *op);
+
+ /*
+ * Protected fields, only to be used by internal implementation.
+ * Everything we need for maintaining the key/value is part of
+ * a cursor. So, include one here so that we can use the cursor
+ * functions to manage them.
+ */
+#if !defined(SWIG) && !defined(DOXYGEN)
+ WT_CURSOR c;
+#endif
+};
+
+/*!
+ * All data operations are performed in the context of a WT_SESSION. This
+ * encapsulates the thread and transactional context of the operation.
+ *
+ * <b>Thread safety:</b> A WT_SESSION handle is not usually shared between
+ * threads, see @ref threads for more information.
+ */
+struct __wt_session {
+ /*! The connection for this session. */
+ WT_CONNECTION *connection;
+
+ /*!
+ * Close the session handle.
+ *
+ * This will release the resources associated with the session handle,
+ * including rolling back any active transactions and closing any
+ * cursors that remain open in the session.
+ *
+ * @snippet ex_all.c Close a session
+ *
+ * @param session the session handle
+ * @configempty{session.close, see dist/api_data.py}
+ * @errors
+ */
+ int __F(close)(WT_HANDLE_CLOSED(WT_SESSION) *session,
+ const char *config);
+
+ /*!
+ * Reconfigure a session handle.
+ *
+ * @snippet ex_all.c Reconfigure a session
+ *
+ * WT_SESSION::reconfigure will fail if a transaction is in progress
+ * in the session.
+ *
+ * All cursors are reset.
+ *
+ * @param session the session handle
+ * @configstart{session.reconfigure, see dist/api_data.py}
+ * @config{isolation, the default isolation level for operations in this
+ * session., a string\, chosen from the following options: \c
+ * "read-uncommitted"\, \c "read-committed"\, \c "snapshot"; default \c
+ * read-committed.}
+ * @configend
+ * @errors
+ */
+ int __F(reconfigure)(WT_SESSION *session, const char *config);
+
+ /*!
+ * @name Cursor handles
+ * @{
+ */
+
+ /*!
+ * Open a new cursor on a data source or duplicate an existing cursor.
+ *
+ * @snippet ex_all.c Open a cursor
+ *
+ * An existing cursor can be duplicated by passing it as the \c to_dup
+ * parameter and setting the \c uri parameter to \c NULL:
+ *
+ * @snippet ex_all.c Duplicate a cursor
+ *
+ * Cursors being duplicated must have a key set, and successfully
+ * duplicated cursors are positioned at the same place in the data
+ * source as the original.
+ *
+ * To reconfigure a cursor, duplicate it with a new configuration value:
+ *
+ * @snippet ex_all.c Reconfigure a cursor
+ *
+ * Cursor handles should be discarded by calling WT_CURSOR::close.
+ *
+ * Cursors capable of supporting transactional operations operate in the
+ * context of the current transaction, if any.
+ *
+ * WT_SESSION::rollback_transaction implicitly resets all cursors.
+ *
+ * Cursors are relatively light-weight objects but may hold references
+ * to heavier-weight objects; applications should re-use cursors when
+ * possible, but instantiating new cursors is not so expensive that
+ * applications need to cache cursors at all cost.
+ *
+ * @param session the session handle
+ * @param uri the data source on which the cursor operates; cursors
+ * are usually opened on tables, however, cursors can be opened on
+ * any data source, regardless of whether it is ultimately stored
+ * in a table. Some cursor types may have limited functionality
+ * (for example, they may be read-only or not support transactional
+ * updates). See @ref data_sources for more information.
+ * <br>
+ * @copydoc doc_cursor_types
+ * @param to_dup a cursor to duplicate
+ * @configstart{session.open_cursor, see dist/api_data.py}
+ * @config{append, append the value as a new record\, creating a new
+ * record number key; valid only for cursors with record number keys., a
+ * boolean flag; default \c false.}
+ * @config{bulk, configure the cursor for bulk-loading\, a fast\,
+ * initial load path (see @ref tune_bulk_load for more information).
+ * Bulk-load may only be used for newly created objects and cursors
+ * configured for bulk-load only support the WT_CURSOR::insert and
+ * WT_CURSOR::close methods. When bulk-loading row-store objects\, keys
+ * must be loaded in sorted order. The value is usually a true/false
+ * flag; when bulk-loading fixed-length column store objects\, the
+ * special value \c bitmap allows chunks of a memory resident bitmap to
+ * be loaded directly into a file by passing a \c WT_ITEM to
+ * WT_CURSOR::set_value where the \c size field indicates the number of
+ * records in the bitmap (as specified by the object's \c value_format
+ * configuration). Bulk-loaded bitmap values must end on a byte boundary
+ * relative to the bit count (except for the last set of values
+ * loaded)., a string; default \c false.}
+ * @config{checkpoint, the name of a checkpoint to open (the reserved
+ * name "WiredTigerCheckpoint" opens the most recent internal checkpoint
+ * taken for the object). The cursor does not support data
+ * modification., a string; default empty.}
+ * @config{dump, configure the cursor for dump format inputs and
+ * outputs: "hex" selects a simple hexadecimal format\, "json" selects a
+ * JSON format with each record formatted as fields named by column
+ * names if available\, and "print" selects a format where only
+ * non-printing characters are hexadecimal encoded. These formats are
+ * compatible with the @ref util_dump and @ref util_load commands., a
+ * string\, chosen from the following options: \c "hex"\, \c "json"\, \c
+ * "print"; default empty.}
+ * @config{next_random, configure the cursor to return a pseudo-random
+ * record from the object; valid only for row-store cursors. Cursors
+ * configured with \c next_random=true only support the WT_CURSOR::next
+ * and WT_CURSOR::close methods. See @ref cursor_random for details., a
+ * boolean flag; default \c false.}
+ * @config{overwrite, configures whether the cursor's insert\, update
+ * and remove methods check the existing state of the record. If \c
+ * overwrite is \c false\, WT_CURSOR::insert fails with
+ * ::WT_DUPLICATE_KEY if the record exists\, WT_CURSOR::update and
+ * WT_CURSOR::remove fail with ::WT_NOTFOUND if the record does not
+ * exist., a boolean flag; default \c true.}
+ * @config{raw, ignore the encodings for the key and value\, manage data
+ * as if the formats were \c "u". See @ref cursor_raw for details., a
+ * boolean flag; default \c false.}
+ * @config{readonly, only query operations are supported by this cursor.
+ * An error is returned if a modification is attempted using the cursor.
+ * The default is false for all cursor types except for log and metadata
+ * cursors., a boolean flag; default \c false.}
+ * @config{statistics, Specify the statistics to be gathered. Choosing
+ * "all" gathers statistics regardless of cost and may include
+ * traversing on-disk files; "fast" gathers a subset of relatively
+ * inexpensive statistics. The selection must agree with the database
+ * \c statistics configuration specified to ::wiredtiger_open or
+ * WT_CONNECTION::reconfigure. For example\, "all" or "fast" can be
+ * configured when the database is configured with "all"\, but the
+ * cursor open will fail if "all" is specified when the database is
+ * configured with "fast"\, and the cursor open will fail in all cases
+ * when the database is configured with "none". If \c statistics is not
+ * configured\, the default configuration is the database configuration.
+ * The "clear" configuration resets statistics after gathering them\,
+ * where appropriate (for example\, a cache size statistic is not
+ * cleared\, while the count of cursor insert operations will be
+ * cleared). See @ref statistics for more information., a list\, with
+ * values chosen from the following options: \c "all"\, \c "fast"\, \c
+ * "clear"; default empty.}
+ * @config{target, if non-empty\, backup the list of objects; valid only
+ * for a backup data source., a list of strings; default empty.}
+ * @configend
+ * @param[out] cursorp a pointer to the newly opened cursor
+ * @errors
+ */
+ int __F(open_cursor)(WT_SESSION *session,
+ const char *uri, WT_HANDLE_NULLABLE(WT_CURSOR) *to_dup,
+ const char *config, WT_CURSOR **cursorp);
+ /*! @} */
+
+ /*!
+ * @name Table operations
+ * @{
+ */
+ /*!
+ * Create a table, column group, index or file.
+ *
+ * @snippet ex_all.c Create a table
+ *
+ * @param session the session handle
+ * @param name the URI of the object to create, such as
+ * \c "table:stock". For a description of URI formats
+ * see @ref data_sources.
+ * @configstart{session.create, see dist/api_data.py}
+ * @config{allocation_size, the file unit allocation size\, in bytes\,
+ * must a power-of-two; smaller values decrease the file space required
+ * by overflow items\, and the default value of 4KB is a good choice
+ * absent requirements from the operating system or storage device., an
+ * integer between 512B and 128MB; default \c 4KB.}
+ * @config{app_metadata, application-owned metadata for this object., a
+ * string; default empty.}
+ * @config{block_allocation, configure block allocation. Permitted
+ * values are \c "first" or \c "best"; the \c "first" configuration uses
+ * a first-available algorithm during block allocation\, the \c "best"
+ * configuration uses a best-fit algorithm., a string\, chosen from the
+ * following options: \c "first"\, \c "best"; default \c best.}
+ * @config{block_compressor, configure a compressor for file blocks.
+ * Permitted values are empty (off) or \c "bzip2"\, \c "snappy" or
+ * custom compression engine \c "name" created with
+ * WT_CONNECTION::add_compressor. See @ref compression for more
+ * information., a string; default empty.}
+ * @config{cache_resident, do not ever evict the object's pages; see
+ * @ref tuning_cache_resident for more information., a boolean flag;
+ * default \c false.}
+ * @config{checksum, configure block checksums; permitted values are
+ * <code>on</code> (checksum all blocks)\, <code>off</code> (checksum no
+ * blocks) and <code>uncompresssed</code> (checksum only blocks which
+ * are not compressed for any reason). The \c uncompressed setting is
+ * for applications which can rely on decompression to fail if a block
+ * has been corrupted., a string\, chosen from the following options: \c
+ * "on"\, \c "off"\, \c "uncompressed"; default \c uncompressed.}
+ * @config{colgroups, comma-separated list of names of column groups.
+ * Each column group is stored separately\, keyed by the primary key of
+ * the table. If no column groups are specified\, all columns are
+ * stored together in a single file. All value columns in the table
+ * must appear in at least one column group. Each column group must be
+ * created with a separate call to WT_SESSION::create., a list of
+ * strings; default empty.}
+ * @config{collator, configure custom collation for keys. Value must be
+ * a collator name created with WT_CONNECTION::add_collator., a string;
+ * default empty.}
+ * @config{columns, list of the column names. Comma-separated list of
+ * the form <code>(column[\,...])</code>. For tables\, the number of
+ * entries must match the total number of values in \c key_format and \c
+ * value_format. For colgroups and indices\, all column names must
+ * appear in the list of columns for the table., a list of strings;
+ * default empty.}
+ * @config{dictionary, the maximum number of unique values remembered in
+ * the Btree row-store leaf page value dictionary; see @ref
+ * file_formats_compression for more information., an integer greater
+ * than or equal to 0; default \c 0.}
+ * @config{exclusive, fail if the object exists. When false (the
+ * default)\, if the object exists\, check that its settings match the
+ * specified configuration., a boolean flag; default \c false.}
+ * @config{format, the file format., a string\, chosen from the
+ * following options: \c "btree"; default \c btree.}
+ * @config{huffman_key, configure Huffman encoding for keys. Permitted
+ * values are empty (off)\, \c "english"\, \c "utf8<file>" or \c
+ * "utf16<file>". See @ref huffman for more information., a string;
+ * default empty.}
+ * @config{huffman_value, configure Huffman encoding for values.
+ * Permitted values are empty (off)\, \c "english"\, \c "utf8<file>" or
+ * \c "utf16<file>". See @ref huffman for more information., a string;
+ * default empty.}
+ * @config{internal_item_max, the largest key stored within an internal
+ * node\, in bytes. If non-zero\, any key larger than the specified
+ * size will be stored as an overflow item (which may require additional
+ * I/O to access). If zero\, a default size is chosen that permits at
+ * least 8 keys per internal page., an integer greater than or equal to
+ * 0; default \c 0.}
+ * @config{internal_key_truncate, configure internal key truncation\,
+ * discarding unnecessary trailing bytes on internal keys (ignored for
+ * custom collators)., a boolean flag; default \c true.}
+ * @config{internal_page_max, the maximum page size for internal nodes\,
+ * in bytes; the size must be a multiple of the allocation size and is
+ * significant for applications wanting to avoid excessive L2 cache
+ * misses while searching the tree. The page maximum is the bytes of
+ * uncompressed data\, that is\, the limit is applied before any block
+ * compression is done., an integer between 512B and 512MB; default \c
+ * 4KB.}
+ * @config{key_format, the format of the data packed into key items.
+ * See @ref schema_format_types for details. By default\, the
+ * key_format is \c 'u' and applications use WT_ITEM structures to
+ * manipulate raw byte arrays. By default\, records are stored in
+ * row-store files: keys of type \c 'r' are record numbers and records
+ * referenced by record number are stored in column-store files., a
+ * format string; default \c u.}
+ * @config{leaf_item_max, the largest key or value stored within a leaf
+ * node\, in bytes. If non-zero\, any key or value larger than the
+ * specified size will be stored as an overflow item (which may require
+ * additional I/O to access). If zero\, a default size is chosen that
+ * permits at least 4 key and value pairs per leaf page., an integer
+ * greater than or equal to 0; default \c 0.}
+ * @config{leaf_page_max, the maximum page size for leaf nodes\, in
+ * bytes; the size must be a multiple of the allocation size\, and is
+ * significant for applications wanting to maximize sequential data
+ * transfer from a storage device. The page maximum is the bytes of
+ * uncompressed data\, that is\, the limit is applied before any block
+ * compression is done., an integer between 512B and 512MB; default \c
+ * 32KB.}
+ * @config{lsm = (, options only relevant for LSM data sources., a set
+ * of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;auto_throttle, Throttle inserts into
+ * LSM trees if flushing to disk isn't keeping up., a boolean flag;
+ * default \c true.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom, create bloom
+ * filters on LSM tree chunks as they are merged., a boolean flag;
+ * default \c true.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom_bit_count,
+ * the number of bits used per item for LSM bloom filters., an integer
+ * between 2 and 1000; default \c 16.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom_config, config string used when
+ * creating Bloom filter files\, passed to WT_SESSION::create., a
+ * string; default empty.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom_hash_count, the number of hash
+ * values per item used for LSM bloom filters., an integer between 2 and
+ * 100; default \c 8.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom_oldest,
+ * create a bloom filter on the oldest LSM tree chunk. Only supported
+ * if bloom filters are enabled., a boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;chunk_max, the maximum size a single
+ * chunk can be. Chunks larger than this size are not considered for
+ * further merges. This is a soft limit\, and chunks larger than this
+ * value can be created. Must be larger than chunk_size., an integer
+ * between 100MB and 10TB; default \c 5GB.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;chunk_size, the maximum size of the
+ * in-memory chunk of an LSM tree. This limit is soft - it is possible
+ * for chunks to be temporarily larger than this value. This overrides
+ * the \c memory_page_max setting., an integer between 512K and 500MB;
+ * default \c 10MB.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;merge_max, the
+ * maximum number of chunks to include in a merge operation., an integer
+ * between 2 and 100; default \c 15.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;merge_min, the minimum number of
+ * chunks to include in a merge operation. If set to 0 or 1 half the
+ * value of merge_max is used., an integer no more than 100; default \c
+ * 0.}
+ * @config{ ),,}
+ * @config{memory_page_max, the maximum size a page can grow to in
+ * memory before being reconciled to disk. The specified size will be
+ * adjusted to a lower bound of <code>50 * leaf_page_max</code>\, and an
+ * upper bound of <code>cache_size / 2</code>. This limit is soft - it
+ * is possible for pages to be temporarily larger than this value. This
+ * setting is ignored for LSM trees\, see \c chunk_size., an integer
+ * between 512B and 10TB; default \c 5MB.}
+ * @config{os_cache_dirty_max, maximum dirty system buffer cache usage\,
+ * in bytes. If non-zero\, schedule writes for dirty blocks belonging
+ * to this object in the system buffer cache after that many bytes from
+ * this object are written into the buffer cache., an integer greater
+ * than or equal to 0; default \c 0.}
+ * @config{os_cache_max, maximum system buffer cache usage\, in bytes.
+ * If non-zero\, evict object blocks from the system buffer cache after
+ * that many bytes from this object are read or written into the buffer
+ * cache., an integer greater than or equal to 0; default \c 0.}
+ * @config{prefix_compression, configure prefix compression on row-store
+ * leaf pages., a boolean flag; default \c false.}
+ * @config{prefix_compression_min, minimum gain before prefix
+ * compression will be used on row-store leaf pages., an integer greater
+ * than or equal to 0; default \c 4.}
+ * @config{split_pct, the Btree page split size as a percentage of the
+ * maximum Btree page size\, that is\, when a Btree page is split\, it
+ * will be split into smaller pages\, where each page is the specified
+ * percentage of the maximum Btree page size., an integer between 25 and
+ * 100; default \c 75.}
+ * @config{type, set the type of data source used to store a column
+ * group\, index or simple table. By default\, a \c "file:" URI is
+ * derived from the object name. The \c type configuration can be used
+ * to switch to a different data source\, such as LSM or an extension
+ * configured by the application., a string; default \c file.}
+ * @config{value_format, the format of the data packed into value items.
+ * See @ref schema_format_types for details. By default\, the
+ * value_format is \c 'u' and applications use a WT_ITEM structure to
+ * manipulate raw byte arrays. Value items of type 't' are bitfields\,
+ * and when configured with record number type keys\, will be stored
+ * using a fixed-length store., a format string; default \c u.}
+ * @configend
+ * @errors
+ */
+ int __F(create)(WT_SESSION *session,
+ const char *name, const char *config);
+
+ /*!
+ * Compact a live row- or column-store btree or LSM tree.
+ *
+ * @snippet ex_all.c Compact a table
+ *
+ * @param session the session handle
+ * @param name the URI of the object to compact, such as
+ * \c "table:stock"
+ * @configstart{session.compact, see dist/api_data.py}
+ * @config{timeout, maximum amount of time to allow for compact in
+ * seconds. The actual amount of time spent in compact may exceed the
+ * configured value. A value of zero disables the timeout., an integer;
+ * default \c 1200.}
+ * @configend
+ * @errors
+ */
+ int __F(compact)(WT_SESSION *session,
+ const char *name, const char *config);
+
+ /*!
+ * Drop (delete) an object.
+ *
+ * @snippet ex_all.c Drop a table
+ *
+ * @param session the session handle
+ * @param name the URI of the object to drop, such as \c "table:stock"
+ * @configstart{session.drop, see dist/api_data.py}
+ * @config{force, return success if the object does not exist., a
+ * boolean flag; default \c false.}
+ * @config{remove_files, should the underlying files be removed?., a
+ * boolean flag; default \c true.}
+ * @configend
+ * @ebusy_errors
+ */
+ int __F(drop)(WT_SESSION *session,
+ const char *name, const char *config);
+
+ /*!
+ * Insert a ::WT_LOGREC_MESSAGE type record in the database log files
+ * (the database must be configured for logging when this method is
+ * called).
+ *
+ * @param session the session handle
+ * @param fmt a printf format specifier
+ * @errors
+ */
+ int __F(log_printf)(WT_SESSION *session, const char *fmt, ...);
+
+ /*!
+ * Rename an object.
+ *
+ * @snippet ex_all.c Rename a table
+ *
+ * @param session the session handle
+ * @param uri the current URI of the object, such as \c "table:old"
+ * @param newuri the new URI of the object, such as \c "table:new"
+ * @configempty{session.rename, see dist/api_data.py}
+ * @ebusy_errors
+ */
+ int __F(rename)(WT_SESSION *session,
+ const char *uri, const char *newuri, const char *config);
+
+ /*!
+ * Salvage a file or table
+ *
+ * Salvage rebuilds the file, or files of which a table is comprised,
+ * discarding any corrupted file blocks.
+ *
+ * Previously deleted records may re-appear, and inserted records may
+ * disappear, when salvage is done, so salvage should not be run
+ * unless it is known to be necessary. Normally, salvage should be
+ * called after a file or table has been corrupted, as reported by the
+ * WT_SESSION::verify method.
+ *
+ * Files are rebuilt in place, the salvage method overwrites the
+ * existing files.
+ *
+ * @snippet ex_all.c Salvage a table
+ *
+ * @param session the session handle
+ * @param name the URI of the file or table to salvage
+ * @configstart{session.salvage, see dist/api_data.py}
+ * @config{force, force salvage even of files that do not appear to be
+ * WiredTiger files., a boolean flag; default \c false.}
+ * @configend
+ * @ebusy_errors
+ */
+ int __F(salvage)(WT_SESSION *session,
+ const char *name, const char *config);
+
+ /*!
+ * Truncate a file, table or cursor range.
+ *
+ * Truncate a file or table.
+ * @snippet ex_all.c Truncate a table
+ *
+ * Truncate a cursor range. When truncating based on a cursor position,
+ * it is not required the cursor reference a record in the object, only
+ * that the key be set. This allows applications to discard portions of
+ * the object name space without knowing exactly what records the object
+ * contains.
+ * @snippet ex_all.c Truncate a range
+ *
+ * @param session the session handle
+ * @param name the URI of the file or table to truncate
+ * @param start optional cursor marking the first record discarded;
+ * if <code>NULL</code>, the truncate starts from the beginning of
+ * the object
+ * @param stop optional cursor marking the last record discarded;
+ * if <code>NULL</code>, the truncate continues to the end of the
+ * object
+ * @configempty{session.truncate, see dist/api_data.py}
+ * @ebusy_errors
+ */
+ int __F(truncate)(WT_SESSION *session,
+ const char *name,
+ WT_HANDLE_NULLABLE(WT_CURSOR) *start,
+ WT_HANDLE_NULLABLE(WT_CURSOR) *stop,
+ const char *config);
+
+ /*!
+ * Upgrade a file or table.
+ *
+ * Upgrade upgrades a file or table, if upgrade is required.
+ *
+ * @snippet ex_all.c Upgrade a table
+ *
+ * @param session the session handle
+ * @param name the URI of the file or table to upgrade
+ * @configempty{session.upgrade, see dist/api_data.py}
+ * @ebusy_errors
+ */
+ int __F(upgrade)(WT_SESSION *session,
+ const char *name, const char *config);
+
+ /*!
+ * Verify a file or table.
+ *
+ * Verify reports if a file, or the files of which a table is
+ * comprised, have been corrupted. The WT_SESSION::salvage method
+ * can be used to repair a corrupted file,
+ *
+ * @snippet ex_all.c Verify a table
+ *
+ * @param session the session handle
+ * @param name the URI of the file or table to verify
+ * @configstart{session.verify, see dist/api_data.py}
+ * @config{dump_address, Display addresses and page types as pages are
+ * verified\, using the application's message handler\, intended for
+ * debugging., a boolean flag; default \c false.}
+ * @config{dump_blocks, Display the contents of on-disk blocks as they
+ * are verified\, using the application's message handler\, intended for
+ * debugging., a boolean flag; default \c false.}
+ * @config{dump_offsets, Display the contents of specific on-disk
+ * blocks\, using the application's message handler\, intended for
+ * debugging., a list of strings; default empty.}
+ * @config{dump_pages, Display the contents of in-memory pages as they
+ * are verified\, using the application's message handler\, intended for
+ * debugging., a boolean flag; default \c false.}
+ * @configend
+ * @ebusy_errors
+ */
+ int __F(verify)(WT_SESSION *session,
+ const char *name, const char *config);
+ /*! @} */
+
+ /*!
+ * @name Transactions
+ * @{
+ */
+ /*!
+ * Start a transaction in this session.
+ *
+ * The transaction remains active until ended by
+ * WT_SESSION::commit_transaction or WT_SESSION::rollback_transaction.
+ * Operations performed on cursors capable of supporting transactional
+ * operations that are already open in this session, or which are opened
+ * before the transaction ends, will operate in the context of the
+ * transaction.
+ *
+ * WT_SESSION::begin_transaction will fail if a transaction is already
+ * in progress in the session.
+ *
+ * @snippet ex_all.c transaction commit/rollback
+ *
+ * @param session the session handle
+ * @configstart{session.begin_transaction, see dist/api_data.py}
+ * @config{isolation, the isolation level for this transaction; defaults
+ * to the session's isolation level., a string\, chosen from the
+ * following options: \c "read-uncommitted"\, \c "read-committed"\, \c
+ * "snapshot"; default empty.}
+ * @config{name, name of the transaction for tracing and debugging., a
+ * string; default empty.}
+ * @config{priority, priority of the transaction for resolving
+ * conflicts. Transactions with higher values are less likely to
+ * abort., an integer between -100 and 100; default \c 0.}
+ * @config{sync, whether to sync log records when the transaction
+ * commits\, inherited from ::wiredtiger_open \c transaction_sync., a
+ * boolean flag; default empty.}
+ * @configend
+ * @errors
+ */
+ int __F(begin_transaction)(WT_SESSION *session, const char *config);
+
+ /*!
+ * Commit the current transaction.
+ *
+ * A transaction must be in progress when this method is called.
+ *
+ * If WT_SESSION::commit_transaction returns an error, the transaction
+ * was rolled back, not committed.
+ *
+ * @snippet ex_all.c transaction commit/rollback
+ *
+ * @param session the session handle
+ * @configempty{session.commit_transaction, see dist/api_data.py}
+ * @errors
+ */
+ int __F(commit_transaction)(WT_SESSION *session, const char *config);
+
+ /*!
+ * Roll back the current transaction.
+ *
+ * A transaction must be in progress when this method is called.
+ *
+ * All cursors are reset.
+ *
+ * @snippet ex_all.c transaction commit/rollback
+ *
+ * @param session the session handle
+ * @configempty{session.rollback_transaction, see dist/api_data.py}
+ * @errors
+ */
+ int __F(rollback_transaction)(WT_SESSION *session, const char *config);
+
+ /*!
+ * Write a transactionally consistent snapshot of a database or set of
+ * objects. The checkpoint includes all transactions committed before
+ * the checkpoint starts. Additionally, checkpoints may optionally be
+ * discarded.
+ *
+ * @snippet ex_all.c Checkpoint examples
+ *
+ * @param session the session handle
+ * @configstart{session.checkpoint, see dist/api_data.py}
+ * @config{drop, specify a list of checkpoints to drop. The list may
+ * additionally contain one of the following keys: \c "from=all" to drop
+ * all checkpoints\, \c "from=<checkpoint>" to drop all checkpoints
+ * after and including the named checkpoint\, or \c "to=<checkpoint>" to
+ * drop all checkpoints before and including the named checkpoint.
+ * Checkpoints cannot be dropped while a hot backup is in progress or if
+ * open in a cursor., a list of strings; default empty.}
+ * @config{force, by default\, checkpoints may be skipped if the
+ * underlying object has not been modified\, this option forces the
+ * checkpoint., a boolean flag; default \c false.}
+ * @config{name, if non-empty\, specify a name for the checkpoint (note
+ * that checkpoints including LSM trees may not be named)., a string;
+ * default empty.}
+ * @config{target, if non-empty\, checkpoint the list of objects., a
+ * list of strings; default empty.}
+ * @configend
+ * @errors
+ */
+ int __F(checkpoint)(WT_SESSION *session, const char *config);
+
+ /*!
+ * Return the transaction ID range pinned by the session handle.
+ *
+ * The ID range is approximate and is calculated based on the oldest
+ * ID needed for the active transaction in this session, compared
+ * to the newest transaction in the system.
+ *
+ * @snippet ex_all.c transaction pinned range
+ *
+ * @param session the session handle
+ * @param[out] range the range of IDs pinned by this session. Zero if
+ * there is no active transaction.
+ * @errors
+ */
+ int __F(transaction_pinned_range)(WT_SESSION* session, uint64_t *range);
+
+ /*! @} */
+};
+
+/*!
+ * A connection to a WiredTiger database. The connection may be opened within
+ * the same address space as the caller or accessed over a socket connection.
+ *
+ * Most applications will open a single connection to a database for each
+ * process. The first process to open a connection to a database will access
+ * the database in its own address space. Subsequent connections (if allowed)
+ * will communicate with the first process over a socket connection to perform
+ * their operations.
+ *
+ * <b>Thread safety:</b> A WT_CONNECTION handle may be shared between threads,
+ * see @ref threads for more information.
+ */
+struct __wt_connection {
+ /*!
+ * @name Async operation handles
+ * @{
+ */
+ /*!
+ * Wait for all outstanding operations to complete.
+ *
+ * @snippet ex_async.c async flush
+ *
+ * @param connection the connection handle
+ * @errors
+ */
+ int __F(async_flush)(WT_CONNECTION *connection);
+
+ /*!
+ * Return an async operation handle
+ *
+ * @snippet ex_async.c async handle allocation
+ *
+ * @param connection the connection handle
+ * @param uri the connection handle
+ * @configstart{connection.async_new_op, see dist/api_data.py}
+ * @config{append, append the value as a new record\, creating a new
+ * record number key; valid only for operations with record number
+ * keys., a boolean flag; default \c false.}
+ * @config{overwrite, configures whether the cursor's insert\, update
+ * and remove methods check the existing state of the record. If \c
+ * overwrite is \c false\, WT_CURSOR::insert fails with
+ * ::WT_DUPLICATE_KEY if the record exists\, WT_CURSOR::update and
+ * WT_CURSOR::remove fail with ::WT_NOTFOUND if the record does not
+ * exist., a boolean flag; default \c true.}
+ * @config{raw, ignore the encodings for the key and value\, manage data
+ * as if the formats were \c "u". See @ref cursor_raw for details., a
+ * boolean flag; default \c false.}
+ * @config{timeout, maximum amount of time to allow for compact in
+ * seconds. The actual amount of time spent in compact may exceed the
+ * configured value. A value of zero disables the timeout., an integer;
+ * default \c 1200.}
+ * @configend
+ * @param callback the operation callback
+ * @param[out] asyncopp the new op handle
+ * @errors
+ * If there are no available handles, \c EBUSY is returned.
+ */
+ int __F(async_new_op)(WT_CONNECTION *connection,
+ const char *uri, const char *config, WT_ASYNC_CALLBACK *callback,
+ WT_ASYNC_OP **asyncopp);
+ /*! @} */
+
+ /*!
+ * Close a connection.
+ *
+ * Any open sessions will be closed.
+ *
+ * @snippet ex_all.c Close a connection
+ *
+ * @param connection the connection handle
+ * @configstart{connection.close, see dist/api_data.py}
+ * @config{leak_memory, don't free memory during close., a boolean flag;
+ * default \c false.}
+ * @configend
+ * @errors
+ */
+ int __F(close)(WT_HANDLE_CLOSED(WT_CONNECTION) *connection,
+ const char *config);
+
+ /*!
+ * Reconfigure a connection handle.
+ *
+ * @snippet ex_all.c Reconfigure a connection
+ *
+ * @param connection the connection handle
+ * @configstart{connection.reconfigure, see dist/api_data.py}
+ * @config{async = (, asynchronous operations configuration options., a
+ * set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, enable asynchronous
+ * operation., a boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;ops_max, maximum number of expected
+ * simultaneous asynchronous operations., an integer between 10 and
+ * 4096; default \c 1024.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads, the
+ * number of worker threads to service asynchronous requests., an
+ * integer between 1 and 20; default \c 2.}
+ * @config{ ),,}
+ * @config{cache_size, maximum heap memory to allocate for the cache. A
+ * database should configure either a cache_size or a shared_cache not
+ * both., an integer between 1MB and 10TB; default \c 100MB.}
+ * @config{checkpoint = (, periodically checkpoint the database., a set
+ * of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;log_size, wait for this amount of log
+ * record bytes to be written to the log between each checkpoint. A
+ * database can configure both log_size and wait to set an upper bound
+ * for checkpoints; setting this value above 0 configures periodic
+ * checkpoints., an integer between 0 and 2GB; default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;name, the checkpoint name., a string;
+ * default \c "WiredTigerCheckpoint".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;wait, seconds to wait between each
+ * checkpoint; setting this value above 0 configures periodic
+ * checkpoints., an integer between 0 and 100000; default \c 0.}
+ * @config{ ),,}
+ * @config{error_prefix, prefix string for error messages., a string;
+ * default empty.}
+ * @config{eviction = (, eviction configuration options., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads_max, maximum number of
+ * threads WiredTiger will start to help evict pages from cache. The
+ * number of threads started will vary depending on the current eviction
+ * load., an integer between 1 and 20; default \c 1.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads_min, minimum number of
+ * threads WiredTiger will start to help evict pages from cache. The
+ * number of threads currently running will vary depending on the
+ * current eviction load., an integer between 1 and 20; default \c 1.}
+ * @config{ ),,}
+ * @config{eviction_dirty_target, continue evicting until the cache has
+ * less dirty memory than the value\, as a percentage of the total cache
+ * size. Dirty pages will only be evicted if the cache is full enough
+ * to trigger eviction., an integer between 10 and 99; default \c 80.}
+ * @config{eviction_target, continue evicting until the cache has less
+ * total memory than the value\, as a percentage of the total cache
+ * size. Must be less than \c eviction_trigger., an integer between 10
+ * and 99; default \c 80.}
+ * @config{eviction_trigger, trigger eviction when the cache is using
+ * this much memory\, as a percentage of the total cache size., an
+ * integer between 10 and 99; default \c 95.}
+ * @config{lsm_manager = (, configure database wide options for LSM tree
+ * management., a set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;merge, merge LSM chunks where
+ * possible., a boolean flag; default \c true.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;worker_thread_max, Configure a set of
+ * threads to manage merging LSM trees in the database., an integer
+ * between 3 and 20; default \c 4.}
+ * @config{ ),,}
+ * @config{shared_cache = (, shared cache configuration options. A
+ * database should configure either a cache_size or a shared_cache not
+ * both., a set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;chunk, the granularity that a shared
+ * cache is redistributed., an integer between 1MB and 10TB; default \c
+ * 10MB.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;name, name of a cache that is
+ * shared between databases., a string; default empty.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache this
+ * database is guaranteed to have available from the shared cache. This
+ * setting is per database. Defaults to the chunk size., an integer;
+ * default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;size, maximum memory
+ * to allocate for the shared cache. Setting this will update the value
+ * if one is already set., an integer between 1MB and 10TB; default \c
+ * 500MB.}
+ * @config{ ),,}
+ * @config{statistics, Maintain database statistics\, which may impact
+ * performance. Choosing "all" maintains all statistics regardless of
+ * cost\, "fast" maintains a subset of statistics that are relatively
+ * inexpensive\, "none" turns off all statistics. The "clear"
+ * configuration resets statistics after they are gathered\, where
+ * appropriate (for example\, a cache size statistic is not cleared\,
+ * while the count of cursor insert operations will be cleared). When
+ * "clear" is configured for the database\, gathered statistics are
+ * reset each time a statistics cursor is used to gather statistics\, as
+ * well as each time statistics are logged using the \c statistics_log
+ * configuration. See @ref statistics for more information., a list\,
+ * with values chosen from the following options: \c "all"\, \c "fast"\,
+ * \c "none"\, \c "clear"; default \c none.}
+ * @config{statistics_log = (, log any statistics the database is
+ * configured to maintain\, to a file. See @ref statistics for more
+ * information., a set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;on_close, log statistics on database
+ * close., a boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;path, the pathname to a file into
+ * which the log records are written\, may contain ISO C standard
+ * strftime conversion specifications. If the value is not an absolute
+ * path name\, the file is created relative to the database home., a
+ * string; default \c "WiredTigerStat.%d.%H".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;sources, if non-empty\, include
+ * statistics for the list of data source URIs\, if they are open at the
+ * time of the statistics logging. The list may include URIs matching a
+ * single data source ("table:mytable")\, or a URI matching all data
+ * sources of a particular type ("table:")., a list of strings; default
+ * empty.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;timestamp, a timestamp
+ * prepended to each log record\, may contain strftime conversion
+ * specifications., a string; default \c "%b %d %H:%M:%S".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;wait, seconds to wait between each
+ * write of the log records., an integer between 0 and 100000; default
+ * \c 0.}
+ * @config{ ),,}
+ * @config{verbose, enable messages for various events. Only available
+ * if WiredTiger is configured with --enable-verbose. Options are given
+ * as a list\, such as <code>"verbose=[evictserver\,read]"</code>., a
+ * list\, with values chosen from the following options: \c "api"\, \c
+ * "block"\, \c "checkpoint"\, \c "compact"\, \c "evict"\, \c
+ * "evictserver"\, \c "fileops"\, \c "log"\, \c "lsm"\, \c "metadata"\,
+ * \c "mutex"\, \c "overflow"\, \c "read"\, \c "reconcile"\, \c
+ * "recovery"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, \c
+ * "temporary"\, \c "transaction"\, \c "verify"\, \c "version"\, \c
+ * "write"; default empty.}
+ * @configend
+ * @errors
+ */
+ int __F(reconfigure)(WT_CONNECTION *connection, const char *config);
+
+ /*!
+ * The home directory of the connection.
+ *
+ * @snippet ex_all.c Get the database home directory
+ *
+ * @param connection the connection handle
+ * @returns a pointer to a string naming the home directory
+ */
+ const char *__F(get_home)(WT_CONNECTION *connection);
+
+ /*!
+ * Add configuration options for a method. See
+ * @ref custom_ds_config_add for more information.
+ *
+ * @snippet ex_all.c Configure method configuration
+ *
+ * @param connection the connection handle
+ * @param method the name of the method
+ * @param uri the object type or NULL for all object types
+ * @param config the additional configuration's name and default value
+ * @param type the additional configuration's type (must be one of
+ * \c "boolean"\, \c "int", \c "list" or \c "string")
+ * @param check the additional configuration check string, or NULL if
+ * none
+ * @errors
+ */
+ int __F(configure_method)(WT_CONNECTION *connection,
+ const char *method, const char *uri,
+ const char *config, const char *type, const char *check);
+
+ /*!
+ * Return if opening this handle created the database.
+ *
+ * @snippet ex_all.c Check if the database is newly created
+ *
+ * @param connection the connection handle
+ * @returns false (zero) if the connection existed before the call to
+ * ::wiredtiger_open, true (non-zero) if it was created by opening this
+ * handle.
+ */
+ int __F(is_new)(WT_CONNECTION *connection);
+
+ /*!
+ * @name Session handles
+ * @{
+ */
+ /*!
+ * Open a session.
+ *
+ * @snippet ex_all.c Open a session
+ *
+ * @param connection the connection handle
+ * @param errhandler An error handler. If <code>NULL</code>, the
+ * connection's error handler is used
+ * @configstart{connection.open_session, see dist/api_data.py}
+ * @config{isolation, the default isolation level for operations in this
+ * session., a string\, chosen from the following options: \c
+ * "read-uncommitted"\, \c "read-committed"\, \c "snapshot"; default \c
+ * read-committed.}
+ * @configend
+ * @param[out] sessionp the new session handle
+ * @errors
+ */
+ int __F(open_session)(WT_CONNECTION *connection,
+ WT_EVENT_HANDLER *errhandler, const char *config,
+ WT_SESSION **sessionp);
+ /*! @} */
+
+ /*!
+ * @name Extensions
+ * @{
+ */
+ /*!
+ * Load an extension.
+ *
+ * @snippet ex_all.c Load an extension
+ *
+ * @param connection the connection handle
+ * @param path the filename of the extension module, or \c "local" to
+ * search the current application binary for the initialization
+ * function, see @ref extensions for more details.
+ * @configstart{connection.load_extension, see dist/api_data.py}
+ * @config{config, configuration string passed to the entry point of the
+ * extension as its WT_CONFIG_ARG argument., a string; default empty.}
+ * @config{entry, the entry point of the extension\, called to
+ * initialize the extension when it is loaded. The signature of the
+ * function must match ::wiredtiger_extension_init., a string; default
+ * \c wiredtiger_extension_init.}
+ * @config{terminate, an optional function in the extension that is
+ * called before the extension is unloaded during WT_CONNECTION::close.
+ * The signature of the function must match
+ * ::wiredtiger_extension_terminate., a string; default \c
+ * wiredtiger_extension_terminate.}
+ * @configend
+ * @errors
+ */
+ int __F(load_extension)(WT_CONNECTION *connection,
+ const char *path, const char *config);
+
+ /*!
+ * Add a custom data source. See @ref custom_data_sources for more
+ * information.
+ *
+ * The application must first implement the WT_DATA_SOURCE interface
+ * and then register the implementation with WiredTiger:
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE register
+ *
+ * @param connection the connection handle
+ * @param prefix the URI prefix for this data source, e.g., "file:"
+ * @param data_source the application-supplied implementation of
+ * WT_DATA_SOURCE to manage this data source.
+ * @configempty{connection.add_data_source, see dist/api_data.py}
+ * @errors
+ */
+ int __F(add_data_source)(WT_CONNECTION *connection, const char *prefix,
+ WT_DATA_SOURCE *data_source, const char *config);
+
+ /*!
+ * Add a custom collation function.
+ *
+ * The application must first implement the WT_COLLATOR interface and
+ * then register the implementation with WiredTiger:
+ *
+ * @snippet ex_all.c WT_COLLATOR register
+ *
+ * @param connection the connection handle
+ * @param name the name of the collation to be used in calls to
+ * WT_SESSION::create
+ * @param collator the application-supplied collation handler
+ * @configempty{connection.add_collator, see dist/api_data.py}
+ * @errors
+ */
+ int __F(add_collator)(WT_CONNECTION *connection,
+ const char *name, WT_COLLATOR *collator, const char *config);
+
+ /*!
+ * Add a compression function.
+ *
+ * The application must first implement the WT_COMPRESSOR interface
+ * and then register the implementation with WiredTiger:
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR initialization structure
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR initialization function
+ *
+ * @param connection the connection handle
+ * @param name the name of the compression function to be used in calls
+ * to WT_SESSION::create
+ * @param compressor the application-supplied compression handler
+ * @configempty{connection.add_compressor, see dist/api_data.py}
+ * @errors
+ */
+ int __F(add_compressor)(WT_CONNECTION *connection,
+ const char *name, WT_COMPRESSOR *compressor, const char *config);
+
+ /*!
+ * Add a custom extractor for index keys or column groups.
+ * @notyet{custom extractors}
+ *
+ * The application must first implement the WT_EXTRACTOR interface and
+ * then register the implementation with WiredTiger:
+ *
+ * @snippet ex_all.c WT_EXTRACTOR register
+ *
+ * @param connection the connection handle
+ * @param name the name of the extractor to be used in calls to
+ * WT_SESSION::create
+ * @param extractor the application-supplied extractor
+ * @configempty{connection.add_extractor, see dist/api_data.py}
+ * @errors
+ */
+ int __F(add_extractor)(WT_CONNECTION *connection, const char *name,
+ WT_EXTRACTOR *extractor, const char *config);
+
+ /*!
+ * Return a reference to the WiredTiger extension functions.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API declaration
+ *
+ * @param wt_conn the WT_CONNECTION handle
+ * @returns a reference to a WT_EXTENSION_API structure.
+ */
+ WT_EXTENSION_API *__F(get_extension_api)(WT_CONNECTION *wt_conn);
+ /*! @} */
+};
+
+/*!
+ * Open a connection to a database.
+ *
+ * @snippet ex_all.c Open a connection
+ *
+ * @param home The path to the database home directory. See @ref home
+ * for more information.
+ * @param errhandler An error handler. If <code>NULL</code>, a builtin error
+ * handler is installed that writes error messages to stderr
+ * @configstart{wiredtiger_open, see dist/api_data.py}
+ * @config{async = (, asynchronous operations configuration options., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, enable asynchronous operation., a
+ * boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;ops_max,
+ * maximum number of expected simultaneous asynchronous operations., an integer
+ * between 10 and 4096; default \c 1024.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads, the number of worker threads to
+ * service asynchronous requests., an integer between 1 and 20; default \c 2.}
+ * @config{ ),,}
+ * @config{buffer_alignment, in-memory alignment (in bytes) for buffers used for
+ * I/O. The default value of -1 indicates a platform-specific alignment value
+ * should be used (4KB on Linux systems\, zero elsewhere)., an integer between
+ * -1 and 1MB; default \c -1.}
+ * @config{cache_size, maximum heap memory to allocate for the cache. A
+ * database should configure either a cache_size or a shared_cache not both., an
+ * integer between 1MB and 10TB; default \c 100MB.}
+ * @config{checkpoint = (, periodically checkpoint the database., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;log_size, wait for this amount of log record
+ * bytes to be written to the log between each checkpoint. A database can
+ * configure both log_size and wait to set an upper bound for checkpoints;
+ * setting this value above 0 configures periodic checkpoints., an integer
+ * between 0 and 2GB; default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;name, the
+ * checkpoint name., a string; default \c "WiredTigerCheckpoint".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;wait, seconds to wait between each
+ * checkpoint; setting this value above 0 configures periodic checkpoints., an
+ * integer between 0 and 100000; default \c 0.}
+ * @config{ ),,}
+ * @config{checkpoint_sync, flush files to stable storage when closing or
+ * writing checkpoints., a boolean flag; default \c true.}
+ * @config{config_base, write the base configuration file if creating the
+ * database\, see @ref config_base for more information., a boolean flag;
+ * default \c true.}
+ * @config{create, create the database if it does not exist., a boolean flag;
+ * default \c false.}
+ * @config{direct_io, Use \c O_DIRECT to access files. Options are given as a
+ * list\, such as <code>"direct_io=[data]"</code>. Configuring \c direct_io
+ * requires care\, see @ref tuning_system_buffer_cache_direct_io for important
+ * warnings. Including \c "data" will cause WiredTiger data files to use \c
+ * O_DIRECT\, including \c "log" will cause WiredTiger log files to use \c
+ * O_DIRECT\, and including \c "checkpoint" will cause WiredTiger data files
+ * opened at a checkpoint (i.e: read only) to use \c O_DIRECT., a list\, with
+ * values chosen from the following options: \c "checkpoint"\, \c "data"\, \c
+ * "log"; default empty.}
+ * @config{error_prefix, prefix string for error messages., a string; default
+ * empty.}
+ * @config{eviction = (, eviction configuration options., a set of related
+ * configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads_max, maximum number of threads
+ * WiredTiger will start to help evict pages from cache. The number of threads
+ * started will vary depending on the current eviction load., an integer between
+ * 1 and 20; default \c 1.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads_min, minimum
+ * number of threads WiredTiger will start to help evict pages from cache. The
+ * number of threads currently running will vary depending on the current
+ * eviction load., an integer between 1 and 20; default \c 1.}
+ * @config{ ),,}
+ * @config{eviction_dirty_target, continue evicting until the cache has less
+ * dirty memory than the value\, as a percentage of the total cache size. Dirty
+ * pages will only be evicted if the cache is full enough to trigger eviction.,
+ * an integer between 10 and 99; default \c 80.}
+ * @config{eviction_target, continue evicting until the cache has less total
+ * memory than the value\, as a percentage of the total cache size. Must be
+ * less than \c eviction_trigger., an integer between 10 and 99; default \c 80.}
+ * @config{eviction_trigger, trigger eviction when the cache is using this much
+ * memory\, as a percentage of the total cache size., an integer between 10 and
+ * 99; default \c 95.}
+ * @config{exclusive, fail if the database already exists\, generally used with
+ * the \c create option., a boolean flag; default \c false.}
+ * @config{extensions, list of shared library extensions to load (using dlopen).
+ * Any values specified to an library extension are passed to
+ * WT_CONNECTION::load_extension as the \c config parameter (for example\,
+ * <code>extensions=(/path/ext.so={entry=my_entry})</code>)., a list of strings;
+ * default empty.}
+ * @config{file_extend, file extension configuration. If set\, extend files of
+ * the set type in allocations of the set size\, instead of a block at a time as
+ * each new block is written. For example\,
+ * <code>file_extend=(data=16MB)</code>., a list\, with values chosen from the
+ * following options: \c "data"\, \c "log"; default empty.}
+ * @config{hazard_max, maximum number of simultaneous hazard pointers per
+ * session handle., an integer greater than or equal to 15; default \c 1000.}
+ * @config{log = (, enable logging., a set of related configuration options
+ * defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;archive, automatically
+ * archive unneeded log files., a boolean flag; default \c true.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, enable logging subsystem., a boolean
+ * flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;file_max, the
+ * maximum size of log files., an integer between 100KB and 2GB; default \c
+ * 100MB.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;path, the path to a directory into
+ * which the log files are written. If the value is not an absolute path name\,
+ * the files are created relative to the database home., a string; default \c
+ * "".}
+ * @config{ ),,}
+ * @config{lsm_manager = (, configure database wide options for LSM tree
+ * management., a set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;merge, merge LSM chunks where possible., a
+ * boolean flag; default \c true.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;worker_thread_max, Configure a set of threads
+ * to manage merging LSM trees in the database., an integer between 3 and 20;
+ * default \c 4.}
+ * @config{ ),,}
+ * @config{mmap, Use memory mapping to access files when possible., a boolean
+ * flag; default \c true.}
+ * @config{multiprocess, permit sharing between processes (will automatically
+ * start an RPC server for primary processes and use RPC for secondary
+ * processes). <b>Not yet supported in WiredTiger</b>., a boolean flag; default
+ * \c false.}
+ * @config{session_max, maximum expected number of sessions (including server
+ * threads)., an integer greater than or equal to 1; default \c 100.}
+ * @config{shared_cache = (, shared cache configuration options. A database
+ * should configure either a cache_size or a shared_cache not both., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;chunk, the granularity that a shared cache is
+ * redistributed., an integer between 1MB and 10TB; default \c 10MB.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;name, name of a cache that is shared between
+ * databases., a string; default empty.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache this database is
+ * guaranteed to have available from the shared cache. This setting is per
+ * database. Defaults to the chunk size., an integer; default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;size, maximum memory to allocate for the
+ * shared cache. Setting this will update the value if one is already set., an
+ * integer between 1MB and 10TB; default \c 500MB.}
+ * @config{ ),,}
+ * @config{statistics, Maintain database statistics\, which may impact
+ * performance. Choosing "all" maintains all statistics regardless of cost\,
+ * "fast" maintains a subset of statistics that are relatively inexpensive\,
+ * "none" turns off all statistics. The "clear" configuration resets statistics
+ * after they are gathered\, where appropriate (for example\, a cache size
+ * statistic is not cleared\, while the count of cursor insert operations will
+ * be cleared). When "clear" is configured for the database\, gathered
+ * statistics are reset each time a statistics cursor is used to gather
+ * statistics\, as well as each time statistics are logged using the \c
+ * statistics_log configuration. See @ref statistics for more information., a
+ * list\, with values chosen from the following options: \c "all"\, \c "fast"\,
+ * \c "none"\, \c "clear"; default \c none.}
+ * @config{statistics_log = (, log any statistics the database is configured to
+ * maintain\, to a file. See @ref statistics for more information., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;on_close, log statistics on database close.,
+ * a boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;path, the
+ * pathname to a file into which the log records are written\, may contain ISO C
+ * standard strftime conversion specifications. If the value is not an absolute
+ * path name\, the file is created relative to the database home., a string;
+ * default \c "WiredTigerStat.%d.%H".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;sources,
+ * if non-empty\, include statistics for the list of data source URIs\, if they
+ * are open at the time of the statistics logging. The list may include URIs
+ * matching a single data source ("table:mytable")\, or a URI matching all data
+ * sources of a particular type ("table:")., a list of strings; default empty.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;timestamp, a timestamp prepended to each log
+ * record\, may contain strftime conversion specifications., a string; default
+ * \c "%b %d %H:%M:%S".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;wait, seconds to wait
+ * between each write of the log records., an integer between 0 and 100000;
+ * default \c 0.}
+ * @config{ ),,}
+ * @config{transaction_sync = (, how to sync log records when the transaction
+ * commits., a set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, whether to sync the log on every
+ * commit by default\, can be overridden by the \c sync setting to
+ * WT_SESSION::begin_transaction., a boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;method, the method used to ensure log records
+ * are stable on disk\, see @ref tune_durability for more information., a
+ * string\, chosen from the following options: \c "dsync"\, \c "fsync"\, \c
+ * "none"; default \c fsync.}
+ * @config{ ),,}
+ * @config{use_environment_priv, use the \c WIREDTIGER_CONFIG and \c
+ * WIREDTIGER_HOME environment variables regardless of whether or not the
+ * process is running with special privileges. See @ref home for more
+ * information., a boolean flag; default \c false.}
+ * @config{verbose, enable messages for various events. Only available if
+ * WiredTiger is configured with --enable-verbose. Options are given as a
+ * list\, such as <code>"verbose=[evictserver\,read]"</code>., a list\, with
+ * values chosen from the following options: \c "api"\, \c "block"\, \c
+ * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\,
+ * \c "log"\, \c "lsm"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c
+ * "read"\, \c "reconcile"\, \c "recovery"\, \c "salvage"\, \c "shared_cache"\,
+ * \c "split"\, \c "temporary"\, \c "transaction"\, \c "verify"\, \c "version"\,
+ * \c "write"; default empty.}
+ * @configend
+ * Additionally, if files named \c WiredTiger.config or \c WiredTiger.basecfg
+ * appear in the WiredTiger home directory, they are read for configuration
+ * values (see @ref config_file and @ref config_base for details).
+ * See @ref config_order for ordering of the configuration mechanisms.
+ * @param[out] connectionp A pointer to the newly opened connection handle
+ * @errors
+ */
+int wiredtiger_open(const char *home,
+ WT_EVENT_HANDLER *errhandler, const char *config,
+ WT_CONNECTION **connectionp);
+
+/*!
+ * Return information about an error as a string; wiredtiger_strerror is a
+ * superset of the ISO C99/POSIX 1003.1-2001 function strerror.
+ *
+ * @snippet ex_all.c Display an error
+ *
+ * @param err a return value from a WiredTiger, C library or POSIX function
+ * @returns a string representation of the error
+ */
+const char *wiredtiger_strerror(int err);
+
+#if !defined(SWIG)
+/*!
+ * The interface implemented by applications to accept notifications
+ * of the completion of asynchronous operations.
+ *
+ * Applications register their implementation with WiredTiger by calling
+ * WT_CONNECTION::async_new_op.
+ *
+ * @snippet ex_async.c async handle allocation
+ */
+struct __wt_async_callback {
+ /*!
+ * Callback to receive completion notification.
+ *
+ * @param[in] op the operation handle
+ * @param[in] op_ret the result of the async operation
+ * @param[in] flags currently unused
+ * @returns zero for success, non-zero to indicate an error.
+ *
+ * @snippet ex_async.c async example callback implementation
+ */
+ int (*notify)(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op,
+ int op_ret, uint32_t flags);
+};
+#endif
+
+/*!
+ * The interface implemented by applications to handle error, informational and
+ * progress messages. Entries set to NULL are ignored and the default handlers
+ * will continue to be used.
+ */
+struct __wt_event_handler {
+ /*!
+ * Callback to handle error messages; by default, error messages are
+ * written to the stderr stream.
+ *
+ * Error handler returns are not ignored: if the handler returns
+ * non-zero, the error may cause the WiredTiger function posting the
+ * event to fail, and may even cause operation or library failure.
+ *
+ * @param session the WiredTiger session handle in use when the error
+ * was generated. The handle may have been created by the application
+ * or automatically by WiredTiger.
+ * @param error a WiredTiger, C99 or POSIX error code, which can
+ * be converted to a string using ::wiredtiger_strerror
+ * @param message an error string
+ */
+ int (*handle_error)(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, int error, const char *message);
+
+ /*!
+ * Callback to handle informational messages; by default, informational
+ * messages are written to the stdout stream.
+ *
+ * Message handler returns are not ignored: if the handler returns
+ * non-zero, the error may cause the WiredTiger function posting the
+ * event to fail, and may even cause operation or library failure.
+ *
+ * @param session the WiredTiger session handle in use when the message
+ * was generated. The handle may have been created by the application
+ * or automatically by WiredTiger.
+ * @param message an informational string
+ */
+ int (*handle_message)(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, const char *message);
+
+ /*!
+ * Callback to handle progress messages; by default, no progress
+ * messages are written.
+ *
+ * Progress handler returns are not ignored: if the handler returns
+ * non-zero, the error may cause the WiredTiger function posting the
+ * event to fail, and may even cause operation or library failure.
+ *
+ * @param session the WiredTiger session handle in use when the
+ * progress message was generated. The handle may have been created by
+ * the application or automatically by WiredTiger.
+ * @param operation a string representation of the operation
+ * @param progress a counter
+ */
+ int (*handle_progress)(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, const char *operation, uint64_t progress);
+
+ /*!
+ * Callback to handle automatic close of a WiredTiger handle.
+ *
+ * Close handler returns are not ignored: if the handler returns
+ * non-zero, the error may cause the WiredTiger function posting the
+ * event to fail, and may even cause operation or library failure.
+ *
+ * @param session The session handle that is being closed if the
+ * cursor parameter is NULL.
+ * @param cursor The cursor handle that is being closed, or NULL if
+ * it is a session handle being closed.
+ */
+ int (*handle_close)(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, WT_CURSOR *cursor);
+};
+
+/*!
+ * @name Data packing and unpacking
+ * @{
+ */
+
+/*!
+ * Pack a structure into a buffer.
+ *
+ * See @ref packing for a description of the permitted format strings.
+ *
+ * @section pack_examples Packing Examples
+ *
+ * For example, the string <code>"iSh"</code> will pack a 32-bit integer
+ * followed by a NUL-terminated string, followed by a 16-bit integer. The
+ * default, big-endian encoding will be used, with no alignment. This could be
+ * used in C as follows:
+ *
+ * @snippet ex_all.c Pack fields into a buffer
+ *
+ * Then later, the values can be unpacked as follows:
+ *
+ * @snippet ex_all.c Unpack fields from a buffer
+ *
+ * @param session the session handle
+ * @param buffer a pointer to a packed byte array
+ * @param size the number of valid bytes in the buffer
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+int wiredtiger_struct_pack(WT_SESSION *session,
+ void *buffer, size_t size, const char *format, ...);
+
+/*!
+ * Calculate the size required to pack a structure.
+ *
+ * Note that for variable-sized fields including variable-sized strings and
+ * integers, the calculated sized merely reflects the expected sizes specified
+ * in the format string itself.
+ *
+ * @snippet ex_all.c Get the packed size
+ *
+ * @param session the session handle
+ * @param sizep a location where the number of bytes needed for the
+ * matching call to ::wiredtiger_struct_pack is returned
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+int wiredtiger_struct_size(WT_SESSION *session,
+ size_t *sizep, const char *format, ...);
+
+/*!
+ * Unpack a structure from a buffer.
+ *
+ * Reverse of ::wiredtiger_struct_pack: gets values out of a
+ * packed byte string.
+ *
+ * @snippet ex_all.c Unpack fields from a buffer
+ *
+ * @param session the session handle
+ * @param buffer a pointer to a packed byte array
+ * @param size the number of valid bytes in the buffer
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+int wiredtiger_struct_unpack(WT_SESSION *session,
+ const void *buffer, size_t size, const char *format, ...);
+
+#if !defined(SWIG)
+
+/*!
+ * Streaming interface to packing.
+ *
+ * This allows applications to pack or unpack records one field at a time.
+ * This is an opaque handle returned by ::wiredtiger_pack_start or
+ * ::wiredtiger_unpack_start. It must be closed with ::wiredtiger_pack_close.
+ */
+typedef struct __wt_pack_stream WT_PACK_STREAM;
+
+/*!
+ * Start a packing operation into a buffer with the given format string. This
+ * should be followed by a series of calls to ::wiredtiger_pack_item,
+ * ::wiredtiger_pack_int, ::wiredtiger_pack_str or ::wiredtiger_pack_uint
+ * to fill in the values.
+ *
+ * @param session the session handle
+ * @param format the data format, see @ref packing
+ * @param buffer a pointer to memory to hold the packed data
+ * @param size the size of the buffer
+ * @param[out] psp the new packing stream handle
+ * @errors
+ */
+int wiredtiger_pack_start(WT_SESSION *session,
+ const char *format, void *buffer, size_t size, WT_PACK_STREAM **psp);
+
+/*!
+ * Start an unpacking operation from a buffer with the given format string.
+ * This should be followed by a series of calls to ::wiredtiger_unpack_item,
+ * ::wiredtiger_unpack_int, ::wiredtiger_unpack_str or ::wiredtiger_unpack_uint
+ * to retrieve the packed values.
+ *
+ * @param session the session handle
+ * @param format the data format, see @ref packing
+ * @param buffer a pointer to memory holding the packed data
+ * @param size the size of the buffer
+ * @param[out] psp the new packing stream handle
+ * @errors
+ */
+int wiredtiger_unpack_start(WT_SESSION *session,
+ const char *format, const void *buffer, size_t size, WT_PACK_STREAM **psp);
+
+/*!
+ * Close a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] usedp the number of bytes in the buffer used by the stream
+ * @errors
+ */
+int wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp);
+
+/*!
+ * Pack an item into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param item an item to pack
+ * @errors
+ */
+int wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item);
+
+/*!
+ * Pack a signed integer into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param i a signed integer to pack
+ * @errors
+ */
+int wiredtiger_pack_int(WT_PACK_STREAM *ps, int64_t i);
+
+/*!
+ * Pack a string into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param s a string to pack
+ * @errors
+ */
+int wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s);
+
+/*!
+ * Pack an unsigned integer into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param u an unsigned integer to pack
+ * @errors
+ */
+int wiredtiger_pack_uint(WT_PACK_STREAM *ps, uint64_t u);
+
+/*!
+ * Unpack an item from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param item an item to unpack
+ * @errors
+ */
+int wiredtiger_unpack_item(WT_PACK_STREAM *ps, WT_ITEM *item);
+
+/*!
+ * Unpack a signed integer from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] ip the unpacked signed integer
+ * @errors
+ */
+int wiredtiger_unpack_int(WT_PACK_STREAM *ps, int64_t *ip);
+
+/*!
+ * Unpack a string from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] sp the unpacked string
+ * @errors
+ */
+int wiredtiger_unpack_str(WT_PACK_STREAM *ps, const char **sp);
+
+/*!
+ * Unpack an unsigned integer from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] up the unpacked unsigned integer
+ * @errors
+ */
+int wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up);
+/*! @} */
+
+/*!
+ * @name Configuration string parsing
+ * @{
+ */
+
+/*!
+ * The configuration information returned by the WiredTiger configuration
+ * parsing functions in the WT_EXTENSION_API and the public API.
+ */
+struct __wt_config_item {
+ /*!
+ * The value of a configuration string.
+ *
+ * Regardless of the type of the configuration string (boolean, int,
+ * list or string), the \c str field will reference the value of the
+ * configuration string.
+ *
+ * The bytes referenced by \c str are <b>not</b> nul-terminated,
+ * use the \c len field instead of a terminating nul byte.
+ */
+ const char *str;
+
+ /*! The number of bytes in the value referenced by \c str. */
+ size_t len;
+
+ /*!
+ * The value of a configuration boolean or integer.
+ *
+ * If the configuration string's value is "true" or "false", the
+ * \c val field will be set to 1 (true), or 0 (false).
+ *
+ * If the configuration string can be legally interpreted as an integer,
+ * using the strtoll function rules as specified in ISO/IEC 9899:1990
+ * ("ISO C90"), that integer will be stored in the \c val field.
+ */
+ int64_t val;
+
+ /*! Permitted values of the \c type field. */
+ enum {
+ /*! A string value with quotes stripped. */
+ WT_CONFIG_ITEM_STRING,
+ /*! A boolean literal ("true" or "false"). */
+ WT_CONFIG_ITEM_BOOL,
+ /*! An unquoted identifier: a string value without quotes. */
+ WT_CONFIG_ITEM_ID,
+ /*! A numeric value. */
+ WT_CONFIG_ITEM_NUM,
+ /*! A nested structure or list, including brackets. */
+ WT_CONFIG_ITEM_STRUCT
+ }
+ /*!
+ * The type of value determined by the parser. In all cases,
+ * the \c str and \c len fields are set.
+ */
+ type;
+};
+
+/*!
+ * Create a handle that can be used to parse or create configuration strings
+ * compatible with WiredTiger APIs.
+ * This API is outside the scope of a WiredTiger connection handle, since
+ * applications may need to generate configuration strings prior to calling
+ * ::wiredtiger_open.
+ * @param session the session handle to be used for error reporting. If NULL
+ * error messages will be written to stdout.
+ * @param config the configuration string being parsed. The string must
+ * remain valid for the lifetime of the parser handle.
+ * @param len the number of valid bytes in \c config
+ * @param[out] config_parserp A pointer to the newly opened handle
+ * @errors
+ */
+int wiredtiger_config_parser_open(WT_SESSION *session,
+ const char *config, size_t len, WT_CONFIG_PARSER **config_parserp);
+
+/*!
+ * A handle that can be used to search and traverse configuration strings
+ * compatible with WiredTiger APIs.
+ * To parse the contents of a list or nested configuration string use a new
+ * configuration parser handle based on the content of the ::WT_CONFIG_ITEM
+ * retrieved from the parent configuration string.
+ *
+ * @section config_parse_examples Configuration String Parsing examples
+ *
+ * This could be used in C to create a configuration parser as follows:
+ *
+ * @snippet ex_config_parse.c Create a configuration parser
+ *
+ * Once the parser has been created the content can be queried directly:
+ *
+ * @snippet ex_config_parse.c get
+ *
+ * Or the content can be traversed linearly:
+ *
+ * @snippet ex_config_parse.c next
+ *
+ * Nested configuration values can be queried using a shorthand notation:
+ *
+ * @snippet ex_config_parse.c nested get
+ *
+ * Nested configuration values can be traversed using multiple
+ * ::WT_CONFIG_PARSER handles:
+ *
+ * @snippet ex_config_parse.c nested traverse
+ */
+struct __wt_config_parser {
+
+ /*!
+ * Close the configuration scanner releasing any resources.
+ *
+ * @param config_parser the configuration parser handle
+ * @errors
+ *
+ */
+ int __F(close)(WT_CONFIG_PARSER *config_parser);
+
+ /*!
+ * Return the next key/value pair.
+ *
+ * When iteration would pass the end of the configuration string
+ * ::WT_NOTFOUND will be returned.
+ *
+ * If an item has no explicitly assigned value, the item will be
+ * returned in \c key and the \c value will be set to the boolean
+ * \c "true" value.
+ *
+ * @param config_parser the configuration parser handle
+ * @param key the returned key
+ * @param value the returned value
+ * @errors
+ *
+ */
+ int __F(next)(WT_CONFIG_PARSER *config_parser,
+ WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+
+ /*!
+ * Return the value of an item in the configuration string.
+ *
+ * @param config_parser the configuration parser handle
+ * @param key configuration key string
+ * @param value the returned value
+ * @errors
+ *
+ */
+ int __F(get)(WT_CONFIG_PARSER *config_parser,
+ const char *key, WT_CONFIG_ITEM *value);
+};
+
+#endif /* !defined(SWIG) */
+/*! @} */
+
+/*!
+ * Get version information.
+ *
+ * @snippet ex_all.c Get the WiredTiger library version #1
+ * @snippet ex_all.c Get the WiredTiger library version #2
+ *
+ * @param majorp a location where the major version number is returned
+ * @param minorp a location where the minor version number is returned
+ * @param patchp a location where the patch version number is returned
+ * @returns a string representation of the version
+ */
+const char *wiredtiger_version(int *majorp, int *minorp, int *patchp);
+
+/*******************************************
+ * Error returns
+ *******************************************/
+/*!
+ * @anchor error_returns
+ * @name Error returns
+ * Most functions and methods in WiredTiger return an integer code indicating
+ * whether the operation succeeded or failed. A return of zero indicates
+ * success, all non-zero return values indicate some kind of failure.
+ *
+ * WiredTiger reserves all values from -31,800 to -31,999 as possible error
+ * return values. WiredTiger may also return C99/POSIX error codes such as
+ * \c ENOMEM, \c EINVAL and \c ENOTSUP, with the usual meanings.
+ *
+ * The following are all of the WiredTiger-specific error returns:
+ * @{
+ */
+/*
+ * DO NOT EDIT: automatically built by dist/api_err.py.
+ * Error return section: BEGIN
+ */
+/*!
+ * Attempt to insert an existing key.
+ * This error is generated when the application attempts to insert a record with
+ * the same key as an existing record without the 'overwrite' configuration to
+ * WT_SESSION::open_cursor.
+ */
+#define WT_DUPLICATE_KEY -31800
+/*!
+ * Non-specific WiredTiger error.
+ * This error is returned when an error is not covered by a specific error
+ * return.
+ */
+#define WT_ERROR -31801
+/*!
+ * Item not found.
+ * This error indicates an operation did not find a value to return. This
+ * includes cursor search and other operations where no record matched the
+ * cursor's search key such as WT_CURSOR::update or WT_CURSOR::remove.
+ */
+#define WT_NOTFOUND -31802
+/*!
+ * WiredTiger library panic.
+ * This error indicates an underlying problem that requires the application exit
+ * and restart.
+ */
+#define WT_PANIC -31803
+/*! @cond internal */
+/*! Restart the operation (internal). */
+#define WT_RESTART -31804
+/*! @endcond */
+/*!
+ * Conflict between concurrent operations.
+ * This error is generated when an operation cannot be completed due to a
+ * conflict with concurrent operations. The operation may be retried; if a
+ * transaction is in progress, it should be rolled back and the operation
+ * retried in a new transaction.
+ */
+#define WT_ROLLBACK -31805
+/*
+ * Error return section: END
+ * DO NOT EDIT: automatically built by dist/api_err.py.
+ */
+/*! @} */
+
+#ifndef DOXYGEN
+#define WT_DEADLOCK WT_ROLLBACK /* Backward compatibility */
+#endif
+
+/*! @} */
+
+/*!
+ * @defgroup wt_ext WiredTiger Extension API
+ * The functions and interfaces applications use to customize and extend the
+ * behavior of WiredTiger.
+ * @{
+ */
+
+/*******************************************
+ * Forward structure declarations for the extension API
+ *******************************************/
+struct __wt_config_arg; typedef struct __wt_config_arg WT_CONFIG_ARG;
+
+/*!
+ * The interface implemented by applications to provide custom ordering of
+ * records.
+ *
+ * Applications register their implementation with WiredTiger by calling
+ * WT_CONNECTION::add_collator.
+ *
+ * @snippet ex_extending.c add collator nocase
+ *
+ * @snippet ex_extending.c add collator prefix10
+ */
+struct __wt_collator {
+ /*!
+ * Callback to compare keys.
+ *
+ * @param[out] cmp set to -1 if <code>key1 < key2</code>,
+ * 0 if <code>key1 == key2</code>,
+ * 1 if <code>key1 > key2</code>.
+ * @returns zero for success, non-zero to indicate an error.
+ *
+ * @snippet ex_all.c Implement WT_COLLATOR
+ *
+ * @snippet ex_extending.c case insensitive comparator
+ *
+ * @snippet ex_extending.c n character comparator
+ */
+ int (*compare)(WT_COLLATOR *collator, WT_SESSION *session,
+ const WT_ITEM *key1, const WT_ITEM *key2, int *cmp);
+
+ /*!
+ * If non-NULL, this callback is called to customize the collator
+ * for each data source. If the callback returns a non-NULL
+ * collator, that instance is used instead of this one for all
+ * comparisons.
+ */
+ int (*customize)(WT_COLLATOR *collator, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ITEM *appcfg, WT_COLLATOR **customp);
+
+ /*!
+ * If non-NULL, a callback performed when the database is closed.
+ *
+ * The WT_COLLATOR::terminate callback is intended to allow cleanup,
+ * the handle will not be subsequently accessed by WiredTiger.
+ */
+ int (*terminate)(WT_COLLATOR *collator, WT_SESSION *session);
+};
+
+/*!
+ * The interface implemented by applications to provide custom compression.
+ *
+ * Compressors must implement the WT_COMPRESSOR interface: the
+ * WT_COMPRESSOR::compress and WT_COMPRESSOR::decompress callbacks must be
+ * specified, and WT_COMPRESSOR::pre_size is optional. To build your own
+ * compressor, use one of the compressors in \c ext/compressors as a template:
+ * \c ext/nop_compress is a simple compressor that passes through data
+ * unchanged, and is a reasonable starting point.
+ *
+ * Applications register their implementation with WiredTiger by calling
+ * WT_CONNECTION::add_compressor.
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR initialization structure
+ * @snippet nop_compress.c WT_COMPRESSOR initialization function
+ */
+struct __wt_compressor {
+ /*!
+ * Callback to compress a chunk of data.
+ *
+ * WT_COMPRESSOR::compress takes a source buffer and a destination
+ * buffer, by default of the same size. If the callback can compress
+ * the buffer to a smaller size in the destination, it does so, sets
+ * the \c compression_failed return to 0 and returns 0. If compression
+ * does not produce a smaller result, the callback sets the
+ * \c compression_failed return to 1 and returns 0. If another
+ * error occurs, it returns an errno or WiredTiger error code.
+ *
+ * On entry, \c src will point to memory, with the length of the memory
+ * in \c src_len. After successful completion, the callback should
+ * return \c 0 and set \c result_lenp to the number of bytes required
+ * for the compressed representation.
+ *
+ * On entry, \c dst points to the destination buffer with a length
+ * of \c dst_len. If the WT_COMPRESSOR::pre_size method is specified,
+ * the destination buffer will be at least the size returned by that
+ * method; otherwise, the destination buffer will be at least as large
+ * as \c src_len.
+ *
+ * If compression would not shrink the data or the \c dst buffer is not
+ * large enough to hold the compressed data, the callback should set
+ * \c compression_failed to a non-zero value and return 0.
+ *
+ * @param[in] src the data to compress
+ * @param[in] src_len the length of the data to compress
+ * @param[in] dst the destination buffer
+ * @param[in] dst_len the length of the destination buffer
+ * @param[out] result_lenp the length of the compressed data
+ * @param[out] compression_failed non-zero if compression did not
+ * decrease the length of the data (compression may not have completed)
+ * @returns zero for success, non-zero to indicate an error.
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR compress
+ */
+ int (*compress)(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ uint8_t *dst, size_t dst_len,
+ size_t *result_lenp, int *compression_failed);
+
+ /*!
+ * Callback to compress a list of byte strings.
+ *
+ * WT_COMPRESSOR::compress_raw gives applications fine-grained control
+ * over disk block size when writing row-store or variable-length
+ * column-store pages. Where this level of control is not required by
+ * the underlying storage device, set the WT_COMPRESSOR::compress_raw
+ * callback to \c NULL and WiredTiger will internally split each page
+ * into blocks, each block then compressed by WT_COMPRESSOR::compress.
+ *
+ * WT_COMPRESSOR::compress_raw takes a source buffer and an array of
+ * 0-based offsets of byte strings in that buffer. The callback then
+ * encodes none, some or all of the byte strings and copies the encoded
+ * representation into a destination buffer. The callback returns the
+ * number of byte strings encoded and the bytes needed for the encoded
+ * representation. The encoded representation has header information
+ * prepended and is written as a block to the underlying file object.
+ *
+ * On entry, \c page_max is the configured maximum size for objects of
+ * this type. (This value is provided for convenience, and will be
+ * either the \c internal_page_max or \c leaf_page_max value specified
+ * to WT_SESSION::create when the object was created.)
+ *
+ * On entry, \c split_pct is the configured Btree page split size for
+ * this object. (This value is provided for convenience, and will be
+ * the \c split_pct value specified to WT_SESSION::create when the
+ * object was created.)
+ *
+ * On entry, \c extra is a count of additional bytes that will be added
+ * to the encoded representation before it is written. In other words,
+ * if the target write size is 8KB, the returned encoded representation
+ * should be less than or equal to (8KB - \c extra). The method does
+ * not need to skip bytes in the destination buffer based on \c extra,
+ * the method should only use \c extra to decide how many bytes to store
+ * into the destination buffer for its ideal block size.
+ *
+ * On entry, \c src points to the source buffer; \c offsets is an array
+ * of \c slots 0-based offsets into \c src, where each offset is the
+ * start of a byte string, except for the last offset, which is the
+ * offset of the first byte past the end of the last byte string. (In
+ * other words, <code>offsets[0]</code> will be 0, the offset of the
+ * first byte of the first byte string in \c src, and
+ * <code>offsets[slots]</code> is the total length of all of the byte
+ * strings in the \c src buffer.)
+ *
+ * On entry, \c dst points to the destination buffer with a length
+ * of \c dst_len. If the WT_COMPRESSOR::pre_size method is specified,
+ * the destination buffer will be at least the size returned by that
+ * method; otherwise, the destination buffer will be at least the
+ * maximum size for the page being written (that is, when writing a
+ * row-store leaf page, the destination buffer will be at least as
+ * large as the \c leaf_page_max configuration value).
+ *
+ * After successful completion, the callback should return \c 0, and
+ * set \c result_slotsp to the number of byte strings encoded and
+ * \c result_lenp to the bytes needed for the encoded representation.
+ *
+ * There is no requirement the callback encode any or all of the byte
+ * strings passed by WiredTiger. If the callback does not encode any
+ * of the byte strings and compression should not be retried, the
+ * callback should set \c result_slotsp to 0.
+ *
+ * If the callback does not encode any of the byte strings and
+ * compression should be retried with additional byte strings, the
+ * callback must return \c EAGAIN. In that case, WiredTiger will
+ * accumulate more rows and repeat the call.
+ *
+ * If there are no more rows to accumulate or the callback indicates
+ * that it cannot be retried, WiredTiger writes the remaining rows
+ * using \c WT_COMPRESSOR::compress.
+ *
+ * On entry, \c final is zero if there are more rows to be written as
+ * part of this page (if there will be additional data provided to the
+ * callback), and non-zero if there are no more rows to be written as
+ * part of this page. If \c final is set and the callback fails to
+ * encode any rows, WiredTiger writes the remaining rows without further
+ * calls to the callback. If \c final is set and the callback encodes
+ * any number of rows, WiredTiger continues to call the callback until
+ * all of the rows are encoded or the callback fails to encode any rows.
+ *
+ * The WT_COMPRESSOR::compress_raw callback is intended for applications
+ * wanting to create disk blocks in specific sizes.
+ * WT_COMPRESSOR::compress_raw is not a replacement for
+ * WT_COMPRESSOR::compress: objects which WT_COMPRESSOR::compress_raw
+ * cannot handle (for example, overflow key or value items), or which
+ * WT_COMPRESSOR::compress_raw chooses not to compress for any reason
+ * (for example, if WT_COMPRESSOR::compress_raw callback chooses not to
+ * compress a small number of rows, but the page being written has no
+ * more rows to accumulate), will be passed to WT_COMPRESSOR::compress.
+ *
+ * The WT_COMPRESSOR::compress_raw callback is only called for objects
+ * where it is applicable, that is, for row-store and variable-length
+ * column-store objects, where both row-store key prefix compression
+ * and row-store and variable-length column-store dictionary compression
+ * are \b not configured. When WT_COMPRESSOR::compress_raw is not
+ * applicable, the WT_COMPRESSOR::compress callback is used instead.
+ *
+ * @param[in] page_max the configured maximum page size for this object
+ * @param[in] split_pct the configured page split size for this object
+ * @param[in] extra the count of the additional bytes
+ * @param[in] src the data to compress
+ * @param[in] offsets the byte offsets of the byte strings in src
+ * @param[in] slots the number of entries in offsets
+ * @param[in] dst the destination buffer
+ * @param[in] dst_len the length of the destination buffer
+ * @param[in] final non-zero if there are no more rows to accumulate
+ * @param[out] result_lenp the length of the compressed data
+ * @param[out] result_slotsp the number of byte offsets taken
+ * @returns zero for success, non-zero to indicate an error.
+ */
+ int (*compress_raw)(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ size_t page_max, int split_pct, size_t extra,
+ uint8_t *src, uint32_t *offsets, uint32_t slots,
+ uint8_t *dst, size_t dst_len,
+ int final,
+ size_t *result_lenp, uint32_t *result_slotsp);
+
+ /*!
+ * Callback to decompress a chunk of data.
+ *
+ * WT_COMPRESSOR::decompress takes a source buffer and a destination
+ * buffer. The contents are switched from \c compress: the
+ * source buffer is the compressed value, and the destination buffer is
+ * sized to be the original size. If the callback successfully
+ * decompresses the source buffer to the destination buffer, it returns
+ * 0. If an error occurs, it returns an errno or WiredTiger error code.
+ * The source buffer that WT_COMPRESSOR::decompress takes may have a
+ * size that is rounded up from the size originally produced by
+ * WT_COMPRESSOR::compress, with the remainder of the buffer set to
+ * zeroes. Most compressors do not care about this difference if the
+ * size to be decompressed can be implicitly discovered from the
+ * compressed data. If your compressor cares, you may need to allocate
+ * space for, and store, the actual size in the compressed buffer. See
+ * the source code for the included snappy compressor for an example.
+ *
+ * On entry, \c src will point to memory, with the length of the memory
+ * in \c src_len. After successful completion, the callback should
+ * return \c 0 and set \c result_lenp to the number of bytes required
+ * for the decompressed representation.
+ *
+ * If the \c dst buffer is not big enough to hold the decompressed
+ * data, the callback should return an error.
+ *
+ * @param[in] src the data to decompress
+ * @param[in] src_len the length of the data to decompress
+ * @param[in] dst the destination buffer
+ * @param[in] dst_len the length of the destination buffer
+ * @param[out] result_lenp the length of the decompressed data
+ * @returns zero for success, non-zero to indicate an error.
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR decompress
+ */
+ int (*decompress)(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ uint8_t *dst, size_t dst_len,
+ size_t *result_lenp);
+
+ /*!
+ * Callback to size a destination buffer for compression
+ *
+ * WT_COMPRESSOR::pre_size is an optional callback that, given the
+ * source buffer and size, produces the size of the destination buffer
+ * to be given to WT_COMPRESSOR::compress. This is useful for
+ * compressors that assume that the output buffer is sized for the
+ * worst case and thus no overrun checks are made. If your compressor
+ * works like this, WT_COMPRESSOR::pre_size will need to be defined.
+ * See the source code for the snappy compressor for an example.
+ * However, if your compressor detects and avoids overruns against its
+ * target buffer, you will not need to define WT_COMPRESSOR::pre_size.
+ * When WT_COMPRESSOR::pre_size is set to NULL, the destination buffer
+ * is sized the same as the source buffer. This is always sufficient,
+ * since a compression result that is larger than the source buffer is
+ * discarded by WiredTiger.
+ *
+ * If not NULL, this callback is called before each call to
+ * WT_COMPRESS::compress to determine the size of the destination
+ * buffer to provide. If the callback is NULL, the destination
+ * buffer will be the same size as the source buffer.
+ *
+ * The callback should set \c result_lenp to a suitable buffer size
+ * for compression, typically the maximum length required by
+ * WT_COMPRESSOR::compress.
+ *
+ * This callback function is for compressors that require an output
+ * buffer larger than the source buffer (for example, that do not
+ * check for buffer overflow during compression).
+ *
+ * @param[in] src the data to compress
+ * @param[in] src_len the length of the data to compress
+ * @param[out] result_lenp the required destination buffer size
+ * @returns zero for success, non-zero to indicate an error.
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR presize
+ */
+ int (*pre_size)(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len, size_t *result_lenp);
+
+ /*!
+ * If non-NULL, a callback performed when the database is closed.
+ *
+ * The WT_COMPRESSOR::terminate callback is intended to allow cleanup,
+ * the handle will not be subsequently accessed by WiredTiger.
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR terminate
+ */
+ int (*terminate)(WT_COMPRESSOR *compressor, WT_SESSION *session);
+};
+
+/*!
+ * Applications can extend WiredTiger by providing new implementations of the
+ * WT_DATA_SOURCE class. Each data source supports a different URI scheme for
+ * data sources to WT_SESSION::create, WT_SESSION::open_cursor and related
+ * methods. See @ref custom_data_sources for more information.
+ *
+ * <b>Thread safety:</b> WiredTiger may invoke methods on the WT_DATA_SOURCE
+ * interface from multiple threads concurrently. It is the responsibility of
+ * the implementation to protect any shared data.
+ *
+ * Applications register their implementation with WiredTiger by calling
+ * WT_CONNECTION::add_data_source.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE register
+ */
+struct __wt_data_source {
+ /*!
+ * Callback to create a new object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE create
+ */
+ int (*create)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config);
+
+ /*!
+ * Callback to compact an object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE compact
+ */
+ int (*compact)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config);
+
+ /*!
+ * Callback to drop an object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE drop
+ */
+ int (*drop)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config);
+
+ /*!
+ * Callback to initialize a cursor.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE open_cursor
+ */
+ int (*open_cursor)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config, WT_CURSOR **new_cursor);
+
+ /*!
+ * Callback to rename an object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE rename
+ */
+ int (*rename)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, const char *newuri, WT_CONFIG_ARG *config);
+
+ /*!
+ * Callback to salvage an object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE salvage
+ */
+ int (*salvage)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config);
+
+ /*!
+ * Callback to truncate an object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE truncate
+ */
+ int (*truncate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config);
+
+ /*!
+ * Callback to truncate a range of an object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE range truncate
+ */
+ int (*range_truncate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ WT_CURSOR *start, WT_CURSOR *stop);
+
+ /*!
+ * Callback to verify an object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE verify
+ */
+ int (*verify)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config);
+
+ /*!
+ * Callback to checkpoint the database.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE checkpoint
+ */
+ int (*checkpoint)(
+ WT_DATA_SOURCE *dsrc, WT_SESSION *session, WT_CONFIG_ARG *config);
+
+ /*!
+ * If non-NULL, a callback performed when the database is closed.
+ *
+ * The WT_DATA_SOURCE::terminate callback is intended to allow cleanup,
+ * the handle will not be subsequently accessed by WiredTiger.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE terminate
+ */
+ int (*terminate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session);
+};
+
+/*!
+ * The interface implemented by applications to provide custom extraction of
+ * index keys or column group values.
+ *
+ * Applications register implementations with WiredTiger by calling
+ * WT_CONNECTION::add_extractor.
+ *
+ * @snippet ex_all.c WT_EXTRACTOR register
+ */
+struct __wt_extractor {
+ /*!
+ * Callback to extract a value for an index or column group.
+ *
+ * @errors
+ *
+ * @snippet ex_all.c WT_EXTRACTOR
+ */
+ int (*extract)(WT_EXTRACTOR *extractor, WT_SESSION *session,
+ const WT_ITEM *key, const WT_ITEM *value, WT_ITEM *result);
+};
+
+/*!
+ * Entry point to an extension, called when the extension is loaded.
+ *
+ * @param connection the connection handle
+ * @param config the config information passed to WT_CONNECTION::load_extension
+ * @errors
+ */
+extern int wiredtiger_extension_init(
+ WT_CONNECTION *connection, WT_CONFIG_ARG *config);
+
+/*!
+ * Optional cleanup function for an extension, called during
+ * WT_CONNECTION::close.
+ *
+ * @param connection the connection handle
+ * @errors
+ */
+extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
+
+/*! @} */
+
+/*******************************************
+ * Statistic reference.
+ *******************************************/
+/*!
+ * @addtogroup wt
+ * @{
+ */
+/*
+ * DO NOT EDIT: automatically built by dist/api_stat.py.
+ * Statistics section: BEGIN
+ */
+
+/*!
+ * @name Connection statistics
+ * @anchor statistics_keys
+ * @anchor statistics_conn
+ * Statistics are accessed through cursors with \c "statistics:" URIs.
+ * Individual statistics can be queried through the cursor using the following
+ * keys. See @ref data_statistics for more information.
+ * @{
+ */
+/*! async: number of allocation state races */
+#define WT_STAT_CONN_ASYNC_ALLOC_RACE 1000
+/*! async: number of op slots viewed for alloc */
+#define WT_STAT_CONN_ASYNC_ALLOC_VIEW 1001
+/*! async: current work queue length */
+#define WT_STAT_CONN_ASYNC_CUR_QUEUE 1002
+/*! async: number of async flush calls */
+#define WT_STAT_CONN_ASYNC_FLUSH 1003
+/*! async: number of times op allocation failed */
+#define WT_STAT_CONN_ASYNC_FULL 1004
+/*! async: maximum work queue length */
+#define WT_STAT_CONN_ASYNC_MAX_QUEUE 1005
+/*! async: number of times worker found no work */
+#define WT_STAT_CONN_ASYNC_NOWORK 1006
+/*! async: op allocations */
+#define WT_STAT_CONN_ASYNC_OP_ALLOC 1007
+/*! async: op compact calls */
+#define WT_STAT_CONN_ASYNC_OP_COMPACT 1008
+/*! async: op insert calls */
+#define WT_STAT_CONN_ASYNC_OP_INSERT 1009
+/*! async: op remove calls */
+#define WT_STAT_CONN_ASYNC_OP_REMOVE 1010
+/*! async: op search calls */
+#define WT_STAT_CONN_ASYNC_OP_SEARCH 1011
+/*! async: op update calls */
+#define WT_STAT_CONN_ASYNC_OP_UPDATE 1012
+/*! block manager: mapped bytes read */
+#define WT_STAT_CONN_BLOCK_BYTE_MAP_READ 1013
+/*! block manager: bytes read */
+#define WT_STAT_CONN_BLOCK_BYTE_READ 1014
+/*! block manager: bytes written */
+#define WT_STAT_CONN_BLOCK_BYTE_WRITE 1015
+/*! block manager: mapped blocks read */
+#define WT_STAT_CONN_BLOCK_MAP_READ 1016
+/*! block manager: blocks pre-loaded */
+#define WT_STAT_CONN_BLOCK_PRELOAD 1017
+/*! block manager: blocks read */
+#define WT_STAT_CONN_BLOCK_READ 1018
+/*! block manager: blocks written */
+#define WT_STAT_CONN_BLOCK_WRITE 1019
+/*! cache: tracked dirty bytes in the cache */
+#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1020
+/*! cache: bytes currently in the cache */
+#define WT_STAT_CONN_CACHE_BYTES_INUSE 1021
+/*! cache: maximum bytes configured */
+#define WT_STAT_CONN_CACHE_BYTES_MAX 1022
+/*! cache: bytes read into cache */
+#define WT_STAT_CONN_CACHE_BYTES_READ 1023
+/*! cache: bytes written from cache */
+#define WT_STAT_CONN_CACHE_BYTES_WRITE 1024
+/*! cache: checkpoint blocked page eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1025
+/*! cache: unmodified pages evicted */
+#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1026
+/*! cache: page split during eviction deepened the tree */
+#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1027
+/*! cache: modified pages evicted */
+#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1028
+/*! cache: pages selected for eviction unable to be evicted */
+#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1029
+/*! cache: pages evicted because they exceeded the in-memory maximum */
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1030
+/*! cache: failed eviction of pages that exceeded the in-memory maximum */
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1031
+/*! cache: hazard pointer blocked page eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1032
+/*! cache: internal pages evicted */
+#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1033
+/*! cache: eviction server candidate queue empty when topping up */
+#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1034
+/*! cache: eviction server candidate queue not empty when topping up */
+#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1035
+/*! cache: eviction server evicting pages */
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1036
+/*! cache: eviction server populating queue, but not evicting pages */
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1037
+/*! cache: eviction server unable to reach eviction goal */
+#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1038
+/*! cache: pages split during eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1039
+/*! cache: pages walked for eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_WALK 1040
+/*! cache: tracked dirty pages in the cache */
+#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1041
+/*! cache: pages currently held in the cache */
+#define WT_STAT_CONN_CACHE_PAGES_INUSE 1042
+/*! cache: pages read into cache */
+#define WT_STAT_CONN_CACHE_READ 1043
+/*! cache: pages written from cache */
+#define WT_STAT_CONN_CACHE_WRITE 1044
+/*! conn: pthread mutex condition wait calls */
+#define WT_STAT_CONN_COND_WAIT 1045
+/*! Btree: cursor create calls */
+#define WT_STAT_CONN_CURSOR_CREATE 1046
+/*! Btree: cursor insert calls */
+#define WT_STAT_CONN_CURSOR_INSERT 1047
+/*! Btree: cursor next calls */
+#define WT_STAT_CONN_CURSOR_NEXT 1048
+/*! Btree: cursor prev calls */
+#define WT_STAT_CONN_CURSOR_PREV 1049
+/*! Btree: cursor remove calls */
+#define WT_STAT_CONN_CURSOR_REMOVE 1050
+/*! Btree: cursor reset calls */
+#define WT_STAT_CONN_CURSOR_RESET 1051
+/*! Btree: cursor search calls */
+#define WT_STAT_CONN_CURSOR_SEARCH 1052
+/*! Btree: cursor search near calls */
+#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1053
+/*! Btree: cursor update calls */
+#define WT_STAT_CONN_CURSOR_UPDATE 1054
+/*! dhandle: session dhandles swept */
+#define WT_STAT_CONN_DH_SESSION_HANDLES 1055
+/*! dhandle: session sweep attempts */
+#define WT_STAT_CONN_DH_SESSION_SWEEPS 1056
+/*! conn: files currently open */
+#define WT_STAT_CONN_FILE_OPEN 1057
+/*! log: log buffer size increases */
+#define WT_STAT_CONN_LOG_BUFFER_GROW 1058
+/*! log: total log buffer size */
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1059
+/*! log: user provided log bytes written */
+#define WT_STAT_CONN_LOG_BYTES_USER 1060
+/*! log: log bytes written */
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1061
+/*! log: yields waiting for previous log file close */
+#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1062
+/*! log: maximum log file size */
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1063
+/*! log: log read operations */
+#define WT_STAT_CONN_LOG_READS 1064
+/*! log: records processed by log scan */
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1065
+/*! log: log scan records requiring two reads */
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1066
+/*! log: log scan operations */
+#define WT_STAT_CONN_LOG_SCANS 1067
+/*! log: consolidated slot closures */
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1068
+/*! log: logging bytes consolidated */
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1069
+/*! log: consolidated slot joins */
+#define WT_STAT_CONN_LOG_SLOT_JOINS 1070
+/*! log: consolidated slot join races */
+#define WT_STAT_CONN_LOG_SLOT_RACES 1071
+/*! log: slots selected for switching that were unavailable */
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1072
+/*! log: record size exceeded maximum */
+#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1073
+/*! log: failed to find a slot large enough for record */
+#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1074
+/*! log: consolidated slot join transitions */
+#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1075
+/*! log: log sync operations */
+#define WT_STAT_CONN_LOG_SYNC 1076
+/*! log: log write operations */
+#define WT_STAT_CONN_LOG_WRITES 1077
+/*! LSM: sleep for LSM checkpoint throttle */
+#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1078
+/*! LSM: sleep for LSM merge throttle */
+#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1079
+/*! LSM: rows merged in an LSM tree */
+#define WT_STAT_CONN_LSM_ROWS_MERGED 1080
+/*! LSM: App work units currently queued */
+#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1081
+/*! LSM: Merge work units currently queued */
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1082
+/*! LSM: tree queue hit maximum */
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1083
+/*! LSM: Switch work units currently queued */
+#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1084
+/*! LSM: tree maintenance operations scheduled */
+#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1085
+/*! LSM: tree maintenance operations discarded */
+#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1086
+/*! LSM: tree maintenance operations executed */
+#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1087
+/*! conn: memory allocations */
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1088
+/*! conn: memory frees */
+#define WT_STAT_CONN_MEMORY_FREE 1089
+/*! conn: memory re-allocations */
+#define WT_STAT_CONN_MEMORY_GROW 1090
+/*! conn: total read I/Os */
+#define WT_STAT_CONN_READ_IO 1091
+/*! reconciliation: page reconciliation calls */
+#define WT_STAT_CONN_REC_PAGES 1092
+/*! reconciliation: page reconciliation calls for eviction */
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1093
+/*! reconciliation: split bytes currently awaiting free */
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1094
+/*! reconciliation: split objects currently awaiting free */
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1095
+/*! conn: pthread mutex shared lock read-lock calls */
+#define WT_STAT_CONN_RWLOCK_READ 1096
+/*! conn: pthread mutex shared lock write-lock calls */
+#define WT_STAT_CONN_RWLOCK_WRITE 1097
+/*! session: open cursor count */
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1098
+/*! session: open session count */
+#define WT_STAT_CONN_SESSION_OPEN 1099
+/*! txn: transaction begins */
+#define WT_STAT_CONN_TXN_BEGIN 1100
+/*! txn: transaction checkpoints */
+#define WT_STAT_CONN_TXN_CHECKPOINT 1101
+/*! txn: transaction checkpoint currently running */
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1102
+/*! txn: transactions committed */
+#define WT_STAT_CONN_TXN_COMMIT 1103
+/*! txn: transaction failures due to cache overflow */
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1104
+/*! txn: transaction range of IDs currently pinned */
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1105
+/*! txn: transactions rolled back */
+#define WT_STAT_CONN_TXN_ROLLBACK 1106
+/*! conn: total write I/Os */
+#define WT_STAT_CONN_WRITE_IO 1107
+
+/*!
+ * @}
+ * @name Statistics for data sources
+ * @anchor statistics_dsrc
+ * @{
+ */
+/*! block manager: file allocation unit size */
+#define WT_STAT_DSRC_ALLOCATION_SIZE 2000
+/*! block manager: blocks allocated */
+#define WT_STAT_DSRC_BLOCK_ALLOC 2001
+/*! block manager: checkpoint size */
+#define WT_STAT_DSRC_BLOCK_CHECKPOINT_SIZE 2002
+/*! block manager: allocations requiring file extension */
+#define WT_STAT_DSRC_BLOCK_EXTENSION 2003
+/*! block manager: blocks freed */
+#define WT_STAT_DSRC_BLOCK_FREE 2004
+/*! block manager: file magic number */
+#define WT_STAT_DSRC_BLOCK_MAGIC 2005
+/*! block manager: file major version number */
+#define WT_STAT_DSRC_BLOCK_MAJOR 2006
+/*! block manager: minor version number */
+#define WT_STAT_DSRC_BLOCK_MINOR 2007
+/*! block manager: file bytes available for reuse */
+#define WT_STAT_DSRC_BLOCK_REUSE_BYTES 2008
+/*! block manager: file size in bytes */
+#define WT_STAT_DSRC_BLOCK_SIZE 2009
+/*! LSM: bloom filters in the LSM tree */
+#define WT_STAT_DSRC_BLOOM_COUNT 2010
+/*! LSM: bloom filter false positives */
+#define WT_STAT_DSRC_BLOOM_FALSE_POSITIVE 2011
+/*! LSM: bloom filter hits */
+#define WT_STAT_DSRC_BLOOM_HIT 2012
+/*! LSM: bloom filter misses */
+#define WT_STAT_DSRC_BLOOM_MISS 2013
+/*! LSM: bloom filter pages evicted from cache */
+#define WT_STAT_DSRC_BLOOM_PAGE_EVICT 2014
+/*! LSM: bloom filter pages read into cache */
+#define WT_STAT_DSRC_BLOOM_PAGE_READ 2015
+/*! LSM: total size of bloom filters */
+#define WT_STAT_DSRC_BLOOM_SIZE 2016
+/*! btree: column-store variable-size deleted values */
+#define WT_STAT_DSRC_BTREE_COLUMN_DELETED 2017
+/*! btree: column-store fixed-size leaf pages */
+#define WT_STAT_DSRC_BTREE_COLUMN_FIX 2018
+/*! btree: column-store internal pages */
+#define WT_STAT_DSRC_BTREE_COLUMN_INTERNAL 2019
+/*! btree: column-store variable-size leaf pages */
+#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2020
+/*! btree: pages rewritten by compaction */
+#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2021
+/*! btree: number of key/value pairs */
+#define WT_STAT_DSRC_BTREE_ENTRIES 2022
+/*! btree: fixed-record size */
+#define WT_STAT_DSRC_BTREE_FIXED_LEN 2023
+/*! btree: maximum tree depth */
+#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2024
+/*! btree: maximum internal page item size */
+#define WT_STAT_DSRC_BTREE_MAXINTLITEM 2025
+/*! btree: maximum internal page size */
+#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2026
+/*! btree: maximum leaf page item size */
+#define WT_STAT_DSRC_BTREE_MAXLEAFITEM 2027
+/*! btree: maximum leaf page size */
+#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2028
+/*! btree: overflow pages */
+#define WT_STAT_DSRC_BTREE_OVERFLOW 2029
+/*! btree: row-store internal pages */
+#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2030
+/*! btree: row-store leaf pages */
+#define WT_STAT_DSRC_BTREE_ROW_LEAF 2031
+/*! cache: bytes read into cache */
+#define WT_STAT_DSRC_CACHE_BYTES_READ 2032
+/*! cache: bytes written from cache */
+#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2033
+/*! cache: checkpoint blocked page eviction */
+#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2034
+/*! cache: unmodified pages evicted */
+#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2035
+/*! cache: modified pages evicted */
+#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2036
+/*! cache: data source pages selected for eviction unable to be evicted */
+#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2037
+/*! cache: hazard pointer blocked page eviction */
+#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2038
+/*! cache: internal pages evicted */
+#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2039
+/*! cache: overflow values cached in memory */
+#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2040
+/*! cache: pages read into cache */
+#define WT_STAT_DSRC_CACHE_READ 2041
+/*! cache: overflow pages read into cache */
+#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2042
+/*! cache: pages written from cache */
+#define WT_STAT_DSRC_CACHE_WRITE 2043
+/*! compression: raw compression call failed, no additional data available */
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2044
+/*! compression: raw compression call failed, additional data available */
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2045
+/*! compression: raw compression call succeeded */
+#define WT_STAT_DSRC_COMPRESS_RAW_OK 2046
+/*! compression: compressed pages read */
+#define WT_STAT_DSRC_COMPRESS_READ 2047
+/*! compression: compressed pages written */
+#define WT_STAT_DSRC_COMPRESS_WRITE 2048
+/*! compression: page written failed to compress */
+#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2049
+/*! compression: page written was too small to compress */
+#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2050
+/*! cursor: create calls */
+#define WT_STAT_DSRC_CURSOR_CREATE 2051
+/*! cursor: insert calls */
+#define WT_STAT_DSRC_CURSOR_INSERT 2052
+/*! cursor: bulk-loaded cursor-insert calls */
+#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2053
+/*! cursor: cursor-insert key and value bytes inserted */
+#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2054
+/*! cursor: next calls */
+#define WT_STAT_DSRC_CURSOR_NEXT 2055
+/*! cursor: prev calls */
+#define WT_STAT_DSRC_CURSOR_PREV 2056
+/*! cursor: remove calls */
+#define WT_STAT_DSRC_CURSOR_REMOVE 2057
+/*! cursor: cursor-remove key bytes removed */
+#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2058
+/*! cursor: reset calls */
+#define WT_STAT_DSRC_CURSOR_RESET 2059
+/*! cursor: search calls */
+#define WT_STAT_DSRC_CURSOR_SEARCH 2060
+/*! cursor: search near calls */
+#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2061
+/*! cursor: update calls */
+#define WT_STAT_DSRC_CURSOR_UPDATE 2062
+/*! cursor: cursor-update value bytes updated */
+#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2063
+/*! LSM: sleep for LSM checkpoint throttle */
+#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2064
+/*! LSM: chunks in the LSM tree */
+#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2065
+/*! LSM: highest merge generation in the LSM tree */
+#define WT_STAT_DSRC_LSM_GENERATION_MAX 2066
+/*! LSM: queries that could have benefited from a Bloom filter that did
+ * not exist */
+#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2067
+/*! LSM: sleep for LSM merge throttle */
+#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2068
+/*! reconciliation: dictionary matches */
+#define WT_STAT_DSRC_REC_DICTIONARY 2069
+/*! reconciliation: internal page multi-block writes */
+#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2070
+/*! reconciliation: leaf page multi-block writes */
+#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2071
+/*! reconciliation: maximum blocks required for a page */
+#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2072
+/*! reconciliation: internal-page overflow keys */
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2073
+/*! reconciliation: leaf-page overflow keys */
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2074
+/*! reconciliation: overflow values written */
+#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2075
+/*! reconciliation: pages deleted */
+#define WT_STAT_DSRC_REC_PAGE_DELETE 2076
+/*! reconciliation: page checksum matches */
+#define WT_STAT_DSRC_REC_PAGE_MATCH 2077
+/*! reconciliation: page reconciliation calls */
+#define WT_STAT_DSRC_REC_PAGES 2078
+/*! reconciliation: page reconciliation calls for eviction */
+#define WT_STAT_DSRC_REC_PAGES_EVICTION 2079
+/*! reconciliation: leaf page key bytes discarded using prefix compression */
+#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2080
+/*! reconciliation: internal page key bytes discarded using suffix
+ * compression */
+#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2081
+/*! session: object compaction */
+#define WT_STAT_DSRC_SESSION_COMPACT 2082
+/*! session: open cursor count */
+#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2083
+/*! txn: update conflicts */
+#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2084
+/*! @} */
+/*
+ * Statistics section: END
+ * DO NOT EDIT: automatically built by dist/api_stat.py.
+ */
+/*!
+ * @name Log record and operation types
+ * @anchor log_types
+ * @{
+ */
+/*
+ * DO NOT EDIT: automatically built by dist/log.py.
+ * Log record declarations: BEGIN
+ */
+/*! invalid operation */
+#define WT_LOGOP_INVALID 0
+/*! checkpoint */
+#define WT_LOGREC_CHECKPOINT 0
+/*! transaction commit */
+#define WT_LOGREC_COMMIT 1
+/*! file sync */
+#define WT_LOGREC_FILE_SYNC 2
+/*! message */
+#define WT_LOGREC_MESSAGE 3
+/*! column put */
+#define WT_LOGOP_COL_PUT 1
+/*! column remove */
+#define WT_LOGOP_COL_REMOVE 2
+/*! column truncate */
+#define WT_LOGOP_COL_TRUNCATE 3
+/*! row put */
+#define WT_LOGOP_ROW_PUT 4
+/*! row remove */
+#define WT_LOGOP_ROW_REMOVE 5
+/*! row truncate */
+#define WT_LOGOP_ROW_TRUNCATE 6
+/*
+ * Log record declarations: END
+ * DO NOT EDIT: automatically built by dist/log.py.
+ */
+/*! @} */
+/*! @} */
+
+#undef __F
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* __WIREDTIGER_H_ */
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger_ext.h b/src/third_party/wiredtiger/src/include/wiredtiger_ext.h
new file mode 100644
index 00000000000..fd0282cd50c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/wiredtiger_ext.h
@@ -0,0 +1,398 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#ifndef __WIREDTIGER_EXT_H_
+#define __WIREDTIGER_EXT_H_
+
+#include <wiredtiger.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if !defined(SWIG)
+
+/*!
+ * @addtogroup wt_ext
+ * @{
+ */
+
+/*!
+ * Read-committed isolation level, returned by
+ * WT_EXTENSION_API::transaction_isolation_level.
+ */
+#define WT_TXN_ISO_READ_COMMITTED 1
+/*!
+ * Read-uncommitted isolation level, returned by
+ * WT_EXTENSION_API::transaction_isolation_level.
+ */
+#define WT_TXN_ISO_READ_UNCOMMITTED 2
+/*!
+ * Snapshot isolation level, returned by
+ * WT_EXTENSION_API::transaction_isolation_level.
+ */
+#define WT_TXN_ISO_SNAPSHOT 3
+
+typedef struct __wt_txn_notify WT_TXN_NOTIFY;
+/*!
+ * Snapshot isolation level, returned by
+ * WT_EXTENSION_API::transaction_isolation_level.
+ */
+struct __wt_txn_notify {
+ /*!
+ * A method called when the session's current transaction is committed
+ * or rolled back.
+ *
+ * @param notify a pointer to the event handler
+ * @param session the current session handle
+ * @param txnid the transaction ID
+ * @param committed an integer value which is non-zero if the
+ * transaction is being committed.
+ */
+ int (*notify)(WT_TXN_NOTIFY *notify, WT_SESSION *session,
+ uint64_t txnid, int committed);
+};
+
+/*!
+ * Table of WiredTiger extension methods.
+ *
+ * This structure is used to provide a set of WiredTiger methods to extension
+ * modules without needing to link the modules with the WiredTiger library.
+ *
+ * The extension methods may be used both by modules that are linked with
+ * the WiredTiger library (for example, a data source configured using the
+ * WT_CONNECTION::add_data_source method), and by modules not linked with the
+ * WiredTiger library (for example, a compression module configured using the
+ * WT_CONNECTION::add_compressor method).
+ *
+ * To use these functions:
+ * - include the wiredtiger_ext.h header file,
+ * - declare a variable which references a WT_EXTENSION_API structure, and
+ * - initialize the variable using WT_CONNECTION::get_extension_api method.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API declaration
+ *
+ * The following code is from the sample compression module, where compression
+ * extension functions are configured in the extension's entry point:
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR initialization structure
+ * @snippet nop_compress.c WT_COMPRESSOR initialization function
+ */
+struct __wt_extension_api {
+/* !!! To maintain backwards compatibility, this structure is append-only. */
+#if !defined(DOXYGEN)
+ /*
+ * Private fields.
+ */
+ WT_CONNECTION *conn; /* Enclosing connection */
+#endif
+ /*!
+ * Insert an error message into the WiredTiger error stream.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param fmt a printf-like format specification
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API err_printf
+ */
+ int (*err_printf)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session, const char *fmt, ...);
+
+ /*!
+ * Insert a message into the WiredTiger message stream.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param fmt a printf-like format specification
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API msg_printf
+ */
+ int (*msg_printf)(
+ WT_EXTENSION_API *, WT_SESSION *session, const char *fmt, ...);
+
+ /*!
+ * Return information about an error as a string; the strerror method
+ * is a superset of the ISO C99/POSIX 1003.1-2001 function strerror.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API strerror
+ *
+ * @param err a return value from a WiredTiger, C library or POSIX
+ * function
+ * @returns a string representation of the error
+ */
+ const char *(*strerror)(int err);
+
+ /*!
+ * Allocate short-term use scratch memory.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param bytes the number of bytes of memory needed
+ * @returns A valid memory reference on success or NULL on error
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API scr_alloc
+ */
+ void *(*scr_alloc)(
+ WT_EXTENSION_API *wt_api, WT_SESSION *session, size_t bytes);
+
+ /*!
+ * Free short-term use scratch memory.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param ref a memory reference returned by WT_EXTENSION_API::scr_alloc
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API scr_free
+ */
+ void (*scr_free)(WT_EXTENSION_API *, WT_SESSION *session, void *ref);
+
+ /*!
+ * Configure the extension collator method.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param config the configuration information passed to an application
+ * @param collatorp the selector collator, if any
+ * @param ownp set if the collator terminate method should be called
+ * when no longer needed
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION collator config
+ */
+ int (*collator_config)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+ WT_CONFIG_ARG *config, WT_COLLATOR **collatorp, int *ownp);
+
+ /*!
+ * The extension collator method.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param collator the collator (or NULL if none available)
+ * @param first first item
+ * @param second second item
+ * @param[out] cmp set less than 0 if \c first collates less than
+ * \c second, set equal to 0 if \c first collates equally to \c second,
+ * set greater than 0 if \c first collates greater than \c second
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION collate
+ */
+ int (*collate)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+ WT_COLLATOR *collator, WT_ITEM *first, WT_ITEM *second, int *cmp);
+
+ /*!
+ * @copydoc wiredtiger_config_parser_open
+ */
+ int (*config_parser_open)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+ const char *config, size_t len, WT_CONFIG_PARSER **config_parserp);
+
+ /*!
+ * Return the value of a configuration string.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param key configuration key string
+ * @param config the configuration information passed to an application
+ * @param value the returned value
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION config_get
+ */
+ int (*config_get)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+ WT_CONFIG_ARG *config, const char *key, WT_CONFIG_ITEM *value);
+
+ /*!
+ * Insert a row into the metadata if it does not already exist.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param key row key
+ * @param value row value
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION metadata insert
+ */
+ int (*metadata_insert)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session, const char *key, const char *value);
+
+ /*!
+ * Remove a row from the metadata.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param key row key
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION metadata remove
+ */
+ int (*metadata_remove)(
+ WT_EXTENSION_API *wt_api, WT_SESSION *session, const char *key);
+
+ /*!
+ * Return a row from the metadata.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param key row key
+ * @param [out] valuep the row value
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION metadata search
+ */
+ int (*metadata_search)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session, const char *key, const char **valuep);
+
+ /*!
+ * Update a row in the metadata by either inserting a new record or
+ * updating an existing record.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param key row key
+ * @param value row value
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION metadata update
+ */
+ int (*metadata_update)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session, const char *key, const char *value);
+
+ /*!
+ * Pack a structure into a buffer.
+ * See ::wiredtiger_struct_pack for details.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @param buffer a pointer to a packed byte array
+ * @param size the number of valid bytes in the buffer
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+ int (*struct_pack)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+ void *buffer, size_t size, const char *format, ...);
+
+ /*!
+ * Calculate the size required to pack a structure.
+ * See ::wiredtiger_struct_size for details.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @param sizep a location where the number of bytes needed for the
+ * matching call to WT_EXTENSION_API::struct_pack is returned
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+ int (*struct_size)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+ size_t *sizep, const char *format, ...);
+
+ /*!
+ * Unpack a structure from a buffer.
+ * See ::wiredtiger_struct_unpack for details.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @param buffer a pointer to a packed byte array
+ * @param size the number of valid bytes in the buffer
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+ int (*struct_unpack)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+ const void *buffer, size_t size, const char *format, ...);
+
+ /*!
+ * Return the current transaction ID.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @returns the current transaction ID.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION transaction ID
+ */
+ uint64_t (*transaction_id)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session);
+
+ /*!
+ * Return the current transaction's isolation level; returns one of
+ * ::WT_TXN_ISO_READ_COMMITTED, ::WT_TXN_ISO_READ_UNCOMMITTED, or
+ * ::WT_TXN_ISO_SNAPSHOT.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @returns the current transaction's isolation level.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION transaction isolation level
+ */
+ int (*transaction_isolation_level)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session);
+
+ /*!
+ * Request notification of transaction resolution by specifying a
+ * function to be called when the session's current transaction is
+ * either committed or rolled back. If the transaction is being
+ * committed, but the notification function returns an error, the
+ * transaction will be rolled back.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @param notify a handler for commit or rollback events
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION transaction notify
+ */
+ int (*transaction_notify)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session, WT_TXN_NOTIFY *notify);
+
+ /*!
+ * Return the oldest transaction ID not yet visible to a running
+ * transaction.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @returns the oldest transaction ID not yet visible to a running
+ * transaction.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION transaction oldest
+ */
+ uint64_t (*transaction_oldest)(WT_EXTENSION_API *wt_api);
+
+ /*!
+ * Return if the current transaction can see the given transaction ID.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @param transaction_id the transaction ID
+ * @returns true (non-zero) if the transaction ID is visible to the
+ * current transaction.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION transaction visible
+ */
+ int (*transaction_visible)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session, uint64_t transaction_id);
+
+ /*!
+ * @copydoc wiredtiger_version
+ */
+ const char *(*version)(int *majorp, int *minorp, int *patchp);
+};
+
+/*!
+ * @typedef WT_CONFIG_ARG
+ *
+ * A configuration object passed to some extension interfaces. This is an
+ * opaque type: configuration values can be queried using
+ * WT_EXTENSION_API::config_get
+ */
+
+/*! @} */
+#endif /* SWIG */
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* __WIREDTIGER_EXT_H_ */
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
new file mode 100644
index 00000000000..e9482c688d3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -0,0 +1,337 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*******************************************
+ * WiredTiger public include file, and configuration control.
+ *******************************************/
+#include "wiredtiger_config.h"
+#include "wiredtiger_ext.h"
+
+/*******************************************
+ * WiredTiger system include files.
+ *******************************************/
+#ifndef _WIN32
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/uio.h>
+#endif
+#include <ctype.h>
+#ifndef _WIN32
+#include <dlfcn.h>
+#endif
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#ifdef _WIN32
+#include <io.h>
+#endif
+#include <limits.h>
+#ifndef _WIN32
+#include <pthread.h>
+#endif
+#ifdef HAVE_PTHREAD_NP_H
+#include <pthread_np.h>
+#endif
+#include <stddef.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+#include <time.h>
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+/*******************************************
+ * WiredTiger externally maintained include files.
+ *******************************************/
+#include "queue.h"
+
+/*
+ * DO NOT EDIT: automatically built by dist/s_typedef.
+ * Forward type declarations for internal types: BEGIN
+ */
+struct __wt_addr;
+ typedef struct __wt_addr WT_ADDR;
+struct __wt_async;
+ typedef struct __wt_async WT_ASYNC;
+struct __wt_async_cursor;
+ typedef struct __wt_async_cursor WT_ASYNC_CURSOR;
+struct __wt_async_format;
+ typedef struct __wt_async_format WT_ASYNC_FORMAT;
+struct __wt_async_op_impl;
+ typedef struct __wt_async_op_impl WT_ASYNC_OP_IMPL;
+struct __wt_async_worker_state;
+ typedef struct __wt_async_worker_state WT_ASYNC_WORKER_STATE;
+struct __wt_block;
+ typedef struct __wt_block WT_BLOCK;
+struct __wt_block_ckpt;
+ typedef struct __wt_block_ckpt WT_BLOCK_CKPT;
+struct __wt_block_desc;
+ typedef struct __wt_block_desc WT_BLOCK_DESC;
+struct __wt_block_header;
+ typedef struct __wt_block_header WT_BLOCK_HEADER;
+struct __wt_bloom;
+ typedef struct __wt_bloom WT_BLOOM;
+struct __wt_bloom_hash;
+ typedef struct __wt_bloom_hash WT_BLOOM_HASH;
+struct __wt_bm;
+ typedef struct __wt_bm WT_BM;
+struct __wt_btree;
+ typedef struct __wt_btree WT_BTREE;
+struct __wt_cache;
+ typedef struct __wt_cache WT_CACHE;
+struct __wt_cache_pool;
+ typedef struct __wt_cache_pool WT_CACHE_POOL;
+struct __wt_cell;
+ typedef struct __wt_cell WT_CELL;
+struct __wt_cell_unpack;
+ typedef struct __wt_cell_unpack WT_CELL_UNPACK;
+struct __wt_ckpt;
+ typedef struct __wt_ckpt WT_CKPT;
+struct __wt_col;
+ typedef struct __wt_col WT_COL;
+struct __wt_col_rle;
+ typedef struct __wt_col_rle WT_COL_RLE;
+struct __wt_colgroup;
+ typedef struct __wt_colgroup WT_COLGROUP;
+struct __wt_compact;
+ typedef struct __wt_compact WT_COMPACT;
+struct __wt_condvar;
+ typedef struct __wt_condvar WT_CONDVAR;
+struct __wt_config;
+ typedef struct __wt_config WT_CONFIG;
+struct __wt_config_check;
+ typedef struct __wt_config_check WT_CONFIG_CHECK;
+struct __wt_config_entry;
+ typedef struct __wt_config_entry WT_CONFIG_ENTRY;
+struct __wt_config_parser_impl;
+ typedef struct __wt_config_parser_impl WT_CONFIG_PARSER_IMPL;
+struct __wt_connection_impl;
+ typedef struct __wt_connection_impl WT_CONNECTION_IMPL;
+struct __wt_connection_stats;
+ typedef struct __wt_connection_stats WT_CONNECTION_STATS;
+struct __wt_connection_stats_spinlock;
+ typedef struct __wt_connection_stats_spinlock WT_CONNECTION_STATS_SPINLOCK;
+struct __wt_cursor_backup;
+ typedef struct __wt_cursor_backup WT_CURSOR_BACKUP;
+struct __wt_cursor_backup_entry;
+ typedef struct __wt_cursor_backup_entry WT_CURSOR_BACKUP_ENTRY;
+struct __wt_cursor_btree;
+ typedef struct __wt_cursor_btree WT_CURSOR_BTREE;
+struct __wt_cursor_bulk;
+ typedef struct __wt_cursor_bulk WT_CURSOR_BULK;
+struct __wt_cursor_config;
+ typedef struct __wt_cursor_config WT_CURSOR_CONFIG;
+struct __wt_cursor_data_source;
+ typedef struct __wt_cursor_data_source WT_CURSOR_DATA_SOURCE;
+struct __wt_cursor_dump;
+ typedef struct __wt_cursor_dump WT_CURSOR_DUMP;
+struct __wt_cursor_index;
+ typedef struct __wt_cursor_index WT_CURSOR_INDEX;
+struct __wt_cursor_json;
+ typedef struct __wt_cursor_json WT_CURSOR_JSON;
+struct __wt_cursor_log;
+ typedef struct __wt_cursor_log WT_CURSOR_LOG;
+struct __wt_cursor_lsm;
+ typedef struct __wt_cursor_lsm WT_CURSOR_LSM;
+struct __wt_cursor_metadata;
+ typedef struct __wt_cursor_metadata WT_CURSOR_METADATA;
+struct __wt_cursor_stat;
+ typedef struct __wt_cursor_stat WT_CURSOR_STAT;
+struct __wt_cursor_table;
+ typedef struct __wt_cursor_table WT_CURSOR_TABLE;
+struct __wt_data_handle;
+ typedef struct __wt_data_handle WT_DATA_HANDLE;
+struct __wt_data_handle_cache;
+ typedef struct __wt_data_handle_cache WT_DATA_HANDLE_CACHE;
+struct __wt_dlh;
+ typedef struct __wt_dlh WT_DLH;
+struct __wt_dsrc_stats;
+ typedef struct __wt_dsrc_stats WT_DSRC_STATS;
+struct __wt_evict_entry;
+ typedef struct __wt_evict_entry WT_EVICT_ENTRY;
+struct __wt_evict_worker;
+ typedef struct __wt_evict_worker WT_EVICT_WORKER;
+struct __wt_ext;
+ typedef struct __wt_ext WT_EXT;
+struct __wt_extlist;
+ typedef struct __wt_extlist WT_EXTLIST;
+struct __wt_fh;
+ typedef struct __wt_fh WT_FH;
+struct __wt_hazard;
+ typedef struct __wt_hazard WT_HAZARD;
+struct __wt_ikey;
+ typedef struct __wt_ikey WT_IKEY;
+struct __wt_index;
+ typedef struct __wt_index WT_INDEX;
+struct __wt_insert;
+ typedef struct __wt_insert WT_INSERT;
+struct __wt_insert_head;
+ typedef struct __wt_insert_head WT_INSERT_HEAD;
+struct __wt_log_desc;
+ typedef struct __wt_log_desc WT_LOG_DESC;
+struct __wt_log_op_desc;
+ typedef struct __wt_log_op_desc WT_LOG_OP_DESC;
+struct __wt_log_rec_desc;
+ typedef struct __wt_log_rec_desc WT_LOG_REC_DESC;
+struct __wt_lsm_chunk;
+ typedef struct __wt_lsm_chunk WT_LSM_CHUNK;
+struct __wt_lsm_data_source;
+ typedef struct __wt_lsm_data_source WT_LSM_DATA_SOURCE;
+struct __wt_lsm_manager;
+ typedef struct __wt_lsm_manager WT_LSM_MANAGER;
+struct __wt_lsm_tree;
+ typedef struct __wt_lsm_tree WT_LSM_TREE;
+struct __wt_lsm_work_unit;
+ typedef struct __wt_lsm_work_unit WT_LSM_WORK_UNIT;
+struct __wt_lsm_worker_args;
+ typedef struct __wt_lsm_worker_args WT_LSM_WORKER_ARGS;
+struct __wt_lsm_worker_cookie;
+ typedef struct __wt_lsm_worker_cookie WT_LSM_WORKER_COOKIE;
+struct __wt_multi;
+ typedef struct __wt_multi WT_MULTI;
+struct __wt_named_collator;
+ typedef struct __wt_named_collator WT_NAMED_COLLATOR;
+struct __wt_named_compressor;
+ typedef struct __wt_named_compressor WT_NAMED_COMPRESSOR;
+struct __wt_named_data_source;
+ typedef struct __wt_named_data_source WT_NAMED_DATA_SOURCE;
+struct __wt_ovfl_reuse;
+ typedef struct __wt_ovfl_reuse WT_OVFL_REUSE;
+struct __wt_ovfl_track;
+ typedef struct __wt_ovfl_track WT_OVFL_TRACK;
+struct __wt_ovfl_txnc;
+ typedef struct __wt_ovfl_txnc WT_OVFL_TXNC;
+struct __wt_page;
+ typedef struct __wt_page WT_PAGE;
+struct __wt_page_deleted;
+ typedef struct __wt_page_deleted WT_PAGE_DELETED;
+struct __wt_page_header;
+ typedef struct __wt_page_header WT_PAGE_HEADER;
+struct __wt_page_index;
+ typedef struct __wt_page_index WT_PAGE_INDEX;
+struct __wt_page_modify;
+ typedef struct __wt_page_modify WT_PAGE_MODIFY;
+struct __wt_process;
+ typedef struct __wt_process WT_PROCESS;
+struct __wt_ref;
+ typedef struct __wt_ref WT_REF;
+struct __wt_row;
+ typedef struct __wt_row WT_ROW;
+struct __wt_rwlock;
+ typedef struct __wt_rwlock WT_RWLOCK;
+struct __wt_salvage_cookie;
+ typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE;
+struct __wt_scratch_track;
+ typedef struct __wt_scratch_track WT_SCRATCH_TRACK;
+struct __wt_session_impl;
+ typedef struct __wt_session_impl WT_SESSION_IMPL;
+struct __wt_size;
+ typedef struct __wt_size WT_SIZE;
+struct __wt_split_stash;
+ typedef struct __wt_split_stash WT_SPLIT_STASH;
+struct __wt_stats;
+ typedef struct __wt_stats WT_STATS;
+struct __wt_table;
+ typedef struct __wt_table WT_TABLE;
+struct __wt_txn;
+ typedef struct __wt_txn WT_TXN;
+struct __wt_txn_global;
+ typedef struct __wt_txn_global WT_TXN_GLOBAL;
+struct __wt_txn_op;
+ typedef struct __wt_txn_op WT_TXN_OP;
+struct __wt_txn_state;
+ typedef struct __wt_txn_state WT_TXN_STATE;
+struct __wt_upd_skipped;
+ typedef struct __wt_upd_skipped WT_UPD_SKIPPED;
+struct __wt_update;
+ typedef struct __wt_update WT_UPDATE;
+/*
+ * Forward type declarations for internal types: END
+ * DO NOT EDIT: automatically built by dist/s_typedef.
+ */
+
+/*******************************************
+ * WiredTiger internal include files.
+ *******************************************/
+#if defined(_lint)
+#include "lint.h"
+#elif defined(__GNUC__)
+#include "gcc.h"
+#elif defined(_MSC_VER)
+#include "msvc.h"
+#endif
+#include "hardware.h"
+
+#ifdef _WIN32
+#include "os_windows.h"
+#else
+#include "posix.h"
+#endif
+
+#include "misc.h"
+#include "mutex.h"
+
+#include "stat.h" /* required by dhandle.h */
+#include "dhandle.h" /* required by btree.h */
+
+#include "api.h"
+#include "async.h"
+#include "block.h"
+#include "bloom.h"
+#include "btmem.h"
+#include "btree.h"
+#include "cache.h"
+#include "config.h"
+#include "compact.h"
+#include "cursor.h"
+#include "dlh.h"
+#include "error.h"
+#include "flags.h"
+#include "log.h"
+#include "lsm.h"
+#include "meta.h"
+#include "os.h"
+#include "schema.h"
+#include "txn.h"
+
+#include "session.h" /* required by connection.h */
+#include "connection.h"
+
+#include "extern.h"
+#include "verify_build.h"
+
+#include "buf.i"
+#include "misc.i"
+#include "intpack.i" /* required by cell.i, packing.i */
+#include "packing.i"
+#include "cell.i" /* required by btree.i */
+
+#include "mutex.i" /* required by btree.i */
+#include "txn.i" /* required by btree.i */
+
+#include "btree.i" /* required by cursor.i */
+#include "cache.i" /* required by cursor.i */
+#include "cursor.i"
+
+#include "bitstring.i"
+#include "column.i"
+#include "serial.i"
+
+#if defined(__cplusplus)
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
new file mode 100644
index 00000000000..d13002cdc5a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -0,0 +1,1243 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_log_ckpt --
+ * Record the given LSN as the checkpoint LSN and signal the archive
+ * thread as needed.
+ */
+int
+__wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+ log->ckpt_lsn = *ckp_lsn;
+ if (conn->arch_cond != NULL)
+ WT_RET(__wt_cond_signal(session, conn->arch_cond));
+ return (0);
+}
+
+/*
+ * __wt_log_written_reset --
+ * Interface to reset the amount of log written during this
+ * during this checkpoint period. Called from the checkpoint code.
+ */
+void
+__wt_log_written_reset(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ if (!conn->logging)
+ return;
+ log = conn->log;
+ log->log_written = 0;
+ return;
+}
+
+/*
+ * __wt_log_get_files --
+ * Retrieve the list of all existing log files.
+ */
+int
+__wt_log_get_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp)
+{
+ WT_CONNECTION_IMPL *conn;
+ const char *log_path;
+
+ *countp = 0;
+ *filesp = NULL;
+
+ conn = S2C(session);
+ log_path = conn->log_path;
+ if (log_path == NULL)
+ log_path = "";
+ return (__wt_dirlist(session, log_path, WT_LOG_FILENAME,
+ WT_DIRLIST_INCLUDE, filesp, countp));
+}
+
+/*
+ * __wt_log_get_active_files --
+ * Retrieve the list of active log files (those that are not candidates
+ * for archiving).
+ */
+int
+__wt_log_get_active_files(
+ WT_SESSION_IMPL *session, char ***filesp, u_int *countp)
+{
+ WT_DECL_RET;
+ WT_LOG *log;
+ char **files;
+ uint32_t id;
+ u_int count, i;
+
+ id = 0;
+ log = S2C(session)->log;
+
+ WT_RET(__wt_log_get_files(session, &files, &count));
+
+ /* Filter out any files that are below the checkpoint LSN. */
+ for (i = 0; i < count; ) {
+ WT_ERR(__wt_log_extract_lognum(session, files[i], &id));
+ if (id < log->ckpt_lsn.file) {
+ __wt_free(session, files[i]);
+ files[i] = files[count - 1];
+ files[--count] = NULL;
+ } else
+ i++;
+ }
+
+ *filesp = files;
+ *countp = count;
+
+ if (0) {
+err: __wt_log_files_free(session, files, count);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_log_files_free --
+ * Free memory associated with a log file list.
+ */
+void
+__wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count)
+{
+ u_int i;
+
+ for (i = 0; i < count; i++)
+ __wt_free(session, files[i]);
+ __wt_free(session, files);
+}
+
+/*
+ * __wt_log_filename --
+ * Given a log number, return a WT_ITEM of a generated log file name.
+ */
+int
+__wt_log_filename(WT_SESSION_IMPL *session, uint32_t id, WT_ITEM *buf)
+{
+ const char *log_path;
+
+ log_path = S2C(session)->log_path;
+
+ if (log_path != NULL && log_path[0] != '\0')
+ WT_RET(__wt_buf_fmt(session, buf, "%s/%s.%010" PRIu32,
+ log_path, WT_LOG_FILENAME, id));
+ else
+ WT_RET(__wt_buf_fmt(session, buf, "%s.%010" PRIu32,
+ WT_LOG_FILENAME, id));
+
+ return (0);
+}
+
+/*
+ * __wt_log_extract_lognum --
+ * Given a log file name, extract out the log number.
+ */
+int
+__wt_log_extract_lognum(
+ WT_SESSION_IMPL *session, const char *name, uint32_t *id)
+{
+ const char *p;
+
+ WT_UNUSED(session);
+
+ if (id == NULL || name == NULL)
+ return (WT_ERROR);
+ if ((p = strrchr(name, '.')) == NULL ||
+ sscanf(++p, "%" PRIu32, id) != 1)
+ WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name);
+ return (0);
+}
+
+/*
+ * __wt_log_remove --
+ * Given a log number, remove that log file.
+ */
+int
+__wt_log_remove(WT_SESSION_IMPL *session, uint32_t lognum)
+{
+ WT_DECL_ITEM(path);
+ WT_DECL_RET;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &path));
+ WT_ERR(__wt_log_filename(session, lognum, path));
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "log_remove: remove log %s", (char *)path->data));
+ WT_ERR(__wt_remove(session, path->data));
+err: __wt_scr_free(&path);
+ return (ret);
+}
+
+/*
+ * __log_openfile --
+ * Open a log file with the given log file number and return the WT_FH.
+ */
+static int
+__log_openfile(WT_SESSION_IMPL *session, int ok_create, WT_FH **fh, uint32_t id)
+{
+ WT_DECL_ITEM(path);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 0, &path));
+ WT_ERR(__wt_log_filename(session, id, path));
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "opening log %s", (const char *)path->data));
+ WT_ERR(__wt_open(
+ session, path->data, ok_create, 0, WT_FILE_TYPE_LOG, fh));
+err: __wt_scr_free(&path);
+ return (ret);
+}
+
+/*
+ * __wt_log_open --
+ * Open the appropriate log file for the connection. The purpose is
+ * to find the last log file that exists, open it and set our initial
+ * LSNs to the end of that file. If none exist, call __wt_log_newfile
+ * to create it.
+ */
+int
+__wt_log_open(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ uint32_t firstlog, lastlog, lognum;
+ u_int i, logcount;
+ char **logfiles;
+
+ conn = S2C(session);
+ log = conn->log;
+ lastlog = 0;
+ firstlog = UINT32_MAX;
+
+ WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+ lastlog = WT_MAX(lastlog, lognum);
+ firstlog = WT_MIN(firstlog, lognum);
+ }
+ log->fileid = lastlog;
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "log_open: first log %d last log %d", firstlog, lastlog));
+ log->first_lsn.file = firstlog;
+ log->first_lsn.offset = 0;
+
+ /*
+ * Start logging at the beginning of the next log file, no matter
+ * where the previous log file ends.
+ */
+ WT_ERR(__wt_log_newfile(session, 1));
+
+ /*
+ * If there were log files, run recovery.
+ * XXX belongs at a higher level than this.
+ */
+ if (logcount > 0) {
+ log->trunc_lsn = log->alloc_lsn;
+ WT_ERR(__wt_txn_recover(conn));
+ }
+
+err: __wt_log_files_free(session, logfiles, logcount);
+ return (ret);
+}
+
+/*
+ * __wt_log_close --
+ * Close the log file.
+ */
+int
+__wt_log_close(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) {
+ WT_RET(__wt_verbose(session, WT_VERB_LOG,
+ "closing old log %s", log->log_close_fh->name));
+ WT_RET(__wt_close(session, log->log_close_fh));
+ }
+ if (log->log_fh != NULL) {
+ WT_RET(__wt_verbose(session, WT_VERB_LOG,
+ "closing log %s", log->log_fh->name));
+ WT_RET(__wt_close(session, log->log_fh));
+ log->log_fh = NULL;
+ }
+ return (0);
+}
+
+/*
+ * __log_fill --
+ * Copy a thread's log records into the assigned slot.
+ */
+static int
+__log_fill(WT_SESSION_IMPL *session,
+ WT_MYSLOT *myslot, int direct, WT_ITEM *record, WT_LSN *lsnp)
+{
+ WT_DECL_RET;
+ WT_LOG_RECORD *logrec;
+
+ logrec = (WT_LOG_RECORD *)record->mem;
+ /*
+ * Call __wt_write. For now the offset is the real byte offset.
+ * If the offset becomes a unit of LOG_ALIGN this is where we would
+ * multiply by LOG_ALIGN to get the real file byte offset for write().
+ */
+ if (direct)
+ WT_ERR(__wt_write(session, myslot->slot->slot_fh,
+ myslot->offset + myslot->slot->slot_start_offset,
+ (size_t)logrec->len, (void *)logrec));
+ else
+ memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset,
+ logrec, logrec->len);
+
+ WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len);
+ if (lsnp != NULL) {
+ *lsnp = myslot->slot->slot_start_lsn;
+ lsnp->offset += (wt_off_t)myslot->offset;
+ }
+err:
+ if (ret != 0 && myslot->slot->slot_error == 0)
+ myslot->slot->slot_error = ret;
+ return (ret);
+}
+
+/*
+ * __log_size_fit --
+ * Return whether or not recsize will fit in the log file.
+ */
+static int
+__log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+ return (lsn->offset + (wt_off_t)recsize < conn->log_file_max);
+}
+
+/*
+ * __log_truncate --
+ * Truncate the log to the given LSN. If this_log is set, it will only
+ * truncate the log file indicated in the given LSN. If not set,
+ * it will truncate between the given LSN and the trunc_lsn. That is,
+ * since we pre-allocate log files, it will free that space and allow the
+ * log to be traversed. We use the trunc_lsn because logging has already
+ * opened the new/next log file before recovery ran. This function assumes
+ * we are in recovery or other dedicated time and not during live running.
+ */
+static int
+__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, uint32_t this_log)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *log_fh, *tmp_fh;
+ WT_LOG *log;
+ uint32_t lognum;
+ u_int i, logcount;
+ char **logfiles;
+
+ conn = S2C(session);
+ log = conn->log;
+ log_fh = NULL;
+ logcount = 0;
+ logfiles = NULL;
+
+ /*
+ * Truncate the log file to the given LSN.
+ */
+ WT_ERR(__log_openfile(session, 0, &log_fh, lsn->file));
+ WT_ERR(__wt_ftruncate(session, log_fh, lsn->offset));
+ tmp_fh = log_fh;
+ log_fh = NULL;
+ WT_ERR(__wt_close(session, tmp_fh));
+
+ /*
+ * If we just want to truncate the current log, return and skip
+ * looking for intervening logs.
+ */
+ if (this_log)
+ goto err;
+ WT_ERR(__wt_log_get_files(session, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+ if (lognum > lsn->file && lognum < log->trunc_lsn.file) {
+ WT_ERR(__log_openfile(session, 0, &log_fh, lognum));
+ /*
+ * If there are intervening files pre-allocated,
+ * truncate them to the end of the log file header.
+ */
+ WT_ERR(__wt_ftruncate(session,
+ log_fh, LOG_FIRST_RECORD));
+ tmp_fh = log_fh;
+ log_fh = NULL;
+ WT_ERR(__wt_close(session, tmp_fh));
+ }
+ }
+err: if (log_fh != NULL)
+ WT_TRET(__wt_close(session, log_fh));
+ if (logfiles != NULL)
+ __wt_log_files_free(session, logfiles, logcount);
+ return (ret);
+}
+
+/*
+ * __log_filesize --
+ * Returns an estimate of the real end of log file.
+ */
+static int
+__log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *eof)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ wt_off_t log_size, off, off1;
+ uint32_t allocsize, bufsz;
+ char *buf, *zerobuf;
+
+ conn = S2C(session);
+ log = conn->log;
+ if (eof == NULL)
+ return (0);
+ *eof = 0;
+ WT_RET(__wt_filesize(session, fh, &log_size));
+ if (log == NULL)
+ allocsize = LOG_ALIGN;
+ else
+ allocsize = log->allocsize;
+
+ /*
+ * It can be very slow looking for the last real record in the log
+ * in very small chunks. Walk backward by a megabyte at a time. When
+ * we find a part of the log that is not just zeroes, walk to find
+ * the last record.
+ */
+ buf = zerobuf = NULL;
+ if (allocsize < WT_MEGABYTE && log_size > WT_MEGABYTE)
+ bufsz = WT_MEGABYTE;
+ else
+ bufsz = allocsize;
+ WT_RET(__wt_calloc_def(session, bufsz, &buf));
+ WT_ERR(__wt_calloc_def(session, bufsz, &zerobuf));
+
+ /*
+ * Read in a chunk starting at the end of the file. Keep going until
+ * we reach the beginning or we find a chunk that contains any non-zero
+ * bytes. Compare against a known zero byte chunk.
+ */
+ for (off = log_size - (wt_off_t)bufsz;
+ off >= 0;
+ off -= (wt_off_t)bufsz) {
+ WT_ERR(__wt_read(session, fh, off, bufsz, buf));
+ if (memcmp(buf, zerobuf, bufsz) != 0)
+ break;
+ }
+
+ /*
+ * If we're walking by large amounts, now walk by the real allocsize
+ * to find the real end, if we found something. Otherwise we reached
+ * the beginning of the file. Offset can go negative if the log file
+ * size is not a multiple of a megabyte. The first chunk of the log
+ * file will always be non-zero.
+ */
+ if (off < 0)
+ off = 0;
+
+ /*
+ * We know all log records are aligned at log->allocsize. The first
+ * item in a log record is always a 32-bit length. Look for any
+ * non-zero length at the allocsize boundary. This may not be a true
+ * log record since it could be the middle of a large record. But we
+ * know no log record starts after it. Return an estimate of the log
+ * file size.
+ */
+ for (off1 = bufsz - allocsize;
+ off1 > 0; off1 -= (wt_off_t)allocsize)
+ if (memcmp(buf + off1, zerobuf, sizeof(uint32_t)) != 0)
+ break;
+ off = off + off1;
+
+ /*
+ * Set EOF to the last zero-filled record we saw.
+ */
+ *eof = off + (wt_off_t)allocsize;
+err:
+ if (buf != NULL)
+ __wt_free(session, buf);
+ if (zerobuf != NULL)
+ __wt_free(session, zerobuf);
+ return (ret);
+}
+
+/*
+ * __log_acquire --
+ * Called with the log slot lock held. Can be called recursively
+ * from __wt_log_newfile when we change log files.
+ */
+static int
+__log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+ /*
+ * Called locked. Add recsize to alloc_lsn. Save our starting LSN
+ * where the previous allocation finished for the release LSN.
+ * That way when log files switch, we're waiting for the correct LSN
+ * from outstanding writes.
+ */
+ slot->slot_release_lsn = log->alloc_lsn;
+ if (!__log_size_fit(session, &log->alloc_lsn, recsize)) {
+ WT_RET(__wt_log_newfile(session, 0));
+ if (log->log_close_fh != NULL)
+ F_SET(slot, SLOT_CLOSEFH);
+ }
+ /*
+ * Checkpoints can be configured based on amount of log written.
+ * Add in this log record to the sum and if needed, signal the
+ * checkpoint condition. The logging subsystem manages the
+ * accumulated field. There is a bit of layering violation
+ * here checking the connection ckpt field and using its
+ * condition.
+ */
+ if (WT_CKPT_LOGSIZE(conn)) {
+ log->log_written += (wt_off_t)recsize;
+ WT_RET(__wt_checkpoint_signal(session, log->log_written));
+ }
+
+ /*
+ * Need to minimally fill in slot info here. Our slot start LSN
+ * comes after any potential new log file creations.
+ */
+ slot->slot_start_lsn = log->alloc_lsn;
+ slot->slot_start_offset = log->alloc_lsn.offset;
+ /*
+ * Pre-allocate on the first real write into the log file.
+ */
+ if (log->alloc_lsn.offset == LOG_FIRST_RECORD) {
+ if (!log->log_fh->fallocate_available ||
+ (ret = __wt_fallocate(session, log->log_fh,
+ LOG_FIRST_RECORD, conn->log_file_max)) == ENOTSUP)
+ ret = __wt_ftruncate(session, log->log_fh,
+ LOG_FIRST_RECORD + conn->log_file_max);
+ WT_RET(ret);
+ }
+
+ log->alloc_lsn.offset += (wt_off_t)recsize;
+ slot->slot_end_lsn = log->alloc_lsn;
+ slot->slot_error = 0;
+ slot->slot_fh = log->log_fh;
+ return (0);
+}
+
+/*
+ * __log_release --
+ * Release a log slot.
+ */
+static int
+__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *close_fh;
+ WT_LOG *log;
+ WT_LSN sync_lsn;
+ size_t write_size;
+ WT_DECL_SPINLOCK_ID(id); /* Must appear last */
+
+ conn = S2C(session);
+ log = conn->log;
+ /*
+ * If we're going to have to close our log file, make a local copy
+ * of the file handle structure.
+ */
+ close_fh = NULL;
+ if (F_ISSET(slot, SLOT_CLOSEFH)) {
+ close_fh = log->log_close_fh;
+ log->log_close_fh = NULL;
+ F_CLR(slot, SLOT_CLOSEFH);
+ }
+
+ /* Write the buffered records */
+ if (F_ISSET(slot, SLOT_BUFFERED)) {
+ write_size = (size_t)
+ (slot->slot_end_lsn.offset - slot->slot_start_offset);
+ WT_ERR(__wt_write(session, slot->slot_fh,
+ slot->slot_start_offset, write_size, slot->slot_buf.mem));
+ }
+
+ /*
+ * Wait for earlier groups to finish, otherwise there could be holes
+ * in the log file.
+ */
+ while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0)
+ __wt_yield();
+ log->write_lsn = slot->slot_end_lsn;
+ /*
+ * Try to consolidate calls to fsync to wait less. Acquire a spin lock
+ * so that threads finishing writing to the log will wait while the
+ * current fsync completes and advance log->write_lsn.
+ */
+ while (F_ISSET(slot, SLOT_SYNC) &&
+ LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+ if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) {
+ (void)__wt_cond_wait(
+ session, log->log_sync_cond, 10000);
+ continue;
+ }
+ /*
+ * Record the current end of log after we grabbed the lock.
+ * That is how far our fsync call with guarantee.
+ */
+ sync_lsn = log->write_lsn;
+ if (LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+ WT_STAT_FAST_CONN_INCR(session, log_sync);
+ ret = __wt_fsync(session, log->log_fh);
+ if (ret == 0) {
+ F_CLR(slot, SLOT_SYNC);
+ log->sync_lsn = sync_lsn;
+ ret = __wt_cond_signal(
+ session, log->log_sync_cond);
+ }
+ }
+ __wt_spin_unlock(session, &log->log_sync_lock);
+ WT_ERR(ret);
+ }
+ if (F_ISSET(slot, SLOT_BUF_GROW)) {
+ WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
+ F_CLR(slot, SLOT_BUF_GROW);
+ WT_STAT_FAST_CONN_INCRV(session,
+ log_buffer_size, slot->slot_buf.memsize);
+ WT_ERR(__wt_buf_grow(session,
+ &slot->slot_buf, slot->slot_buf.memsize * 2));
+ }
+ /*
+ * If we have a file to close, close it now.
+ */
+ if (close_fh)
+ WT_ERR(__wt_close(session, close_fh));
+
+err: if (ret != 0 && slot->slot_error == 0)
+ slot->slot_error = ret;
+ return (ret);
+}
+
+/*
+ * __wt_log_newfile --
+ * Create the next log file and write the file header record into it.
+ */
+int
+__wt_log_newfile(WT_SESSION_IMPL *session, int conn_create)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOG_DESC *desc;
+ WT_LOG_RECORD *logrec;
+ WT_LOGSLOT tmp;
+ WT_MYSLOT myslot;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ /*
+ * Set aside the log file handle to be closed later. Other threads
+ * may still be using it to write to the log. If the log file size
+ * is small we could fill a log file before the previous one is closed.
+ * Wait for that to close.
+ */
+ while (log->log_close_fh != NULL) {
+ __wt_errx(session,
+ "log_newfile: Log file size %" PRIuMAX " too small",
+ (uintmax_t)conn->log_file_max);
+ WT_STAT_FAST_CONN_INCR(session, log_close_yields);
+ __wt_yield();
+ }
+ log->log_close_fh = log->log_fh;
+ log->fileid++;
+ WT_RET(__log_openfile(session, 1, &log->log_fh, log->fileid));
+ log->alloc_lsn.file = log->fileid;
+ log->alloc_lsn.offset = log->log_fh->size;
+
+ /*
+ * Set up the log descriptor record. Use a scratch buffer to
+ * get correct alignment for direct I/O.
+ */
+ WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize);
+ WT_RET(__wt_scr_alloc(session, log->allocsize, &buf));
+ memset(buf->mem, 0, log->allocsize);
+ logrec = (WT_LOG_RECORD *)buf->mem;
+ desc = (WT_LOG_DESC *)logrec->record;
+ desc->log_magic = WT_LOG_MAGIC;
+ desc->majorv = WT_LOG_MAJOR_VERSION;
+ desc->minorv = WT_LOG_MINOR_VERSION;
+ desc->log_size = (uint64_t)conn->log_file_max;
+
+ /*
+ * Now that the record is set up, initialize the record header.
+ */
+ logrec->len = log->allocsize;
+ logrec->checksum = 0;
+ logrec->checksum = __wt_cksum(logrec, log->allocsize);
+ WT_CLEAR(tmp);
+ myslot.slot = &tmp;
+ myslot.offset = 0;
+
+ /*
+ * Recursively call __log_acquire to allocate log space for the
+ * log descriptor record. Call __log_fill to write it, but we
+ * do not need to call __log_release because we're not waiting for
+ * earlier operations to complete.
+ */
+ WT_ERR(__log_acquire(session, logrec->len, &tmp));
+ WT_ERR(__log_fill(session, &myslot, 1, buf, NULL));
+
+ /*
+ * If we're called from connection creation code, we need to update
+ * the LSNs since we're the only write in progress.
+ */
+ if (conn_create) {
+ WT_ERR(__wt_fsync(session, log->log_fh));
+ log->sync_lsn = tmp.slot_end_lsn;
+ log->write_lsn = tmp.slot_end_lsn;
+ }
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_log_read --
+ * Read the log record at the given LSN. Return the record (including
+ * the log header) in the WT_ITEM. Caller is responsible for freeing it.
+ */
+int
+__wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+ uint32_t flags)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *log_fh;
+ WT_LOG *log;
+ WT_LOG_RECORD *logrec;
+ uint32_t cksum, rdup_len, reclen;
+
+ WT_UNUSED(flags);
+ /*
+ * If the caller didn't give us an LSN or something to return,
+ * there's nothing to do.
+ */
+ if (lsnp == NULL || record == NULL)
+ return (0);
+ conn = S2C(session);
+ log = conn->log;
+ /*
+ * If the offset isn't on an allocation boundary it must be wrong.
+ */
+ if (lsnp->offset % log->allocsize != 0 || lsnp->file > log->fileid)
+ return (WT_NOTFOUND);
+
+ WT_RET(__log_openfile(session, 0, &log_fh, lsnp->file));
+ /*
+ * Read the minimum allocation size a record could be.
+ */
+ WT_ERR(__wt_buf_init(session, record, log->allocsize));
+ WT_ERR(__wt_read(session,
+ log_fh, lsnp->offset, (size_t)log->allocsize, record->mem));
+ /*
+ * First 4 bytes is the real record length. See if we
+ * need to read more than the allocation size. We expect
+ * that we rarely will have to read more. Most log records
+ * will be fairly small.
+ */
+ reclen = *(uint32_t *)record->mem;
+ if (reclen == 0) {
+ ret = WT_NOTFOUND;
+ goto err;
+ }
+ if (reclen > log->allocsize) {
+ rdup_len = __wt_rduppo2(reclen, log->allocsize);
+ WT_ERR(__wt_buf_grow(session, record, rdup_len));
+ WT_ERR(__wt_read(session,
+ log_fh, lsnp->offset, (size_t)rdup_len, record->mem));
+ }
+ /*
+ * We read in the record, verify checksum.
+ */
+ logrec = (WT_LOG_RECORD *)record->mem;
+ cksum = logrec->checksum;
+ logrec->checksum = 0;
+ logrec->checksum = __wt_cksum(logrec, logrec->len);
+ if (logrec->checksum != cksum)
+ WT_ERR_MSG(session, WT_ERROR, "log_read: Bad checksum");
+ record->size = logrec->len;
+ WT_STAT_FAST_CONN_INCR(session, log_reads);
+err:
+ WT_TRET(__wt_close(session, log_fh));
+ return (ret);
+}
+
+/*
+ * __wt_log_scan --
+ * Scan the logs, calling a function on each record found.
+ */
+int
+__wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
+ int (*func)(WT_SESSION_IMPL *session,
+ WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *log_fh;
+ WT_ITEM buf;
+ WT_LOG *log;
+ WT_LOG_RECORD *logrec;
+ WT_LSN end_lsn, rd_lsn, start_lsn;
+ wt_off_t log_size;
+ uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen;
+ u_int i, logcount;
+ int eol;
+ char **logfiles;
+
+ conn = S2C(session);
+ log = conn->log;
+ log_fh = NULL;
+ logcount = 0;
+ logfiles = NULL;
+ eol = 0;
+ WT_CLEAR(buf);
+
+ /*
+ * If the caller did not give us a callback function there is nothing
+ * to do.
+ */
+ if (func == NULL)
+ return (0);
+
+ if (LF_ISSET(WT_LOGSCAN_RECOVER))
+ WT_RET(__wt_verbose(session, WT_VERB_LOG,
+ "__wt_log_scan truncating to %u/%" PRIuMAX,
+ log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset));
+
+ if (log != NULL) {
+ allocsize = log->allocsize;
+
+ if (lsnp == NULL) {
+ if (LF_ISSET(WT_LOGSCAN_FIRST))
+ start_lsn = log->first_lsn;
+ else if (LF_ISSET(WT_LOGSCAN_FROM_CKP))
+ start_lsn = log->ckpt_lsn;
+ else
+ return (WT_ERROR); /* Illegal usage */
+ } else {
+ if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP))
+ WT_RET_MSG(session, WT_ERROR,
+ "choose either a start LSN or a start flag");
+
+ /* Offsets must be on allocation boundaries. */
+ if (lsnp->offset % allocsize != 0 ||
+ lsnp->file > log->fileid)
+ return (WT_NOTFOUND);
+
+ /*
+ * Log cursors may not know the starting LSN. If an
+ * LSN pointer is passed in, but it is the INIT_LSN,
+ * start from the first_lsn.
+ */
+ start_lsn = *lsnp;
+ if (IS_INIT_LSN(&start_lsn))
+ start_lsn = log->first_lsn;
+ }
+ end_lsn = log->alloc_lsn;
+ } else {
+ /*
+ * If logging is not configured, we can still print out the log
+ * if log files exist. We just need to set the LSNs from what
+ * is in the files versus what is in the live connection.
+ */
+ /*
+ * Set allocsize to the minimum alignment it could be. Larger
+ * records and larger allocation boundaries should always be
+ * a multiple of this.
+ */
+ allocsize = LOG_ALIGN;
+ lastlog = 0;
+ firstlog = UINT32_MAX;
+ WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
+ if (logcount == 0)
+ /*
+ * Return it is not supported if none don't exist.
+ */
+ return (ENOTSUP);
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i],
+ &lognum));
+ lastlog = WT_MAX(lastlog, lognum);
+ firstlog = WT_MIN(firstlog, lognum);
+ }
+ start_lsn.file = firstlog;
+ end_lsn.file = lastlog;
+ start_lsn.offset = end_lsn.offset = 0;
+ __wt_log_files_free(session, logfiles, logcount);
+ logfiles = NULL;
+ }
+ WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file));
+ WT_ERR(__log_filesize(session, log_fh, &log_size));
+ rd_lsn = start_lsn;
+ WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN));
+ for (;;) {
+ if (rd_lsn.offset + allocsize > log_size) {
+advance:
+ /*
+ * If we read the last record, go to the next file.
+ */
+ WT_ERR(__wt_close(session, log_fh));
+ log_fh = NULL;
+ eol = 1;
+ /*
+ * Truncate this log file before we move to the next.
+ */
+ if (LF_ISSET(WT_LOGSCAN_RECOVER))
+ WT_ERR(__log_truncate(session, &rd_lsn, 1));
+ rd_lsn.file++;
+ rd_lsn.offset = 0;
+ /*
+ * Avoid an error message when we reach end of log
+ * by checking here.
+ */
+ if (rd_lsn.file > end_lsn.file)
+ break;
+ WT_ERR(__log_openfile(
+ session, 0, &log_fh, rd_lsn.file));
+ WT_ERR(__log_filesize(session, log_fh, &log_size));
+ continue;
+ }
+ /*
+ * Read the minimum allocation size a record could be.
+ */
+ WT_ASSERT(session, buf.memsize >= allocsize);
+ WT_ERR(__wt_read(session,
+ log_fh, rd_lsn.offset, (size_t)allocsize, buf.mem));
+ /*
+ * First 8 bytes is the real record length. See if we
+ * need to read more than the allocation size. We expect
+ * that we rarely will have to read more. Most log records
+ * will be fairly small.
+ */
+ reclen = *(uint32_t *)buf.mem;
+ /*
+ * Log files are pre-allocated. We never expect a zero length
+ * unless we've reached the end of the log. The log can be
+ * written out of order, so when recovery finds the end of
+ * the log, truncate the file and remove any later log files
+ * that may exist.
+ */
+ if (reclen == 0) {
+ /* This LSN is the end. */
+ break;
+ }
+ rdup_len = __wt_rduppo2(reclen, allocsize);
+ if (reclen > allocsize) {
+ /*
+ * The log file end could be the middle of this
+ * log record.
+ */
+ if (rd_lsn.offset + rdup_len > log_size)
+ goto advance;
+ /*
+ * We need to round up and read in the full padded
+ * record, especially for direct I/O.
+ */
+ WT_ERR(__wt_buf_grow(session, &buf, rdup_len));
+ WT_ERR(__wt_read(session,
+ log_fh, rd_lsn.offset, (size_t)rdup_len, buf.mem));
+ WT_STAT_FAST_CONN_INCR(session, log_scan_rereads);
+ }
+ /*
+ * We read in the record, verify checksum.
+ */
+ buf.size = reclen;
+ logrec = (WT_LOG_RECORD *)buf.mem;
+ cksum = logrec->checksum;
+ logrec->checksum = 0;
+ logrec->checksum = __wt_cksum(logrec, logrec->len);
+ if (logrec->checksum != cksum) {
+ /*
+ * A checksum mismatch means we have reached the end of
+ * the useful part of the log. This should be found on
+ * the first pass through recovery. In the second pass
+ * where we truncate the log, this is where it should
+ * end.
+ */
+ if (log != NULL)
+ log->trunc_lsn = rd_lsn;
+ break;
+ }
+
+ /*
+ * We have a valid log record. If it is not the log file
+ * header, invoke the callback.
+ */
+ WT_STAT_FAST_CONN_INCR(session, log_scan_records);
+ if (rd_lsn.offset != 0) {
+ WT_ERR((*func)(session, &buf, &rd_lsn, cookie));
+ if (LF_ISSET(WT_LOGSCAN_ONE))
+ break;
+ }
+ rd_lsn.offset += (wt_off_t)rdup_len;
+ }
+
+ /* Truncate if we're in recovery. */
+ if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
+ LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0)
+ WT_ERR(__log_truncate(session, &rd_lsn, 0));
+
+err: WT_STAT_FAST_CONN_INCR(session, log_scans);
+ if (logfiles != NULL)
+ __wt_log_files_free(session, logfiles, logcount);
+ __wt_buf_free(session, &buf);
+ /*
+ * If the caller wants one record and it is at the end of log,
+ * return WT_NOTFOUND.
+ */
+ if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0)
+ ret = WT_NOTFOUND;
+ if (ret == ENOENT)
+ ret = 0;
+ if (log_fh != NULL)
+ WT_TRET(__wt_close(session, log_fh));
+ return (ret);
+}
+
+/*
+ * __log_direct_write --
+ * Write a log record without using the consolidation arrays.
+ */
+static int
+__log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+ uint32_t flags)
+{
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOGSLOT tmp;
+ WT_MYSLOT myslot;
+ int locked;
+ WT_DECL_SPINLOCK_ID(id); /* Must appear last */
+
+ log = S2C(session)->log;
+ myslot.slot = &tmp;
+ myslot.offset = 0;
+ WT_CLEAR(tmp);
+
+ /* Fast path the contended case. */
+ if (__wt_spin_trylock(session, &log->log_slot_lock, &id) != 0)
+ return (EAGAIN);
+ locked = 1;
+
+ if (LF_ISSET(WT_LOG_FSYNC))
+ F_SET(&tmp, SLOT_SYNC);
+ WT_ERR(__log_acquire(session, record->size, &tmp));
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ locked = 0;
+ WT_ERR(__log_fill(session, &myslot, 1, record, lsnp));
+ WT_ERR(__log_release(session, &tmp));
+
+err: if (locked)
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ return (ret);
+}
+
+/*
+ * __wt_log_write --
+ * Write a record into the log.
+ */
+int
+__wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+ uint32_t flags)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOG_RECORD *logrec;
+ WT_LSN lsn;
+ WT_MYSLOT myslot;
+ uint32_t rdup_len;
+ int locked;
+
+ conn = S2C(session);
+ log = conn->log;
+ locked = 0;
+ INIT_LSN(&lsn);
+ myslot.slot = NULL;
+ /*
+ * Assume the WT_ITEM the user passed is a WT_LOG_RECORD, which has
+ * a header at the beginning for us to fill in.
+ *
+ * If using direct_io, the caller should pass us an aligned record.
+ * But we need to make sure it is big enough and zero-filled so
+ * that we can write the full amount. Do this whether or not
+ * direct_io is in use because it makes the reading code cleaner.
+ */
+ WT_STAT_FAST_CONN_INCRV(session, log_bytes_user, record->size);
+ rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize);
+ WT_ERR(__wt_buf_grow(session, record, rdup_len));
+ WT_ASSERT(session, record->data == record->mem);
+ /*
+ * If the caller's record only partially fills the necessary
+ * space, we need to zero-fill the remainder.
+ */
+ if (record->size != rdup_len) {
+ memset((uint8_t *)record->mem + record->size, 0,
+ rdup_len - record->size);
+ record->size = rdup_len;
+ }
+ logrec = (WT_LOG_RECORD *)record->mem;
+ logrec->len = (uint32_t)record->size;
+ logrec->checksum = 0;
+ logrec->checksum = __wt_cksum(logrec, record->size);
+
+ WT_STAT_FAST_CONN_INCR(session, log_writes);
+
+ if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) {
+ ret = __log_direct_write(session, record, lsnp, flags);
+ if (ret == 0)
+ return (0);
+ if (ret != EAGAIN)
+ WT_ERR(ret);
+ /*
+ * An EAGAIN return means we failed to get the try lock -
+ * fall through to the consolidation code in that case.
+ */
+ }
+
+ /*
+ * As soon as we see contention for the log slot, disable direct
+ * log writes. We get better performance by forcing writes through
+ * the consolidation code. This is because individual writes flood
+ * the I/O system faster than they contend on the log slot lock.
+ */
+ F_SET(log, WT_LOG_FORCE_CONSOLIDATE);
+ if ((ret = __wt_log_slot_join(
+ session, rdup_len, flags, &myslot)) == ENOMEM) {
+ /*
+ * If we couldn't find a consolidated slot for this record
+ * write the record directly.
+ */
+ while ((ret = __log_direct_write(
+ session, record, lsnp, flags)) == EAGAIN)
+ ;
+ WT_ERR(ret);
+ /*
+ * Increase the buffer size of any slots we can get access
+ * to, so future consolidations are likely to succeed.
+ */
+ WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len));
+ return (0);
+ }
+ WT_ERR(ret);
+ if (myslot.offset == 0) {
+ __wt_spin_lock(session, &log->log_slot_lock);
+ locked = 1;
+ WT_ERR(__wt_log_slot_close(session, myslot.slot));
+ WT_ERR(__log_acquire(
+ session, myslot.slot->slot_group_size, myslot.slot));
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ locked = 0;
+ WT_ERR(__wt_log_slot_notify(session, myslot.slot));
+ } else
+ WT_ERR(__wt_log_slot_wait(session, myslot.slot));
+ WT_ERR(__log_fill(session, &myslot, 0, record, &lsn));
+ if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) {
+ WT_ERR(__log_release(session, myslot.slot));
+ WT_ERR(__wt_log_slot_free(myslot.slot));
+ } else if (LF_ISSET(WT_LOG_FSYNC)) {
+ /* Wait for our writes to reach disk */
+ while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 &&
+ myslot.slot->slot_error == 0)
+ (void)__wt_cond_wait(
+ session, log->log_sync_cond, 10000);
+ }
+err:
+ if (locked)
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ if (ret == 0 && lsnp != NULL)
+ *lsnp = lsn;
+ /*
+ * If we're synchronous and some thread had an error, we don't know
+ * if our write made it out to the file or not. The error could be
+ * before or after us. So, if anyone got an error, we report it.
+ * If we're not synchronous, only report if our own operation got
+ * an error.
+ */
+ if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 &&
+ myslot.slot != NULL)
+ ret = myslot.slot->slot_error;
+ return (ret);
+}
+
+/*
+ * __wt_log_vprintf --
+ * Write a message into the log.
+ */
+int
+__wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(logrec);
+ WT_DECL_RET;
+ va_list ap_copy;
+ const char *rec_fmt = WT_UNCHECKED_STRING(I);
+ uint32_t rectype = WT_LOGREC_MESSAGE;
+ size_t header_size, len;
+
+ conn = S2C(session);
+
+ if (!conn->logging)
+ return (0);
+
+ va_copy(ap_copy, ap);
+ len = (size_t)vsnprintf(NULL, 0, fmt, ap_copy) + 1;
+ va_end(ap_copy);
+
+ WT_RET(
+ __wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec));
+
+ /*
+ * We're writing a record with the type (an integer) followed by a
+ * string (NUL-terminated data). To avoid writing the string into
+ * a buffer before copying it, we write the header first, then the
+ * raw bytes of the string.
+ */
+ WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype));
+ WT_ERR(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, header_size,
+ rec_fmt, rectype));
+ logrec->size += (uint32_t)header_size;
+
+ (void)vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap);
+
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "log_printf: %s", (char *)logrec->data + logrec->size));
+
+ logrec->size += len;
+ WT_ERR(__wt_log_write(session, logrec, NULL, 0));
+err: __wt_scr_free(&logrec);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/log/log_auto.c b/src/third_party/wiredtiger/src/log/log_auto.c
new file mode 100644
index 00000000000..f3db79f4daf
--- /dev/null
+++ b/src/third_party/wiredtiger/src/log/log_auto.c
@@ -0,0 +1,437 @@
+/* DO NOT EDIT: automatically built by dist/log.py. */
+
+#include "wt_internal.h"
+
+int
+__wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp)
+{
+ WT_ITEM *logrec;
+
+ WT_RET(__wt_scr_alloc(session, WT_ALIGN(size + 1, LOG_ALIGN), &logrec));
+ WT_CLEAR(*(WT_LOG_RECORD *)logrec->data);
+ logrec->size = offsetof(WT_LOG_RECORD, record);
+
+ *logrecp = logrec;
+ return (0);
+}
+
+void
+__wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp)
+{
+ WT_UNUSED(session);
+ __wt_scr_free(logrecp);
+}
+
+int
+__wt_logrec_read(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end, uint32_t *rectypep)
+{
+ uint64_t rectype;
+
+ WT_UNUSED(session);
+ WT_RET(__wt_vunpack_uint(pp, WT_PTRDIFF(end, *pp), &rectype));
+ *rectypep = (uint32_t)rectype;
+ return (0);
+}
+
+int
+__wt_logop_read(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end,
+ uint32_t *optypep, uint32_t *opsizep)
+{
+ return (__wt_struct_unpack(
+ session, *pp, WT_PTRDIFF(end, *pp), "II", optypep, opsizep));
+}
+
+int
+__wt_logop_col_put_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, uint64_t recno, WT_ITEM *value)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIru);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_COL_PUT;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, recno, value));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, recno, value));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_col_put_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIru);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, recnop, valuep));
+ WT_ASSERT(session, optype == WT_LOGOP_COL_PUT);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_col_put_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ uint64_t recno;
+ WT_ITEM value;
+
+ WT_RET(__wt_logop_col_put_unpack(
+ session, pp, end, &fileid, &recno, &value));
+
+ fprintf(out, " \"optype\": \"col_put\",\n");
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"recno\": \"%" PRIu64 "\",\n", recno);
+ fprintf(out, " \"value\": \"%.*s\",\n",
+ (int)value.size, (const char *)value.data);
+ return (0);
+}
+
+int
+__wt_logop_col_remove_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, uint64_t recno)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIr);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_COL_REMOVE;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, recno));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, recno));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_col_remove_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, uint64_t *recnop)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIr);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, recnop));
+ WT_ASSERT(session, optype == WT_LOGOP_COL_REMOVE);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_col_remove_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ uint64_t recno;
+
+ WT_RET(__wt_logop_col_remove_unpack(
+ session, pp, end, &fileid, &recno));
+
+ fprintf(out, " \"optype\": \"col_remove\",\n");
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"recno\": \"%" PRIu64 "\",\n", recno);
+ return (0);
+}
+
+int
+__wt_logop_col_truncate_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, uint64_t start, uint64_t stop)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIrr);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_COL_TRUNCATE;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, start, stop));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, start, stop));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_col_truncate_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, uint64_t *startp, uint64_t *stopp)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIrr);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, startp, stopp));
+ WT_ASSERT(session, optype == WT_LOGOP_COL_TRUNCATE);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_col_truncate_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ uint64_t start;
+ uint64_t stop;
+
+ WT_RET(__wt_logop_col_truncate_unpack(
+ session, pp, end, &fileid, &start, &stop));
+
+ fprintf(out, " \"optype\": \"col_truncate\",\n");
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"start\": \"%" PRIu64 "\",\n", start);
+ fprintf(out, " \"stop\": \"%" PRIu64 "\",\n", stop);
+ return (0);
+}
+
+int
+__wt_logop_row_put_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, WT_ITEM *key, WT_ITEM *value)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIuu);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_ROW_PUT;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, key, value));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, key, value));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_row_put_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIuu);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, keyp, valuep));
+ WT_ASSERT(session, optype == WT_LOGOP_ROW_PUT);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_row_put_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ WT_ITEM key;
+ WT_ITEM value;
+
+ WT_RET(__wt_logop_row_put_unpack(
+ session, pp, end, &fileid, &key, &value));
+
+ fprintf(out, " \"optype\": \"row_put\",\n");
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"key\": \"%.*s\",\n",
+ (int)key.size, (const char *)key.data);
+ fprintf(out, " \"value\": \"%.*s\",\n",
+ (int)value.size, (const char *)value.data);
+ return (0);
+}
+
+int
+__wt_logop_row_remove_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, WT_ITEM *key)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIu);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_ROW_REMOVE;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, key));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, key));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_row_remove_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, WT_ITEM *keyp)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIu);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, keyp));
+ WT_ASSERT(session, optype == WT_LOGOP_ROW_REMOVE);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_row_remove_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ WT_ITEM key;
+
+ WT_RET(__wt_logop_row_remove_unpack(
+ session, pp, end, &fileid, &key));
+
+ fprintf(out, " \"optype\": \"row_remove\",\n");
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"key\": \"%.*s\",\n",
+ (int)key.size, (const char *)key.data);
+ return (0);
+}
+
+int
+__wt_logop_row_truncate_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIuuI);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_ROW_TRUNCATE;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, start, stop, mode));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, start, stop, mode));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_row_truncate_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIuuI);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, startp, stopp, modep));
+ WT_ASSERT(session, optype == WT_LOGOP_ROW_TRUNCATE);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_row_truncate_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ WT_ITEM start;
+ WT_ITEM stop;
+ uint32_t mode;
+
+ WT_RET(__wt_logop_row_truncate_unpack(
+ session, pp, end, &fileid, &start, &stop, &mode));
+
+ fprintf(out, " \"optype\": \"row_truncate\",\n");
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"start\": \"%.*s\",\n",
+ (int)start.size, (const char *)start.data);
+ fprintf(out, " \"stop\": \"%.*s\",\n",
+ (int)stop.size, (const char *)stop.data);
+ fprintf(out, " \"mode\": \"%" PRIu32 "\",\n", mode);
+ return (0);
+}
+
+int
+__wt_txn_op_printlog(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t optype, opsize;
+
+ /* Peek at the size and the type. */
+ WT_RET(__wt_logop_read(session, pp, end, &optype, &opsize));
+ end = *pp + opsize;
+
+ switch (optype) {
+ case WT_LOGOP_COL_PUT:
+ WT_RET(__wt_logop_col_put_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_COL_REMOVE:
+ WT_RET(__wt_logop_col_remove_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_COL_TRUNCATE:
+ WT_RET(__wt_logop_col_truncate_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_ROW_PUT:
+ WT_RET(__wt_logop_row_put_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_ROW_REMOVE:
+ WT_RET(__wt_logop_row_remove_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_ROW_TRUNCATE:
+ WT_RET(__wt_logop_row_truncate_print(session, pp, end, out));
+ break;
+
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c
new file mode 100644
index 00000000000..c12f47d231b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/log/log_slot.c
@@ -0,0 +1,354 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * This file implements the consolidated array algorithm as described in
+ * the paper:
+ * Scalability of write-ahead logging on multicore and multisocket hardware
+ * by Ryan Johnson, Ippokratis Pandis, Radu Stoica, Manos Athanassoulis
+ * and Anastasia Ailamaki.
+ *
+ * It appeared in The VLDB Journal, DOI 10.1007/s00778-011-0260-8 and can
+ * be found at:
+ * http://infoscience.epfl.ch/record/170505/files/aether-smpfulltext.pdf
+ */
+
+/*
+ * __wt_log_slot_init --
+ * Initialize the slot array.
+ */
+int
+__wt_log_slot_init(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOGSLOT *slot;
+ int32_t i;
+
+ conn = S2C(session);
+ log = conn->log;
+ for (i = 0; i < SLOT_POOL; i++) {
+ log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE;
+ log->slot_pool[i].slot_index = SLOT_INVALID_INDEX;
+ }
+
+ /*
+ * Set up the available slots from the pool the first time.
+ */
+ for (i = 0; i < SLOT_ACTIVE; i++) {
+ slot = &log->slot_pool[i];
+ slot->slot_index = (uint32_t)i;
+ slot->slot_state = WT_LOG_SLOT_READY;
+ log->slot_array[i] = slot;
+ }
+
+ /*
+ * Allocate memory for buffers now that the arrays are setup. Split
+ * this out to make error handling simpler.
+ */
+ for (i = 0; i < SLOT_POOL; i++) {
+ WT_ERR(__wt_buf_init(session,
+ &log->slot_pool[i].slot_buf, WT_LOG_SLOT_BUF_INIT_SIZE));
+ F_SET(&log->slot_pool[i], SLOT_BUFFERED);
+ }
+ WT_STAT_FAST_CONN_INCRV(session,
+ log_buffer_size, WT_LOG_SLOT_BUF_INIT_SIZE * SLOT_POOL);
+ if (0) {
+err: while (--i >= 0)
+ __wt_buf_free(session, &log->slot_pool[i].slot_buf);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_log_slot_destroy --
+ * Clean up the slot array on shutdown.
+ */
+int
+__wt_log_slot_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ int i;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ for (i = 0; i < SLOT_POOL; i++)
+ __wt_buf_free(session, &log->slot_pool[i].slot_buf);
+ return (0);
+}
+
+/*
+ * __wt_log_slot_join --
+ * Join a consolidated logging slot. Callers should be prepared to deal
+ * with a ENOMEM return - which indicates no slots could accommodate
+ * the log record.
+ */
+int
+__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
+ uint32_t flags, WT_MYSLOT *myslotp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ WT_LOGSLOT *slot;
+ int64_t cur_state, new_state, old_state;
+ uint32_t allocated_slot, slot_grow_attempts;
+
+ conn = S2C(session);
+ log = conn->log;
+ slot_grow_attempts = 0;
+find_slot:
+ allocated_slot = __wt_random(session->rnd) % SLOT_ACTIVE;
+ slot = log->slot_array[allocated_slot];
+ old_state = slot->slot_state;
+join_slot:
+ /*
+ * WT_LOG_SLOT_READY and higher means the slot is available for
+ * joining. Any other state means it is in use and transitioning
+ * from the active array.
+ */
+ if (old_state < WT_LOG_SLOT_READY) {
+ WT_STAT_FAST_CONN_INCR(session, log_slot_transitions);
+ goto find_slot;
+ }
+ /*
+ * Add in our size to the state and then atomically swap that
+ * into place if it is still the same value.
+ */
+ new_state = old_state + (int64_t)mysize;
+ if (new_state < old_state) {
+ /* Our size doesn't fit here. */
+ WT_STAT_FAST_CONN_INCR(session, log_slot_toobig);
+ goto find_slot;
+ }
+ /*
+ * If the slot buffer isn't big enough to hold this update, mark
+ * the slot for a buffer size increase and find another slot.
+ */
+ if (new_state > (int64_t)slot->slot_buf.memsize) {
+ F_SET(slot, SLOT_BUF_GROW);
+ if (++slot_grow_attempts > 5) {
+ WT_STAT_FAST_CONN_INCR(session, log_slot_toosmall);
+ return (ENOMEM);
+ }
+ goto find_slot;
+ }
+ cur_state = WT_ATOMIC_CAS_VAL8(slot->slot_state, old_state, new_state);
+ /*
+ * We lost a race to add our size into this slot. Check the state
+ * and try again.
+ */
+ if (cur_state != old_state) {
+ old_state = cur_state;
+ WT_STAT_FAST_CONN_INCR(session, log_slot_races);
+ goto join_slot;
+ }
+ WT_ASSERT(session, myslotp != NULL);
+ /*
+ * We joined this slot. Fill in our information to return to
+ * the caller.
+ */
+ WT_STAT_FAST_CONN_INCR(session, log_slot_joins);
+ if (LF_ISSET(WT_LOG_FSYNC))
+ F_SET(slot, SLOT_SYNC);
+ myslotp->slot = slot;
+ myslotp->offset = (wt_off_t)old_state - WT_LOG_SLOT_READY;
+ return (0);
+}
+
+/*
+ * __wt_log_slot_close --
+ * Close a slot and do not allow any other threads to join this slot.
+ * Remove this from the active slot array and move a new slot from
+ * the pool into its place. Set up the size of this group;
+ * Must be called with the logging spinlock held.
+ */
+int
+__wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ WT_LOGSLOT *newslot;
+ int64_t old_state;
+ int32_t yields;
+ uint32_t pool_i, switch_fails;
+
+ conn = S2C(session);
+ log = conn->log;
+ switch_fails = 0;
+retry:
+ /*
+ * Find an unused slot in the pool.
+ */
+ pool_i = log->pool_index;
+ newslot = &log->slot_pool[pool_i];
+ if (++log->pool_index >= SLOT_POOL)
+ log->pool_index = 0;
+ if (newslot->slot_state != WT_LOG_SLOT_FREE) {
+ WT_STAT_FAST_CONN_INCR(session, log_slot_switch_fails);
+ /*
+ * If it takes a number of attempts to find an available slot
+ * it's likely all slots are waiting to be released. This
+ * churn is used to change how long we pause before closing
+ * the slot - which leads to more consolidation and less churn.
+ */
+ if (++switch_fails % SLOT_POOL == 0 &&
+ switch_fails != 0 && slot->slot_churn < 5)
+ ++slot->slot_churn;
+ __wt_yield();
+ goto retry;
+ } else if (slot->slot_churn > 0) {
+ --slot->slot_churn;
+ WT_ASSERT(session, slot->slot_churn >= 0);
+ }
+
+ /* Pause to allow other threads a chance to consolidate. */
+ for (yields = slot->slot_churn; yields >= 0; yields--)
+ __wt_yield();
+
+ /*
+ * Swap out the slot we're going to use and put a free one in the
+ * slot array in its place so that threads can use it right away.
+ */
+ WT_STAT_FAST_CONN_INCR(session, log_slot_closes);
+ newslot->slot_state = WT_LOG_SLOT_READY;
+ newslot->slot_index = slot->slot_index;
+ log->slot_array[newslot->slot_index] = &log->slot_pool[pool_i];
+ old_state = WT_ATOMIC_STORE8(slot->slot_state, WT_LOG_SLOT_PENDING);
+ slot->slot_group_size = (uint64_t)(old_state - WT_LOG_SLOT_READY);
+ /*
+ * Note that this statistic may be much bigger than in reality,
+ * especially when compared with the total bytes written in
+ * __log_fill. The reason is that this size reflects any
+ * rounding up that is needed and the total bytes in __log_fill
+ * is the amount of user bytes.
+ */
+ WT_STAT_FAST_CONN_INCRV(session,
+ log_slot_consolidated, (uint64_t)slot->slot_group_size);
+ return (0);
+}
+
+/*
+ * __wt_log_slot_notify --
+ * Notify all threads waiting for the state to be < WT_LOG_SLOT_DONE.
+ */
+int
+__wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+ WT_UNUSED(session);
+
+ slot->slot_state =
+ (int64_t)WT_LOG_SLOT_DONE - (int64_t)slot->slot_group_size;
+ return (0);
+}
+
+/*
+ * __wt_log_slot_wait --
+ * Wait for slot leader to allocate log area and tell us our log offset.
+ */
+int
+__wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+ WT_UNUSED(session);
+
+ while (slot->slot_state > WT_LOG_SLOT_DONE)
+ __wt_yield();
+ return (0);
+}
+
+/*
+ * __wt_log_slot_release --
+ * Each thread in a consolidated group releases its portion to
+ * signal it has completed writing its piece of the log.
+ */
+int64_t
+__wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size)
+{
+ int64_t newsize;
+
+ /*
+ * Add my size into the state. When it reaches WT_LOG_SLOT_DONE
+ * all participatory threads have completed copying their piece.
+ */
+ newsize = WT_ATOMIC_ADD8(slot->slot_state, (int64_t)size);
+ return (newsize);
+}
+
+/*
+ * __wt_log_slot_free --
+ * Free a slot back into the pool.
+ */
+int
+__wt_log_slot_free(WT_LOGSLOT *slot)
+{
+ slot->slot_state = WT_LOG_SLOT_FREE;
+ return (0);
+}
+
+/*
+ * __wt_log_slot_grow_buffers --
+ * Increase the buffer size of all available slots in the buffer pool.
+ * Go to some lengths to include active (but unused) slots to handle
+ * the case where all log write record sizes exceed the size of the
+ * active buffer.
+ */
+int
+__wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOGSLOT *slot;
+ int64_t orig_state;
+ uint64_t old_size, total_growth;
+ int i;
+
+ conn = S2C(session);
+ log = conn->log;
+ total_growth = 0;
+ WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
+ /*
+ * Take the log slot lock to prevent other threads growing buffers
+ * at the same time. Could tighten the scope of this lock, or have
+ * a separate lock if there is contention.
+ */
+ __wt_spin_lock(session, &log->log_slot_lock);
+ for (i = 0; i < SLOT_POOL; i++) {
+ slot = &log->slot_pool[i];
+ /* Avoid atomic operations if they won't succeed. */
+ if (slot->slot_state != WT_LOG_SLOT_FREE &&
+ slot->slot_state != WT_LOG_SLOT_READY)
+ continue;
+ /* Don't keep growing unrelated buffers. */
+ if (slot->slot_buf.memsize > (10 * newsize) &&
+ !F_ISSET(slot, SLOT_BUF_GROW))
+ continue;
+ orig_state = WT_ATOMIC_CAS_VAL8(
+ slot->slot_state, WT_LOG_SLOT_FREE, WT_LOG_SLOT_PENDING);
+ if (orig_state != WT_LOG_SLOT_FREE) {
+ orig_state = WT_ATOMIC_CAS_VAL8(slot->slot_state,
+ WT_LOG_SLOT_READY, WT_LOG_SLOT_PENDING);
+ if (orig_state != WT_LOG_SLOT_READY)
+ continue;
+ }
+
+ /* We have a slot - now go ahead and grow the buffer. */
+ old_size = slot->slot_buf.memsize;
+ F_CLR(slot, SLOT_BUF_GROW);
+ WT_ERR(__wt_buf_grow(session, &slot->slot_buf,
+ WT_MAX(slot->slot_buf.memsize * 2, newsize)));
+ slot->slot_state = orig_state;
+ total_growth += slot->slot_buf.memsize - old_size;
+ }
+err: __wt_spin_unlock(session, &log->log_slot_lock);
+ WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, total_growth);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
new file mode 100644
index 00000000000..f50706fb2e9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
@@ -0,0 +1,1519 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#define WT_FORALL_CURSORS(clsm, c, i) \
+ for ((i) = (clsm)->nchunks; (i) > 0;) \
+ if (((c) = (clsm)->cursors[--i]) != NULL)
+
+#define WT_LSM_CURCMP(s, lsm_tree, c1, c2, cmp) \
+ __wt_compare(s, (lsm_tree)->collator, &(c1)->key, &(c2)->key, &cmp)
+
+static int __clsm_lookup(WT_CURSOR_LSM *, WT_ITEM *);
+static int __clsm_open_cursors(WT_CURSOR_LSM *, int, u_int, uint32_t);
+static int __clsm_reset_cursors(WT_CURSOR_LSM *, WT_CURSOR *);
+
+/*
+ * __clsm_enter_update --
+ * Make sure an LSM cursor is ready to perform an update.
+ */
+static int
+__clsm_enter_update(WT_CURSOR_LSM *clsm)
+{
+ WT_CURSOR *primary;
+ WT_DECL_RET;
+ WT_LSM_CHUNK *primary_chunk;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+ int have_primary, ovfl, waited;
+
+ lsm_tree = clsm->lsm_tree;
+ if (clsm->nchunks == 0 ||
+ (primary = clsm->cursors[clsm->nchunks - 1]) == NULL)
+ return (0);
+ session = (WT_SESSION_IMPL *)primary->session;
+ primary_chunk = clsm->primary_chunk;
+ have_primary = (primary_chunk != NULL &&
+ primary_chunk->switch_txn == WT_TXN_NONE);
+ ovfl = 0;
+
+ /*
+ * In LSM there are multiple btrees active at one time. The tree
+ * switch code needs to use btree API methods, and it wants to
+ * operate on the btree for the primary chunk. Set that up now.
+ *
+ * If the primary chunk has grown too large, set a flag so the worker
+ * thread will switch when it gets a chance to avoid introducing high
+ * latency into application threads. Don't do this indefinitely: if a
+ * chunk grows twice as large as the configured size, block until it
+ * can be switched.
+ */
+ if (!F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
+ if (have_primary)
+ WT_WITH_BTREE(session,
+ ((WT_CURSOR_BTREE *)primary)->btree,
+ ovfl = __wt_btree_size_overflow(
+ session, lsm_tree->chunk_size));
+
+ if (ovfl || !have_primary) {
+ /*
+ * Check that we are up-to-date: don't set the switch
+ * if the tree has changed since we last opened
+ * cursors: that can lead to switching multiple times
+ * when only one switch is required, creating very
+ * small chunks.
+ */
+ WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
+ if (clsm->dsk_gen == lsm_tree->dsk_gen &&
+ !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
+ ret = __wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_SWITCH, 0, lsm_tree);
+ F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
+ }
+ WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
+ WT_RET(ret);
+ ovfl = 0;
+ }
+ } else if (have_primary)
+ WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree,
+ ovfl = __wt_btree_size_overflow(
+ session, 2 * lsm_tree->chunk_size));
+
+ /*
+ * If there is no primary chunk, or it has really overflowed, which
+ * either means a worker thread has fallen behind or there has just
+ * been a user-level checkpoint, wait until the tree changes.
+ *
+ * We used to switch chunks in the application thread if we got to
+ * here, but that is problematic because there is a transaction in
+ * progress and it could roll back, leaving the metadata inconsistent.
+ */
+ if (ovfl || !have_primary) {
+ for (waited = 0;
+ clsm->dsk_gen == lsm_tree->dsk_gen;
+ ++waited) {
+ if (waited % 100 == 0)
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
+ __wt_sleep(0, 10);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __clsm_enter --
+ * Start an operation on an LSM cursor, update if the tree has changed.
+ */
+static inline int
+__clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ uint64_t *switch_txnp;
+ uint64_t snap_min;
+
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+ /* Merge cursors never update. */
+ if (F_ISSET(clsm, WT_CLSM_MERGE))
+ return (0);
+
+ if (reset) {
+ WT_ASSERT(session, !F_ISSET(&clsm->iface,
+ WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT));
+ WT_RET(__clsm_reset_cursors(clsm, NULL));
+ }
+
+ for (;;) {
+ /*
+ * If the cursor looks up-to-date, check if the cache is full.
+ * In case this call blocks, the check will be repeated before
+ * proceeding.
+ */
+ if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
+ goto open;
+
+ WT_RET(__wt_cache_full_check(session));
+
+ if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
+ goto open;
+
+ /* Update the maximum transaction ID in the primary chunk. */
+ if (update) {
+ WT_RET(__clsm_enter_update(clsm));
+ if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
+ goto open;
+
+ /*
+ * Ensure that there is a transaction snapshot active.
+ */
+ WT_RET(__wt_txn_autocommit_check(session));
+
+ if (session->txn.isolation == TXN_ISO_SNAPSHOT)
+ __wt_txn_cursor_op(session);
+
+ /*
+ * Figure out how many updates are required for
+ * snapshot isolation.
+ *
+ * This is not a normal visibility check on the maximum
+ * transaction ID in each chunk: any transaction ID
+ * that overlaps with our snapshot is a potential
+ * conflict.
+ */
+ clsm->nupdates = 1;
+ if (session->txn.isolation == TXN_ISO_SNAPSHOT &&
+ F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
+ WT_ASSERT(session,
+ F_ISSET(&session->txn, TXN_HAS_SNAPSHOT));
+ snap_min = session->txn.snap_min;
+ for (switch_txnp =
+ &clsm->switch_txn[clsm->nchunks - 2];
+ clsm->nupdates < clsm->nchunks;
+ clsm->nupdates++, switch_txnp--) {
+ if (TXNID_LT(*switch_txnp, snap_min))
+ break;
+ WT_ASSERT(session,
+ !__wt_txn_visible_all(
+ session, *switch_txnp));
+ }
+ }
+ }
+
+ /*
+ * Stop when we are up-to-date, as long as this is:
+ * - a snapshot isolation update and the cursor is set up for
+ * that;
+ * - an update operation with a primary chunk, or
+ * - a read operation and the cursor is open for reading.
+ */
+ if ((!update ||
+ session->txn.isolation != TXN_ISO_SNAPSHOT ||
+ F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) &&
+ ((update && clsm->primary_chunk != NULL) ||
+ (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ))))
+ break;
+
+open: WT_WITH_SCHEMA_LOCK(session,
+ ret = __clsm_open_cursors(clsm, update, 0, 0));
+ WT_RET(ret);
+ }
+
+ if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) {
+ WT_RET(__cursor_enter(session));
+ F_SET(clsm, WT_CLSM_ACTIVE);
+ }
+
+ return (0);
+}
+
+/*
+ * __clsm_leave --
+ * Finish an operation on an LSM cursor.
+ */
+static int
+__clsm_leave(WT_CURSOR_LSM *clsm)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+ if (F_ISSET(clsm, WT_CLSM_ACTIVE)) {
+ WT_RET(__cursor_leave(session));
+ F_CLR(clsm, WT_CLSM_ACTIVE);
+ }
+
+ return (0);
+}
+
+/*
+ * We need a tombstone to mark deleted records, and we use the special
+ * value below for that purpose. We use two 0x14 (Device Control 4) bytes to
+ * minimize the likelihood of colliding with an application-chosen encoding
+ * byte, if the application uses two leading DC4 byte for some reason, we'll do
+ * a wasted data copy each time a new value is inserted into the object.
+ */
+static const WT_ITEM __tombstone = { "\x14\x14", 2, 0, NULL, 0 };
+
+/*
+ * __clsm_deleted --
+ * Check whether the current value is a tombstone.
+ */
+static inline int
+__clsm_deleted(WT_CURSOR_LSM *clsm, const WT_ITEM *item)
+{
+ return (!F_ISSET(clsm, WT_CLSM_MINOR_MERGE) &&
+ item->size == __tombstone.size &&
+ memcmp(item->data, __tombstone.data, __tombstone.size) == 0);
+}
+
+/*
+ * __clsm_deleted_encode --
+ * Encode values that are in the encoded name space.
+ */
+static inline int
+__clsm_deleted_encode(WT_SESSION_IMPL *session,
+ const WT_ITEM *value, WT_ITEM *final_value, WT_ITEM **tmpp)
+{
+ WT_ITEM *tmp;
+
+ /*
+ * If value requires encoding, get a scratch buffer of the right size
+ * and create a copy of the data with the first byte of the tombstone
+ * appended.
+ */
+ if (value->size >= __tombstone.size &&
+ memcmp(value->data, __tombstone.data, __tombstone.size) == 0) {
+ WT_RET(__wt_scr_alloc(session, value->size + 1, tmpp));
+ tmp = *tmpp;
+
+ memcpy(tmp->mem, value->data, value->size);
+ memcpy((uint8_t *)tmp->mem + value->size, __tombstone.data, 1);
+ final_value->data = tmp->mem;
+ final_value->size = value->size + 1;
+ } else {
+ final_value->data = value->data;
+ final_value->size = value->size;
+ }
+
+ return (0);
+}
+
+/*
+ * __clsm_deleted_decode --
+ * Decode values that start with the tombstone.
+ */
+static inline void
+__clsm_deleted_decode(WT_ITEM *value)
+{
+ /*
+ * Take care with this check: when an LSM cursor is used for a merge,
+ * and/or to create a Bloom filter, it is valid to return the tombstone
+ * value.
+ */
+ if (value->size > __tombstone.size &&
+ memcmp(value->data, __tombstone.data, __tombstone.size) == 0)
+ --value->size;
+}
+
+/*
+ * __clsm_close_cursors --
+ * Close any btree cursors that are not needed.
+ */
+static int
+__clsm_close_cursors(WT_CURSOR_LSM *clsm, u_int start, u_int end)
+{
+ WT_BLOOM *bloom;
+ WT_CURSOR *c;
+ u_int i;
+
+ if (clsm->cursors == NULL || clsm->nchunks == 0)
+ return (0);
+
+ /*
+ * Walk the cursors, closing any we don't need. Note that the exit
+ * condition here is special, don't use WT_FORALL_CURSORS, and be
+ * careful with unsigned integer wrapping.
+ */
+ for (i = start; i < end; i++) {
+ if ((c = (clsm)->cursors[i]) != NULL) {
+ clsm->cursors[i] = NULL;
+ WT_RET(c->close(c));
+ }
+ if ((bloom = clsm->blooms[i]) != NULL) {
+ clsm->blooms[i] = NULL;
+ WT_RET(__wt_bloom_close(bloom));
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __clsm_open_cursors --
+ * Open cursors for the current set of files.
+ */
+static int
+__clsm_open_cursors(
+ WT_CURSOR_LSM *clsm, int update, u_int start_chunk, uint32_t start_id)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *c, **cp, *primary;
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+ WT_TXN *txn;
+ const char *checkpoint, *ckpt_cfg[3];
+ uint64_t saved_gen;
+ u_int i, nchunks, ngood, nupdates;
+ int locked;
+
+ c = &clsm->iface;
+ session = (WT_SESSION_IMPL *)c->session;
+ txn = &session->txn;
+ lsm_tree = clsm->lsm_tree;
+ chunk = NULL;
+
+ ckpt_cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+ ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw";
+ ckpt_cfg[2] = NULL;
+
+ /* Copy the key, so we don't lose the cursor position. */
+ if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key))
+ WT_RET(__wt_buf_set(
+ session, &c->key, c->key.data, c->key.size));
+
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+
+ if (update) {
+ if (txn->isolation == TXN_ISO_SNAPSHOT)
+ F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT);
+ } else
+ F_SET(clsm, WT_CLSM_OPEN_READ);
+
+ WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
+ locked = 1;
+
+ /*
+ * If there is no in-memory chunk in the tree for an update operation,
+ * create one.
+ *
+ * !!!
+ * It is exceeding unlikely that we get here at all, but if we were to
+ * switch chunks in this thread and our transaction roll back, it would
+ * leave the metadata inconsistent. Signal for the LSM worker thread
+ * to create the chunk instead to avoid the issue.
+ */
+ if (update && (lsm_tree->nchunks == 0 ||
+ (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) == NULL ||
+ chunk->switch_txn != WT_TXN_NONE)) {
+ /* Release our lock because switch will get a write lock. */
+ F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
+ locked = 0;
+ WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree));
+
+ /*
+ * Give the worker thread a chance to run before locking the
+ * tree again -- we will loop in __clsm_enter until there is an
+ * in-memory chunk in the tree.
+ */
+ __wt_sleep(0, 1000);
+ WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree));
+ locked = 1;
+ }
+
+ /* Merge cursors have already figured out how many chunks they need. */
+retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) {
+ nchunks = clsm->nchunks;
+ ngood = 0;
+
+ /*
+ * We may have raced with another merge completing. Check that
+ * we're starting at the right offset in the chunk array.
+ */
+ if (start_chunk >= lsm_tree->nchunks ||
+ lsm_tree->chunk[start_chunk]->id != start_id) {
+ for (start_chunk = 0;
+ start_chunk < lsm_tree->nchunks;
+ start_chunk++) {
+ chunk = lsm_tree->chunk[start_chunk];
+ if (chunk->id == start_id)
+ break;
+ }
+ /* We have to find the start chunk: merge locked it. */
+ WT_ASSERT(session, start_chunk < lsm_tree->nchunks);
+ }
+
+ WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
+ } else {
+ nchunks = lsm_tree->nchunks;
+
+ /*
+ * If we are only opening the cursor for updates, only open the
+ * primary chunk, plus any other chunks that might be required
+ * to detect snapshot isolation conflicts.
+ */
+ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
+ WT_ERR(__wt_realloc_def(session,
+ &clsm->txnid_alloc, nchunks,
+ &clsm->switch_txn));
+ if (F_ISSET(clsm, WT_CLSM_OPEN_READ))
+ ngood = nupdates = 0;
+ else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
+ /*
+ * Keep going until all updates in the next
+ * chunk are globally visible. Copy the maximum
+ * transaction IDs into the cursor as we go.
+ */
+ for (ngood = nchunks - 1, nupdates = 1;
+ ngood > 0;
+ ngood--, nupdates++) {
+ chunk = lsm_tree->chunk[ngood - 1];
+ clsm->switch_txn[ngood - 1] = chunk->switch_txn;
+ if (__wt_txn_visible_all(
+ session, chunk->switch_txn))
+ break;
+ }
+ } else {
+ nupdates = 1;
+ ngood = nchunks - 1;
+ }
+
+ /* Check how many cursors are already open. */
+ for (cp = clsm->cursors + ngood;
+ ngood < clsm->nchunks && ngood < nchunks;
+ cp++, ngood++) {
+ chunk = lsm_tree->chunk[ngood];
+
+ /* If the cursor isn't open yet, we're done. */
+ if (*cp == NULL)
+ break;
+
+ /* Easy case: the URIs don't match. */
+ if (strcmp((*cp)->uri, chunk->uri) != 0)
+ break;
+
+ /* Make sure the checkpoint config matches. */
+ checkpoint = ((WT_CURSOR_BTREE *)*cp)->
+ btree->dhandle->checkpoint;
+ if (checkpoint == NULL &&
+ F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+ !chunk->empty)
+ break;
+
+ /* Make sure the Bloom config matches. */
+ if (clsm->blooms[ngood] == NULL &&
+ F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ break;
+ }
+
+ /* Spurious generation bump? */
+ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) {
+ clsm->dsk_gen = lsm_tree->dsk_gen;
+ goto err;
+ }
+
+ /*
+ * Close any cursors we no longer need. If the cursor is a
+ * pure update cursor, close everything -- we usually only need
+ * a single chunk open in that case and we haven't walked all
+ * of the other slots in the loop above.
+ *
+ * Drop the LSM tree lock while we do this: if the cache is
+ * full, we may block while closing a cursor. Save the
+ * generation number and retry if it has changed under us.
+ */
+ if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0)
+ ngood = 0;
+ if (clsm->cursors != NULL && ngood < clsm->nchunks) {
+ saved_gen = lsm_tree->dsk_gen;
+ locked = 0;
+ WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree));
+ WT_ERR(__clsm_close_cursors(
+ clsm, ngood, clsm->nchunks));
+ WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree));
+ locked = 1;
+ if (lsm_tree->dsk_gen != saved_gen)
+ goto retry;
+ }
+
+ /* Detach from our old primary. */
+ clsm->primary_chunk = NULL;
+ clsm->current = NULL;
+ }
+
+ WT_ERR(__wt_realloc_def(session,
+ &clsm->bloom_alloc, nchunks, &clsm->blooms));
+ WT_ERR(__wt_realloc_def(session,
+ &clsm->cursor_alloc, nchunks, &clsm->cursors));
+
+ clsm->nchunks = nchunks;
+
+ /* Open the cursors for chunks that have changed. */
+ for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) {
+ chunk = lsm_tree->chunk[i + start_chunk];
+ /* Copy the maximum transaction ID. */
+ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
+ clsm->switch_txn[i] = chunk->switch_txn;
+
+ /*
+ * Read from the checkpoint if the file has been written.
+ * Once all cursors switch, the in-memory tree can be evicted.
+ */
+ WT_ASSERT(session, *cp == NULL);
+ ret = __wt_open_cursor(session, chunk->uri, c,
+ (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ?
+ ckpt_cfg : NULL, cp);
+
+ /*
+ * XXX kludge: we may have an empty chunk where no checkpoint
+ * was written. If so, try to open the ordinary handle on that
+ * chunk instead.
+ */
+ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
+ ret = __wt_open_cursor(
+ session, chunk->uri, c, NULL, cp);
+ if (ret == 0)
+ chunk->empty = 1;
+ }
+ WT_ERR(ret);
+
+ /*
+ * Setup all cursors other than the primary to only do conflict
+ * checks on insert operations. This allows us to execute
+ * inserts on non-primary chunks as a way of checking for
+ * write conflicts with concurrent updates.
+ */
+ if (i != nchunks - 1)
+ (*cp)->insert = __wt_curfile_update_check;
+
+ if (!F_ISSET(clsm, WT_CLSM_MERGE) &&
+ F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(__wt_bloom_open(session, chunk->bloom_uri,
+ lsm_tree->bloom_bit_count,
+ lsm_tree->bloom_hash_count,
+ c, &clsm->blooms[i]));
+
+ /* Child cursors always use overwrite and raw mode. */
+ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
+ }
+
+ /* The last chunk is our new primary. */
+ if (chunk != NULL &&
+ !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+ chunk->switch_txn == WT_TXN_NONE) {
+ clsm->primary_chunk = chunk;
+ primary = clsm->cursors[clsm->nchunks - 1];
+ /*
+ * Disable eviction for the in-memory chunk. Also clear the
+ * bulk load flag here, otherwise eviction will be enabled by
+ * the first update.
+ */
+ btree = ((WT_CURSOR_BTREE *)(primary))->btree;
+ if (btree->bulk_load_ok) {
+ btree->bulk_load_ok = 0;
+ WT_WITH_BTREE(session, btree,
+ __wt_btree_evictable(session, 0));
+ }
+ }
+
+ clsm->dsk_gen = lsm_tree->dsk_gen;
+
+err:
+#ifdef HAVE_DIAGNOSTIC
+ /* Check that all cursors are open as expected. */
+ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) {
+ for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) {
+ chunk = lsm_tree->chunk[i + start_chunk];
+
+ /* Make sure the cursor is open. */
+ WT_ASSERT(session, *cp != NULL);
+
+ /* Easy case: the URIs should match. */
+ WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0);
+
+ /* Make sure the checkpoint config matches. */
+ checkpoint = ((WT_CURSOR_BTREE *)*cp)->
+ btree->dhandle->checkpoint;
+ WT_ASSERT(session,
+ (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+ !chunk->empty) ?
+ checkpoint != NULL : checkpoint == NULL);
+
+ /* Make sure the Bloom config matches. */
+ WT_ASSERT(session,
+ (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
+ !F_ISSET(clsm, WT_CLSM_MERGE)) ?
+ clsm->blooms[i] != NULL : clsm->blooms[i] == NULL);
+ }
+ }
+#endif
+ if (locked)
+ WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
+ return (ret);
+}
+
+/*
+ * __wt_clsm_init_merge --
+ * Initialize an LSM cursor for a merge.
+ */
+int
+__wt_clsm_init_merge(
+ WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ F_SET(clsm, WT_CLSM_MERGE);
+ if (start_chunk != 0)
+ F_SET(clsm, WT_CLSM_MINOR_MERGE);
+ clsm->nchunks = nchunks;
+
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __clsm_open_cursors(clsm, 0, start_chunk, start_id));
+ return (ret);
+}
+
+/*
+ * __clsm_get_current --
+ * Find the smallest / largest of the cursors and copy its key/value.
+ */
+static int
+__clsm_get_current(
+ WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, int smallest, int *deletedp)
+{
+ WT_CURSOR *c, *current;
+ int cmp, multiple;
+ u_int i;
+
+ current = NULL;
+ multiple = 0;
+
+ WT_FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(c, WT_CURSTD_KEY_INT))
+ continue;
+ if (current == NULL) {
+ current = c;
+ continue;
+ }
+ WT_RET(WT_LSM_CURCMP(session, clsm->lsm_tree, c, current, cmp));
+ if (smallest ? cmp < 0 : cmp > 0) {
+ current = c;
+ multiple = 0;
+ } else if (cmp == 0)
+ multiple = 1;
+ }
+
+ c = &clsm->iface;
+ if ((clsm->current = current) == NULL) {
+ F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ return (WT_NOTFOUND);
+ }
+
+ if (multiple)
+ F_SET(clsm, WT_CLSM_MULTIPLE);
+ else
+ F_CLR(clsm, WT_CLSM_MULTIPLE);
+
+ WT_RET(current->get_key(current, &c->key));
+ WT_RET(current->get_value(current, &c->value));
+
+ F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if ((*deletedp = __clsm_deleted(clsm, &c->value)) == 0)
+ F_SET(c, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+ return (0);
+}
+
+/*
+ * __clsm_compare --
+ * WT_CURSOR->compare implementation for the LSM cursor type.
+ */
+static int
+__clsm_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+ WT_CURSOR_LSM *alsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ /* There's no need to sync with the LSM tree, avoid WT_LSM_ENTER. */
+ alsm = (WT_CURSOR_LSM *)a;
+ CURSOR_API_CALL(a, session, compare, NULL);
+
+ /*
+ * Confirm both cursors refer to the same source and have keys, then
+ * compare the keys.
+ */
+ if (strcmp(a->uri, b->uri) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "comparison method cursors must reference the same object");
+
+ WT_CURSOR_NEEDKEY(a);
+ WT_CURSOR_NEEDKEY(b);
+
+ WT_ERR(__wt_compare(
+ session, alsm->lsm_tree->collator, &a->key, &b->key, cmpp));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __clsm_next --
+ * WT_CURSOR->next method for the LSM cursor type.
+ */
+static int
+__clsm_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+ int check, cmp, deleted;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_API_CALL(cursor, session, next, NULL);
+ WT_CURSOR_NOVALUE(cursor);
+ WT_ERR(__clsm_enter(clsm, 0, 0));
+
+ /* If we aren't positioned for a forward scan, get started. */
+ if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT)) {
+ F_CLR(clsm, WT_CLSM_MULTIPLE);
+ WT_FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
+ WT_ERR(c->reset(c));
+ ret = c->next(c);
+ } else if (c != clsm->current) {
+ c->set_key(c, &cursor->key);
+ if ((ret = c->search_near(c, &cmp)) == 0) {
+ if (cmp < 0)
+ ret = c->next(c);
+ else if (cmp == 0) {
+ if (clsm->current == NULL)
+ clsm->current = c;
+ else
+ F_SET(clsm,
+ WT_CLSM_MULTIPLE);
+ }
+ } else
+ F_CLR(c, WT_CURSTD_KEY_SET);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ F_SET(clsm, WT_CLSM_ITERATE_NEXT);
+ F_CLR(clsm, WT_CLSM_ITERATE_PREV);
+
+ /* We just positioned *at* the key, now move. */
+ if (clsm->current != NULL)
+ goto retry;
+ } else {
+retry: /*
+ * If there are multiple cursors on that key, move them
+ * forward.
+ */
+ if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) {
+ check = 0;
+ WT_FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(c, WT_CURSTD_KEY_INT))
+ continue;
+ if (check) {
+ WT_ERR(WT_LSM_CURCMP(session,
+ clsm->lsm_tree, c, clsm->current,
+ cmp));
+ if (cmp == 0)
+ WT_ERR_NOTFOUND_OK(c->next(c));
+ }
+ if (c == clsm->current)
+ check = 1;
+ }
+ }
+
+ /* Move the smallest cursor forward. */
+ c = clsm->current;
+ WT_ERR_NOTFOUND_OK(c->next(c));
+ }
+
+ /* Find the cursor(s) with the smallest key. */
+ if ((ret = __clsm_get_current(session, clsm, 1, &deleted)) == 0 &&
+ deleted)
+ goto retry;
+
+err: WT_TRET(__clsm_leave(clsm));
+ API_END(session, ret);
+ if (ret == 0)
+ __clsm_deleted_decode(&cursor->value);
+ return (ret);
+}
+
+/*
+ * __clsm_prev --
+ * WT_CURSOR->prev method for the LSM cursor type.
+ */
+static int
+__clsm_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+ int check, cmp, deleted;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_API_CALL(cursor, session, prev, NULL);
+ WT_CURSOR_NOVALUE(cursor);
+ WT_ERR(__clsm_enter(clsm, 0, 0));
+
+ /* If we aren't positioned for a reverse scan, get started. */
+ if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_PREV)) {
+ F_CLR(clsm, WT_CLSM_MULTIPLE);
+ WT_FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
+ WT_ERR(c->reset(c));
+ ret = c->prev(c);
+ } else if (c != clsm->current) {
+ c->set_key(c, &cursor->key);
+ if ((ret = c->search_near(c, &cmp)) == 0) {
+ if (cmp > 0)
+ ret = c->prev(c);
+ else if (cmp == 0) {
+ if (clsm->current == NULL)
+ clsm->current = c;
+ else
+ F_SET(clsm,
+ WT_CLSM_MULTIPLE);
+ }
+ }
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ F_SET(clsm, WT_CLSM_ITERATE_PREV);
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT);
+
+ /* We just positioned *at* the key, now move. */
+ if (clsm->current != NULL)
+ goto retry;
+ } else {
+retry: /*
+ * If there are multiple cursors on that key, move them
+ * backwards.
+ */
+ if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) {
+ check = 0;
+ WT_FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(c, WT_CURSTD_KEY_INT))
+ continue;
+ if (check) {
+ WT_ERR(WT_LSM_CURCMP(session,
+ clsm->lsm_tree, c, clsm->current,
+ cmp));
+ if (cmp == 0)
+ WT_ERR_NOTFOUND_OK(c->prev(c));
+ }
+ if (c == clsm->current)
+ check = 1;
+ }
+ }
+
+ /* Move the smallest cursor backwards. */
+ c = clsm->current;
+ WT_ERR_NOTFOUND_OK(c->prev(c));
+ }
+
+ /* Find the cursor(s) with the largest key. */
+ if ((ret = __clsm_get_current(session, clsm, 0, &deleted)) == 0 &&
+ deleted)
+ goto retry;
+
+err: WT_TRET(__clsm_leave(clsm));
+ API_END(session, ret);
+ if (ret == 0)
+ __clsm_deleted_decode(&cursor->value);
+ return (ret);
+}
+
+/*
+ * __clsm_reset_cursors --
+ * Reset any positioned chunk cursors.
+ *
+ * If the skip parameter is non-NULL, that cursor is about to be used, so
+ * there is no need to reset it.
+ */
+static int
+__clsm_reset_cursors(WT_CURSOR_LSM *clsm, WT_CURSOR *skip)
+{
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ u_int i;
+
+ /* Fast path if the cursor is not positioned. */
+ if ((clsm->current == NULL || clsm->current == skip) &&
+ !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV))
+ return (0);
+
+ WT_FORALL_CURSORS(clsm, c, i) {
+ if (c == skip)
+ continue;
+ if (F_ISSET(c, WT_CURSTD_KEY_INT))
+ WT_TRET(c->reset(c));
+ }
+
+ clsm->current = NULL;
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+
+ return (ret);
+}
+
+/*
+ * __clsm_reset --
+ * WT_CURSOR->reset method for the LSM cursor type.
+ */
+static int
+__clsm_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ /*
+ * Don't use the normal __clsm_enter path: that is wasted work when all
+ * we want to do is give up our position.
+ */
+ clsm = (WT_CURSOR_LSM *)cursor;
+ CURSOR_API_CALL(cursor, session, reset, NULL);
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+ WT_TRET(__clsm_reset_cursors(clsm, NULL));
+
+ /* In case we were left positioned, clear that. */
+ WT_TRET(__clsm_leave(clsm));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __clsm_lookup --
+ * Position an LSM cursor.
+ */
+static int
+__clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value)
+{
+ WT_BLOOM *bloom;
+ WT_BLOOM_HASH bhash;
+ WT_CURSOR *c, *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+ int have_hash;
+
+ c = NULL;
+ cursor = &clsm->iface;
+ have_hash = 0;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_FORALL_CURSORS(clsm, c, i) {
+ /* If there is a Bloom filter, see if we can skip the read. */
+ bloom = NULL;
+ if ((bloom = clsm->blooms[i]) != NULL) {
+ if (!have_hash) {
+ WT_ERR(__wt_bloom_hash(
+ bloom, &cursor->key, &bhash));
+ have_hash = 1;
+ }
+
+ ret = __wt_bloom_hash_get(bloom, &bhash);
+ if (ret == WT_NOTFOUND) {
+ WT_STAT_FAST_INCR(session,
+ &clsm->lsm_tree->stats, bloom_miss);
+ continue;
+ } else if (ret == 0)
+ WT_STAT_FAST_INCR(session,
+ &clsm->lsm_tree->stats, bloom_hit);
+ WT_ERR(ret);
+ }
+ c->set_key(c, &cursor->key);
+ if ((ret = c->search(c)) == 0) {
+ WT_ERR(c->get_key(c, &cursor->key));
+ WT_ERR(c->get_value(c, value));
+ if (__clsm_deleted(clsm, value))
+ ret = WT_NOTFOUND;
+ goto done;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ F_CLR(c, WT_CURSTD_KEY_SET);
+ /* Update stats: the active chunk can't have a bloom filter. */
+ if (bloom != NULL)
+ WT_STAT_FAST_INCR(session,
+ &clsm->lsm_tree->stats, bloom_false_positive);
+ else if (clsm->primary_chunk == NULL || i != clsm->nchunks)
+ WT_STAT_FAST_INCR(session,
+ &clsm->lsm_tree->stats, lsm_lookup_no_bloom);
+ }
+ WT_ERR(WT_NOTFOUND);
+
+done:
+err: F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if (ret == 0) {
+ clsm->current = c;
+ F_SET(cursor, WT_CURSTD_KEY_INT);
+ if (value == &cursor->value)
+ F_SET(cursor, WT_CURSTD_VALUE_INT);
+ } else if (c != NULL)
+ WT_TRET(c->reset(c));
+
+ return (ret);
+}
+
+/*
+ * __clsm_search --
+ * WT_CURSOR->search method for the LSM cursor type.
+ */
+static int
+__clsm_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_API_CALL(cursor, session, search, NULL);
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NOVALUE(cursor);
+ WT_ERR(__clsm_enter(clsm, 1, 0));
+
+ ret = __clsm_lookup(clsm, &cursor->value);
+
+err: WT_TRET(__clsm_leave(clsm));
+ API_END(session, ret);
+ if (ret == 0)
+ __clsm_deleted_decode(&cursor->value);
+ return (ret);
+}
+
+/*
+ * __clsm_search_near --
+ * WT_CURSOR->search_near method for the LSM cursor type.
+ */
+static int
+__clsm_search_near(WT_CURSOR *cursor, int *exactp)
+{
+ WT_CURSOR *c, *larger, *smaller;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_ITEM v;
+ WT_SESSION_IMPL *session;
+ u_int i;
+ int cmp, deleted;
+
+ larger = smaller = NULL;
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_API_CALL(cursor, session, search_near, NULL);
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NOVALUE(cursor);
+ WT_ERR(__clsm_enter(clsm, 1, 0));
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+
+ /*
+ * search_near is somewhat fiddly: we can't just use a nearby key from
+ * the in-memory chunk because there could be a closer key on disk.
+ *
+ * As we search down the chunks, we stop as soon as we find an exact
+ * match. Otherwise, we maintain the smallest cursor larger than the
+ * search key and the largest cursor smaller than the search key. At
+ * the bottom, we prefer the larger cursor, but if no record is larger,
+ * use the smaller cursor, or if no record at all was found,
+ * WT_NOTFOUND.
+ */
+ WT_FORALL_CURSORS(clsm, c, i) {
+ c->set_key(c, &cursor->key);
+ if ((ret = c->search_near(c, &cmp)) == WT_NOTFOUND) {
+ F_CLR(c, WT_CURSTD_KEY_SET);
+ ret = 0;
+ continue;
+ } else if (ret != 0)
+ goto err;
+
+ WT_ERR(c->get_value(c, &v));
+ deleted = __clsm_deleted(clsm, &v);
+
+ if (cmp == 0 && !deleted) {
+ clsm->current = c;
+ *exactp = 0;
+ goto done;
+ }
+
+ /*
+ * Prefer larger cursors. There are two reasons: (1) we expect
+ * prefix searches to be a common case (as in our own indices);
+ * and (2) we need a way to unambiguously know we have the
+ * "closest" result.
+ */
+ if (cmp < 0) {
+ if ((ret = c->next(c)) == 0)
+ cmp = 1;
+ else if (ret == WT_NOTFOUND)
+ ret = c->prev(c);
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * If we land on a deleted item, try going forwards or
+ * backwards to find one that isn't deleted.
+ */
+ while (deleted && (ret = c->next(c)) == 0) {
+ cmp = 1;
+ WT_ERR(c->get_value(c, &v));
+ deleted = __clsm_deleted(clsm, &v);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ while (deleted && (ret = c->prev(c)) == 0) {
+ cmp = -1;
+ WT_ERR(c->get_value(c, &v));
+ deleted = __clsm_deleted(clsm, &v);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ if (deleted)
+ continue;
+
+ /*
+ * We are trying to find the smallest cursor greater than the
+ * search key, or, if there is no larger key, the largest
+ * cursor smaller than the search key.
+ *
+ * It could happen that one cursor contains both of the closest
+ * records. In that case, we will track it in "larger", and it
+ * will be the one we finally choose.
+ */
+ if (cmp > 0) {
+ if (larger == NULL)
+ larger = c;
+ else {
+ WT_ERR(WT_LSM_CURCMP(session,
+ clsm->lsm_tree, c, larger, cmp));
+ if (cmp < 0) {
+ WT_ERR(larger->reset(larger));
+ larger = c;
+ }
+ }
+ } else {
+ if (smaller == NULL)
+ smaller = c;
+ else {
+ WT_ERR(WT_LSM_CURCMP(session,
+ clsm->lsm_tree, c, smaller, cmp));
+ if (cmp > 0) {
+ WT_ERR(smaller->reset(smaller));
+ smaller = c;
+ }
+ }
+ }
+
+ if (c != smaller && c != larger)
+ WT_ERR(c->reset(c));
+ }
+
+ if (larger != NULL) {
+ clsm->current = larger;
+ larger = NULL;
+ *exactp = 1;
+ } else if (smaller != NULL) {
+ clsm->current = smaller;
+ smaller = NULL;
+ *exactp = -1;
+ } else
+ ret = WT_NOTFOUND;
+
+done:
+err: WT_TRET(__clsm_leave(clsm));
+ API_END(session, ret);
+ if (ret == 0) {
+ c = clsm->current;
+ WT_TRET(c->get_key(c, &cursor->key));
+ WT_TRET(c->get_value(c, &cursor->value));
+ }
+ if (smaller != NULL)
+ WT_TRET(smaller->reset(smaller));
+ if (larger != NULL)
+ WT_TRET(larger->reset(larger));
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if (ret == 0) {
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+ __clsm_deleted_decode(&cursor->value);
+ } else
+ clsm->current = NULL;
+
+ return (ret);
+}
+
+/*
+ * __clsm_put --
+ * Put an entry into the in-memory tree, trigger a file switch if
+ * necessary.
+ */
+static inline int
+__clsm_put(WT_SESSION_IMPL *session,
+ WT_CURSOR_LSM *clsm, const WT_ITEM *key, const WT_ITEM *value, int position)
+{
+ WT_CURSOR *c, *primary;
+ WT_LSM_TREE *lsm_tree;
+ u_int i;
+
+ lsm_tree = clsm->lsm_tree;
+
+ WT_ASSERT(session,
+ clsm->primary_chunk != NULL &&
+ (clsm->primary_chunk->switch_txn == WT_TXN_NONE ||
+ TXNID_LE(session->txn.id, clsm->primary_chunk->switch_txn)));
+
+ /*
+ * Clear the existing cursor position. Don't clear the primary cursor:
+ * we're about to use it anyway.
+ */
+ primary = clsm->cursors[clsm->nchunks - 1];
+ WT_RET(__clsm_reset_cursors(clsm, primary));
+
+ /* If necessary, set the position for future scans. */
+ if (position)
+ clsm->current = primary;
+
+ for (i = 0; i < clsm->nupdates; i++) {
+ c = clsm->cursors[(clsm->nchunks - i) - 1];
+ c->set_key(c, key);
+ c->set_value(c, value);
+ WT_RET((position && i == 0) ? c->update(c) : c->insert(c));
+ }
+
+ /*
+ * Update the record count. It is in a shared structure, but it's only
+ * approximate, so don't worry about protecting access.
+ *
+ * Throttle if necessary. Every 100 update operations on each cursor,
+ * check if throttling is required. Don't rely only on the shared
+ * counter because it can race, and because for some workloads, there
+ * may not be enough records per chunk to get effective throttling.
+ */
+ if ((++clsm->primary_chunk->count % 100 == 0 ||
+ ++clsm->update_count >= 100) &&
+ lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) {
+ clsm->update_count = 0;
+ WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats,
+ lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle);
+ WT_STAT_FAST_CONN_INCRV(session,
+ lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle);
+ WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats,
+ lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle);
+ WT_STAT_FAST_CONN_INCRV(session,
+ lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle);
+ __wt_sleep(0,
+ lsm_tree->ckpt_throttle + lsm_tree->merge_throttle);
+ }
+
+ return (0);
+}
+
+/*
+ * __clsm_insert --
+ * WT_CURSOR->insert method for the LSM cursor type.
+ */
+static int
+__clsm_insert(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_ITEM value;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL);
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NEEDVALUE(cursor);
+ WT_ERR(__clsm_enter(clsm, 0, 1));
+
+ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+ (ret = __clsm_lookup(clsm, &value)) != WT_NOTFOUND) {
+ if (ret == 0)
+ ret = WT_DUPLICATE_KEY;
+ goto err;
+ }
+
+ WT_ERR(__clsm_deleted_encode(session, &cursor->value, &value, &buf));
+ ret = __clsm_put(session, clsm, &cursor->key, &value, 0);
+
+err: __wt_scr_free(&buf);
+ WT_TRET(__clsm_leave(clsm));
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __clsm_update --
+ * WT_CURSOR->update method for the LSM cursor type.
+ */
+static int
+__clsm_update(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_ITEM value;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, update, NULL);
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NEEDVALUE(cursor);
+ WT_ERR(__clsm_enter(clsm, 0, 1));
+
+ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) ||
+ (ret = __clsm_lookup(clsm, &value)) == 0) {
+ WT_ERR(__clsm_deleted_encode(
+ session, &cursor->value, &value, &buf));
+ ret = __clsm_put(session, clsm, &cursor->key, &value, 1);
+ }
+
+err: __wt_scr_free(&buf);
+ WT_TRET(__clsm_leave(clsm));
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __clsm_remove --
+ * WT_CURSOR->remove method for the LSM cursor type.
+ */
+static int
+__clsm_remove(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_ITEM value;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL);
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NOVALUE(cursor);
+ WT_ERR(__clsm_enter(clsm, 0, 1));
+
+ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) ||
+ (ret = __clsm_lookup(clsm, &value)) == 0)
+ ret = __clsm_put(session, clsm, &cursor->key, &__tombstone, 1);
+
+err: WT_TRET(__clsm_leave(clsm));
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __clsm_close --
+ * WT_CURSOR->close method for the LSM cursor type.
+ */
+static int
+__clsm_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ /*
+ * Don't use the normal __clsm_enter path: that is wasted work when
+ * closing, and the cursor may never have been used.
+ */
+ clsm = (WT_CURSOR_LSM *)cursor;
+ CURSOR_API_CALL(cursor, session, close, NULL);
+ WT_TRET(__clsm_close_cursors(clsm, 0, clsm->nchunks));
+ __wt_free(session, clsm->blooms);
+ __wt_free(session, clsm->cursors);
+ __wt_free(session, clsm->switch_txn);
+
+ /* In case we were somehow left positioned, clear that. */
+ WT_TRET(__clsm_leave(clsm));
+
+ /* The WT_LSM_TREE owns the URI. */
+ cursor->uri = NULL;
+ if (clsm->lsm_tree != NULL)
+ __wt_lsm_tree_release(session, clsm->lsm_tree);
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_clsm_open --
+ * WT_SESSION->open_cursor method for LSM cursors.
+ */
+int
+__wt_clsm_open(WT_SESSION_IMPL *session,
+ const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __clsm_compare, /* compare */
+ __clsm_next, /* next */
+ __clsm_prev, /* prev */
+ __clsm_reset, /* reset */
+ __clsm_search, /* search */
+ __clsm_search_near, /* search-near */
+ __clsm_insert, /* insert */
+ __clsm_update, /* update */
+ __clsm_remove, /* remove */
+ __clsm_close); /* close */
+ WT_CURSOR *cursor;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+
+ clsm = NULL;
+ cursor = NULL;
+
+ if (!WT_PREFIX_MATCH(uri, "lsm:"))
+ return (EINVAL);
+
+ WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
+ if (cval.len != 0)
+ WT_RET_MSG(session, EINVAL,
+ "LSM does not support opening by checkpoint");
+
+ /* Get the LSM tree. */
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree));
+ WT_RET(ret);
+
+ WT_ERR(__wt_calloc_def(session, 1, &clsm));
+
+ cursor = &clsm->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ cursor->uri = lsm_tree->name;
+ cursor->key_format = lsm_tree->key_format;
+ cursor->value_format = lsm_tree->value_format;
+
+ WT_ERR(__wt_cursor_config_readonly(cursor, cfg, 0));
+
+ clsm->lsm_tree = lsm_tree;
+
+ /*
+ * The tree's dsk_gen starts at one, so starting the cursor on zero
+ * will force a call into open_cursors on the first operation.
+ */
+ clsm->dsk_gen = 0;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_LSM, iface) == 0);
+ WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp));
+
+ if (0) {
+err: __wt_lsm_tree_release(session, lsm_tree);
+ if (clsm != NULL) {
+ clsm->lsm_tree = NULL;
+ WT_TRET(__clsm_close(cursor));
+ }
+ }
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
new file mode 100644
index 00000000000..8f4b3ba49ef
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
@@ -0,0 +1,667 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __lsm_manager_aggressive_update(WT_SESSION_IMPL *, WT_LSM_TREE *);
+static int __lsm_manager_run_server(WT_SESSION_IMPL *);
+static int __lsm_manager_worker_setup(WT_SESSION_IMPL *);
+
+static void * __lsm_worker_manager(void *);
+
+/*
+ * __wt_lsm_manager_config --
+ * Configure the LSM manager.
+ */
+int
+__wt_lsm_manager_config(WT_SESSION_IMPL *session, const char **cfg)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CONFIG_ITEM cval;
+
+ conn = S2C(session);
+
+ WT_RET(__wt_config_gets(session, cfg, "lsm_manager.merge", &cval));
+ if (cval.val)
+ F_SET(conn, WT_CONN_LSM_MERGE);
+ WT_RET(__wt_config_gets(
+ session, cfg, "lsm_manager.worker_thread_max", &cval));
+ if (cval.val)
+ conn->lsm_manager.lsm_workers_max = (uint32_t)cval.val;
+ return (0);
+}
+
+/*
+ * __lsm_general_worker_start --
+ * Start up all of the general LSM worker threads.
+ */
+static int
+__lsm_general_worker_start(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LSM_MANAGER *manager;
+ WT_LSM_WORKER_ARGS *worker_args;
+
+ conn = S2C(session);
+ manager = &conn->lsm_manager;
+
+ /*
+ * Start the remaining worker threads.
+ * This should get more sophisticated in the future - only launching
+ * as many worker threads as are required to keep up with demand.
+ */
+ WT_ASSERT(session, manager->lsm_workers > 1);
+ for (; manager->lsm_workers < manager->lsm_workers_max;
+ manager->lsm_workers++) {
+ worker_args =
+ &manager->lsm_worker_cookies[manager->lsm_workers];
+ worker_args->work_cond = manager->work_cond;
+ worker_args->id = manager->lsm_workers;
+ worker_args->type =
+ WT_LSM_WORK_BLOOM |
+ WT_LSM_WORK_DROP |
+ WT_LSM_WORK_FLUSH |
+ WT_LSM_WORK_SWITCH;
+ F_SET(worker_args, WT_LSM_WORKER_RUN);
+ /*
+ * Only allow half of the threads to run merges to avoid all
+ * all workers getting stuck in long-running merge operations.
+ * Make sure the first worker is allowed, so that there is at
+ * least one thread capable of running merges. We know the
+ * first worker is id 2, so set merges on even numbered workers.
+ */
+ if (manager->lsm_workers % 2 == 0)
+ FLD_SET(worker_args->type, WT_LSM_WORK_MERGE);
+ WT_RET(__wt_lsm_worker_start(session, worker_args));
+ }
+ return (0);
+}
+
+/*
+ * __lsm_stop_workers --
+ * Stop worker threads until the number reaches the configured amount.
+ */
+static int
+__lsm_stop_workers(WT_SESSION_IMPL *session)
+{
+ WT_LSM_MANAGER *manager;
+ WT_LSM_WORKER_ARGS *worker_args;
+ uint32_t i;
+
+ manager = &S2C(session)->lsm_manager;
+ /*
+ * Start at the end of the list of threads and stop them until we
+ * have the desired number. We want to keep all active threads
+ * packed at the front of the worker array.
+ */
+ WT_ASSERT(session, manager->lsm_workers != 0);
+ for (i = manager->lsm_workers - 1; i >= manager->lsm_workers_max; i--) {
+ worker_args = &manager->lsm_worker_cookies[i];
+ /*
+ * Clear this worker's flag so it stops.
+ */
+ F_CLR(worker_args, WT_LSM_WORKER_RUN);
+ WT_ASSERT(session, worker_args->tid != 0);
+ WT_RET(__wt_thread_join(session, worker_args->tid));
+ worker_args->tid = 0;
+ worker_args->type = 0;
+ worker_args->flags = 0;
+ manager->lsm_workers--;
+ /*
+ * We do not clear the session because they are allocated
+ * statically when the connection was opened.
+ */
+ }
+ return (0);
+}
+
+/*
+ * __wt_lsm_manager_reconfig --
+ * Re-configure the LSM manager.
+ */
+int
+__wt_lsm_manager_reconfig(WT_SESSION_IMPL *session, const char **cfg)
+{
+ WT_LSM_MANAGER *manager;
+ uint32_t orig_workers;
+
+ manager = &S2C(session)->lsm_manager;
+ orig_workers = manager->lsm_workers_max;
+
+ WT_RET(__wt_lsm_manager_config(session, cfg));
+ /*
+ * If LSM hasn't started yet, we simply reconfigured the settings
+ * and we'll let the normal code path start the threads.
+ */
+ if (manager->lsm_workers_max == 0)
+ return (0);
+ if (manager->lsm_workers == 0)
+ return (0);
+ /*
+ * If the number of workers has not changed, we're done.
+ */
+ if (orig_workers == manager->lsm_workers_max)
+ return (0);
+ /*
+ * If we want more threads, start them.
+ */
+ if (manager->lsm_workers_max > orig_workers)
+ return (__lsm_general_worker_start(session));
+
+ /*
+ * Otherwise we want to reduce the number of workers.
+ */
+ WT_ASSERT(session, manager->lsm_workers_max < orig_workers);
+ WT_RET(__lsm_stop_workers(session));
+ return (0);
+}
+
+/*
+ * __wt_lsm_manager_start --
+ * Start the LSM management infrastructure. Our queues and locks were
+ * initialized when the connection was initialized.
+ */
+int
+__wt_lsm_manager_start(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+ WT_LSM_MANAGER *manager;
+ WT_SESSION_IMPL *worker_session;
+ uint32_t i;
+
+ manager = &S2C(session)->lsm_manager;
+
+ /*
+ * We need at least a manager, a switch thread and a generic
+ * worker.
+ */
+ WT_ASSERT(session, manager->lsm_workers_max > 2);
+
+ /*
+ * Open sessions for all potential worker threads here - it's not
+ * safe to have worker threads open/close sessions themselves.
+ * All the LSM worker threads do their operations on read-only
+ * files. Use read-uncommitted isolation to avoid keeping
+ * updates in cache unnecessarily.
+ */
+ for (i = 0; i < WT_LSM_MAX_WORKERS; i++) {
+ WT_ERR(__wt_open_internal_session(
+ S2C(session), "lsm-worker", 1, 0, &worker_session));
+ worker_session->isolation = TXN_ISO_READ_UNCOMMITTED;
+ manager->lsm_worker_cookies[i].session = worker_session;
+ }
+
+ /* Start the LSM manager thread. */
+ WT_ERR(__wt_thread_create(session, &manager->lsm_worker_cookies[0].tid,
+ __lsm_worker_manager, &manager->lsm_worker_cookies[0]));
+
+ F_SET(S2C(session), WT_CONN_SERVER_LSM);
+
+ if (0) {
+err: for (i = 0;
+ (worker_session =
+ manager->lsm_worker_cookies[i].session) != NULL;
+ i++)
+ WT_TRET((&worker_session->iface)->close(
+ &worker_session->iface, NULL));
+ }
+ return (ret);
+}
+
+/*
+ * __wt_lsm_manager_free_work_unit --
+ * Release an LSM tree work unit.
+ */
+void
+__wt_lsm_manager_free_work_unit(
+ WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT *entry)
+{
+ if (entry != NULL) {
+ WT_ASSERT(session, entry->lsm_tree->queue_ref > 0);
+
+ (void)WT_ATOMIC_SUB4(entry->lsm_tree->queue_ref, 1);
+ __wt_free(session, entry);
+ }
+}
+
+/*
+ * __wt_lsm_manager_destroy --
+ * Destroy the LSM manager threads and subsystem.
+ */
+int
+__wt_lsm_manager_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LSM_MANAGER *manager;
+ WT_LSM_WORK_UNIT *current, *next;
+ WT_SESSION *wt_session;
+ uint32_t i;
+ uint64_t removed;
+
+ conn = S2C(session);
+ manager = &conn->lsm_manager;
+ removed = 0;
+
+ if (manager->lsm_workers > 0) {
+ /*
+ * Stop the main LSM manager thread first.
+ */
+ while (F_ISSET(conn, WT_CONN_SERVER_LSM))
+ __wt_yield();
+
+ /* Clean up open LSM handles. */
+ ret = __wt_lsm_tree_close_all(session);
+
+ WT_TRET(__wt_thread_join(
+ session, manager->lsm_worker_cookies[0].tid));
+ manager->lsm_worker_cookies[0].tid = 0;
+
+ /* Release memory from any operations left on the queue. */
+ for (current = TAILQ_FIRST(&manager->switchqh);
+ current != NULL; current = next) {
+ next = TAILQ_NEXT(current, q);
+ TAILQ_REMOVE(&manager->switchqh, current, q);
+ ++removed;
+ __wt_lsm_manager_free_work_unit(session, current);
+ }
+ for (current = TAILQ_FIRST(&manager->appqh);
+ current != NULL; current = next) {
+ next = TAILQ_NEXT(current, q);
+ TAILQ_REMOVE(&manager->appqh, current, q);
+ ++removed;
+ __wt_lsm_manager_free_work_unit(session, current);
+ }
+ for (current = TAILQ_FIRST(&manager->managerqh);
+ current != NULL; current = next) {
+ next = TAILQ_NEXT(current, q);
+ TAILQ_REMOVE(&manager->managerqh, current, q);
+ ++removed;
+ __wt_lsm_manager_free_work_unit(session, current);
+ }
+
+ /* Close all LSM worker sessions. */
+ for (i = 0; i < WT_LSM_MAX_WORKERS; i++) {
+ wt_session =
+ &manager->lsm_worker_cookies[i].session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ }
+ }
+ WT_STAT_FAST_CONN_INCRV(session,
+ lsm_work_units_discarded, removed);
+
+ /* Free resources that are allocated in connection initialize */
+ __wt_spin_destroy(session, &manager->switch_lock);
+ __wt_spin_destroy(session, &manager->app_lock);
+ __wt_spin_destroy(session, &manager->manager_lock);
+ WT_TRET(__wt_cond_destroy(session, &manager->work_cond));
+
+ return (ret);
+}
+
+/*
+ * __lsm_manager_aggressive_update --
+ * Update the merge aggressiveness for a single LSM tree.
+ */
+static int
+__lsm_manager_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ struct timespec now;
+ uint64_t chunk_wait, stallms;
+ u_int new_aggressive;
+
+ WT_RET(__wt_epoch(session, &now));
+ stallms = WT_TIMEDIFF(now, lsm_tree->last_flush_ts) / WT_MILLION;
+ /*
+ * Get aggressive if more than enough chunks for a merge should have
+ * been created by now. Use 10 seconds as a default if we don't have an
+ * estimate.
+ */
+ if (lsm_tree->nchunks > 1)
+ chunk_wait = stallms / (lsm_tree->chunk_fill_ms == 0 ?
+ 10000 : lsm_tree->chunk_fill_ms);
+ else
+ chunk_wait = 0;
+ new_aggressive = (u_int)(chunk_wait / lsm_tree->merge_min);
+
+ if (new_aggressive > lsm_tree->merge_aggressiveness) {
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "LSM merge %s got aggressive (old %u new %u), "
+ "merge_min %d, %u / %" PRIu64,
+ lsm_tree->name, lsm_tree->merge_aggressiveness,
+ new_aggressive, lsm_tree->merge_min, stallms,
+ lsm_tree->chunk_fill_ms));
+ lsm_tree->merge_aggressiveness = new_aggressive;
+ }
+ return (0);
+}
+
+/*
+ * __lsm_manager_worker_setup --
+ * Do setup owned by the LSM manager thread including starting the worker
+ * threads.
+ */
+static int
+__lsm_manager_worker_setup(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LSM_MANAGER *manager;
+ WT_LSM_WORKER_ARGS *worker_args;
+
+ conn = S2C(session);
+ manager = &conn->lsm_manager;
+
+ WT_ASSERT(session, manager->lsm_workers == 1);
+ /*
+ * The LSM manager is worker[0]. The switch thread is worker[1].
+ * Setup and start the switch/drop worker explicitly.
+ */
+ worker_args = &manager->lsm_worker_cookies[1];
+ worker_args->work_cond = manager->work_cond;
+ worker_args->id = manager->lsm_workers++;
+ worker_args->type = WT_LSM_WORK_DROP | WT_LSM_WORK_SWITCH;
+ F_SET(worker_args, WT_LSM_WORKER_RUN);
+ /* Start the switch thread. */
+ WT_RET(__wt_lsm_worker_start(session, worker_args));
+ WT_RET(__lsm_general_worker_start(session));
+
+ return (0);
+}
+
+/*
+ * __lsm_manager_worker_shutdown --
+ * Shutdown the LSM manager and worker threads.
+ */
+static int
+__lsm_manager_worker_shutdown(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+ WT_LSM_MANAGER *manager;
+ u_int i;
+
+ manager = &S2C(session)->lsm_manager;
+
+ /*
+ * Wait for the rest of the LSM workers to shutdown. Stop at index
+ * one - since we (the manager) are at index 0.
+ */
+ for (i = 1; i < manager->lsm_workers; i++) {
+ WT_ASSERT(session, manager->lsm_worker_cookies[i].tid != 0);
+ WT_TRET(__wt_cond_signal(session, manager->work_cond));
+ WT_TRET(__wt_thread_join(
+ session, manager->lsm_worker_cookies[i].tid));
+ }
+ return (ret);
+}
+
+/*
+ * __lsm_manager_run_server --
+ * Run manager thread operations.
+ */
+static int
+__lsm_manager_run_server(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LSM_TREE *lsm_tree;
+ struct timespec now;
+ uint64_t fillms, pushms;
+
+ conn = S2C(session);
+ while (F_ISSET(conn, WT_CONN_SERVER_RUN)) {
+ if (TAILQ_EMPTY(&conn->lsmqh)) {
+ __wt_sleep(0, 10000);
+ continue;
+ }
+ __wt_sleep(0, 10000);
+ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) {
+ if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+ continue;
+ WT_RET(__lsm_manager_aggressive_update(
+ session, lsm_tree));
+ WT_RET(__wt_epoch(session, &now));
+ pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 :
+ WT_TIMEDIFF(
+ now, lsm_tree->work_push_ts) / WT_MILLION;
+ fillms = 3 * lsm_tree->chunk_fill_ms;
+ if (fillms == 0)
+ fillms = 10000;
+ /*
+ * If the tree appears to not be triggering enough
+ * LSM maintenance, help it out. Additional work units
+ * don't hurt, and can be necessary if some work
+ * units aren't completed for some reason.
+ * If the tree hasn't been modified, and there are
+ * more than 1 chunks - try to get the tree smaller
+ * so queries run faster.
+ * If we are getting aggressive - ensure there are
+ * enough work units that we can get chunks merged.
+ * If we aren't pushing enough work units, compared
+ * to how often new chunks are being created add some
+ * more.
+ */
+ if (lsm_tree->queue_ref >= LSM_TREE_MAX_QUEUE)
+ WT_STAT_FAST_CONN_INCR(session,
+ lsm_work_queue_max);
+ else if ((!lsm_tree->modified &&
+ lsm_tree->nchunks > 1) ||
+ (lsm_tree->queue_ref == 0 &&
+ lsm_tree->nchunks > 1) ||
+ (lsm_tree->merge_aggressiveness > 3 &&
+ !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) ||
+ pushms > fillms) {
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_DROP, 0, lsm_tree));
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_FLUSH, 0, lsm_tree));
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_BLOOM, 0, lsm_tree));
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "MGR %s: queue %d mod %d nchunks %d"
+ " flags 0x%x aggressive %d pushms %" PRIu64
+ " fillms %" PRIu64,
+ lsm_tree->name, lsm_tree->queue_ref,
+ lsm_tree->modified, lsm_tree->nchunks,
+ lsm_tree->flags,
+ lsm_tree->merge_aggressiveness,
+ pushms, fillms));
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_MERGE, 0, lsm_tree));
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __lsm_worker_manager --
+ * A thread that manages all open LSM trees, and the shared LSM worker
+ * threads.
+ */
+static void *
+__lsm_worker_manager(void *arg)
+{
+ WT_DECL_RET;
+ WT_LSM_WORKER_ARGS *cookie;
+ WT_SESSION_IMPL *session;
+
+ cookie = (WT_LSM_WORKER_ARGS *)arg;
+ session = cookie->session;
+
+ WT_ERR(__lsm_manager_worker_setup(session));
+ WT_ERR(__lsm_manager_run_server(session));
+ WT_ERR(__lsm_manager_worker_shutdown(session));
+
+ if (ret != 0) {
+err: __wt_err(session, ret, "LSM worker manager thread error");
+ }
+ F_CLR(S2C(session), WT_CONN_SERVER_LSM);
+ return (NULL);
+}
+
+/*
+ * __wt_lsm_manager_clear_tree --
+ * Remove all entries for a tree from the LSM manager queues. This
+ * introduces an inefficiency if LSM trees are being opened and closed
+ * regularly.
+ */
+int
+__wt_lsm_manager_clear_tree(
+ WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_LSM_MANAGER *manager;
+ WT_LSM_WORK_UNIT *current, *next;
+ uint64_t removed;
+
+ manager = &S2C(session)->lsm_manager;
+ removed = 0;
+
+ /* Clear out the tree from the switch queue */
+ __wt_spin_lock(session, &manager->switch_lock);
+
+ /* Structure the loop so that it's safe to free as we iterate */
+ for (current = TAILQ_FIRST(&manager->switchqh);
+ current != NULL; current = next) {
+ next = TAILQ_NEXT(current, q);
+ if (current->lsm_tree != lsm_tree)
+ continue;
+ ++removed;
+ TAILQ_REMOVE(&manager->switchqh, current, q);
+ __wt_lsm_manager_free_work_unit(session, current);
+ }
+ __wt_spin_unlock(session, &manager->switch_lock);
+ /* Clear out the tree from the application queue */
+ __wt_spin_lock(session, &manager->app_lock);
+ for (current = TAILQ_FIRST(&manager->appqh);
+ current != NULL; current = next) {
+ next = TAILQ_NEXT(current, q);
+ if (current->lsm_tree != lsm_tree)
+ continue;
+ ++removed;
+ TAILQ_REMOVE(&manager->appqh, current, q);
+ __wt_lsm_manager_free_work_unit(session, current);
+ }
+ __wt_spin_unlock(session, &manager->app_lock);
+ /* Clear out the tree from the manager queue */
+ __wt_spin_lock(session, &manager->manager_lock);
+ for (current = TAILQ_FIRST(&manager->managerqh);
+ current != NULL; current = next) {
+ next = TAILQ_NEXT(current, q);
+ if (current->lsm_tree != lsm_tree)
+ continue;
+ ++removed;
+ TAILQ_REMOVE(&manager->managerqh, current, q);
+ __wt_lsm_manager_free_work_unit(session, current);
+ }
+ __wt_spin_unlock(session, &manager->manager_lock);
+ WT_STAT_FAST_CONN_INCRV(session, lsm_work_units_discarded, removed);
+ return (0);
+}
+
+/*
+ * We assume this is only called from __wt_lsm_manager_pop_entry and we
+ * have session, entry and type available to use. If the queue is empty
+ * we may return from the macro.
+ */
+#define LSM_POP_ENTRY(qh, qlock, qlen) do { \
+ if (TAILQ_EMPTY(qh)) \
+ return (0); \
+ __wt_spin_lock(session, qlock); \
+ TAILQ_FOREACH(entry, (qh), q) { \
+ if (FLD_ISSET(type, entry->type)) { \
+ TAILQ_REMOVE(qh, entry, q); \
+ WT_STAT_FAST_CONN_DECR(session, qlen); \
+ break; \
+ } \
+ } \
+ __wt_spin_unlock(session, (qlock)); \
+} while (0)
+
+/*
+ * __wt_lsm_manager_pop_entry --
+ * Retrieve the head of the queue, if it matches the requested work
+ * unit type.
+ */
+int
+__wt_lsm_manager_pop_entry(
+ WT_SESSION_IMPL *session, uint32_t type, WT_LSM_WORK_UNIT **entryp)
+{
+ WT_LSM_MANAGER *manager;
+ WT_LSM_WORK_UNIT *entry;
+
+ manager = &S2C(session)->lsm_manager;
+ *entryp = NULL;
+ entry = NULL;
+
+ /*
+ * Pop the entry off the correct queue based on our work type.
+ */
+ if (type == WT_LSM_WORK_SWITCH)
+ LSM_POP_ENTRY(&manager->switchqh,
+ &manager->switch_lock, lsm_work_queue_switch);
+ else if (type == WT_LSM_WORK_MERGE)
+ LSM_POP_ENTRY(&manager->managerqh,
+ &manager->manager_lock, lsm_work_queue_manager);
+ else
+ LSM_POP_ENTRY(&manager->appqh,
+ &manager->app_lock, lsm_work_queue_app);
+ if (entry != NULL)
+ WT_STAT_FAST_CONN_INCR(session, lsm_work_units_done);
+ *entryp = entry;
+ return (0);
+}
+
+/*
+ * Push a work unit onto the appropriate queue. This macro assumes we are
+ * called from __wt_lsm_manager_push_entry and we have session and entry
+ * available for use.
+ */
+#define LSM_PUSH_ENTRY(qh, qlock, qlen) do { \
+ __wt_spin_lock(session, qlock); \
+ TAILQ_INSERT_TAIL((qh), entry, q); \
+ WT_STAT_FAST_CONN_INCR(session, qlen); \
+ __wt_spin_unlock(session, qlock); \
+} while (0)
+
+/*
+ * __wt_lsm_manager_push_entry --
+ * Add an entry to the end of the switch queue.
+ */
+int
+__wt_lsm_manager_push_entry(WT_SESSION_IMPL *session,
+ uint32_t type, uint32_t flags, WT_LSM_TREE *lsm_tree)
+{
+ WT_LSM_MANAGER *manager;
+ WT_LSM_WORK_UNIT *entry;
+
+ manager = &S2C(session)->lsm_manager;
+
+ WT_RET(__wt_epoch(session, &lsm_tree->work_push_ts));
+
+ WT_RET(__wt_calloc_def(session, 1, &entry));
+ entry->type = type;
+ entry->flags = flags;
+ entry->lsm_tree = lsm_tree;
+ (void)WT_ATOMIC_ADD4(lsm_tree->queue_ref, 1);
+ WT_STAT_FAST_CONN_INCR(session, lsm_work_units_created);
+
+ if (type == WT_LSM_WORK_SWITCH)
+ LSM_PUSH_ENTRY(&manager->switchqh,
+ &manager->switch_lock, lsm_work_queue_switch);
+ else if (type == WT_LSM_WORK_MERGE)
+ LSM_PUSH_ENTRY(&manager->managerqh,
+ &manager->manager_lock, lsm_work_queue_manager);
+ else
+ LSM_PUSH_ENTRY(&manager->appqh,
+ &manager->app_lock, lsm_work_queue_app);
+
+ WT_RET(__wt_cond_signal(session, manager->work_cond));
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_merge.c b/src/third_party/wiredtiger/src/lsm/lsm_merge.c
new file mode 100644
index 00000000000..784837092cd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_merge.c
@@ -0,0 +1,489 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_lsm_merge_update_tree --
+ * Merge a set of chunks and populate a new one.
+ * Must be called with the LSM lock held.
+ */
+int
+__wt_lsm_merge_update_tree(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, u_int start_chunk, u_int nchunks,
+ WT_LSM_CHUNK *chunk)
+{
+ size_t chunks_after_merge;
+ u_int i;
+
+ WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
+
+ /* Setup the array of obsolete chunks. */
+ WT_RET(__wt_realloc_def(session, &lsm_tree->old_alloc,
+ lsm_tree->nold_chunks + nchunks, &lsm_tree->old_chunks));
+
+ /* Copy entries one at a time, so we can reuse gaps in the list. */
+ for (i = 0; i < nchunks; i++)
+ lsm_tree->old_chunks[lsm_tree->nold_chunks++] =
+ lsm_tree->chunk[start_chunk + i];
+
+ /* Update the current chunk list. */
+ chunks_after_merge = lsm_tree->nchunks - (nchunks + start_chunk);
+ memmove(lsm_tree->chunk + start_chunk + 1,
+ lsm_tree->chunk + start_chunk + nchunks,
+ chunks_after_merge * sizeof(*lsm_tree->chunk));
+ lsm_tree->nchunks -= nchunks - 1;
+ memset(lsm_tree->chunk + lsm_tree->nchunks, 0,
+ (nchunks - 1) * sizeof(*lsm_tree->chunk));
+ lsm_tree->chunk[start_chunk] = chunk;
+
+ return (0);
+}
+
+/*
+ * __wt_lsm_merge --
+ * Merge a set of chunks of an LSM tree.
+ */
+int
+__wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
+{
+ WT_BLOOM *bloom;
+ WT_CURSOR *dest, *src;
+ WT_DECL_ITEM(bbuf);
+ WT_DECL_RET;
+ WT_ITEM key, value;
+ WT_LSM_CHUNK *chunk, *previous, *youngest;
+ uint32_t aggressive, generation, max_gap, max_gen, max_level, start_id;
+ uint64_t insert_count, record_count, chunk_size;
+ u_int dest_id, end_chunk, i, merge_max, merge_min, nchunks, start_chunk;
+ u_int verb;
+ int create_bloom, locked, in_sync, tret;
+ const char *cfg[3];
+ const char *drop_cfg[] =
+ { WT_CONFIG_BASE(session, session_drop), "force", NULL };
+
+ bloom = NULL;
+ chunk_size = 0;
+ create_bloom = 0;
+ dest = src = NULL;
+ locked = 0;
+ start_id = 0;
+ in_sync = 0;
+
+ /*
+ * If the tree is open read-only or we are compacting, be very
+ * aggressive. Otherwise, we can spend a long time waiting for merges
+ * to start in read-only applications.
+ */
+ if (!lsm_tree->modified ||
+ F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING))
+ lsm_tree->merge_aggressiveness = 10;
+
+ aggressive = lsm_tree->merge_aggressiveness;
+ merge_max = (aggressive > 5) ? 100 : lsm_tree->merge_min;
+ merge_min = (aggressive > 5) ? 2 : lsm_tree->merge_min;
+ max_gap = (aggressive + 4) / 5;
+ max_level = (lsm_tree->merge_throttle > 0) ? 0 : id + aggressive;
+
+ /*
+ * If there aren't any chunks to merge, or some of the chunks aren't
+ * yet written, we're done. A non-zero error indicates that the worker
+ * should assume there is no work to do: if there are unwritten chunks,
+ * the worker should write them immediately.
+ */
+ if (lsm_tree->nchunks < merge_min)
+ return (WT_NOTFOUND);
+
+ /*
+ * Use the lsm_tree lock to read the chunks (so no switches occur), but
+ * avoid holding it while the merge is in progress: that may take a
+ * long time.
+ */
+ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
+
+ /*
+ * Only include chunks that already have a Bloom filter or are the
+ * result of a merge and not involved in a merge.
+ */
+ for (end_chunk = lsm_tree->nchunks - 1; end_chunk > 0; --end_chunk) {
+ chunk = lsm_tree->chunk[end_chunk];
+ WT_ASSERT(session, chunk != NULL);
+ if (F_ISSET(chunk, WT_LSM_CHUNK_MERGING))
+ continue;
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) || chunk->generation > 0)
+ break;
+ else if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) &&
+ F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
+ break;
+ }
+
+ /*
+ * Give up immediately if there aren't enough on disk chunks in the
+ * tree for a merge.
+ */
+ if (end_chunk < merge_min - 1) {
+ WT_RET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ return (WT_NOTFOUND);
+ }
+
+ /*
+ * Look for the most efficient merge we can do. We define efficiency
+ * as collapsing as many levels as possible while processing the
+ * smallest number of rows.
+ *
+ * We make a distinction between "major" and "minor" merges. The
+ * difference is whether the oldest chunk is involved: if it is, we can
+ * discard tombstones, because there can be no older record to marked
+ * deleted.
+ *
+ * Respect the configured limit on the number of chunks to merge: start
+ * with the most recent set of chunks and work backwards until going
+ * further becomes significantly less efficient.
+ */
+ for (start_chunk = end_chunk + 1, record_count = 0;
+ start_chunk > 0; ) {
+ chunk = lsm_tree->chunk[start_chunk - 1];
+ youngest = lsm_tree->chunk[end_chunk];
+ nchunks = (end_chunk + 1) - start_chunk;
+
+ /*
+ * If the chunk is already involved in a merge or a Bloom
+ * filter is being built for it, stop.
+ */
+ if (F_ISSET(chunk, WT_LSM_CHUNK_MERGING) || chunk->bloom_busy)
+ break;
+
+ /*
+ * Look for small merges before trying a big one: some threads
+ * should stay in low levels until we get more aggressive.
+ */
+ if (chunk->generation > max_level)
+ break;
+
+ /*
+ * If the size of the chunks selected so far exceeds the
+ * configured maximum chunk size, stop. Keep going if we can
+ * slide the window further into the tree: we don't want to
+ * leave small chunks in the middle.
+ */
+ if ((chunk_size += chunk->size) > lsm_tree->chunk_max)
+ if (nchunks < merge_min ||
+ (chunk->generation > youngest->generation &&
+ chunk_size - youngest->size > lsm_tree->chunk_max))
+ break;
+
+ /*
+ * If we have enough chunks for a merge and the next chunk is
+ * in too high a generation, stop.
+ */
+ if (nchunks >= merge_min) {
+ previous = lsm_tree->chunk[start_chunk];
+ max_gen = youngest->generation + max_gap;
+ if (previous->generation <= max_gen &&
+ chunk->generation > max_gen)
+ break;
+ }
+
+ F_SET(chunk, WT_LSM_CHUNK_MERGING);
+ record_count += chunk->count;
+ --start_chunk;
+
+ /*
+ * If we have a full window, or the merge would be too big,
+ * remove the youngest chunk.
+ */
+ if (nchunks == merge_max ||
+ chunk_size > lsm_tree->chunk_max) {
+ WT_ASSERT(session,
+ F_ISSET(youngest, WT_LSM_CHUNK_MERGING));
+ F_CLR(youngest, WT_LSM_CHUNK_MERGING);
+ record_count -= youngest->count;
+ chunk_size -= youngest->size;
+ --end_chunk;
+ }
+ }
+
+ nchunks = (end_chunk + 1) - start_chunk;
+ WT_ASSERT(session, nchunks <= merge_max);
+
+ if (nchunks > 0) {
+ WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
+ for (i = 0; i < nchunks; i++) {
+ chunk = lsm_tree->chunk[start_chunk + i];
+ WT_ASSERT(session,
+ F_ISSET(chunk, WT_LSM_CHUNK_MERGING));
+ }
+
+ chunk = lsm_tree->chunk[start_chunk];
+ youngest = lsm_tree->chunk[end_chunk];
+ start_id = chunk->id;
+
+ /*
+ * Don't do merges that are too small or across too many
+ * generations.
+ */
+ if (nchunks < merge_min ||
+ chunk->generation > youngest->generation + max_gap) {
+ for (i = 0; i < nchunks; i++) {
+ chunk = lsm_tree->chunk[start_chunk + i];
+ WT_ASSERT(session,
+ F_ISSET(chunk, WT_LSM_CHUNK_MERGING));
+ F_CLR(chunk, WT_LSM_CHUNK_MERGING);
+ }
+ nchunks = 0;
+ }
+ }
+
+ /* Find the merge generation. */
+ for (generation = 0, i = 0; i < nchunks; i++)
+ generation = WT_MAX(generation,
+ lsm_tree->chunk[start_chunk + i]->generation + 1);
+
+ WT_RET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+ if (nchunks == 0)
+ return (WT_NOTFOUND);
+
+ /* Allocate an ID for the merge. */
+ dest_id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+
+ /*
+ * We only want to do the chunk loop if we're running with verbose,
+ * so we wrap these statements in the conditional. Avoid the loop
+ * in the normal path.
+ */
+ if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) {
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "Merging %s chunks %u-%u into %u (%" PRIu64 " records)"
+ ", generation %" PRIu32,
+ lsm_tree->name,
+ start_chunk, end_chunk, dest_id, record_count, generation));
+ for (verb = start_chunk; verb <= end_chunk; verb++)
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "%s: Chunk[%u] id %u",
+ lsm_tree->name, verb, lsm_tree->chunk[verb]->id));
+ }
+
+ WT_RET(__wt_calloc_def(session, 1, &chunk));
+ chunk->id = dest_id;
+
+ if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) &&
+ (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) ||
+ start_chunk > 0) && record_count > 0)
+ create_bloom = 1;
+
+ /*
+ * Special setup for the merge cursor:
+ * first, reset to open the dependent cursors;
+ * then restrict the cursor to a specific number of chunks;
+ * then set MERGE so the cursor doesn't track updates to the tree.
+ */
+ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
+ F_SET(src, WT_CURSTD_RAW);
+ WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks));
+
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
+ WT_ERR(ret);
+ if (create_bloom) {
+ WT_ERR(__wt_lsm_tree_bloom_name(
+ session, lsm_tree, chunk->id, &chunk->bloom_uri));
+
+ WT_ERR(__wt_bloom_create(session, chunk->bloom_uri,
+ lsm_tree->bloom_config,
+ record_count, lsm_tree->bloom_bit_count,
+ lsm_tree->bloom_hash_count, &bloom));
+ }
+
+ /* Discard pages we read as soon as we're done with them. */
+ F_SET(session, WT_SESSION_NO_CACHE);
+
+ cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+ cfg[1] = "bulk,raw,skip_sort_check";
+ cfg[2] = NULL;
+ WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
+
+#define LSM_MERGE_CHECK_INTERVAL 1000
+ for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
+ if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) {
+ if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+ WT_ERR(EINTR);
+ /*
+ * Help out with switching chunks in case the
+ * checkpoint worker is busy.
+ */
+ if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
+ WT_WITH_SCHEMA_LOCK(session, ret =
+ __wt_lsm_tree_switch(session, lsm_tree));
+ WT_ERR(ret);
+ }
+ WT_STAT_FAST_CONN_INCRV(session,
+ lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL);
+ ++lsm_tree->merge_progressing;
+ }
+
+ WT_ERR(src->get_key(src, &key));
+ dest->set_key(dest, &key);
+ WT_ERR(src->get_value(src, &value));
+ dest->set_value(dest, &value);
+ WT_ERR(dest->insert(dest));
+ if (create_bloom)
+ WT_ERR(__wt_bloom_insert(bloom, &key));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ WT_STAT_FAST_CONN_INCRV(session,
+ lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL);
+ ++lsm_tree->merge_progressing;
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+ "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.",
+ record_count, insert_count));
+
+ /*
+ * Closing and syncing the files can take a while. Set the
+ * merge_syncing field so that compact knows it is still in
+ * progress.
+ */
+ (void)WT_ATOMIC_ADD4(lsm_tree->merge_syncing, 1);
+ in_sync = 1;
+ /*
+ * We've successfully created the new chunk. Now install it. We need
+ * to ensure that the NO_CACHE flag is cleared and the bloom filter
+ * is closed (even if a step fails), so track errors but don't return
+ * until we've cleaned up.
+ */
+ WT_TRET(src->close(src));
+ WT_TRET(dest->close(dest));
+ src = dest = NULL;
+
+ F_CLR(session, WT_SESSION_NO_CACHE);
+
+ /*
+ * We're doing advisory reads to fault the new trees into cache.
+ * Don't block if the cache is full: our next unit of work may be to
+ * discard some trees to free space.
+ */
+ F_SET(session, WT_SESSION_NO_CACHE_CHECK);
+
+ if (create_bloom) {
+ if (ret == 0)
+ WT_TRET(__wt_bloom_finalize(bloom));
+
+ /*
+ * Read in a key to make sure the Bloom filters btree handle is
+ * open before it becomes visible to application threads.
+ * Otherwise application threads will stall while it is opened
+ * and internal pages are read into cache.
+ */
+ if (ret == 0) {
+ WT_CLEAR(key);
+ WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
+ }
+
+ WT_TRET(__wt_bloom_close(bloom));
+ bloom = NULL;
+ }
+ WT_ERR(ret);
+
+ /*
+ * Open a handle on the new chunk before application threads attempt
+ * to access it, opening it pre-loads internal pages into the file
+ * system cache.
+ */
+ cfg[1] = "checkpoint=" WT_CHECKPOINT;
+ WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
+ WT_TRET(dest->close(dest));
+ dest = NULL;
+ ++lsm_tree->merge_progressing;
+ (void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1);
+ in_sync = 0;
+ WT_ERR_NOTFOUND_OK(ret);
+
+ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+ locked = 1;
+
+ /*
+ * Check whether we raced with another merge, and adjust the chunk
+ * array offset as necessary.
+ */
+ if (start_chunk >= lsm_tree->nchunks ||
+ lsm_tree->chunk[start_chunk]->id != start_id)
+ for (start_chunk = 0;
+ start_chunk < lsm_tree->nchunks;
+ start_chunk++)
+ if (lsm_tree->chunk[start_chunk]->id == start_id)
+ break;
+
+ /*
+ * It is safe to error out here - since the update can only fail
+ * prior to making updates to the tree.
+ */
+ WT_ERR(__wt_lsm_merge_update_tree(
+ session, lsm_tree, start_chunk, nchunks, chunk));
+
+ if (create_bloom)
+ F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+ chunk->count = insert_count;
+ chunk->generation = generation;
+ F_SET(chunk, WT_LSM_CHUNK_ONDISK);
+
+ /*
+ * We have no current way of continuing if the metadata update fails,
+ * so we will panic in that case. Put some effort into cleaning up
+ * after ourselves here - so things have a chance of shutting down.
+ *
+ * Any errors that happened after the tree was locked are
+ * fatal - we can't guarantee the state of the tree.
+ */
+ if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0)
+ WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge");
+
+ lsm_tree->dsk_gen++;
+
+ /* Update the throttling while holding the tree lock. */
+ __wt_lsm_tree_throttle(session, lsm_tree, 1);
+
+ /* Schedule a pass to discard old chunks */
+ WT_ERR(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_DROP, 0, lsm_tree));
+
+err: if (locked)
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ if (in_sync)
+ (void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1);
+ if (src != NULL)
+ WT_TRET(src->close(src));
+ if (dest != NULL)
+ WT_TRET(dest->close(dest));
+ if (bloom != NULL)
+ WT_TRET(__wt_bloom_close(bloom));
+ __wt_scr_free(&bbuf);
+ if (ret != 0) {
+ /* Drop the newly-created files on error. */
+ WT_WITH_SCHEMA_LOCK(session,
+ tret = __wt_schema_drop(session, chunk->uri, drop_cfg));
+ WT_TRET(tret);
+ if (create_bloom) {
+ WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop(
+ session, chunk->bloom_uri, drop_cfg));
+ WT_TRET(tret);
+ }
+ __wt_free(session, chunk->bloom_uri);
+ __wt_free(session, chunk->uri);
+ __wt_free(session, chunk);
+
+ if (ret == EINTR)
+ WT_TRET(__wt_verbose(session, WT_VERB_LSM,
+ "Merge aborted due to close"));
+ else
+ WT_TRET(__wt_verbose(session, WT_VERB_LSM,
+ "Merge failed with %s", wiredtiger_strerror(ret)));
+ }
+ F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_meta.c b/src/third_party/wiredtiger/src/lsm/lsm_meta.c
new file mode 100644
index 00000000000..fbb5a9958d5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_meta.c
@@ -0,0 +1,238 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_lsm_meta_read --
+ * Read the metadata for an LSM tree.
+ */
+int
+__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_CONFIG cparser, lparser;
+ WT_CONFIG_ITEM ck, cv, lk, lv;
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_NAMED_COLLATOR *ncoll;
+ const char *lsmconfig;
+ u_int nchunks;
+
+ chunk = NULL; /* -Wconditional-uninitialized */
+
+ WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconfig));
+ WT_ERR(__wt_config_init(session, &cparser, lsmconfig));
+ while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) {
+ if (WT_STRING_MATCH("key_format", ck.str, ck.len)) {
+ __wt_free(session, lsm_tree->key_format);
+ WT_ERR(__wt_strndup(session,
+ cv.str, cv.len, &lsm_tree->key_format));
+ } else if (WT_STRING_MATCH("value_format", ck.str, ck.len)) {
+ __wt_free(session, lsm_tree->value_format);
+ WT_ERR(__wt_strndup(session,
+ cv.str, cv.len, &lsm_tree->value_format));
+ } else if (WT_STRING_MATCH("collator", ck.str, ck.len)) {
+ if (cv.len == 0)
+ continue;
+ TAILQ_FOREACH(ncoll, &S2C(session)->collqh, q) {
+ if (WT_STRING_MATCH(
+ ncoll->name, cv.str, cv.len)) {
+ lsm_tree->collator = ncoll->collator;
+ break;
+ }
+ }
+ if (lsm_tree->collator == NULL)
+ WT_ERR_MSG(session, EINVAL,
+ "unknown collator '%.*s'",
+ (int)cv.len, cv.str);
+ WT_ERR(__wt_strndup(session,
+ cv.str, cv.len, &lsm_tree->collator_name));
+ } else if (WT_STRING_MATCH("bloom_config", ck.str, ck.len)) {
+ __wt_free(session, lsm_tree->bloom_config);
+ /* Don't include the brackets. */
+ WT_ERR(__wt_strndup(session,
+ cv.str + 1, cv.len - 2, &lsm_tree->bloom_config));
+ } else if (WT_STRING_MATCH("file_config", ck.str, ck.len)) {
+ __wt_free(session, lsm_tree->file_config);
+ /* Don't include the brackets. */
+ WT_ERR(__wt_strndup(session,
+ cv.str + 1, cv.len - 2, &lsm_tree->file_config));
+ } else if (WT_STRING_MATCH("auto_throttle", ck.str, ck.len)) {
+ if (cv.val)
+ F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
+ else
+ F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE);
+ } else if (WT_STRING_MATCH("bloom", ck.str, ck.len))
+ lsm_tree->bloom = (uint32_t)cv.val;
+ else if (WT_STRING_MATCH("bloom_bit_count", ck.str, ck.len))
+ lsm_tree->bloom_bit_count = (uint32_t)cv.val;
+ else if (WT_STRING_MATCH("bloom_hash_count", ck.str, ck.len))
+ lsm_tree->bloom_hash_count = (uint32_t)cv.val;
+ else if (WT_STRING_MATCH("chunk_max", ck.str, ck.len))
+ lsm_tree->chunk_max = (uint64_t)cv.val;
+ else if (WT_STRING_MATCH("chunk_size", ck.str, ck.len))
+ lsm_tree->chunk_size = (uint64_t)cv.val;
+ else if (WT_STRING_MATCH("merge_max", ck.str, ck.len))
+ lsm_tree->merge_max = (uint32_t)cv.val;
+ else if (WT_STRING_MATCH("merge_min", ck.str, ck.len))
+ lsm_tree->merge_min = (uint32_t)cv.val;
+ else if (WT_STRING_MATCH("last", ck.str, ck.len))
+ lsm_tree->last = (u_int)cv.val;
+ else if (WT_STRING_MATCH("chunks", ck.str, ck.len)) {
+ WT_ERR(__wt_config_subinit(session, &lparser, &cv));
+ for (nchunks = 0; (ret =
+ __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
+ if (WT_STRING_MATCH("id", lk.str, lk.len)) {
+ WT_ERR(__wt_realloc_def(session,
+ &lsm_tree->chunk_alloc,
+ nchunks + 1, &lsm_tree->chunk));
+ WT_ERR(__wt_calloc_def(
+ session, 1, &chunk));
+ lsm_tree->chunk[nchunks++] = chunk;
+ chunk->id = (uint32_t)lv.val;
+ WT_ERR(__wt_lsm_tree_chunk_name(session,
+ lsm_tree, chunk->id, &chunk->uri));
+ F_SET(chunk,
+ WT_LSM_CHUNK_ONDISK |
+ WT_LSM_CHUNK_STABLE);
+ } else if (WT_STRING_MATCH(
+ "bloom", lk.str, lk.len)) {
+ WT_ERR(__wt_lsm_tree_bloom_name(
+ session, lsm_tree,
+ chunk->id, &chunk->bloom_uri));
+ F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+ continue;
+ } else if (WT_STRING_MATCH(
+ "chunk_size", lk.str, lk.len)) {
+ chunk->size = (uint64_t)lv.val;
+ continue;
+ } else if (WT_STRING_MATCH(
+ "count", lk.str, lk.len)) {
+ chunk->count = (uint64_t)lv.val;
+ continue;
+ } else if (WT_STRING_MATCH(
+ "generation", lk.str, lk.len)) {
+ chunk->generation = (uint32_t)lv.val;
+ continue;
+ }
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ lsm_tree->nchunks = nchunks;
+ } else if (WT_STRING_MATCH("old_chunks", ck.str, ck.len)) {
+ WT_ERR(__wt_config_subinit(session, &lparser, &cv));
+ for (nchunks = 0; (ret =
+ __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
+ if (WT_STRING_MATCH("bloom", lk.str, lk.len)) {
+ WT_ERR(__wt_strndup(session,
+ lv.str, lv.len, &chunk->bloom_uri));
+ F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+ continue;
+ }
+ WT_ERR(__wt_realloc_def(session,
+ &lsm_tree->old_alloc, nchunks + 1,
+ &lsm_tree->old_chunks));
+ WT_ERR(__wt_calloc_def(session, 1, &chunk));
+ lsm_tree->old_chunks[nchunks++] = chunk;
+ WT_ERR(__wt_strndup(session,
+ lk.str, lk.len, &chunk->uri));
+ F_SET(chunk, WT_LSM_CHUNK_ONDISK);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ lsm_tree->nold_chunks = nchunks;
+ /* Values included for backward compatibility */
+ } else if (WT_STRING_MATCH("merge_threads", ck.str, ck.len)) {
+ } else
+ WT_ERR(__wt_illegal_value(session, "LSM metadata"));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ /*
+ * If the default merge_min was not overridden, calculate it now. We
+ * do this here so that trees created before merge_min was added get a
+ * sane value.
+ */
+ if (lsm_tree->merge_min < 2)
+ lsm_tree->merge_min = WT_MAX(2, lsm_tree->merge_max / 2);
+
+err: __wt_free(session, lsmconfig);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_meta_write --
+ * Write the metadata for an LSM tree.
+ */
+int
+__wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ u_int i;
+ int first;
+
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_buf_fmt(session, buf,
+ "key_format=%s,value_format=%s,bloom_config=(%s),file_config=(%s)",
+ lsm_tree->key_format, lsm_tree->value_format,
+ lsm_tree->bloom_config, lsm_tree->file_config));
+ if (lsm_tree->collator_name != NULL)
+ WT_ERR(__wt_buf_catfmt(
+ session, buf, ",collator=%s", lsm_tree->collator_name));
+ WT_ERR(__wt_buf_catfmt(session, buf,
+ ",last=%" PRIu32
+ ",chunk_max=%" PRIu64
+ ",chunk_size=%" PRIu64
+ ",auto_throttle=%" PRIu32
+ ",merge_max=%" PRIu32
+ ",merge_min=%" PRIu32
+ ",bloom=%" PRIu32
+ ",bloom_bit_count=%" PRIu32
+ ",bloom_hash_count=%" PRIu32,
+ lsm_tree->last, lsm_tree->chunk_max, lsm_tree->chunk_size,
+ F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) ? 1 : 0,
+ lsm_tree->merge_max, lsm_tree->merge_min, lsm_tree->bloom,
+ lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count));
+ WT_ERR(__wt_buf_catfmt(session, buf, ",chunks=["));
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ if (i > 0)
+ WT_ERR(__wt_buf_catfmt(session, buf, ","));
+ WT_ERR(__wt_buf_catfmt(session, buf, "id=%" PRIu32, chunk->id));
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(__wt_buf_catfmt(session, buf, ",bloom"));
+ if (chunk->size != 0)
+ WT_ERR(__wt_buf_catfmt(session, buf,
+ ",chunk_size=%" PRIu64, chunk->size));
+ if (chunk->count != 0)
+ WT_ERR(__wt_buf_catfmt(
+ session, buf, ",count=%" PRIu64, chunk->count));
+ WT_ERR(__wt_buf_catfmt(
+ session, buf, ",generation=%" PRIu32, chunk->generation));
+ }
+ WT_ERR(__wt_buf_catfmt(session, buf, "]"));
+ WT_ERR(__wt_buf_catfmt(session, buf, ",old_chunks=["));
+ first = 1;
+ for (i = 0; i < lsm_tree->nold_chunks; i++) {
+ chunk = lsm_tree->old_chunks[i];
+ WT_ASSERT(session, chunk != NULL);
+ if (first)
+ first = 0;
+ else
+ WT_ERR(__wt_buf_catfmt(session, buf, ","));
+ WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri));
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(__wt_buf_catfmt(
+ session, buf, ",bloom=\"%s\"", chunk->bloom_uri));
+ }
+ WT_ERR(__wt_buf_catfmt(session, buf, "]"));
+ ret = __wt_metadata_update(session, lsm_tree->name, buf->data);
+ WT_ERR(ret);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
new file mode 100644
index 00000000000..dc7d17e7a2c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
@@ -0,0 +1,162 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __lsm_stat_init --
+ * Initialize a LSM statistics structure.
+ */
+static int
+__lsm_stat_init(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cst)
+{
+ WT_CURSOR *stat_cursor;
+ WT_DECL_ITEM(uribuf);
+ WT_DECL_RET;
+ WT_DSRC_STATS *new, *stats;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ u_int i;
+ int locked;
+ char config[64];
+ const char *cfg[] = {
+ WT_CONFIG_BASE(session, session_open_cursor), NULL, NULL };
+ const char *disk_cfg[] = {
+ WT_CONFIG_BASE(session, session_open_cursor),
+ "checkpoint=" WT_CHECKPOINT, NULL, NULL };
+
+ locked = 0;
+ WT_RET(__wt_lsm_tree_get(session, uri, 0, &lsm_tree));
+ WT_ERR(__wt_scr_alloc(session, 0, &uribuf));
+
+ /* Propagate all, fast and/or clear to the cursors we open. */
+ if (!F_ISSET(cst, WT_CONN_STAT_NONE)) {
+ (void)snprintf(config, sizeof(config),
+ "statistics=(%s%s%s)",
+ F_ISSET(cst, WT_CONN_STAT_CLEAR) ? "clear," : "",
+ F_ISSET(cst, WT_CONN_STAT_ALL) ? "all," : "",
+ !F_ISSET(cst, WT_CONN_STAT_ALL) &&
+ F_ISSET(cst, WT_CONN_STAT_FAST) ? "fast," : "");
+ cfg[1] = disk_cfg[1] = config;
+ }
+
+ /*
+ * Set the cursor to reference the data source statistics; we don't
+ * initialize it, instead we copy (rather than aggregate), the first
+ * chunk's statistics, which has the same effect.
+ */
+ stats = &cst->u.dsrc_stats;
+
+ /* Hold the LSM lock so that we can safely walk through the chunks. */
+ WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree));
+ locked = 1;
+
+ /*
+ * For each chunk, aggregate its statistics, as well as any associated
+ * bloom filter statistics, into the total statistics.
+ */
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+
+ /*
+ * Get the statistics for the chunk's underlying object.
+ *
+ * XXX kludge: we may have an empty chunk where no checkpoint
+ * was written. If so, try to open the ordinary handle on that
+ * chunk instead.
+ */
+ WT_ERR(__wt_buf_fmt(
+ session, uribuf, "statistics:%s", chunk->uri));
+ ret = __wt_curstat_open(session, uribuf->data,
+ F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ? disk_cfg : cfg,
+ &stat_cursor);
+ if (ret == WT_NOTFOUND &&
+ F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
+ ret = __wt_curstat_open(
+ session, uribuf->data, cfg, &stat_cursor);
+ WT_ERR(ret);
+
+ /*
+ * The underlying statistics have now been initialized; fill in
+ * values from the chunk's information, then aggregate into the
+ * top-level.
+ */
+ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
+ WT_STAT_SET(new, lsm_generation_max, chunk->generation);
+
+ /*
+ * We want to aggregate the table's statistics. Get a base set
+ * of statistics from the first chunk, then aggregate statistics
+ * from each new chunk.
+ */
+ if (i == 0)
+ *stats = *new;
+ else
+ __wt_stat_aggregate_dsrc_stats(new, stats);
+ WT_ERR(stat_cursor->close(stat_cursor));
+
+ if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ continue;
+
+ /* Maintain a count of bloom filters. */
+ WT_STAT_INCR(&lsm_tree->stats, bloom_count);
+
+ /* Get the bloom filter's underlying object. */
+ WT_ERR(__wt_buf_fmt(
+ session, uribuf, "statistics:%s", chunk->bloom_uri));
+ WT_ERR(__wt_curstat_open(
+ session, uribuf->data, cfg, &stat_cursor));
+
+ /*
+ * The underlying statistics have now been initialized; fill in
+ * values from the bloom filter's information, then aggregate
+ * into the top-level.
+ */
+ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
+ WT_STAT_SET(new,
+ bloom_size, (chunk->count * lsm_tree->bloom_bit_count) / 8);
+ WT_STAT_SET(new, bloom_page_evict,
+ WT_STAT(new, cache_eviction_clean) +
+ WT_STAT(new, cache_eviction_dirty));
+ WT_STAT_SET(new, bloom_page_read, WT_STAT(new, cache_read));
+
+ __wt_stat_aggregate_dsrc_stats(new, stats);
+ WT_ERR(stat_cursor->close(stat_cursor));
+ }
+
+ /* Set statistics that aren't aggregated directly into the cursor */
+ WT_STAT_SET(stats, lsm_chunk_count, lsm_tree->nchunks);
+
+ /* Aggregate, and optionally clear, LSM-level specific information. */
+ __wt_stat_aggregate_dsrc_stats(&lsm_tree->stats, stats);
+ if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+ __wt_stat_refresh_dsrc_stats(&lsm_tree->stats);
+
+ __wt_curstat_dsrc_final(cst);
+
+err: if (locked)
+ WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
+ __wt_lsm_tree_release(session, lsm_tree);
+ __wt_scr_free(&uribuf);
+
+ return (ret);
+}
+
+/*
+ * __wt_curstat_lsm_init --
+ * Initialize the statistics for a LSM tree.
+ */
+int
+__wt_curstat_lsm_init(
+ WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cst)
+{
+ WT_DECL_RET;
+
+ WT_WITH_SCHEMA_LOCK(session, ret = __lsm_stat_init(session, uri, cst));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
new file mode 100644
index 00000000000..447a8eb60a6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
@@ -0,0 +1,1266 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __lsm_tree_open_check(WT_SESSION_IMPL *, WT_LSM_TREE *);
+static int __lsm_tree_open(WT_SESSION_IMPL *, const char *, WT_LSM_TREE **);
+static int __lsm_tree_set_name(WT_SESSION_IMPL *, WT_LSM_TREE *, const char *);
+
+/*
+ * __lsm_tree_discard --
+ * Free an LSM tree structure.
+ */
+static int
+__lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ u_int i;
+
+ /* We may be destroying an lsm_tree before it was added. */
+ if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN))
+ TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q);
+
+ __wt_free(session, lsm_tree->name);
+ __wt_free(session, lsm_tree->config);
+ __wt_free(session, lsm_tree->key_format);
+ __wt_free(session, lsm_tree->value_format);
+ __wt_free(session, lsm_tree->collator_name);
+ __wt_free(session, lsm_tree->bloom_config);
+ __wt_free(session, lsm_tree->file_config);
+
+ WT_TRET(__wt_rwlock_destroy(session, &lsm_tree->rwlock));
+
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ if ((chunk = lsm_tree->chunk[i]) == NULL)
+ continue;
+
+ __wt_free(session, chunk->bloom_uri);
+ __wt_free(session, chunk->uri);
+ __wt_free(session, chunk);
+ }
+ __wt_free(session, lsm_tree->chunk);
+
+ for (i = 0; i < lsm_tree->nold_chunks; i++) {
+ chunk = lsm_tree->old_chunks[i];
+ WT_ASSERT(session, chunk != NULL);
+
+ __wt_free(session, chunk->bloom_uri);
+ __wt_free(session, chunk->uri);
+ __wt_free(session, chunk);
+ }
+ __wt_free(session, lsm_tree->old_chunks);
+ __wt_free(session, lsm_tree);
+
+ return (ret);
+}
+
+/*
+ * __lsm_tree_close --
+ * Close an LSM tree structure.
+ */
+static int
+__lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+ int i;
+
+ /* Stop any active merges. */
+ F_CLR(lsm_tree, WT_LSM_TREE_ACTIVE);
+
+ /*
+ * Wait for all LSM operations and work units that were in flight to
+ * finish.
+ */
+ for (i = 0; lsm_tree->refcnt > 1 || lsm_tree->queue_ref > 0; ++i) {
+ /*
+ * Remove any work units from the manager queues. Do this step
+ * repeatedly in case a work unit was in the process of being
+ * created when we cleared the active flag.
+ * !! Drop the schema lock whilst completing this step so that
+ * we don't block any operations that require the schema
+ * lock to complete. This is safe because any operation that
+ * is closing the tree should first have gotten exclusive
+ * access to the LSM tree via __wt_lsm_tree_get, so other
+ * schema level operations will return EBUSY, even though
+ * we're dropping the schema lock here.
+ */
+ if (i % 1000 == 0) {
+ WT_WITHOUT_SCHEMA_LOCK(session, ret =
+ __wt_lsm_manager_clear_tree(session, lsm_tree));
+ WT_RET(ret);
+ }
+ __wt_yield();
+ }
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_close_all --
+ * Close all LSM tree structures.
+ */
+int
+__wt_lsm_tree_close_all(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+
+ while ((lsm_tree = TAILQ_FIRST(&S2C(session)->lsmqh)) != NULL) {
+ /*
+ * Tree close assumes that we have a reference to the tree
+ * so it can tell when it's safe to do the close. We could
+ * got through tree get here, but short circuit instead. There
+ * is no need to decrement the reference count since destroy
+ * is unconditional.
+ */
+ (void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1);
+ WT_TRET(__lsm_tree_close(session, lsm_tree));
+ WT_TRET(__lsm_tree_discard(session, lsm_tree));
+ }
+
+ return (ret);
+}
+
+/*
+ * __lsm_tree_set_name --
+ * Set or reset the name of an LSM tree
+ */
+static int
+__lsm_tree_set_name(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, const char *uri)
+{
+ if (lsm_tree->name != NULL)
+ __wt_free(session, lsm_tree->name);
+ WT_RET(__wt_strdup(session, uri, &lsm_tree->name));
+ lsm_tree->filename = lsm_tree->name + strlen("lsm:");
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_bloom_name --
+ * Get the URI of the Bloom filter for a given chunk.
+ */
+int
+__wt_lsm_tree_bloom_name(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_buf_fmt(
+ session, tmp, "file:%s-%06" PRIu32 ".bf", lsm_tree->filename, id));
+ WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_chunk_name --
+ * Get the URI of the file for a given chunk.
+ */
+int
+__wt_lsm_tree_chunk_name(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_buf_fmt(
+ session, tmp, "file:%s-%06" PRIu32 ".lsm", lsm_tree->filename, id));
+ WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_set_chunk_size --
+ * Set the size of the chunk. Should only be called for chunks that are
+ * on disk, or about to become on disk.
+ */
+int
+__wt_lsm_tree_set_chunk_size(
+ WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk)
+{
+ wt_off_t size;
+ const char *filename;
+
+ filename = chunk->uri;
+ if (!WT_PREFIX_SKIP(filename, "file:"))
+ WT_RET_MSG(session, EINVAL,
+ "Expected a 'file:' URI: %s", chunk->uri);
+ WT_RET(__wt_filesize_name(session, filename, &size));
+
+ chunk->size = (uint64_t)size;
+
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_setup_chunk --
+ * Initialize a chunk of an LSM tree.
+ */
+int
+__wt_lsm_tree_setup_chunk(
+ WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
+{
+ const char *cfg[] =
+ { WT_CONFIG_BASE(session, session_drop), "force", NULL };
+ int exists;
+
+ WT_RET(__wt_epoch(session, &chunk->create_ts));
+
+ WT_RET(__wt_lsm_tree_chunk_name(
+ session, lsm_tree, chunk->id, &chunk->uri));
+
+ /*
+ * If the underlying file exists, drop the chunk first - there may be
+ * some content hanging over from an aborted merge or checkpoint.
+ *
+ * Don't do this for the very first chunk: we are called during
+ * WT_SESSION::create, and doing a drop inside there does interesting
+ * things with handle locks and metadata tracking. It can never have
+ * been the result of an interrupted merge, anyway.
+ */
+ if (chunk->id > 1) {
+ WT_RET(__wt_exist(
+ session, chunk->uri + strlen("file:"), &exists));
+ if (exists)
+ WT_RET(__wt_schema_drop(session, chunk->uri, cfg));
+ }
+ return (__wt_schema_create(session, chunk->uri, lsm_tree->file_config));
+}
+
+/*
+ * __wt_lsm_tree_create --
+ * Create an LSM tree structure for the given name.
+ */
+int
+__wt_lsm_tree_create(WT_SESSION_IMPL *session,
+ const char *uri, int exclusive, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+ const char *cfg[] =
+ { WT_CONFIG_BASE(session, session_create), config, NULL };
+ const char *tmpconfig;
+
+ /* If the tree is open, it already exists. */
+ if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) {
+ __wt_lsm_tree_release(session, lsm_tree);
+ return (exclusive ? EEXIST : 0);
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ /*
+ * If the tree has metadata, it already exists.
+ *
+ * !!!
+ * Use a local variable: we don't care what the existing configuration
+ * is, but we don't want to overwrite the real config.
+ */
+ if (__wt_metadata_search(session, uri, &tmpconfig) == 0) {
+ __wt_free(session, tmpconfig);
+ return (exclusive ? EEXIST : 0);
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
+ if (WT_STRING_MATCH("r", cval.str, cval.len))
+ WT_RET_MSG(session, EINVAL,
+ "LSM trees cannot be configured as column stores");
+
+ WT_RET(__wt_calloc_def(session, 1, &lsm_tree));
+
+ WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
+
+ WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval));
+ WT_ERR(__wt_strndup(
+ session, cval.str, cval.len, &lsm_tree->key_format));
+ WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval));
+ WT_ERR(__wt_strndup(
+ session, cval.str, cval.len, &lsm_tree->value_format));
+
+ WT_ERR(__wt_config_gets(session, cfg, "collator", &cval));
+ WT_ERR(__wt_strndup(
+ session, cval.str, cval.len, &lsm_tree->collator_name));
+
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval));
+ if (cval.val)
+ F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
+ else
+ F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE);
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval));
+ FLD_SET(lsm_tree->bloom,
+ (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED));
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval));
+ if (cval.val != 0)
+ FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST);
+
+ if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) &&
+ FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))
+ WT_ERR_MSG(session, EINVAL,
+ "Bloom filters can only be created on newest and oldest "
+ "chunks if bloom filters are enabled");
+
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval));
+ if (cval.type == WT_CONFIG_ITEM_STRUCT) {
+ cval.str++;
+ cval.len -= 2;
+ }
+ WT_ERR(__wt_strndup(
+ session, cval.str, cval.len, &lsm_tree->bloom_config));
+
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval));
+ lsm_tree->bloom_bit_count = (uint32_t)cval.val;
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval));
+ lsm_tree->bloom_hash_count = (uint32_t)cval.val;
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval));
+ lsm_tree->chunk_max = (uint64_t)cval.val;
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval));
+ lsm_tree->chunk_size = (uint64_t)cval.val;
+ if (lsm_tree->chunk_size > lsm_tree->chunk_max)
+ WT_ERR_MSG(session, EINVAL,
+ "Chunk size (chunk_size) must be smaller than or equal to "
+ "the maximum chunk size (chunk_max)");
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval));
+ lsm_tree->merge_max = (uint32_t)cval.val;
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_min", &cval));
+ lsm_tree->merge_min = (uint32_t)cval.val;
+ if (lsm_tree->merge_min > lsm_tree->merge_max)
+ WT_ERR_MSG(session, EINVAL,
+ "LSM merge_min must be less than or equal to merge_max");
+
+ /*
+ * Set up the config for each chunk.
+ *
+ * Make the memory_page_max double the chunk size, so application
+ * threads don't immediately try to force evict the chunk when the
+ * worker thread clears the NO_EVICTION flag.
+ */
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_buf_fmt(session, buf,
+ "%s,key_format=u,value_format=u,memory_page_max=%" PRIu64,
+ config, 2 * lsm_tree->chunk_max));
+ WT_ERR(__wt_strndup(
+ session, buf->data, buf->size, &lsm_tree->file_config));
+
+ /* Create the first chunk and flush the metadata. */
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+
+ /* Discard our partially populated handle. */
+ ret = __lsm_tree_discard(session, lsm_tree);
+ lsm_tree = NULL;
+
+ /*
+ * Open our new tree and add it to the handle cache. Don't discard on
+ * error: the returned handle is NULL on error, and the metadata
+ * tracking macros handle cleaning up on failure.
+ */
+ if (ret == 0)
+ ret = __lsm_tree_open(session, uri, &lsm_tree);
+ if (ret == 0)
+ __wt_lsm_tree_release(session, lsm_tree);
+
+ if (0) {
+err: WT_TRET(__lsm_tree_discard(session, lsm_tree));
+ }
+ __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __lsm_tree_open_check --
+ * Validate the configuration of an LSM tree.
+ */
+static int
+__lsm_tree_open_check(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_CONFIG_ITEM cval;
+ uint64_t maxleafpage, required;
+ const char *cfg[] = { WT_CONFIG_BASE(
+ session, session_create), lsm_tree->file_config, NULL };
+
+ WT_RET(__wt_config_gets(session, cfg, "leaf_page_max", &cval));
+ maxleafpage = (uint64_t)cval.val;
+
+ /*
+ * Three chunks, plus one page for each participant in up to three
+ * concurrent merges.
+ */
+ required = 3 * lsm_tree->chunk_size +
+ 3 * (lsm_tree->merge_max * maxleafpage);
+ if (S2C(session)->cache_size < required)
+ WT_RET_MSG(session, EINVAL,
+ "LSM cache size %" PRIu64 " (%" PRIu64 "MB) too small, "
+ "must be at least %" PRIu64 " (%" PRIu64 "MB)",
+ S2C(session)->cache_size,
+ S2C(session)->cache_size / WT_MEGABYTE,
+ required, required / WT_MEGABYTE);
+ return (0);
+}
+
+/*
+ * __lsm_tree_open --
+ * Open an LSM tree structure.
+ */
+static int
+__lsm_tree_open(
+ WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+
+ conn = S2C(session);
+ lsm_tree = NULL;
+
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+ /* Start the LSM manager thread if it isn't running. */
+ if (WT_ATOMIC_CAS4(conn->lsm_manager.lsm_workers, 0, 1))
+ WT_RET(__wt_lsm_manager_start(session));
+
+ /* Make sure no one beat us to it. */
+ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
+ if (strcmp(uri, lsm_tree->name) == 0) {
+ *treep = lsm_tree;
+ return (0);
+ }
+
+ /* Try to open the tree. */
+ WT_RET(__wt_calloc_def(session, 1, &lsm_tree));
+ WT_ERR(__wt_rwlock_alloc(session, &lsm_tree->rwlock, "lsm tree"));
+
+ WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
+
+ WT_ERR(__wt_lsm_meta_read(session, lsm_tree));
+
+ /*
+ * Sanity check the configuration. Do it now since this is the first
+ * time we have the LSM tree configuration.
+ */
+ WT_ERR(__lsm_tree_open_check(session, lsm_tree));
+
+ if (lsm_tree->nchunks == 0) {
+ F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
+ WT_ERR(__wt_lsm_tree_switch(session, lsm_tree));
+ }
+
+ /* Set the generation number so cursors are opened on first usage. */
+ lsm_tree->dsk_gen = 1;
+
+ /*
+ * Setup reference counting. Use separate reference counts for tree
+ * handles and queue entries, so that queue entries don't interfere
+ * with getting handles exclusive.
+ */
+ lsm_tree->refcnt = 1;
+ lsm_tree->queue_ref = 0;
+
+ /* Set a flush timestamp as a baseline. */
+ WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts));
+
+ /* Now the tree is setup, make it visible to others. */
+ TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q);
+ F_SET(lsm_tree, WT_LSM_TREE_ACTIVE | WT_LSM_TREE_OPEN);
+
+ *treep = lsm_tree;
+
+ if (0) {
+err: WT_TRET(__lsm_tree_discard(session, lsm_tree));
+ }
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_get --
+ * Get an LSM tree structure for the given name. Optionally get exclusive
+ * access to the handle. Exclusive access works separately to the LSM
+ * tree lock - since operations that need exclusive access may also need
+ * to take the LSM tree lock for example outstanding work unit operations.
+ */
+int
+__wt_lsm_tree_get(WT_SESSION_IMPL *session,
+ const char *uri, int exclusive, WT_LSM_TREE **treep)
+{
+ WT_LSM_TREE *lsm_tree;
+
+ /* See if the tree is already open. */
+ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
+ if (strcmp(uri, lsm_tree->name) == 0) {
+ /*
+ * Short circuit if the handle is already held
+ * exclusively or exclusive access is requested and
+ * there are references held.
+ */
+ if ((exclusive && lsm_tree->refcnt > 0) ||
+ F_ISSET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE))
+ return (EBUSY);
+
+ if (exclusive) {
+ F_SET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE);
+ if (!WT_ATOMIC_CAS4(lsm_tree->refcnt, 0, 1)) {
+ F_CLR(lsm_tree, WT_LSM_TREE_EXCLUSIVE);
+ return (EBUSY);
+ }
+ } else
+ (void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1);
+
+ /*
+ * If we got a reference, but an exclusive reference
+ * beat us to it, give our reference up.
+ */
+ if (!exclusive &&
+ F_ISSET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE)) {
+ (void)WT_ATOMIC_SUB4(lsm_tree->refcnt, 1);
+ return (EBUSY);
+ }
+ *treep = lsm_tree;
+ return (0);
+ }
+
+ /* Open a new tree. */
+ return (__lsm_tree_open(session, uri, treep));
+}
+
+/*
+ * __wt_lsm_tree_release --
+ * Release an LSM tree structure.
+ */
+void
+__wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_ASSERT(session, lsm_tree->refcnt > 0);
+ (void)WT_ATOMIC_SUB4(lsm_tree->refcnt, 1);
+ F_CLR_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE);
+}
+
+/* How aggressively to ramp up or down throttle due to level 0 merging */
+#define WT_LSM_MERGE_THROTTLE_BUMP_PCT (100 / lsm_tree->merge_max)
+/* Number of level 0 chunks that need to be present to throttle inserts */
+#define WT_LSM_MERGE_THROTTLE_THRESHOLD \
+ (2 * lsm_tree->merge_min)
+/* Minimal throttling time */
+#define WT_LSM_THROTTLE_START 20
+
+#define WT_LSM_MERGE_THROTTLE_INCREASE(val) do { \
+ (val) += ((val) * WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100; \
+ if ((val) < WT_LSM_THROTTLE_START) \
+ (val) = WT_LSM_THROTTLE_START; \
+ } while (0)
+
+#define WT_LSM_MERGE_THROTTLE_DECREASE(val) do { \
+ (val) -= ((val) * WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100; \
+ if ((val) < WT_LSM_THROTTLE_START) \
+ (val) = 0; \
+ } while (0)
+
+/*
+ * __wt_lsm_tree_throttle --
+ * Calculate whether LSM updates need to be throttled. Must be called
+ * with the LSM tree lock held.
+ */
+void
+__wt_lsm_tree_throttle(
+ WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only)
+{
+ WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk;
+ uint64_t cache_sz, cache_used, oldtime, record_count, timediff;
+ uint32_t in_memory, gen0_chunks;
+
+ /* Never throttle in small trees. */
+ if (lsm_tree->nchunks < 3) {
+ lsm_tree->ckpt_throttle = lsm_tree->merge_throttle = 0;
+ return;
+ }
+
+ cache_sz = S2C(session)->cache_size;
+
+ /*
+ * In the steady state, we expect that the checkpoint worker thread
+ * will keep up with inserts. If not, throttle the insert rate to
+ * avoid filling the cache with in-memory chunks. Threads sleep every
+ * 100 operations, so take that into account in the calculation.
+ *
+ * Also throttle based on whether merge threads are keeping up. If
+ * there are enough chunks that have never been merged we slow down
+ * inserts so that merges have some chance of keeping up.
+ *
+ * Count the number of in-memory chunks, the number of unmerged chunk
+ * on disk, and find the most recent on-disk chunk (if any).
+ */
+ record_count = 1;
+ gen0_chunks = in_memory = 0;
+ ondisk = NULL;
+ for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
+ cp >= lsm_tree->chunk;
+ --cp)
+ if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) {
+ record_count += (*cp)->count;
+ ++in_memory;
+ } else {
+ /*
+ * Assign ondisk to the last chunk that has been
+ * flushed since the tree was last opened (i.e it's on
+ * disk and stable is not set).
+ */
+ if (ondisk == NULL &&
+ ((*cp)->generation == 0 &&
+ !F_ISSET(*cp, WT_LSM_CHUNK_STABLE)))
+ ondisk = *cp;
+
+ if ((*cp)->generation == 0 &&
+ !F_ISSET(*cp, WT_LSM_CHUNK_MERGING))
+ ++gen0_chunks;
+ }
+
+ last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];
+
+ /* Checkpoint throttling, based on the number of in-memory chunks. */
+ if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3)
+ lsm_tree->ckpt_throttle = 0;
+ else if (decrease_only)
+ ; /* Nothing to do */
+ else if (ondisk == NULL) {
+ /*
+ * No checkpoint has completed this run. Keep slowing down
+ * inserts until one does.
+ */
+ lsm_tree->ckpt_throttle =
+ WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle);
+ } else {
+ WT_ASSERT(session,
+ WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0);
+ timediff =
+ WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts);
+ lsm_tree->ckpt_throttle =
+ (long)((in_memory - 2) * timediff / (20 * record_count));
+
+ /*
+ * Get more aggressive as the number of in memory chunks
+ * consumes a large proportion of the cache. In memory chunks
+ * are allowed to grow up to twice as large as the configured
+ * value when checkpoints aren't keeping up. That worst case
+ * is when this calculation is relevant.
+ * There is nothing particularly special about the chosen
+ * multipliers.
+ */
+ cache_used = in_memory * lsm_tree->chunk_size * 2;
+ if (cache_used > cache_sz * 0.8)
+ lsm_tree->ckpt_throttle *= 5;
+ }
+
+ /*
+ * Merge throttling, based on the number of on-disk, level 0 chunks.
+ *
+ * Don't throttle if the tree has less than a single level's number
+ * of chunks.
+ */
+ if (lsm_tree->nchunks < lsm_tree->merge_max)
+ lsm_tree->merge_throttle = 0;
+ else if (gen0_chunks < WT_LSM_MERGE_THROTTLE_THRESHOLD)
+ WT_LSM_MERGE_THROTTLE_DECREASE(lsm_tree->merge_throttle);
+ else if (!decrease_only)
+ WT_LSM_MERGE_THROTTLE_INCREASE(lsm_tree->merge_throttle);
+
+ /* Put an upper bound of 1s on both throttle calculations. */
+ lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle);
+ lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle);
+
+ /*
+ * Update our estimate of how long each in-memory chunk stays active.
+ * Filter out some noise by keeping a weighted history of the
+ * calculated value. Wait until we have enough chunks that we can
+ * check that the new value is sane: otherwise, after a long idle
+ * period, we can calculate a crazy value.
+ */
+ if (in_memory > 1 && ondisk != NULL) {
+ prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2];
+ WT_ASSERT(session, prev_chunk->generation == 0);
+ WT_ASSERT(session, WT_TIMECMP(
+ last_chunk->create_ts, prev_chunk->create_ts) >= 0);
+ timediff =
+ WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts);
+ WT_ASSERT(session,
+ WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0);
+ oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts);
+ if (timediff < 10 * oldtime)
+ lsm_tree->chunk_fill_ms =
+ (3 * lsm_tree->chunk_fill_ms +
+ timediff / 1000000) / 4;
+ }
+}
+
+/*
+ * __wt_lsm_tree_switch --
+ * Switch to a new in-memory tree.
+ */
+int
+__wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ uint32_t nchunks, new_id;
+ int first_switch;
+
+ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
+
+ nchunks = lsm_tree->nchunks;
+
+ first_switch = nchunks == 0 ? 1 : 0;
+ /*
+ * Check if a switch is still needed: we may have raced while waiting
+ * for a lock.
+ */
+ chunk = NULL;
+ if (!first_switch &&
+ (chunk = lsm_tree->chunk[nchunks - 1]) != NULL &&
+ !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+ !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
+ goto err;
+
+ /* Set the switch transaction in the previous chunk, if necessary. */
+ if (chunk != NULL && chunk->switch_txn == WT_TXN_NONE)
+ chunk->switch_txn = __wt_txn_new_id(session);
+
+ /* Update the throttle time. */
+ __wt_lsm_tree_throttle(session, lsm_tree, 0);
+
+ new_id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+
+ WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc,
+ nchunks + 1, &lsm_tree->chunk));
+
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+ "Tree %s switch to: %" PRIu32 ", checkpoint throttle %ld, "
+ "merge throttle %ld", lsm_tree->name,
+ new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle));
+
+ WT_ERR(__wt_calloc_def(session, 1, &chunk));
+ chunk->id = new_id;
+ chunk->switch_txn = WT_TXN_NONE;
+ lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
+ WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
+
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+ F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
+ ++lsm_tree->dsk_gen;
+
+ lsm_tree->modified = 1;
+
+err: WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ /*
+ * Errors that happen during a tree switch leave the tree in a state
+ * where we can't make progress. Error out of WiredTiger.
+ */
+ if (ret != 0)
+ WT_PANIC_RET(session, ret, "Failed doing LSM switch");
+ else if (!first_switch)
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_FLUSH, 0, lsm_tree));
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_drop --
+ * Drop an LSM tree.
+ */
+int
+__wt_lsm_tree_drop(
+ WT_SESSION_IMPL *session, const char *name, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ u_int i;
+ int locked;
+
+ locked = 0;
+
+ /* Get the LSM tree. */
+ WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree));
+
+ /* Shut down the LSM worker. */
+ WT_ERR(__lsm_tree_close(session, lsm_tree));
+
+ /* Prevent any new opens. */
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+ locked = 1;
+
+ /* Drop the chunks. */
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(
+ __wt_schema_drop(session, chunk->bloom_uri, cfg));
+ }
+
+ /* Drop any chunks on the obsolete list. */
+ for (i = 0; i < lsm_tree->nold_chunks; i++) {
+ if ((chunk = lsm_tree->old_chunks[i]) == NULL)
+ continue;
+ WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(
+ __wt_schema_drop(session, chunk->bloom_uri, cfg));
+ }
+
+ locked = 0;
+ WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ ret = __wt_metadata_remove(session, name);
+
+err: if (locked)
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ WT_TRET(__lsm_tree_discard(session, lsm_tree));
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_rename --
+ * Rename an LSM tree.
+ */
+int
+__wt_lsm_tree_rename(WT_SESSION_IMPL *session,
+ const char *olduri, const char *newuri, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ const char *old;
+ u_int i;
+ int locked;
+
+ old = NULL;
+ locked = 0;
+
+ /* Get the LSM tree. */
+ WT_RET(__wt_lsm_tree_get(session, olduri, 1, &lsm_tree));
+
+ /* Shut down the LSM worker. */
+ WT_ERR(__lsm_tree_close(session, lsm_tree));
+
+ /* Prevent any new opens. */
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+ locked = 1;
+
+ /* Set the new name. */
+ WT_ERR(__lsm_tree_set_name(session, lsm_tree, newuri));
+
+ /* Rename the chunks. */
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ old = chunk->uri;
+ chunk->uri = NULL;
+
+ WT_ERR(__wt_lsm_tree_chunk_name(
+ session, lsm_tree, chunk->id, &chunk->uri));
+ WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg));
+ __wt_free(session, old);
+
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
+ old = chunk->bloom_uri;
+ chunk->bloom_uri = NULL;
+ WT_ERR(__wt_lsm_tree_bloom_name(
+ session, lsm_tree, chunk->id, &chunk->bloom_uri));
+ F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+ WT_ERR(__wt_schema_rename(
+ session, old, chunk->uri, cfg));
+ __wt_free(session, old);
+ }
+ }
+
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+ locked = 0;
+ WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ WT_ERR(__wt_metadata_remove(session, olduri));
+
+err: if (locked)
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ if (old != NULL)
+ __wt_free(session, old);
+ /*
+ * Discard this LSM tree structure. The first operation on the renamed
+ * tree will create a new one.
+ */
+ WT_TRET(__lsm_tree_discard(session, lsm_tree));
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_truncate --
+ * Truncate an LSM tree.
+ */
+int
+__wt_lsm_tree_truncate(
+ WT_SESSION_IMPL *session, const char *name, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ int locked;
+
+ WT_UNUSED(cfg);
+ chunk = NULL;
+ locked = 0;
+
+ /* Get the LSM tree. */
+ WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree));
+
+ /* Shut down the LSM worker. */
+ WT_ERR(__lsm_tree_close(session, lsm_tree));
+
+ /* Prevent any new opens. */
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+ locked = 1;
+
+ /* Create the new chunk. */
+ WT_ERR(__wt_calloc_def(session, 1, &chunk));
+ chunk->id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+ WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
+
+ /* Mark all chunks old. */
+ WT_ERR(__wt_lsm_merge_update_tree(
+ session, lsm_tree, 0, lsm_tree->nchunks, chunk));
+
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+
+ locked = 0;
+ WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ __wt_lsm_tree_release(session, lsm_tree);
+
+err: if (locked)
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ if (ret != 0) {
+ if (chunk != NULL) {
+ (void)__wt_schema_drop(session, chunk->uri, NULL);
+ __wt_free(session, chunk);
+ }
+ /*
+ * Discard the LSM tree structure on error. This will force the
+ * LSM tree to be re-opened the next time it is accessed and
+ * the last good version of the metadata will be used, resulting
+ * in a valid (not truncated) tree.
+ */
+ WT_TRET(__lsm_tree_discard(session, lsm_tree));
+ }
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_readlock --
+ * Acquire a shared lock on an LSM tree.
+ */
+int
+__wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_RET(__wt_readlock(session, lsm_tree->rwlock));
+
+ /*
+ * Diagnostic: avoid deadlocks with the schema lock: if we need it for
+ * an operation, we should already have it.
+ */
+ F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_readunlock --
+ * Release a shared lock on an LSM tree.
+ */
+int
+__wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+
+ F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+
+ if ((ret = __wt_readunlock(session, lsm_tree->rwlock)) != 0)
+ WT_PANIC_RET(session, ret, "Unlocking an LSM tree");
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_writelock --
+ * Acquire an exclusive lock on an LSM tree.
+ */
+int
+__wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_RET(__wt_writelock(session, lsm_tree->rwlock));
+
+ /*
+ * Diagnostic: avoid deadlocks with the schema lock: if we need it for
+ * an operation, we should already have it.
+ */
+ F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_writeunlock --
+ * Release an exclusive lock on an LSM tree.
+ */
+int
+__wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+
+ F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+
+ if ((ret = __wt_writeunlock(session, lsm_tree->rwlock)) != 0)
+ WT_PANIC_RET(session, ret, "Unlocking an LSM tree");
+ return (0);
+}
+
+/*
+ * __wt_lsm_compact --
+ * Compact an LSM tree called via __wt_schema_worker.
+ */
+int
+__wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ time_t begin, end;
+ uint64_t progress;
+ int i, compacting, flushing, locked, ref;
+
+ compacting = flushing = locked = ref = 0;
+ chunk = NULL;
+ /*
+ * This function is applied to all matching sources: ignore anything
+ * that is not an LSM tree.
+ */
+ if (!WT_PREFIX_MATCH(name, "lsm:"))
+ return (0);
+
+ /* Tell __wt_schema_worker not to look inside the LSM tree. */
+ *skip = 1;
+
+ WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree));
+
+ if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
+ WT_ERR_MSG(session, EINVAL,
+ "LSM compaction requires active merge threads");
+
+ WT_ERR(__wt_seconds(session, &begin));
+
+ /*
+ * Compacting has two distinct phases.
+ * 1. All in-memory chunks up to and including the current
+ * current chunk must be flushed. Normally, the flush code
+ * does not flush the last, in-use chunk, so we set a force
+ * flag to include that last chunk. We monitor the state of the
+ * last chunk and periodically push another forced flush work
+ * unit until it is complete.
+ * 2. After all flushing is done, we move onto the merging
+ * phase for compaction. Again, we monitor the state and
+ * continue to push merge work units until all merging is done.
+ */
+
+ /* Lock the tree: single-thread compaction. */
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+ locked = 1;
+
+ /* Clear any merge throttle: compact throws out that calculation. */
+ lsm_tree->merge_throttle = 0;
+ lsm_tree->merge_aggressiveness = 0;
+ progress = lsm_tree->merge_progressing;
+
+ /* If another thread started a compact on this tree, we're done. */
+ if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING))
+ goto err;
+
+ /*
+ * Set the switch transaction on the current chunk, if it
+ * hasn't been set before. This prevents further writes, so it
+ * can be flushed by the checkpoint worker.
+ */
+ if (lsm_tree->nchunks > 0 &&
+ (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL) {
+ if (chunk->switch_txn == WT_TXN_NONE)
+ chunk->switch_txn = __wt_txn_new_id(session);
+ /*
+ * If we have a chunk, we want to look for it to be on-disk.
+ * So we need to add a reference to keep it available.
+ */
+ (void)WT_ATOMIC_ADD4(chunk->refcnt, 1);
+ ref = 1;
+ }
+
+ locked = 0;
+ WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+ if (chunk != NULL) {
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+ "Compact force flush %s flags 0x%" PRIx32
+ " chunk %u flags 0x%"
+ PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags));
+ flushing = 1;
+ /*
+ * Make sure the in-memory chunk gets flushed do not push a
+ * switch, because we don't want to create a new in-memory
+ * chunk if the tree is being used read-only now.
+ */
+ WT_ERR(__wt_lsm_manager_push_entry(session,
+ WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree));
+ } else {
+ /*
+ * If there is no chunk to flush, go straight to the
+ * compacting state.
+ */
+ compacting = 1;
+ progress = lsm_tree->merge_progressing;
+ F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+ "COMPACT: Start compacting %s", lsm_tree->name));
+ }
+
+ /* Wait for the work unit queues to drain. */
+ while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) {
+ /*
+ * The flush flag is cleared when the chunk has been flushed.
+ * Continue to push forced flushes until the chunk is on disk.
+ * Once it is on disk move to the compacting phase.
+ */
+ if (flushing) {
+ WT_ASSERT(session, chunk != NULL);
+ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
+ WT_ERR(__wt_verbose(session,
+ WT_VERB_LSM,
+ "Compact flush done %s chunk %u. "
+ "Start compacting progress %" PRIu64,
+ name, chunk->id,
+ lsm_tree->merge_progressing));
+ (void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+ flushing = ref = 0;
+ compacting = 1;
+ F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
+ progress = lsm_tree->merge_progressing;
+ } else {
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+ "Compact flush retry %s chunk %u",
+ name, chunk->id));
+ WT_ERR(__wt_lsm_manager_push_entry(session,
+ WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE,
+ lsm_tree));
+ }
+ }
+
+ /*
+ * The compacting flag is cleared when no merges can be done.
+ * Ensure that we push through some aggressive merges before
+ * stopping otherwise we might not do merges that would
+ * span chunks with different generations.
+ */
+ if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) {
+ if (lsm_tree->merge_aggressiveness < 10 ||
+ (progress < lsm_tree->merge_progressing) ||
+ lsm_tree->merge_syncing) {
+ progress = lsm_tree->merge_progressing;
+ F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
+ lsm_tree->merge_aggressiveness = 10;
+ } else
+ break;
+ }
+ __wt_sleep(1, 0);
+ WT_ERR(__wt_seconds(session, &end));
+ if (session->compact->max_time > 0 &&
+ session->compact->max_time < (uint64_t)(end - begin)) {
+ WT_ERR(ETIMEDOUT);
+ }
+ /*
+ * Push merge operations while they are still getting work
+ * done. If we are pushing merges, make sure they are
+ * aggressive, to avoid duplicating effort.
+ */
+ if (compacting)
+#define COMPACT_PARALLEL_MERGES 5
+ for (i = lsm_tree->queue_ref;
+ i < COMPACT_PARALLEL_MERGES; i++) {
+ lsm_tree->merge_aggressiveness = 10;
+ WT_ERR(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_MERGE, 0, lsm_tree));
+ }
+ }
+err:
+ /* Ensure anything we set is cleared. */
+ if (ref)
+ (void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+ if (compacting) {
+ F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING);
+ lsm_tree->merge_aggressiveness = 0;
+ }
+ if (locked)
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+ WT_TRET(__wt_verbose(session, WT_VERB_LSM,
+ "Compact %s complete, return %d", name, ret));
+
+ __wt_lsm_tree_release(session, lsm_tree);
+ return (ret);
+
+}
+
+/*
+ * __wt_lsm_tree_worker --
+ * Run a schema worker operation on each level of a LSM tree.
+ */
+int
+__wt_lsm_tree_worker(WT_SESSION_IMPL *session,
+ const char *uri,
+ int (*file_func)(WT_SESSION_IMPL *, const char *[]),
+ int (*name_func)(WT_SESSION_IMPL *, const char *, int *),
+ const char *cfg[], uint32_t open_flags)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ u_int i;
+ int exclusive, locked;
+
+ locked = 0;
+ exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE) ? 1 : 0;
+ WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree));
+
+ /*
+ * We mark that we're busy using the tree to coordinate
+ * with merges so that merging doesn't change the chunk
+ * array out from underneath us.
+ */
+ WT_ERR(exclusive ?
+ __wt_lsm_tree_writelock(session, lsm_tree) :
+ __wt_lsm_tree_readlock(session, lsm_tree));
+ locked = 1;
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ if (file_func == __wt_checkpoint &&
+ F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
+ continue;
+ WT_ERR(__wt_schema_worker(session, chunk->uri,
+ file_func, name_func, cfg, open_flags));
+ if (name_func == __wt_backup_list_uri_append &&
+ F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(__wt_schema_worker(session, chunk->bloom_uri,
+ file_func, name_func, cfg, open_flags));
+ }
+err: if (locked)
+ WT_TRET(exclusive ?
+ __wt_lsm_tree_writeunlock(session, lsm_tree) :
+ __wt_lsm_tree_readunlock(session, lsm_tree));
+ __wt_lsm_tree_release(session, lsm_tree);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
new file mode 100644
index 00000000000..278c400070f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
@@ -0,0 +1,625 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __lsm_bloom_create(
+ WT_SESSION_IMPL *, WT_LSM_TREE *, WT_LSM_CHUNK *, u_int);
+static int __lsm_discard_handle(WT_SESSION_IMPL *, const char *, const char *);
+
+/*
+ * __lsm_copy_chunks --
+ * Take a copy of part of the LSM tree chunk array so that we can work on
+ * the contents without holding the LSM tree handle lock long term.
+ */
+static int
+__lsm_copy_chunks(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, WT_LSM_WORKER_COOKIE *cookie, int old_chunks)
+{
+ WT_DECL_RET;
+ u_int i, nchunks;
+ size_t alloc;
+
+ /* Always return zero chunks on error. */
+ cookie->nchunks = 0;
+
+ WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
+ if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+ return (__wt_lsm_tree_readunlock(session, lsm_tree));
+
+ /* Take a copy of the current state of the LSM tree. */
+ nchunks = old_chunks ? lsm_tree->nold_chunks : lsm_tree->nchunks;
+ alloc = old_chunks ? lsm_tree->old_alloc : lsm_tree->chunk_alloc;
+
+ /*
+ * If the tree array of active chunks is larger than our current buffer,
+ * increase the size of our current buffer to match.
+ */
+ if (cookie->chunk_alloc < alloc)
+ WT_ERR(__wt_realloc(session,
+ &cookie->chunk_alloc, alloc, &cookie->chunk_array));
+ if (nchunks > 0)
+ memcpy(cookie->chunk_array,
+ old_chunks ? lsm_tree->old_chunks : lsm_tree->chunk,
+ nchunks * sizeof(*cookie->chunk_array));
+
+ /*
+ * Mark each chunk as active, so we don't drop it until after we know
+ * it's safe.
+ */
+ for (i = 0; i < nchunks; i++)
+ (void)WT_ATOMIC_ADD4(cookie->chunk_array[i]->refcnt, 1);
+
+err: WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
+
+ if (ret == 0)
+ cookie->nchunks = nchunks;
+ return (ret);
+}
+
+/*
+ * __wt_lsm_get_chunk_to_flush --
+ * Find and pin a chunk in the LSM tree that is likely to need flushing.
+ */
+int
+__wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, int force, WT_LSM_CHUNK **chunkp)
+{
+ u_int i, end;
+
+ *chunkp = NULL;
+
+ WT_ASSERT(session, lsm_tree->queue_ref > 0);
+ WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
+ if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+ return (__wt_lsm_tree_readunlock(session, lsm_tree));
+
+ /*
+ * Normally we don't want to force out the last chunk. But if we're
+ * doing a forced flush, likely from a compact call, then we want
+ * to include the final chunk.
+ */
+ end = force ? lsm_tree->nchunks : lsm_tree->nchunks - 1;
+ for (i = 0; i < end; i++) {
+ if (!F_ISSET(lsm_tree->chunk[i], WT_LSM_CHUNK_ONDISK)) {
+ (void)WT_ATOMIC_ADD4(lsm_tree->chunk[i]->refcnt, 1);
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "Flush%s: return chunk %u of %u: %s",
+ force ? " w/ force" : "", i, end - 1,
+ lsm_tree->chunk[i]->uri));
+ *chunkp = lsm_tree->chunk[i];
+ break;
+ }
+ }
+
+ WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree));
+
+ return (0);
+}
+
+/*
+ * __lsm_unpin_chunks --
+ * Decrement the reference count for a set of chunks. Allowing those
+ * chunks to be considered for deletion.
+ */
+static void
+__lsm_unpin_chunks(WT_SESSION_IMPL *session, WT_LSM_WORKER_COOKIE *cookie)
+{
+ u_int i;
+
+ for (i = 0; i < cookie->nchunks; i++) {
+ if (cookie->chunk_array[i] == NULL)
+ continue;
+ WT_ASSERT(session, cookie->chunk_array[i]->refcnt > 0);
+ (void)WT_ATOMIC_SUB4(cookie->chunk_array[i]->refcnt, 1);
+ }
+ /* Ensure subsequent calls don't double decrement. */
+ cookie->nchunks = 0;
+}
+
+/*
+ * __wt_lsm_work_switch --
+ * Do a switch if the LSM tree needs one.
+ */
+int
+__wt_lsm_work_switch(
+ WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **entryp, int *ran)
+{
+ WT_DECL_RET;
+ WT_LSM_WORK_UNIT *entry;
+
+ /* We've become responsible for freeing the work unit. */
+ entry = *entryp;
+ *ran = 0;
+ *entryp = NULL;
+
+ if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
+ WT_WITH_SCHEMA_LOCK(session, ret =
+ __wt_lsm_tree_switch(session, entry->lsm_tree));
+ /* Failing to complete the switch is fine */
+ if (ret == EBUSY) {
+ if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH))
+ WT_ERR(__wt_lsm_manager_push_entry(session,
+ WT_LSM_WORK_SWITCH, 0, entry->lsm_tree));
+ ret = 0;
+ } else
+ *ran = 1;
+ }
+err: __wt_lsm_manager_free_work_unit(session, entry);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_work_bloom --
+ * Try to create a Bloom filter for the newest on-disk chunk that doesn't
+ * have one.
+ */
+int
+__wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_WORKER_COOKIE cookie;
+ u_int i, merge;
+
+ WT_CLEAR(cookie);
+
+ WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 0));
+
+ /* Create bloom filters in all checkpointed chunks. */
+ merge = 0;
+ for (i = 0; i < cookie.nchunks; i++) {
+ chunk = cookie.chunk_array[i];
+
+ /*
+ * Skip if a thread is still active in the chunk or it
+ * isn't suitable.
+ */
+ if (!F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ||
+ F_ISSET(chunk, WT_LSM_CHUNK_BLOOM | WT_LSM_CHUNK_MERGING) ||
+ chunk->generation > 0 ||
+ chunk->count == 0)
+ continue;
+
+ /*
+ * See if we win the race to switch on the "busy" flag and
+ * recheck that the chunk still needs a Bloom filter.
+ */
+ if (WT_ATOMIC_CAS4(chunk->bloom_busy, 0, 1)) {
+ if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
+ ret = __lsm_bloom_create(
+ session, lsm_tree, chunk, (u_int)i);
+ /*
+ * Record if we were successful so that we can
+ * later push a merge work unit.
+ */
+ if (ret == 0)
+ merge = 1;
+ }
+ chunk->bloom_busy = 0;
+ break;
+ }
+ }
+ /*
+ * If we created any bloom filters, we push a merge work unit now.
+ */
+ if (merge)
+ WT_ERR(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_MERGE, 0, lsm_tree));
+
+err:
+ __lsm_unpin_chunks(session, &cookie);
+ __wt_free(session, cookie.chunk_array);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_checkpoint_chunk --
+ * Flush a single LSM chunk to disk.
+ */
+int
+__wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
+{
+ WT_DECL_RET;
+ WT_TXN_ISOLATION saved_isolation;
+
+ /*
+ * If the chunk is already checkpointed, make sure it is also evicted.
+ * Either way, there is no point trying to checkpoint it again.
+ */
+ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+ !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) &&
+ !chunk->evicted) {
+ if ((ret = __lsm_discard_handle(
+ session, chunk->uri, NULL)) == 0)
+ chunk->evicted = 1;
+ else if (ret == EBUSY)
+ ret = 0;
+ else
+ WT_RET_MSG(session, ret, "discard handle");
+ }
+ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "LSM worker %s already on disk",
+ chunk->uri));
+ return (0);
+ }
+
+ /* Stop if a running transaction needs the chunk. */
+ __wt_txn_update_oldest(session);
+ if (chunk->switch_txn == WT_TXN_NONE ||
+ !__wt_txn_visible_all(session, chunk->switch_txn)) {
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "LSM worker %s: running transaction, return",
+ chunk->uri));
+ return (0);
+ }
+
+ WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s",
+ chunk->uri));
+
+ /*
+ * Flush the file before checkpointing: this is the expensive part in
+ * terms of I/O.
+ *
+ * Use the special eviction isolation level to avoid interfering with
+ * an application checkpoint: we have already checked that all of the
+ * updates in this chunk are globally visible.
+ *
+ * !!! We can wait here for checkpoints and fsyncs to complete, which
+ * can be a long time.
+ */
+ if ((ret = __wt_session_get_btree(
+ session, chunk->uri, NULL, NULL, 0)) == 0) {
+ saved_isolation = session->txn.isolation;
+ session->txn.isolation = TXN_ISO_EVICTION;
+ ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES);
+ session->txn.isolation = saved_isolation;
+ WT_TRET(__wt_session_release_btree(session));
+ }
+ WT_RET(ret);
+
+ WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s",
+ chunk->uri));
+
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_worker(session, chunk->uri,
+ __wt_checkpoint, NULL, NULL, 0));
+
+ if (ret != 0)
+ WT_RET_MSG(session, ret, "LSM checkpoint");
+
+ /* Now the file is written, get the chunk size. */
+ WT_RET(__wt_lsm_tree_set_chunk_size(session, chunk));
+
+ /* Update the flush timestamp to help track ongoing progress. */
+ WT_RET(__wt_epoch(session, &lsm_tree->last_flush_ts));
+
+ /* Lock the tree, mark the chunk as on disk and update the metadata. */
+ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
+ F_SET(chunk, WT_LSM_CHUNK_ONDISK);
+ ret = __wt_lsm_meta_write(session, lsm_tree);
+ ++lsm_tree->dsk_gen;
+
+ /* Update the throttle time. */
+ __wt_lsm_tree_throttle(session, lsm_tree, 1);
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+ if (ret != 0)
+ WT_RET_MSG(session, ret, "LSM metadata write");
+
+ /*
+ * Clear the "cache resident" flag so the primary can be evicted and
+ * eventually closed. Only do this once the checkpoint has succeeded:
+ * otherwise, accessing the leaf page during the checkpoint can trigger
+ * forced eviction.
+ */
+ WT_RET(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0));
+ __wt_btree_evictable(session, 1);
+ WT_RET(__wt_session_release_btree(session));
+
+ /* Make sure we aren't pinning a transaction ID. */
+ __wt_txn_release_snapshot(session);
+
+ WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s",
+ chunk->uri));
+ /*
+ * Schedule a bloom filter create for our newly flushed chunk */
+ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF))
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_BLOOM, 0, lsm_tree));
+ else
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_MERGE, 0, lsm_tree));
+ return (0);
+}
+
+/*
+ * __lsm_bloom_create --
+ * Create a bloom filter for a chunk of the LSM tree that has been
+ * checkpointed but not yet been merged.
+ */
+static int
+__lsm_bloom_create(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off)
+{
+ WT_BLOOM *bloom;
+ WT_CURSOR *src;
+ WT_DECL_RET;
+ WT_ITEM key;
+ WT_SESSION *wt_session;
+ uint64_t insert_count;
+ int exist;
+
+ /*
+ * Normally, the Bloom URI is populated when the chunk struct is
+ * allocated. After an open, however, it may not have been.
+ * Deal with that here.
+ */
+ if (chunk->bloom_uri == NULL)
+ WT_RET(__wt_lsm_tree_bloom_name(
+ session, lsm_tree, chunk->id, &chunk->bloom_uri));
+
+ /*
+ * Drop the bloom filter first - there may be some content hanging over
+ * from an aborted merge or checkpoint.
+ */
+ wt_session = &session->iface;
+ WT_RET(__wt_exist(session, chunk->bloom_uri + strlen("file:"), &exist));
+ if (exist)
+ WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force"));
+
+ bloom = NULL;
+ /*
+ * This is merge-like activity, and we don't want compacts to give up
+ * because we are creating a bunch of bloom filters before merging.
+ */
+ ++lsm_tree->merge_progressing;
+ WT_RET(__wt_bloom_create(session, chunk->bloom_uri,
+ lsm_tree->bloom_config, chunk->count,
+ lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom));
+
+ /* Open a special merge cursor just on this chunk. */
+ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
+ F_SET(src, WT_CURSTD_RAW);
+ WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));
+
+ F_SET(session, WT_SESSION_NO_CACHE);
+ for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
+ WT_ERR(src->get_key(src, &key));
+ WT_ERR(__wt_bloom_insert(bloom, &key));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ WT_TRET(src->close(src));
+
+ WT_TRET(__wt_bloom_finalize(bloom));
+ WT_ERR(ret);
+
+ F_CLR(session, WT_SESSION_NO_CACHE);
+
+ /*
+ * Load the new Bloom filter into cache.
+ *
+ * We're doing advisory reads to fault the new trees into cache.
+ * Don't block if the cache is full: our next unit of work may be to
+ * discard some trees to free space.
+ */
+ F_SET(session, WT_SESSION_NO_CACHE_CHECK);
+
+ WT_CLEAR(key);
+ WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
+
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+ "LSM worker created bloom filter %s. "
+ "Expected %" PRIu64 " items, got %" PRIu64,
+ chunk->bloom_uri, chunk->count, insert_count));
+
+ /* Ensure the bloom filter is in the metadata. */
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+ F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+ ret = __wt_lsm_meta_write(session, lsm_tree);
+ ++lsm_tree->dsk_gen;
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+ if (ret != 0)
+ WT_ERR_MSG(session, ret, "LSM bloom worker metadata write");
+
+err: if (bloom != NULL)
+ WT_TRET(__wt_bloom_close(bloom));
+ F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+ return (ret);
+}
+
+/*
+ * __lsm_discard_handle --
+ * Try to discard a handle from cache.
+ */
+static int
+__lsm_discard_handle(
+ WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
+{
+ /* This will fail with EBUSY if the file is still in use. */
+ WT_RET(__wt_session_get_btree(session, uri, checkpoint, NULL,
+ WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY));
+
+ F_SET(session->dhandle, WT_DHANDLE_DISCARD);
+ return (__wt_session_release_btree(session));
+}
+
+/*
+ * __lsm_drop_file --
+ * Helper function to drop part of an LSM tree.
+ */
+static int
+__lsm_drop_file(WT_SESSION_IMPL *session, const char *uri)
+{
+ WT_DECL_RET;
+ const char *drop_cfg[] = {
+ WT_CONFIG_BASE(session, session_drop), "remove_files=false", NULL
+ };
+
+ /*
+ * We need to grab the schema lock to drop the file, so first try to
+ * make sure there is minimal work to freeing space in the cache. Only
+ * bother trying to discard the checkpoint handle: the in-memory handle
+ * should have been closed already.
+ *
+ * This will fail with EBUSY if the file is still in use.
+ */
+ WT_RET(__lsm_discard_handle(session, uri, WT_CHECKPOINT));
+
+ /*
+ * Take the schema lock for the drop operation. Since __wt_schema_drop
+ * results in the hot backup lock being taken when it updates the
+ * metadata (which would be too late to prevent our drop).
+ */
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_drop(session, uri, drop_cfg));
+
+ if (ret == 0)
+ ret = __wt_remove(session, uri + strlen("file:"));
+ WT_RET(__wt_verbose(session, WT_VERB_LSM, "Dropped %s", uri));
+
+ if (ret == EBUSY || ret == ENOENT)
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "LSM worker drop of %s failed with %d", uri, ret));
+
+ return (ret);
+}
+
+/*
+ * __wt_lsm_free_chunks --
+ * Try to drop chunks from the tree that are no longer required.
+ */
+int
+__wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_WORKER_COOKIE cookie;
+ u_int i, skipped;
+ int flush_metadata, drop_ret;
+
+ flush_metadata = 0;
+
+ if (lsm_tree->nold_chunks == 0)
+ return (0);
+
+ /*
+ * Make sure only a single thread is freeing the old chunk array
+ * at any time.
+ */
+ if (!WT_ATOMIC_CAS4(lsm_tree->freeing_old_chunks, 0, 1))
+ return (0);
+ /*
+ * Take a copy of the current state of the LSM tree and look for chunks
+ * to drop. We do it this way to avoid holding the LSM tree lock while
+ * doing I/O or waiting on the schema lock.
+ *
+ * This is safe because only one thread will be in this function at a
+ * time. Merges may complete concurrently, and the old_chunks array
+ * may be extended, but we shuffle down the pointers each time we free
+ * one to keep the non-NULL slots at the beginning of the array.
+ */
+ WT_CLEAR(cookie);
+ WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 1));
+ for (i = skipped = 0; i < cookie.nchunks; i++) {
+ chunk = cookie.chunk_array[i];
+ WT_ASSERT(session, chunk != NULL);
+ /* Skip the chunk if another worker is using it. */
+ if (chunk->refcnt > 1) {
+ ++skipped;
+ continue;
+ }
+
+ /*
+ * Don't remove files if a hot backup is in progress.
+ *
+ * The schema lock protects the set of live files, this check
+ * prevents us from removing a file that hot backup already
+ * knows about.
+ */
+ if (S2C(session)->hot_backup != 0)
+ break;
+
+ /*
+ * Drop any bloom filters and chunks we can. Don't try to drop
+ * a chunk if the bloom filter drop fails.
+ * An EBUSY return indicates that a cursor is still open in
+ * the tree - move to the next chunk in that case.
+ * An ENOENT return indicates that the LSM tree metadata was
+ * out of sync with the on disk state. Update the
+ * metadata to match in that case.
+ */
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
+ drop_ret = __lsm_drop_file(session, chunk->bloom_uri);
+ if (drop_ret == EBUSY) {
+ ++skipped;
+ continue;
+ } else if (drop_ret != ENOENT)
+ WT_ERR(drop_ret);
+
+ flush_metadata = 1;
+ F_CLR(chunk, WT_LSM_CHUNK_BLOOM);
+ }
+ if (chunk->uri != NULL) {
+ drop_ret = __lsm_drop_file(session, chunk->uri);
+ if (drop_ret == EBUSY) {
+ ++skipped;
+ continue;
+ } else if (drop_ret != ENOENT)
+ WT_ERR(drop_ret);
+ flush_metadata = 1;
+ }
+
+ /* Lock the tree to clear out the old chunk information. */
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+
+ /*
+ * The chunk we are looking at should be the first one in the
+ * tree that we haven't already skipped over.
+ */
+ WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk);
+ __wt_free(session, chunk->bloom_uri);
+ __wt_free(session, chunk->uri);
+ __wt_free(session, lsm_tree->old_chunks[skipped]);
+
+ /* Shuffle down to keep all occupied slots at the beginning. */
+ if (--lsm_tree->nold_chunks > skipped) {
+ memmove(lsm_tree->old_chunks + skipped,
+ lsm_tree->old_chunks + skipped + 1,
+ (lsm_tree->nold_chunks - skipped) *
+ sizeof(WT_LSM_CHUNK *));
+ lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL;
+ }
+
+ WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+ /*
+ * Clear the chunk in the cookie so we don't attempt to
+ * decrement the reference count.
+ */
+ cookie.chunk_array[i] = NULL;
+ }
+
+err: /* Flush the metadata unless the system is in panic */
+ if (flush_metadata && ret != WT_PANIC) {
+ WT_TRET(__wt_lsm_tree_writelock(session, lsm_tree));
+ WT_TRET(__wt_lsm_meta_write(session, lsm_tree));
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ }
+ __lsm_unpin_chunks(session, &cookie);
+ __wt_free(session, cookie.chunk_array);
+ lsm_tree->freeing_old_chunks = 0;
+
+ /* Returning non-zero means there is no work to do. */
+ if (!flush_metadata)
+ WT_TRET(WT_NOTFOUND);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_worker.c b/src/third_party/wiredtiger/src/lsm/lsm_worker.c
new file mode 100644
index 00000000000..f24e58148b1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_worker.c
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __lsm_worker_general_op(
+ WT_SESSION_IMPL *, WT_LSM_WORKER_ARGS *, int *);
+static void * __lsm_worker(void *);
+
+/*
+ * __wt_lsm_worker_start --
+ * A wrapper around the LSM worker thread start.
+ */
+int
+__wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args)
+{
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "Start LSM worker %d type 0x%x", args->id, args->type));
+ return (__wt_thread_create(session, &args->tid, __lsm_worker, args));
+}
+
+/*
+ * __lsm_worker_general_op --
+ * Execute a single bloom, drop or flush work unit.
+ */
+static int
+__lsm_worker_general_op(
+ WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *cookie, int *completed)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_WORK_UNIT *entry;
+ int force;
+
+ *completed = 0;
+ /*
+ * Return if this thread cannot process a bloom, drop or flush.
+ */
+ if (!FLD_ISSET(cookie->type,
+ WT_LSM_WORK_BLOOM | WT_LSM_WORK_DROP | WT_LSM_WORK_FLUSH))
+ return (WT_NOTFOUND);
+
+ if ((ret = __wt_lsm_manager_pop_entry(session,
+ cookie->type, &entry)) != 0 || entry == NULL)
+ return (ret);
+
+ if (entry->type == WT_LSM_WORK_FLUSH) {
+ force = F_ISSET(entry, WT_LSM_WORK_FORCE);
+ F_CLR(entry, WT_LSM_WORK_FORCE);
+ WT_ERR(__wt_lsm_get_chunk_to_flush(session,
+ entry->lsm_tree, force, &chunk));
+ /*
+ * If we got a chunk to flush, checkpoint it.
+ */
+ if (chunk != NULL) {
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+ "Flush%s chunk %d %s",
+ force ? " w/ force" : "",
+ chunk->id, chunk->uri));
+ ret = __wt_lsm_checkpoint_chunk(
+ session, entry->lsm_tree, chunk);
+ WT_ASSERT(session, chunk->refcnt > 0);
+ (void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+ WT_ERR(ret);
+ }
+ } else if (entry->type == WT_LSM_WORK_DROP)
+ WT_ERR(__wt_lsm_free_chunks(session, entry->lsm_tree));
+ else if (entry->type == WT_LSM_WORK_BLOOM)
+ WT_ERR(__wt_lsm_work_bloom(session, entry->lsm_tree));
+ *completed = 1;
+
+err: __wt_lsm_manager_free_work_unit(session, entry);
+ return (ret);
+}
+
+/*
+ * __lsm_worker --
+ * A thread that executes work units for all open LSM trees.
+ */
+static void *
+__lsm_worker(void *arg)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LSM_WORK_UNIT *entry;
+ WT_LSM_WORKER_ARGS *cookie;
+ WT_SESSION_IMPL *session;
+ int progress, ran;
+
+ cookie = (WT_LSM_WORKER_ARGS *)arg;
+ session = cookie->session;
+ conn = S2C(session);
+
+ entry = NULL;
+ while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+ F_ISSET(cookie, WT_LSM_WORKER_RUN)) {
+ progress = 0;
+
+ /*
+ * Workers process the different LSM work queues. Some workers
+ * can handle several or all work unit types. So the code is
+ * prioritized so important operations happen first.
+ * Switches are the highest priority.
+ */
+ while (FLD_ISSET(cookie->type, WT_LSM_WORK_SWITCH) &&
+ (ret = __wt_lsm_manager_pop_entry(
+ session, WT_LSM_WORK_SWITCH, &entry)) == 0 &&
+ entry != NULL)
+ WT_ERR(
+ __wt_lsm_work_switch(session, &entry, &progress));
+ /* Flag an error if the pop failed. */
+ WT_ERR(ret);
+
+ /*
+ * Next the general operations.
+ */
+ ret = __lsm_worker_general_op(session, cookie, &ran);
+ if (ret == EBUSY || ret == WT_NOTFOUND)
+ ret = 0;
+ WT_ERR(ret);
+ progress = progress || ran;
+
+ /*
+ * Finally see if there is any merge work we can do. This is
+ * last because the earlier operations may result in adding
+ * merge work to the queue.
+ */
+ if (FLD_ISSET(cookie->type, WT_LSM_WORK_MERGE) &&
+ (ret = __wt_lsm_manager_pop_entry(
+ session, WT_LSM_WORK_MERGE, &entry)) == 0 &&
+ entry != NULL) {
+ WT_ASSERT(session, entry->type == WT_LSM_WORK_MERGE);
+ ret = __wt_lsm_merge(session,
+ entry->lsm_tree, cookie->id);
+ if (ret == WT_NOTFOUND) {
+ F_CLR(entry->lsm_tree, WT_LSM_TREE_COMPACTING);
+ ret = 0;
+ } else if (ret == EBUSY)
+ ret = 0;
+ /* Clear any state */
+ WT_CLEAR_BTREE_IN_SESSION(session);
+ __wt_lsm_manager_free_work_unit(session, entry);
+ entry = NULL;
+ progress = 1;
+ }
+ /* Flag an error if the pop failed. */
+ WT_ERR(ret);
+
+ /* Don't busy wait if there was any work to do. */
+ if (!progress) {
+ WT_ERR(
+ __wt_cond_wait(session, cookie->work_cond, 10000));
+ continue;
+ }
+ }
+
+ if (ret != 0) {
+err: __wt_lsm_manager_free_work_unit(session, entry);
+ __wt_err(session, ret,
+ "Error in LSM worker thread %d", cookie->id);
+ }
+ return (NULL);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_apply.c b/src/third_party/wiredtiger/src/meta/meta_apply.c
new file mode 100644
index 00000000000..313516148c0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_apply.c
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_meta_btree_apply --
+ * Apply a function to all files listed in the metadata, apart from the
+ * metadata file.
+ */
+int
+__wt_meta_btree_apply(WT_SESSION_IMPL *session,
+ int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
+{
+ WT_CURSOR *cursor;
+ WT_DATA_HANDLE *saved_dhandle;
+ WT_DECL_RET;
+ const char *uri;
+ int cmp, tret;
+
+ saved_dhandle = session->dhandle;
+ WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+ cursor->set_key(cursor, "file:");
+ if ((tret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
+ tret = cursor->next(cursor);
+ for (; tret == 0; tret = cursor->next(cursor)) {
+ WT_ERR(cursor->get_key(cursor, &uri));
+ if (!WT_PREFIX_MATCH(uri, "file:"))
+ break;
+ else if (strcmp(uri, WT_METAFILE_URI) == 0)
+ continue;
+
+ /*
+ * We need to pull the handle into the session handle cache
+ * and make sure it's referenced to stop other internal code
+ * dropping the handle (e.g in LSM when cleaning up obsolete
+ * chunks). Holding the metadata lock isn't enough.
+ */
+ ret = __wt_session_get_btree(session, uri, NULL, NULL, 0);
+ if (ret == 0) {
+ ret = func(session, cfg);
+ if (WT_META_TRACKING(session))
+ WT_TRET(
+ __wt_meta_track_handle_lock(session, 0));
+ else
+ WT_TRET(__wt_session_release_btree(session));
+ } else if (ret == EBUSY)
+ ret = __wt_conn_btree_apply_single(
+ session, uri, NULL, func, cfg);
+ WT_ERR(ret);
+ }
+
+ if (tret != WT_NOTFOUND)
+ WT_TRET(tret);
+err: WT_TRET(cursor->close(cursor));
+ session->dhandle = saved_dhandle;
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
new file mode 100644
index 00000000000..998ae7e0d02
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
@@ -0,0 +1,528 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __ckpt_last(WT_SESSION_IMPL *, const char *, WT_CKPT *);
+static int __ckpt_last_name(WT_SESSION_IMPL *, const char *, const char **);
+static int __ckpt_load(WT_SESSION_IMPL *,
+ WT_CONFIG_ITEM *, WT_CONFIG_ITEM *, WT_CKPT *);
+static int __ckpt_named(
+ WT_SESSION_IMPL *, const char *, const char *, WT_CKPT *);
+static int __ckpt_set(WT_SESSION_IMPL *, const char *, const char *);
+static int __ckpt_version_chk(WT_SESSION_IMPL *, const char *, const char *);
+
+/*
+ * __wt_meta_checkpoint --
+ * Return a file's checkpoint information.
+ */
+int
+__wt_meta_checkpoint(WT_SESSION_IMPL *session,
+ const char *fname, const char *checkpoint, WT_CKPT *ckpt)
+{
+ WT_DECL_RET;
+ const char *config;
+
+ config = NULL;
+
+ /* Retrieve the metadata entry for the file. */
+ WT_ERR(__wt_metadata_search(session, fname, &config));
+
+ /* Check the major/minor version numbers. */
+ WT_ERR(__ckpt_version_chk(session, fname, config));
+
+ /*
+ * Retrieve the named checkpoint or the last checkpoint.
+ *
+ * If we don't find a named checkpoint, we're done, they're read-only.
+ * If we don't find a default checkpoint, it's creation, return "no
+ * data" and let our caller handle it.
+ */
+ if (checkpoint == NULL) {
+ if ((ret = __ckpt_last(session, config, ckpt)) == WT_NOTFOUND) {
+ ret = 0;
+ ckpt->addr.data = ckpt->raw.data = NULL;
+ ckpt->addr.size = ckpt->raw.size = 0;
+ }
+ } else
+ WT_ERR(__ckpt_named(session, checkpoint, config, ckpt));
+
+err: __wt_free(session, config);
+ return (ret);
+}
+
+/*
+ * __wt_meta_checkpoint_last_name --
+ * Return the last unnamed checkpoint's name.
+ */
+int
+__wt_meta_checkpoint_last_name(
+ WT_SESSION_IMPL *session, const char *fname, const char **namep)
+{
+ WT_DECL_RET;
+ const char *config;
+
+ config = NULL;
+
+ /* Retrieve the metadata entry for the file. */
+ WT_ERR(__wt_metadata_search(session, fname, &config));
+
+ /* Check the major/minor version numbers. */
+ WT_ERR(__ckpt_version_chk(session, fname, config));
+
+ /* Retrieve the name of the last unnamed checkpoint. */
+ WT_ERR(__ckpt_last_name(session, config, namep));
+
+err: __wt_free(session, config);
+ return (ret);
+}
+
+/*
+ * __wt_meta_checkpoint_clear --
+ * Clear a file's checkpoint.
+ */
+int
+__wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname)
+{
+ /*
+ * If we are unrolling a failed create, we may have already removed the
+ * metadata entry. If no entry is found to update and we're trying to
+ * clear the checkpoint, just ignore it.
+ */
+ WT_RET_NOTFOUND_OK(__ckpt_set(session, fname, NULL));
+
+ return (0);
+}
+
+/*
+ * __ckpt_set --
+ * Set a file's checkpoint.
+ */
+static int
+__ckpt_set(WT_SESSION_IMPL *session, const char *fname, const char *v)
+{
+ WT_DECL_RET;
+ const char *config, *cfg[3], *newcfg;
+
+ config = newcfg = NULL;
+
+ /* Retrieve the metadata for this file. */
+ WT_ERR(__wt_metadata_search(session, fname, &config));
+
+ /* Replace the checkpoint entry. */
+ cfg[0] = config;
+ cfg[1] = v == NULL ? "checkpoint=()" : v;
+ cfg[2] = NULL;
+ WT_ERR(__wt_config_collapse(session, cfg, &newcfg));
+ WT_ERR(__wt_metadata_update(session, fname, newcfg));
+
+err: __wt_free(session, config);
+ __wt_free(session, newcfg);
+ return (ret);
+}
+
+/*
+ * __ckpt_named --
+ * Return the information associated with a file's named checkpoint.
+ */
+static int
+__ckpt_named(WT_SESSION_IMPL *session,
+ const char *checkpoint, const char *config, WT_CKPT *ckpt)
+{
+ WT_CONFIG ckptconf;
+ WT_CONFIG_ITEM k, v;
+
+ WT_RET(__wt_config_getones(session, config, "checkpoint", &v));
+ WT_RET(__wt_config_subinit(session, &ckptconf, &v));
+
+ /*
+ * Take the first match: there should never be more than a single
+ * checkpoint of any name.
+ */
+ while (__wt_config_next(&ckptconf, &k, &v) == 0)
+ if (WT_STRING_MATCH(checkpoint, k.str, k.len))
+ return (__ckpt_load(session, &k, &v, ckpt));
+
+ return (WT_NOTFOUND);
+}
+
+/*
+ * __ckpt_last --
+ * Return the information associated with the file's last checkpoint.
+ */
+static int
+__ckpt_last(WT_SESSION_IMPL *session, const char *config, WT_CKPT *ckpt)
+{
+ WT_CONFIG ckptconf;
+ WT_CONFIG_ITEM a, k, v;
+ int64_t found;
+
+ WT_RET(__wt_config_getones(session, config, "checkpoint", &v));
+ WT_RET(__wt_config_subinit(session, &ckptconf, &v));
+ for (found = 0; __wt_config_next(&ckptconf, &k, &v) == 0;) {
+ /* Ignore checkpoints before the ones we've already seen. */
+ WT_RET(__wt_config_subgets(session, &v, "order", &a));
+ if (found) {
+ if (a.val < found)
+ continue;
+ __wt_meta_checkpoint_free(session, ckpt);
+ }
+ found = a.val;
+ WT_RET(__ckpt_load(session, &k, &v, ckpt));
+ }
+
+ return (found ? 0 : WT_NOTFOUND);
+}
+
+/*
+ * __ckpt_last_name --
+ * Return the name associated with the file's last unnamed checkpoint.
+ */
+static int
+__ckpt_last_name(
+ WT_SESSION_IMPL *session, const char *config, const char **namep)
+{
+ WT_CONFIG ckptconf;
+ WT_CONFIG_ITEM a, k, v;
+ WT_DECL_RET;
+ int64_t found;
+
+ *namep = NULL;
+
+ WT_ERR(__wt_config_getones(session, config, "checkpoint", &v));
+ WT_ERR(__wt_config_subinit(session, &ckptconf, &v));
+ for (found = 0; __wt_config_next(&ckptconf, &k, &v) == 0;) {
+ /*
+ * We only care about unnamed checkpoints; applications may not
+ * use any matching prefix as a checkpoint name, the comparison
+ * is pretty simple.
+ */
+ if (k.len < strlen(WT_CHECKPOINT) ||
+ strncmp(k.str, WT_CHECKPOINT, strlen(WT_CHECKPOINT)) != 0)
+ continue;
+
+ /* Ignore checkpoints before the ones we've already seen. */
+ WT_ERR(__wt_config_subgets(session, &v, "order", &a));
+ if (found && a.val < found)
+ continue;
+
+ if (*namep != NULL)
+ __wt_free(session, *namep);
+ WT_ERR(__wt_strndup(session, k.str, k.len, namep));
+ found = a.val;
+ }
+ if (!found)
+ ret = WT_NOTFOUND;
+
+ if (0) {
+err: __wt_free(session, namep);
+ }
+ return (ret);
+}
+
+/*
+ * __ckpt_compare_order --
+ * Qsort comparison routine for the checkpoint list.
+ */
+static int
+__ckpt_compare_order(const void *a, const void *b)
+{
+ WT_CKPT *ackpt, *bckpt;
+
+ ackpt = (WT_CKPT *)a;
+ bckpt = (WT_CKPT *)b;
+
+ return (ackpt->order > bckpt->order ? 1 : -1);
+}
+
+/*
+ * __wt_meta_ckptlist_get --
+ * Load all available checkpoint information for a file.
+ */
+int
+__wt_meta_ckptlist_get(
+ WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep)
+{
+ WT_CKPT *ckpt, *ckptbase;
+ WT_CONFIG ckptconf;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ size_t allocated, slot;
+ const char *config;
+
+ *ckptbasep = NULL;
+
+ ckptbase = NULL;
+ allocated = slot = 0;
+ config = NULL;
+
+ /* Retrieve the metadata information for the file. */
+ WT_RET(__wt_metadata_search(session, fname, &config));
+
+ /* Load any existing checkpoints into the array. */
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+ if (__wt_config_getones(session, config, "checkpoint", &v) == 0 &&
+ __wt_config_subinit(session, &ckptconf, &v) == 0)
+ for (; __wt_config_next(&ckptconf, &k, &v) == 0; ++slot) {
+ WT_ERR(__wt_realloc_def(
+ session, &allocated, slot + 1, &ckptbase));
+ ckpt = &ckptbase[slot];
+
+ WT_ERR(__ckpt_load(session, &k, &v, ckpt));
+ }
+
+ /*
+ * Allocate an extra slot for a new value, plus a slot to mark the end.
+ *
+ * This isn't very clean, but there's necessary cooperation between the
+ * schema layer (that maintains the list of checkpoints), the btree
+ * layer (that knows when the root page is written, creating a new
+ * checkpoint), and the block manager (which actually creates the
+ * checkpoint). All of that cooperation is handled in the WT_CKPT
+ * structure referenced from the WT_BTREE structure.
+ */
+ WT_ERR(__wt_realloc_def(session, &allocated, slot + 2, &ckptbase));
+
+ /* Sort in creation-order. */
+ qsort(ckptbase, slot, sizeof(WT_CKPT), __ckpt_compare_order);
+
+ /* Return the array to our caller. */
+ *ckptbasep = ckptbase;
+
+ if (0) {
+err: __wt_meta_ckptlist_free(session, ckptbase);
+ }
+ __wt_free(session, config);
+ __wt_scr_free(&buf);
+
+ return (ret);
+}
+
+/*
+ * __ckpt_load --
+ * Load a single checkpoint's information into a WT_CKPT structure.
+ */
+static int
+__ckpt_load(WT_SESSION_IMPL *session,
+ WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v, WT_CKPT *ckpt)
+{
+ WT_CONFIG_ITEM a;
+ char timebuf[64];
+
+ /*
+ * Copy the name, address (raw and hex), order and time into the slot.
+ * If there's no address, it's a fake.
+ */
+ WT_RET(__wt_strndup(session, k->str, k->len, &ckpt->name));
+
+ WT_RET(__wt_config_subgets(session, v, "addr", &a));
+ WT_RET(__wt_buf_set(session, &ckpt->addr, a.str, a.len));
+ if (a.len == 0)
+ F_SET(ckpt, WT_CKPT_FAKE);
+ else
+ WT_RET(__wt_nhex_to_raw(session, a.str, a.len, &ckpt->raw));
+
+ WT_RET(__wt_config_subgets(session, v, "order", &a));
+ if (a.len == 0)
+ goto format;
+ ckpt->order = a.val;
+
+ WT_RET(__wt_config_subgets(session, v, "time", &a));
+ if (a.len == 0 || a.len > sizeof(timebuf) - 1)
+ goto format;
+ memcpy(timebuf, a.str, a.len);
+ timebuf[a.len] = '\0';
+ if (sscanf(timebuf, "%" SCNuMAX, &ckpt->sec) != 1)
+ goto format;
+
+ WT_RET(__wt_config_subgets(session, v, "size", &a));
+ ckpt->ckpt_size = (uint64_t)a.val;
+
+ WT_RET(__wt_config_subgets(session, v, "write_gen", &a));
+ if (a.len == 0)
+ goto format;
+ /*
+ * The largest value a WT_CONFIG_ITEM can handle is signed: this value
+ * appears on disk and I don't want to sign it there, so I'm casting it
+ * here instead.
+ */
+ ckpt->write_gen = (uint64_t)a.val;
+
+ return (0);
+
+format:
+ WT_RET_MSG(session, WT_ERROR, "corrupted checkpoint list");
+}
+
+/*
+ * __wt_meta_ckptlist_set --
+ * Set a file's checkpoint value from the WT_CKPT list.
+ */
+int
+__wt_meta_ckptlist_set(WT_SESSION_IMPL *session,
+ const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn)
+{
+ WT_CKPT *ckpt;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ time_t secs;
+ int64_t maxorder;
+ const char *sep;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+ maxorder = 0;
+ sep = "";
+ WT_ERR(__wt_buf_fmt(session, buf, "checkpoint=("));
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ /*
+ * Each internal checkpoint name is appended with a generation
+ * to make it a unique name. We're solving two problems: when
+ * two checkpoints are taken quickly, the timer may not be
+ * unique and/or we can even see time travel on the second
+ * checkpoint if we snapshot the time in-between nanoseconds
+ * rolling over. Second, if we reset the generational counter
+ * when new checkpoints arrive, we could logically re-create
+ * specific checkpoints, racing with cursors open on those
+ * checkpoints. I can't think of any way to return incorrect
+ * results by racing with those cursors, but it's simpler not
+ * to worry about it.
+ */
+ if (ckpt->order > maxorder)
+ maxorder = ckpt->order;
+
+ /* Skip deleted checkpoints. */
+ if (F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+
+ if (F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_UPDATE)) {
+ /*
+ * We fake checkpoints for handles in the middle of a
+ * bulk load. If there is a checkpoint, convert the
+ * raw cookie to a hex string.
+ */
+ if (ckpt->raw.size == 0)
+ ckpt->addr.size = 0;
+ else
+ WT_ERR(__wt_raw_to_hex(session,
+ ckpt->raw.data,
+ ckpt->raw.size, &ckpt->addr));
+
+ /* Set the order and timestamp. */
+ if (F_ISSET(ckpt, WT_CKPT_ADD))
+ ckpt->order = ++maxorder;
+
+ /*
+ * XXX
+ * Assumes a time_t fits into a uintmax_t, which isn't
+ * guaranteed, a time_t has to be an arithmetic type,
+ * but not an integral type.
+ */
+ WT_ERR(__wt_seconds(session, &secs));
+ ckpt->sec = (uintmax_t)secs;
+ }
+ if (strcmp(ckpt->name, WT_CHECKPOINT) == 0)
+ WT_ERR(__wt_buf_catfmt(session, buf,
+ "%s%s.%" PRId64 "=(addr=\"%.*s\",order=%" PRIu64
+ ",time=%" PRIuMAX ",size=%" PRIu64
+ ",write_gen=%" PRIu64 ")",
+ sep, ckpt->name, ckpt->order,
+ (int)ckpt->addr.size, (char *)ckpt->addr.data,
+ ckpt->order, ckpt->sec, ckpt->ckpt_size,
+ ckpt->write_gen));
+ else
+ WT_ERR(__wt_buf_catfmt(session, buf,
+ "%s%s=(addr=\"%.*s\",order=%" PRIu64
+ ",time=%" PRIuMAX ",size=%" PRIu64
+ ",write_gen=%" PRIu64 ")",
+ sep, ckpt->name,
+ (int)ckpt->addr.size, (char *)ckpt->addr.data,
+ ckpt->order, ckpt->sec, ckpt->ckpt_size,
+ ckpt->write_gen));
+ sep = ",";
+ }
+ WT_ERR(__wt_buf_catfmt(session, buf, ")"));
+ if (ckptlsn != NULL)
+ WT_ERR(__wt_buf_catfmt(session, buf,
+ ",checkpoint_lsn=(%" PRIu32 ",%" PRIuMAX ")",
+ ckptlsn->file, (uintmax_t)ckptlsn->offset));
+ WT_ERR(__ckpt_set(session, fname, buf->mem));
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_meta_ckptlist_free --
+ * Discard the checkpoint array.
+ */
+void
+__wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
+{
+ WT_CKPT *ckpt;
+
+ if (ckptbase == NULL)
+ return;
+
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ __wt_meta_checkpoint_free(session, ckpt);
+ __wt_free(session, ckptbase);
+}
+
+/*
+ * __wt_meta_checkpoint_free --
+ * Clean up a single checkpoint structure.
+ */
+void
+__wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
+{
+ if (ckpt == NULL)
+ return;
+
+ __wt_free(session, ckpt->name);
+ __wt_buf_free(session, &ckpt->addr);
+ __wt_buf_free(session, &ckpt->raw);
+ __wt_free(session, ckpt->bpriv);
+
+ WT_CLEAR(*ckpt); /* Clear to prepare for re-use. */
+}
+
+/*
+ * __ckpt_version_chk --
+ * Check the version major/minor numbers.
+ */
+static int
+__ckpt_version_chk(
+ WT_SESSION_IMPL *session, const char *fname, const char *config)
+{
+ WT_CONFIG_ITEM a, v;
+ int majorv, minorv;
+
+ WT_RET(__wt_config_getones(session, config, "version", &v));
+ WT_RET(__wt_config_subgets(session, &v, "major", &a));
+ majorv = (int)a.val;
+ WT_RET(__wt_config_subgets(session, &v, "minor", &a));
+ minorv = (int)a.val;
+
+ if (majorv < WT_BTREE_MAJOR_VERSION_MIN ||
+ majorv > WT_BTREE_MAJOR_VERSION_MAX ||
+ (majorv == WT_BTREE_MAJOR_VERSION_MIN &&
+ minorv < WT_BTREE_MINOR_VERSION_MIN) ||
+ (majorv == WT_BTREE_MAJOR_VERSION_MAX &&
+ minorv > WT_BTREE_MINOR_VERSION_MAX))
+ WT_RET_MSG(session, EACCES,
+ "%s is an unsupported WiredTiger source file version %d.%d"
+ "; this WiredTiger build only supports versions from %d.%d "
+ "to %d.%d",
+ fname,
+ majorv, minorv,
+ WT_BTREE_MAJOR_VERSION_MIN,
+ WT_BTREE_MINOR_VERSION_MIN,
+ WT_BTREE_MAJOR_VERSION_MAX,
+ WT_BTREE_MINOR_VERSION_MAX);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_ext.c b/src/third_party/wiredtiger/src/meta/meta_ext.c
new file mode 100644
index 00000000000..b68058a6e91
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_ext.c
@@ -0,0 +1,103 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ext_metadata_insert --
+ * Insert a row into the metadata (external API version).
+ */
+int
+__wt_ext_metadata_insert(WT_EXTENSION_API *wt_api,
+ WT_SESSION *wt_session, const char *key, const char *value)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = conn->default_session;
+
+ return (__wt_metadata_insert(session, key, value));
+}
+
+/*
+ * __wt_ext_metadata_remove --
+ * Remove a row from the metadata (external API version).
+ */
+int
+__wt_ext_metadata_remove(
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = conn->default_session;
+
+ return (__wt_metadata_remove(session, key));
+}
+
+/*
+ * __wt_ext_metadata_search --
+ * Return a copied row from the metadata (external API version).
+ * The caller is responsible for freeing the allocated memory.
+ */
+int
+__wt_ext_metadata_search(WT_EXTENSION_API *wt_api,
+ WT_SESSION *wt_session, const char *key, const char **valuep)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = conn->default_session;
+
+ return (__wt_metadata_search(session, key, valuep));
+}
+
+/*
+ * __wt_ext_metadata_update --
+ * Update a row in the metadata (external API version).
+ */
+int
+__wt_ext_metadata_update(WT_EXTENSION_API *wt_api,
+ WT_SESSION *wt_session, const char *key, const char *value)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = conn->default_session;
+
+ return (__wt_metadata_update(session, key, value));
+}
+
+/*
+ * __wt_metadata_get_ckptlist --
+ * Public entry point to __wt_meta_ckptlist_get (for wt list).
+ */
+int
+__wt_metadata_get_ckptlist(
+ WT_SESSION *session, const char *name, WT_CKPT **ckptbasep)
+{
+ return (__wt_meta_ckptlist_get(
+ (WT_SESSION_IMPL *)session, name, ckptbasep));
+}
+
+/*
+ * __wt_metadata_free_ckptlist --
+ * Public entry point to __wt_meta_ckptlist_free (for wt list).
+ */
+void
+__wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase)
+{
+ __wt_meta_ckptlist_free((WT_SESSION_IMPL *)session, ckptbase);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c
new file mode 100644
index 00000000000..e66ed609952
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_table.c
@@ -0,0 +1,206 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __metadata_turtle --
+ * Return if a key's value should be taken from the turtle file.
+ */
+static int
+__metadata_turtle(const char *key)
+{
+ switch (key[0]) {
+ case 'f':
+ if (strcmp(key, WT_METAFILE_URI) == 0)
+ return (1);
+ break;
+ case 'W':
+ if (strcmp(key, "WiredTiger version") == 0)
+ return (1);
+ if (strcmp(key, "WiredTiger version string") == 0)
+ return (1);
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __wt_metadata_open --
+ * Opens the metadata file, sets session->metafile.
+ */
+int
+__wt_metadata_open(WT_SESSION_IMPL *session)
+{
+ if (session->metafile != NULL)
+ return (0);
+
+ WT_RET(__wt_session_get_btree(session, WT_METAFILE_URI, NULL, NULL, 0));
+
+ session->metafile = S2BT(session);
+ WT_ASSERT(session, session->metafile != NULL);
+
+ /* The metafile doesn't need to stay locked -- release it. */
+ return (__wt_session_release_btree(session));
+}
+
+/*
+ * __wt_metadata_cursor --
+ * Opens a cursor on the metadata.
+ */
+int
+__wt_metadata_cursor(
+ WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp)
+{
+ WT_DATA_HANDLE *saved_dhandle;
+ WT_DECL_RET;
+ const char *cfg[] =
+ { WT_CONFIG_BASE(session, session_open_cursor), config, NULL };
+
+ saved_dhandle = session->dhandle;
+ WT_ERR(__wt_metadata_open(session));
+
+ WT_SET_BTREE_IN_SESSION(session, session->metafile);
+
+ /*
+ * We use the metadata a lot, so we have a handle cached; lock it and
+ * increment the in-use counter.
+ */
+ WT_ERR(__wt_session_lock_btree(session, 0));
+ __wt_session_dhandle_incr_use(session);
+
+ ret = __wt_curfile_create(session, NULL, cfg, 0, 0, cursorp);
+
+ /* Restore the caller's btree. */
+err: session->dhandle = saved_dhandle;
+ return (ret);
+}
+
+/*
+ * __wt_metadata_insert --
+ * Insert a row into the metadata.
+ */
+int
+__wt_metadata_insert(
+ WT_SESSION_IMPL *session, const char *key, const char *value)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ WT_RET(__wt_verbose(session, WT_VERB_METADATA,
+ "Insert: key: %s, value: %s, tracking: %s, %s" "turtle",
+ key, value, WT_META_TRACKING(session) ? "true" : "false",
+ __metadata_turtle(key) ? "" : "not "));
+
+ if (__metadata_turtle(key))
+ WT_RET_MSG(session, EINVAL,
+ "%s: insert not supported on the turtle file", key);
+
+ WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+ cursor->set_key(cursor, key);
+ cursor->set_value(cursor, value);
+ WT_ERR(cursor->insert(cursor));
+ if (WT_META_TRACKING(session))
+ WT_ERR(__wt_meta_track_insert(session, key));
+
+err: WT_TRET(cursor->close(cursor));
+ return (ret);
+}
+
+/*
+ * __wt_metadata_update --
+ * Update a row in the metadata.
+ */
+int
+__wt_metadata_update(
+ WT_SESSION_IMPL *session, const char *key, const char *value)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ WT_RET(__wt_verbose(session, WT_VERB_METADATA,
+ "Update: key: %s, value: %s, tracking: %s, %s" "turtle",
+ key, value, WT_META_TRACKING(session) ? "true" : "false",
+ __metadata_turtle(key) ? "" : "not "));
+
+ if (__metadata_turtle(key))
+ return (__wt_turtle_update(session, key, value));
+
+ if (WT_META_TRACKING(session))
+ WT_RET(__wt_meta_track_update(session, key));
+
+ WT_RET(__wt_metadata_cursor(session, "overwrite", &cursor));
+ cursor->set_key(cursor, key);
+ cursor->set_value(cursor, value);
+ WT_ERR(cursor->insert(cursor));
+
+err: WT_TRET(cursor->close(cursor));
+ return (ret);
+}
+
+/*
+ * __wt_metadata_remove --
+ * Remove a row from the metadata.
+ */
+int
+__wt_metadata_remove(WT_SESSION_IMPL *session, const char *key)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ WT_RET(__wt_verbose(session, WT_VERB_METADATA,
+ "Remove: key: %s, tracking: %s, %s" "turtle",
+ key, WT_META_TRACKING(session) ? "true" : "false",
+ __metadata_turtle(key) ? "" : "not "));
+
+ if (__metadata_turtle(key))
+ WT_RET_MSG(session, EINVAL,
+ "%s: remove not supported on the turtle file", key);
+
+ WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+ cursor->set_key(cursor, key);
+ WT_ERR(cursor->search(cursor));
+ if (WT_META_TRACKING(session))
+ WT_ERR(__wt_meta_track_update(session, key));
+ WT_ERR(cursor->remove(cursor));
+
+err: WT_TRET(cursor->close(cursor));
+ return (ret);
+}
+
+/*
+ * __wt_metadata_search --
+ * Return a copied row from the metadata.
+ * The caller is responsible for freeing the allocated memory.
+ */
+int
+__wt_metadata_search(
+ WT_SESSION_IMPL *session, const char *key, const char **valuep)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ const char *value;
+
+ *valuep = NULL;
+
+ WT_RET(__wt_verbose(session, WT_VERB_METADATA,
+ "Search: key: %s, tracking: %s, %s" "turtle",
+ key, WT_META_TRACKING(session) ? "true" : "false",
+ __metadata_turtle(key) ? "" : "not "));
+
+ if (__metadata_turtle(key))
+ return (__wt_turtle_read(session, key, valuep));
+
+ WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+ cursor->set_key(cursor, key);
+ WT_ERR(cursor->search(cursor));
+ WT_ERR(cursor->get_value(cursor, &value));
+ WT_ERR(__wt_strdup(session, value, valuep));
+
+err: WT_TRET(cursor->close(cursor));
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c
new file mode 100644
index 00000000000..55e61f8d1bc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_track.c
@@ -0,0 +1,365 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * WT_META_TRACK -- A tracked metadata operation: a non-transactional log,
+ * maintained to make it easy to unroll simple metadata and filesystem
+ * operations.
+ */
+typedef struct __wt_meta_track {
+ enum {
+ WT_ST_EMPTY, /* Unused slot */
+ WT_ST_CHECKPOINT, /* Complete a checkpoint */
+ WT_ST_FILEOP, /* File operation */
+ WT_ST_LOCK, /* Lock a handle */
+ WT_ST_REMOVE, /* Remove a metadata entry */
+ WT_ST_SET /* Reset a metadata entry */
+ } op;
+ const char *a, *b; /* Strings */
+ WT_BTREE *btree; /* Locked handle */
+ int created; /* Handle on newly created file */
+} WT_META_TRACK;
+
+/*
+ * __meta_track_next --
+ * Extend the list of operations we're tracking, as necessary, and
+ * optionally return the next slot.
+ */
+static int
+__meta_track_next(WT_SESSION_IMPL *session, WT_META_TRACK **trkp)
+{
+ size_t offset, sub_off;
+
+ if (session->meta_track_next == NULL)
+ session->meta_track_next = session->meta_track;
+
+ offset = WT_PTRDIFF(session->meta_track_next, session->meta_track);
+ sub_off = WT_PTRDIFF(session->meta_track_sub, session->meta_track);
+ if (offset == session->meta_track_alloc) {
+ WT_RET(__wt_realloc(session, &session->meta_track_alloc,
+ WT_MAX(2 * session->meta_track_alloc,
+ 20 * sizeof(WT_META_TRACK)), &session->meta_track));
+
+ /* Maintain positions in the new chunk of memory. */
+ session->meta_track_next =
+ (uint8_t *)session->meta_track + offset;
+ if (session->meta_track_sub != NULL)
+ session->meta_track_sub =
+ (uint8_t *)session->meta_track + sub_off;
+ }
+
+ WT_ASSERT(session, session->meta_track_next != NULL);
+
+ if (trkp != NULL) {
+ *trkp = session->meta_track_next;
+ session->meta_track_next = *trkp + 1;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_meta_track_discard --
+ * Cleanup metadata tracking when closing a session.
+ */
+void
+__wt_meta_track_discard(WT_SESSION_IMPL *session)
+{
+ __wt_free(session, session->meta_track);
+ session->meta_track_next = NULL;
+ session->meta_track_alloc = 0;
+}
+
+/*
+ * __wt_meta_track_on --
+ * Turn on metadata operation tracking.
+ */
+int
+__wt_meta_track_on(WT_SESSION_IMPL *session)
+{
+ if (session->meta_track_nest++ == 0)
+ WT_RET(__meta_track_next(session, NULL));
+
+ return (0);
+}
+
+/*
+ * __meta_track_apply --
+ * Apply the changes in a metadata tracking record.
+ */
+static int
+__meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll)
+{
+ WT_BM *bm;
+ WT_DECL_RET;
+ int tret;
+
+ /*
+ * Unlock handles and complete checkpoints regardless of whether we are
+ * unrolling.
+ */
+ if (!unroll && trk->op != WT_ST_CHECKPOINT && trk->op != WT_ST_LOCK)
+ goto free;
+
+ switch (trk->op) {
+ case WT_ST_EMPTY: /* Unused slot */
+ break;
+ case WT_ST_CHECKPOINT: /* Checkpoint, see above */
+ if (!unroll) {
+ bm = trk->btree->bm;
+ WT_WITH_BTREE(session, trk->btree,
+ WT_TRET(bm->checkpoint_resolve(bm, session)));
+ }
+ break;
+ case WT_ST_LOCK: /* Handle lock, see above */
+ if (unroll && trk->created)
+ F_SET(trk->btree->dhandle, WT_DHANDLE_DISCARD);
+ WT_WITH_BTREE(session, trk->btree,
+ WT_TRET(__wt_session_release_btree(session)));
+ break;
+ case WT_ST_FILEOP: /* File operation */
+ /*
+ * For renames, both a and b are set.
+ * For creates, a is NULL.
+ * For removes, b is NULL.
+ */
+ if (trk->a != NULL && trk->b != NULL &&
+ (tret = __wt_rename(session,
+ trk->b + strlen("file:"),
+ trk->a + strlen("file:"))) != 0) {
+ __wt_err(session, tret,
+ "metadata unroll rename %s to %s",
+ trk->b, trk->a);
+ WT_TRET(tret);
+ } else if (trk->a == NULL) {
+ if ((tret = __wt_remove(session,
+ trk->b + strlen("file:"))) != 0) {
+ __wt_err(session, tret,
+ "metadata unroll create %s",
+ trk->b);
+ WT_TRET(tret);
+ }
+ }
+ /*
+ * We can't undo removes yet: that would imply
+ * some kind of temporary rename and remove in
+ * roll forward.
+ */
+ break;
+ case WT_ST_REMOVE: /* Remove trk.a */
+ if ((tret = __wt_metadata_remove(session, trk->a)) != 0) {
+ __wt_err(session, tret,
+ "metadata unroll remove: %s",
+ trk->a);
+ WT_TRET(tret);
+ }
+ break;
+ case WT_ST_SET: /* Set trk.a to trk.b */
+ if ((tret = __wt_metadata_update(
+ session, trk->a, trk->b)) != 0) {
+ __wt_err(session, tret,
+ "metadata unroll update %s to %s",
+ trk->a, trk->b);
+ WT_TRET(tret);
+ }
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+free: trk->op = WT_ST_EMPTY;
+ __wt_free(session, trk->a);
+ __wt_free(session, trk->b);
+ trk->btree = NULL;
+
+ return (ret);
+}
+
+/*
+ * __wt_meta_track_off --
+ * Turn off metadata operation tracking, unrolling on error.
+ */
+int
+__wt_meta_track_off(WT_SESSION_IMPL *session, int unroll)
+{
+ WT_DECL_RET;
+ WT_META_TRACK *trk, *trk_orig;
+
+ WT_ASSERT(session,
+ WT_META_TRACKING(session) && session->meta_track_nest > 0);
+
+ trk_orig = session->meta_track;
+ trk = session->meta_track_next;
+
+ /* If it was a nested transaction, there is nothing to do. */
+ if (--session->meta_track_nest != 0)
+ return (0);
+
+ /* Turn off tracking for unroll. */
+ session->meta_track_next = session->meta_track_sub = NULL;
+
+ /*
+ * If there were no operations logged, return now and avoid unnecessary
+ * metadata checkpoints. For example, this happens if attempting to
+ * create a data source that already exists (or drop one that doesn't).
+ */
+ if (trk == trk_orig)
+ return (0);
+
+ while (--trk >= trk_orig)
+ WT_TRET(__meta_track_apply(session, trk, unroll));
+
+ /*
+ * If the operation succeeded and we aren't relying on the log for
+ * durability, checkpoint the metadata. */
+ if (!unroll && ret == 0 && session->metafile != NULL &&
+ !S2C(session)->logging)
+ WT_WITH_BTREE(session, session->metafile,
+ ret = __wt_checkpoint(session, NULL));
+
+ return (ret);
+}
+
+/*
+ * __wt_meta_track_sub_on --
+ * Start a group of operations that can be committed independent of the
+ * main transaction.
+ */
+int
+__wt_meta_track_sub_on(WT_SESSION_IMPL *session)
+{
+ WT_ASSERT(session, session->meta_track_sub == NULL);
+ session->meta_track_sub = session->meta_track_next;
+ return (0);
+}
+
+/*
+ * __wt_meta_track_sub_off --
+ * Commit a group of operations independent of the main transaction.
+ */
+int
+__wt_meta_track_sub_off(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+ WT_META_TRACK *trk, *trk_orig;
+
+ if (!WT_META_TRACKING(session) || session->meta_track_sub == NULL)
+ return (0);
+
+ trk_orig = session->meta_track_sub;
+ trk = session->meta_track_next;
+
+ /* Turn off tracking for unroll. */
+ session->meta_track_next = session->meta_track_sub = NULL;
+
+ while (--trk >= trk_orig)
+ WT_TRET(__meta_track_apply(session, trk, 0));
+
+ session->meta_track_next = trk_orig;
+ return (ret);
+}
+
+/*
+ * __wt_meta_track_checkpoint --
+ * Track a handle involved in a checkpoint.
+ */
+int
+__wt_meta_track_checkpoint(WT_SESSION_IMPL *session)
+{
+ WT_META_TRACK *trk;
+
+ WT_ASSERT(session, session->dhandle != NULL);
+
+ WT_RET(__meta_track_next(session, &trk));
+
+ trk->op = WT_ST_CHECKPOINT;
+ trk->btree = S2BT(session);
+ return (0);
+}
+/*
+ * __wt_meta_track_insert --
+ * Track an insert operation.
+ */
+int
+__wt_meta_track_insert(WT_SESSION_IMPL *session, const char *key)
+{
+ WT_META_TRACK *trk;
+
+ WT_RET(__meta_track_next(session, &trk));
+
+ trk->op = WT_ST_REMOVE;
+ WT_RET(__wt_strdup(session, key, &trk->a));
+
+ return (0);
+}
+
+/*
+ * __wt_meta_track_update --
+ * Track a metadata update operation.
+ */
+int
+__wt_meta_track_update(WT_SESSION_IMPL *session, const char *key)
+{
+ WT_DECL_RET;
+ WT_META_TRACK *trk;
+
+ WT_RET(__meta_track_next(session, &trk));
+
+ trk->op = WT_ST_SET;
+ WT_RET(__wt_strdup(session, key, &trk->a));
+
+ /*
+ * If there was a previous value, keep it around -- if not, then this
+ * "update" is really an insert.
+ */
+ if ((ret =
+ __wt_metadata_search(session, key, &trk->b)) == WT_NOTFOUND) {
+ trk->op = WT_ST_REMOVE;
+ ret = 0;
+ }
+ return (ret);
+}
+
+/*
+ * __wt_meta_track_fileop --
+ * Track a filesystem operation.
+ */
+int
+__wt_meta_track_fileop(
+ WT_SESSION_IMPL *session, const char *olduri, const char *newuri)
+{
+ WT_META_TRACK *trk;
+
+ WT_RET(__meta_track_next(session, &trk));
+
+ trk->op = WT_ST_FILEOP;
+ if (olduri != NULL)
+ WT_RET(__wt_strdup(session, olduri, &trk->a));
+ if (newuri != NULL)
+ WT_RET(__wt_strdup(session, newuri, &trk->b));
+ return (0);
+}
+
+/*
+ * __wt_meta_track_handle_lock --
+ * Track a locked handle.
+ */
+int
+__wt_meta_track_handle_lock(WT_SESSION_IMPL *session, int created)
+{
+ WT_META_TRACK *trk;
+
+ WT_ASSERT(session, session->dhandle != NULL);
+
+ WT_RET(__meta_track_next(session, &trk));
+
+ trk->op = WT_ST_LOCK;
+ trk->btree = S2BT(session);
+ trk->created = created;
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c
new file mode 100644
index 00000000000..d6060ebf47b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c
@@ -0,0 +1,318 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __metadata_config --
+ * Return the default configuration information for the metadata file.
+ */
+static int
+__metadata_config(WT_SESSION_IMPL *session, const char **metaconfp)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ const char *cfg[] = { WT_CONFIG_BASE(session, file_meta), NULL, NULL };
+ const char *metaconf;
+
+ *metaconfp = NULL;
+
+ metaconf = NULL;
+
+ /* Create a turtle file with default values. */
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_buf_fmt(session, buf,
+ "key_format=S,value_format=S,id=%d,version=(major=%d,minor=%d)",
+ WT_METAFILE_ID,
+ WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX));
+ cfg[1] = buf->data;
+ WT_ERR(__wt_config_collapse(session, cfg, &metaconf));
+
+ *metaconfp = metaconf;
+
+ if (0) {
+err: __wt_free(session, metaconf);
+ }
+ __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __metadata_init --
+ * Create the metadata file.
+ */
+static int
+__metadata_init(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+
+ /*
+ * We're single-threaded, but acquire the schema lock regardless: the
+ * lower level code checks that it is appropriately synchronized.
+ */
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_create(session, WT_METAFILE_URI, NULL));
+
+ return (ret);
+}
+
+/*
+ * __metadata_load_hot_backup --
+ * Load the contents of any hot backup file.
+ */
+static int
+__metadata_load_hot_backup(WT_SESSION_IMPL *session)
+{
+ FILE *fp;
+ WT_DECL_ITEM(key);
+ WT_DECL_ITEM(value);
+ WT_DECL_RET;
+ char *path;
+
+ fp = NULL;
+ path = NULL;
+
+ /* Look for a hot backup file: if we find it, load it. */
+ WT_RET(__wt_filename(session, WT_METADATA_BACKUP, &path));
+ fp = fopen(path, "r");
+ __wt_free(session, path);
+ if (fp == NULL)
+ return (0);
+
+ /* Read line pairs and load them into the metadata file. */
+ WT_ERR(__wt_scr_alloc(session, 512, &key));
+ WT_ERR(__wt_scr_alloc(session, 512, &value));
+ for (;;) {
+ WT_ERR(__wt_getline(session, key, fp));
+ if (key->size == 0)
+ break;
+ WT_ERR(__wt_getline(session, value, fp));
+ if (value->size == 0)
+ WT_ERR(__wt_illegal_value(session, WT_METADATA_BACKUP));
+ WT_ERR(__wt_metadata_update(session, key->data, value->data));
+ }
+
+ F_SET(S2C(session), WT_CONN_WAS_BACKUP);
+
+err: if (fp != NULL)
+ WT_TRET(fclose(fp) == 0 ? 0 : __wt_errno());
+ __wt_scr_free(&key);
+ __wt_scr_free(&value);
+ return (ret);
+}
+
+/*
+ * __metadata_load_bulk --
+ * Create any bulk-loaded file stubs.
+ */
+static int
+__metadata_load_bulk(WT_SESSION_IMPL *session)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ uint32_t allocsize;
+ int exist;
+ const char *filecfg[] = { WT_CONFIG_BASE(session, file_meta), NULL };
+ const char *key;
+
+ /*
+ * If a file was being bulk-loaded during the hot backup, it will appear
+ * in the metadata file, but the file won't exist. Create on demand.
+ */
+ WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ while ((ret = cursor->next(cursor)) == 0) {
+ WT_ERR(cursor->get_key(cursor, &key));
+ if (!WT_PREFIX_SKIP(key, "file:"))
+ continue;
+
+ /* If the file exists, it's all good. */
+ WT_ERR(__wt_exist(session, key, &exist));
+ if (exist)
+ continue;
+
+ /*
+ * If the file doesn't exist, assume it's a bulk-loaded file;
+ * retrieve the allocation size and re-create the file.
+ */
+ WT_ERR(__wt_direct_io_size_check(
+ session, filecfg, "allocation_size", &allocsize));
+ WT_ERR(__wt_block_manager_create(session, key, allocsize));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: if (cursor != NULL)
+ WT_TRET(cursor->close(cursor));
+
+ return (ret);
+}
+
+/*
+ * __wt_turtle_init --
+ * Check the turtle file and create if necessary.
+ */
+int
+__wt_turtle_init(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+ int exist;
+ const char *metaconf;
+
+ metaconf = NULL;
+
+ /*
+ * Discard any turtle setup file left-over from previous runs. This
+ * doesn't matter for correctness, it's just cleaning up random files.
+ */
+ WT_RET(__wt_exist(session, WT_METADATA_TURTLE_SET, &exist));
+ if (exist)
+ WT_RET(__wt_remove(session, WT_METADATA_TURTLE_SET));
+
+ /*
+ * We could die after creating the turtle file and before creating the
+ * metadata file, or worse, the metadata file might be in some random
+ * state. Make sure that doesn't happen: if we don't find the turtle
+ * file, first create the metadata file, load any hot backup, and then
+ * create the turtle file. No matter what happens, if metadata file
+ * creation doesn't fully complete, we won't have a turtle file and we
+ * will repeat the process until we succeed.
+ *
+ * If there's already a turtle file, we're done.
+ */
+ WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist));
+ if (exist)
+ return (0);
+
+ /* Create the metadata file. */
+ WT_RET(__metadata_init(session));
+
+ /* Load any hot-backup information. */
+ WT_RET(__metadata_load_hot_backup(session));
+
+ /* Create any bulk-loaded file stubs. */
+ WT_RET(__metadata_load_bulk(session));
+
+ /* Create the turtle file. */
+ WT_RET(__metadata_config(session, &metaconf));
+ WT_ERR(__wt_turtle_update(session, WT_METAFILE_URI, metaconf));
+
+ /* Remove the backup file if it exists, we'll never read it again. */
+ WT_ERR(__wt_exist(session, WT_METADATA_BACKUP, &exist));
+ if (exist)
+ WT_ERR(__wt_remove(session, WT_METADATA_BACKUP));
+
+err: __wt_free(session, metaconf);
+ return (ret);
+}
+
+/*
+ * __wt_turtle_read --
+ * Read the turtle file.
+ */
+int
+__wt_turtle_read(WT_SESSION_IMPL *session, const char *key, const char **valuep)
+{
+ FILE *fp;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ int match;
+ char *path;
+
+ *valuep = NULL;
+
+ fp = NULL;
+ path = NULL;
+
+ /*
+ * Open the turtle file; there's one case where we won't find the turtle
+ * file, yet still succeed. We create the metadata file before creating
+ * the turtle file, and that means returning the default configuration
+ * string for the metadata file.
+ */
+ WT_RET(__wt_filename(session, WT_METADATA_TURTLE, &path));
+ if ((fp = fopen(path, "r")) == NULL)
+ ret = __wt_errno();
+ __wt_free(session, path);
+ if (fp == NULL)
+ return (strcmp(key, WT_METAFILE_URI) == 0 ?
+ __metadata_config(session, valuep) : ret);
+
+ /* Search for the key. */
+ WT_ERR(__wt_scr_alloc(session, 512, &buf));
+ for (match = 0;;) {
+ WT_ERR(__wt_getline(session, buf, fp));
+ if (buf->size == 0)
+ WT_ERR(WT_NOTFOUND);
+ if (strcmp(key, buf->data) == 0)
+ match = 1;
+
+ /* Key matched: read the subsequent line for the value. */
+ WT_ERR(__wt_getline(session, buf, fp));
+ if (buf->size == 0)
+ WT_ERR(__wt_illegal_value(session, WT_METADATA_TURTLE));
+ if (match)
+ break;
+ }
+
+ /* Copy the value for the caller. */
+ WT_ERR(__wt_strdup(session, buf->data, valuep));
+
+err: if (fp != NULL)
+ WT_TRET(fclose(fp) == 0 ? 0 : __wt_errno());
+ __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_turtle_update --
+ * Update the turtle file.
+ */
+int
+__wt_turtle_update(
+ WT_SESSION_IMPL *session, const char *key, const char *value)
+{
+ FILE *fp;
+ WT_DECL_RET;
+ int vmajor, vminor, vpatch;
+ const char *version;
+ char *path;
+
+ fp = NULL;
+ path = NULL;
+
+ /*
+ * Create the turtle setup file: we currently re-write it from scratch
+ * every time.
+ */
+ WT_RET(__wt_filename(session, WT_METADATA_TURTLE_SET, &path));
+ if ((fp = fopen(path, "w")) == NULL)
+ ret = __wt_errno();
+ __wt_free(session, path);
+ if (fp == NULL)
+ return (ret);
+
+ version = wiredtiger_version(&vmajor, &vminor, &vpatch);
+ WT_ERR_TEST((fprintf(fp,
+ "%s\n%s\n%s\n" "major=%d,minor=%d,patch=%d\n%s\n%s\n",
+ WT_METADATA_VERSION_STR, version,
+ WT_METADATA_VERSION, vmajor, vminor, vpatch,
+ key, value) < 0), __wt_errno());
+
+ ret = fclose(fp);
+ fp = NULL;
+ WT_ERR_TEST(ret == EOF, __wt_errno());
+
+ WT_ERR(
+ __wt_rename(session, WT_METADATA_TURTLE_SET, WT_METADATA_TURTLE));
+
+ if (0) {
+err: WT_TRET(__wt_remove(session, WT_METADATA_TURTLE_SET));
+ }
+
+ if (fp != NULL)
+ WT_TRET(fclose(fp) == 0 ? 0 : __wt_errno());
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_abort.c b/src/third_party/wiredtiger/src/os_posix/os_abort.c
new file mode 100644
index 00000000000..3d99ffe20b2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_abort.c
@@ -0,0 +1,26 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_abort --
+ * Abort the process, dropping core.
+ */
+void
+__wt_abort(WT_SESSION_IMPL *session)
+ WT_GCC_FUNC_ATTRIBUTE((noreturn))
+{
+ __wt_errx(session, "aborting WiredTiger library");
+
+#ifdef HAVE_DIAGNOSTIC
+ __wt_attach(session);
+#endif
+
+ abort();
+ /* NOTREACHED */
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_alloc.c b/src/third_party/wiredtiger/src/os_posix/os_alloc.c
new file mode 100644
index 00000000000..f7344032a15
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_alloc.c
@@ -0,0 +1,238 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * There's no malloc interface, WiredTiger never calls malloc.
+ *
+ * The problem is an application might allocate memory, write secret stuff in
+ * it, free the memory, then WiredTiger allocates the memory and uses it for a
+ * file page or log record, then writes it to disk, without having overwritten
+ * it fully. That results in the secret stuff being protected by WiredTiger's
+ * permission mechanisms, potentially inappropriate for the secret stuff.
+ */
+
+/*
+ * __wt_calloc --
+ * ANSI calloc function.
+ */
+int
+__wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp)
+{
+ void *p;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL WT_SESSION_IMPL handle.
+ */
+ WT_ASSERT(session, number != 0 && size != 0);
+
+ if (session != NULL)
+ WT_STAT_FAST_CONN_INCR(session, memory_allocation);
+
+ if ((p = calloc(number, size)) == NULL)
+ WT_RET_MSG(session, __wt_errno(), "memory allocation");
+
+ *(void **)retp = p;
+ return (0);
+}
+
+/*
+ * __wt_realloc --
+ * ANSI realloc function.
+ */
+int
+__wt_realloc(WT_SESSION_IMPL *session,
+ size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
+{
+ void *p;
+ size_t bytes_allocated;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL WT_SESSION_IMPL handle.
+ *
+ * Sometimes we're allocating memory and we don't care about the
+ * final length -- bytes_allocated_ret may be NULL.
+ */
+ p = *(void **)retp;
+ bytes_allocated =
+ (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret;
+ WT_ASSERT(session,
+ (p == NULL && bytes_allocated == 0) ||
+ (p != NULL &&
+ (bytes_allocated_ret == NULL || bytes_allocated != 0)));
+ WT_ASSERT(session, bytes_to_allocate != 0);
+ WT_ASSERT(session, bytes_allocated < bytes_to_allocate);
+
+ if (session != NULL) {
+ if (p == NULL)
+ WT_STAT_FAST_CONN_INCR(session, memory_allocation);
+ else
+ WT_STAT_FAST_CONN_INCR(session, memory_grow);
+ }
+
+ if ((p = realloc(p, bytes_to_allocate)) == NULL)
+ WT_RET_MSG(session, __wt_errno(), "memory allocation");
+
+ /*
+ * Clear the allocated memory -- an application might: allocate memory,
+ * write secret stuff into it, free the memory, then we re-allocate the
+ * memory and use it for a file page or log record, and then write it to
+ * disk. That would result in the secret stuff being protected by the
+ * WiredTiger permission mechanisms, potentially inappropriate for the
+ * secret stuff.
+ */
+ memset((uint8_t *)
+ p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated);
+
+ /* Update caller's bytes allocated value. */
+ if (bytes_allocated_ret != NULL)
+ *bytes_allocated_ret = bytes_to_allocate;
+
+ *(void **)retp = p;
+ return (0);
+}
+
+/*
+ * __wt_realloc_aligned --
+ * ANSI realloc function that aligns to buffer boundaries, configured with
+ * the "buffer_alignment" key to wiredtiger_open.
+ */
+int
+__wt_realloc_aligned(WT_SESSION_IMPL *session,
+ size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
+{
+#if defined(HAVE_POSIX_MEMALIGN)
+ WT_DECL_RET;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL WT_SESSION_IMPL handle.
+ */
+ if (session != NULL && S2C(session)->buffer_alignment > 0) {
+ void *p, *newp;
+ size_t bytes_allocated;
+
+ /*
+ * Sometimes we're allocating memory and we don't care about the
+ * final length -- bytes_allocated_ret may be NULL.
+ */
+ p = *(void **)retp;
+ bytes_allocated =
+ (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret;
+ WT_ASSERT(session,
+ (p == NULL && bytes_allocated == 0) ||
+ (p != NULL &&
+ (bytes_allocated_ret == NULL || bytes_allocated != 0)));
+ WT_ASSERT(session, bytes_to_allocate != 0);
+ WT_ASSERT(session, bytes_allocated < bytes_to_allocate);
+
+ if (session != NULL)
+ WT_STAT_FAST_CONN_INCR(session, memory_allocation);
+
+ if ((ret = posix_memalign(&newp,
+ S2C(session)->buffer_alignment,
+ bytes_to_allocate)) != 0)
+ WT_RET_MSG(session, ret, "memory allocation");
+
+ if (p != NULL)
+ memcpy(newp, p, bytes_allocated);
+ __wt_free(session, p);
+ p = newp;
+
+ /* Clear the allocated memory (see above). */
+ memset((uint8_t *)p + bytes_allocated, 0,
+ bytes_to_allocate - bytes_allocated);
+
+ /* Update caller's bytes allocated value. */
+ if (bytes_allocated_ret != NULL)
+ *bytes_allocated_ret = bytes_to_allocate;
+
+ *(void **)retp = p;
+ return (0);
+ }
+#endif
+ /*
+ * If there is no posix_memalign function, or no alignment configured,
+ * fall back to realloc.
+ *
+ * Windows note: Visual C CRT memalign does not match Posix behavior
+ * and would also double each allocation so it is bad for memory use
+ */
+ return (__wt_realloc(
+ session, bytes_allocated_ret, bytes_to_allocate, retp));
+}
+
+/*
+ * __wt_strndup --
+ * Duplicate a byte string of a given length (and NUL-terminate).
+ */
+int
+__wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp)
+{
+ void *p;
+
+ if (str == NULL) {
+ *(void **)retp = NULL;
+ return (0);
+ }
+
+ WT_RET(__wt_calloc(session, len + 1, 1, &p));
+
+ /*
+ * Don't change this to strncpy, we rely on this function to duplicate
+ * "strings" that contain nul bytes.
+ */
+ memcpy(p, str, len);
+
+ *(void **)retp = p;
+ return (0);
+}
+
+/*
+ * __wt_strdup --
+ * ANSI strdup function.
+ */
+int
+__wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp)
+{
+ return (__wt_strndup(
+ session, str, (str == NULL) ? 0 : strlen(str), retp));
+}
+
+/*
+ * __wt_free_int --
+ * ANSI free function.
+ */
+void
+__wt_free_int(WT_SESSION_IMPL *session, const void *p_arg)
+{
+ void *p;
+
+ p = *(void **)p_arg;
+ if (p == NULL) /* ANSI C free semantics */
+ return;
+
+ /*
+ * If there's a serialization bug we might race with another thread.
+ * We can't avoid the race (and we aren't willing to flush memory),
+ * but we minimize the window by clearing the free address, hoping a
+ * racing thread will see, and won't free, a NULL pointer.
+ */
+ *(void **)p_arg = NULL;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL WT_SESSION_IMPL handle.
+ */
+ if (session != NULL)
+ WT_STAT_FAST_CONN_INCR(session, memory_free);
+
+ free(p);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_dir.c b/src/third_party/wiredtiger/src/os_posix/os_dir.c
new file mode 100644
index 00000000000..98b2d4926cd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_dir.c
@@ -0,0 +1,94 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+/* I'm sure we need to config this */
+#include <dirent.h>
+
+/*
+ * __wt_dirlist --
+ * Get a list of files from a directory, optionally filtered by
+ * a given prefix.
+ */
+int
+__wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix,
+ uint32_t flags, char ***dirlist, u_int *countp)
+{
+ struct dirent *dp;
+ DIR *dirp;
+ WT_DECL_RET;
+ size_t dirallocsz;
+ u_int count, dirsz;
+ int match;
+ char **entries, *path;
+
+ *dirlist = NULL;
+ *countp = 0;
+
+ WT_RET(__wt_filename(session, dir, &path));
+
+ dirp = NULL;
+ dirallocsz = 0;
+ dirsz = 0;
+ entries = NULL;
+ if (flags == 0)
+ LF_SET(WT_DIRLIST_INCLUDE);
+
+ WT_ERR(__wt_verbose(session, WT_VERB_FILEOPS,
+ "wt_dirlist of %s %s prefix %s",
+ path, LF_ISSET(WT_DIRLIST_INCLUDE) ? "include" : "exclude",
+ prefix == NULL ? "all" : prefix));
+
+ WT_SYSCALL_RETRY(((dirp = opendir(path)) == NULL ? 1 : 0), ret);
+ if (ret != 0)
+ WT_ERR_MSG(session, ret, "%s: opendir", path);
+ for (dirsz = 0, count = 0; (dp = readdir(dirp)) != NULL;) {
+ /*
+ * Skip . and ..
+ */
+ if (strcmp(dp->d_name, ".") == 0 ||
+ strcmp(dp->d_name, "..") == 0)
+ continue;
+ match = 0;
+ if (prefix != NULL &&
+ ((LF_ISSET(WT_DIRLIST_INCLUDE) &&
+ WT_PREFIX_MATCH(dp->d_name, prefix)) ||
+ (LF_ISSET(WT_DIRLIST_EXCLUDE) &&
+ !WT_PREFIX_MATCH(dp->d_name, prefix))))
+ match = 1;
+ if (prefix == NULL || match) {
+ /*
+ * We have a file name we want to return.
+ */
+ count++;
+ if (count > dirsz) {
+ dirsz += WT_DIR_ENTRY;
+ WT_ERR(__wt_realloc_def(
+ session, &dirallocsz, dirsz, &entries));
+ }
+ WT_ERR(__wt_strdup(
+ session, dp->d_name, &entries[count-1]));
+ }
+ }
+ if (count > 0)
+ *dirlist = entries;
+ *countp = count;
+err:
+ if (dirp != NULL)
+ (void)closedir(dirp);
+ __wt_free(session, path);
+
+ if (ret == 0)
+ return (0);
+
+ if (*dirlist != NULL) {
+ for (count = dirsz; count > 0; count--)
+ __wt_free(session, entries[count]);
+ __wt_free(session, entries);
+ }
+ WT_RET_MSG(session, ret, "dirlist %s prefix %s", dir, prefix);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_dlopen.c b/src/third_party/wiredtiger/src/os_posix/os_dlopen.c
new file mode 100644
index 00000000000..91410c54c04
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_dlopen.c
@@ -0,0 +1,83 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_dlopen --
+ * Open a dynamic library.
+ */
+int
+__wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
+{
+ WT_DECL_RET;
+ WT_DLH *dlh;
+
+ WT_RET(__wt_calloc_def(session, 1, &dlh));
+ WT_ERR(__wt_strdup(session, path, &dlh->name));
+
+ if ((dlh->handle = dlopen(path, RTLD_LAZY)) == NULL)
+ WT_ERR_MSG(
+ session, __wt_errno(), "dlopen(%s): %s", path, dlerror());
+
+ *dlhp = dlh;
+ if (0) {
+err: __wt_free(session, dlh->name);
+ __wt_free(session, dlh);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_dlsym --
+ * Lookup a symbol in a dynamic library.
+ */
+int
+__wt_dlsym(WT_SESSION_IMPL *session,
+ WT_DLH *dlh, const char *name, int fail, void *sym_ret)
+{
+ void *sym;
+
+ *(void **)sym_ret = NULL;
+ if ((sym = dlsym(dlh->handle, name)) == NULL) {
+ if (fail)
+ WT_RET_MSG(session, __wt_errno(),
+ "dlsym(%s in %s): %s", name, dlh->name, dlerror());
+ return (0);
+ }
+
+ *(void **)sym_ret = sym;
+ return (0);
+}
+
+/*
+ * __wt_dlclose --
+ * Close a dynamic library
+ */
+int
+__wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh)
+{
+ WT_DECL_RET;
+
+ /*
+ * FreeBSD dies inside __cxa_finalize when closing handles.
+ *
+ * For now, just skip the dlclose: this may leak some resources until
+ * the process exits, but that is preferable to hard-to-debug crashes
+ * during exit.
+ */
+#ifndef __FreeBSD__
+ if (dlclose(dlh->handle) != 0) {
+ ret = __wt_errno();
+ __wt_err(session, ret, "dlclose: %s", dlerror());
+ }
+#endif
+
+ __wt_free(session, dlh->name);
+ __wt_free(session, dlh);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_errno.c b/src/third_party/wiredtiger/src/os_posix/os_errno.c
new file mode 100644
index 00000000000..9290f7d651f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_errno.c
@@ -0,0 +1,22 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_errno --
+ * Return errno, or WT_ERROR if errno not set.
+ */
+int
+__wt_errno(void)
+{
+ /*
+ * Called when we know an error occurred, and we want the system
+ * error code, but there's some chance it's not set.
+ */
+ return (errno == 0 ? WT_ERROR : errno);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_exist.c b/src/third_party/wiredtiger/src/os_posix/os_exist.c
new file mode 100644
index 00000000000..723f07026e1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_exist.c
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_exist --
+ * Return if the file exists.
+ */
+int
+__wt_exist(WT_SESSION_IMPL *session, const char *filename, int *existp)
+{
+ struct stat sb;
+ WT_DECL_RET;
+ char *path;
+
+ WT_RET(__wt_filename(session, filename, &path));
+
+ WT_SYSCALL_RETRY(stat(path, &sb), ret);
+
+ __wt_free(session, path);
+
+ if (ret == 0) {
+ *existp = 1;
+ return (0);
+ }
+ if (ret == ENOENT) {
+ *existp = 0;
+ return (0);
+ }
+
+ WT_RET_MSG(session, ret, "%s: fstat", filename);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fallocate.c b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
new file mode 100644
index 00000000000..28cd1979c77
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
@@ -0,0 +1,97 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#if defined(HAVE_FALLOCATE)
+#include <linux/falloc.h>
+#endif
+
+/*
+ * __wt_fallocate_config --
+ * Configure fallocate behavior for a file handle.
+ */
+void
+__wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+ WT_UNUSED(session);
+
+ fh->fallocate_available = 0;
+ fh->fallocate_requires_locking = 0;
+
+#ifdef __linux__
+ /*
+ * We've seen Linux systems where posix_fallocate corrupts existing data
+ * (even though that is explicitly disallowed by POSIX). We've not seen
+ * problems with fallocate, it's unlocked for now.
+ */
+#if defined(HAVE_FALLOCATE)
+ fh->fallocate_available = 1;
+ fh->fallocate_requires_locking = 0;
+#elif defined(HAVE_POSIX_FALLOCATE)
+ fh->fallocate_available = 1;
+ fh->fallocate_requires_locking = 1;
+#endif
+#elif defined(HAVE_POSIX_FALLOCATE)
+ /*
+ * FreeBSD and Solaris support posix_fallocate, and so far we've seen
+ * no problems leaving it unlocked.
+ */
+ fh->fallocate_available = 1;
+ fh->fallocate_requires_locking = 0;
+#endif
+}
+
+/*
+ * __wt_fallocate --
+ * Allocate space for a file handle.
+ */
+int
+__wt_fallocate(
+ WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len)
+{
+ WT_DECL_RET;
+
+#if defined(HAVE_FALLOCATE)
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "%s: fallocate", fh->name));
+ WT_SYSCALL_RETRY(
+ fallocate(fh->fd, FALLOC_FL_KEEP_SIZE, offset, len), ret);
+ if (ret == 0)
+ return (0);
+
+ /*
+ * Linux returns ENOTSUP for fallocate on some file systems; we return
+ * ENOTSUP, and our caller should avoid calling us again.
+ */
+ if (ret != ENOTSUP)
+ WT_RET_MSG(session, ret, "%s: fallocate", fh->name);
+#elif defined(HAVE_POSIX_FALLOCATE)
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "%s: posix_fallocate", fh->name));
+ WT_SYSCALL_RETRY(posix_fallocate(fh->fd, offset, len), ret);
+ if (ret == 0)
+ return (0);
+
+ /*
+ * Solaris returns EINVAL for posix_fallocate on some file systems; we
+ * return ENOTSUP, and our caller should avoid calling us again.
+ */
+ if (ret != EINVAL)
+ WT_RET_MSG(session, ret, "%s: posix_fallocate", fh->name);
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(fh);
+ WT_UNUSED(offset);
+ WT_UNUSED(len);
+ WT_UNUSED(ret);
+#endif
+
+ fh->fallocate_available = 0;
+ fh->fallocate_requires_locking = 0;
+ return (ENOTSUP);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_filesize.c b/src/third_party/wiredtiger/src/os_posix/os_filesize.c
new file mode 100644
index 00000000000..3692b135d73
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_filesize.c
@@ -0,0 +1,55 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_filesize --
+ * Get the size of a file in bytes.
+ */
+int
+__wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
+{
+ struct stat sb;
+ WT_DECL_RET;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: fstat", fh->name));
+
+ WT_SYSCALL_RETRY(fstat(fh->fd, &sb), ret);
+ if (ret == 0) {
+ *sizep = sb.st_size;
+ return (0);
+ }
+
+ WT_RET_MSG(session, ret, "%s: fstat", fh->name);
+}
+
+/*
+ * __wt_filesize_name --
+ * Return the size of a file in bytes, given a file name.
+ */
+int
+__wt_filesize_name(
+ WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep)
+{
+ struct stat sb;
+ WT_DECL_RET;
+ char *path;
+
+ WT_RET(__wt_filename(session, filename, &path));
+
+ WT_SYSCALL_RETRY(stat(path, &sb), ret);
+
+ __wt_free(session, path);
+
+ if (ret == 0) {
+ *sizep = sb.st_size;
+ return (0);
+ }
+
+ WT_RET_MSG(session, ret, "%s: fstat", filename);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_flock.c b/src/third_party/wiredtiger/src/os_posix/os_flock.c
new file mode 100644
index 00000000000..e9e653d73e6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_flock.c
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bytelock --
+ * Lock/unlock a byte in a file.
+ */
+int
+__wt_bytelock(WT_FH *fhp, wt_off_t byte, int lock)
+{
+ struct flock fl;
+ WT_DECL_RET;
+
+ /*
+ * WiredTiger requires this function be able to acquire locks past
+ * the end of file.
+ *
+ * Note we're using fcntl(2) locking: all fcntl locks associated with a
+ * file for a given process are removed when any file descriptor for the
+ * file is closed by the process, even if a lock was never requested for
+ * that file descriptor.
+ */
+ fl.l_start = byte;
+ fl.l_len = 1;
+ fl.l_type = lock ? F_WRLCK : F_UNLCK;
+ fl.l_whence = SEEK_SET;
+
+ WT_SYSCALL_RETRY(fcntl(fhp->fd, F_SETLK, &fl), ret);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fsync.c b/src/third_party/wiredtiger/src/os_posix/os_fsync.c
new file mode 100644
index 00000000000..c181809df95
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_fsync.c
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_fsync --
+ * Flush a file handle.
+ */
+int
+__wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+ WT_DECL_RET;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: fsync", fh->name));
+
+#ifdef HAVE_FDATASYNC
+ WT_SYSCALL_RETRY(fdatasync(fh->fd), ret);
+#else
+ WT_SYSCALL_RETRY(fsync(fh->fd), ret);
+#endif
+ if (ret != 0)
+ WT_RET_MSG(session, ret, "%s fsync error", fh->name);
+
+ return (0);
+}
+
+/*
+ * __wt_fsync_async --
+ * Flush a file handle and don't wait for the result.
+ */
+int
+__wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+#ifdef HAVE_SYNC_FILE_RANGE
+ WT_DECL_RET;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "%s: sync_file_range", fh->name));
+
+ if ((ret = sync_file_range(fh->fd,
+ (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) == 0)
+ return (0);
+ WT_RET_MSG(session, ret, "%s: sync_file_range", fh->name);
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(fh);
+ return (0);
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_ftruncate.c b/src/third_party/wiredtiger/src/os_posix/os_ftruncate.c
new file mode 100644
index 00000000000..3f3034de551
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_ftruncate.c
@@ -0,0 +1,26 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ftruncate --
+ * Truncate a file.
+ */
+int
+__wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len)
+{
+ WT_DECL_RET;
+
+ WT_SYSCALL_RETRY(ftruncate(fh->fd, len), ret);
+ if (ret == 0) {
+ fh->size = fh->extend_size = len;
+ return (0);
+ }
+
+ WT_RET_MSG(session, ret, "%s ftruncate error", fh->name);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_getline.c b/src/third_party/wiredtiger/src/os_posix/os_getline.c
new file mode 100644
index 00000000000..7ef4065ac3b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_getline.c
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_getline --
+ * Get a line from a stream.
+ *
+ * Implementation of the POSIX getline or BSD fgetln functions (finding the
+ * function in a portable way is hard, it's simple enough to write it instead).
+ *
+ * Note: Unlike the standard getline calls, this function doesn't include the
+ * trailing newline character in the returned buffer and discards empty lines
+ * (so the caller's EOF marker is a returned line length of 0).
+ */
+int
+__wt_getline(WT_SESSION_IMPL *session, WT_ITEM *buf, FILE *fp)
+{
+ int c;
+
+ /*
+ * We always NUL-terminate the returned string (even if it's empty),
+ * make sure there's buffer space for a trailing NUL in all cases.
+ */
+ WT_RET(__wt_buf_init(session, buf, 100));
+
+ while ((c = fgetc(fp)) != EOF) {
+ /* Leave space for a trailing NUL. */
+ WT_RET(__wt_buf_extend(session, buf, buf->size + 2));
+ if (c == '\n') {
+ if (buf->size == 0)
+ continue;
+ break;
+ }
+ ((char *)buf->mem)[buf->size++] = (char)c;
+ }
+ if (c == EOF && ferror(fp))
+ WT_RET_MSG(session, __wt_errno(), "file read");
+
+ ((char *)buf->mem)[buf->size] = '\0';
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_getopt.c b/src/third_party/wiredtiger/src/os_posix/os_getopt.c
new file mode 100644
index 00000000000..1c25521dacd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_getopt.c
@@ -0,0 +1,150 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* $NetBSD: getopt.c,v 1.26 2003/08/07 16:43:40 agc Exp $ */
+
+/*
+ * Copyright (c) 1987, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "wt_internal.h"
+
+extern int __wt_opterr, __wt_optind, __wt_optopt, __wt_optreset;
+int __wt_opterr = 1, /* if error message should be printed */
+ __wt_optind = 1, /* index into parent argv vector */
+ __wt_optopt, /* character checked for validity */
+ __wt_optreset; /* reset getopt */
+
+extern char *__wt_optarg;
+char *__wt_optarg; /* argument associated with option */
+
+#define BADCH (int)'?'
+#define BADARG (int)':'
+#define EMSG ""
+
+/*
+ * __wt_getopt --
+ * Parse argc/argv argument vector.
+ */
+int
+__wt_getopt(
+ const char *progname, int nargc, char * const *nargv, const char *ostr)
+{
+ static const char *place = EMSG; /* option letter processing */
+ const char *oli; /* option letter list index */
+
+ if (__wt_optreset || *place == 0) { /* update scanning pointer */
+ __wt_optreset = 0;
+ place = nargv[__wt_optind];
+ if (__wt_optind >= nargc || *place++ != '-') {
+ /* Argument is absent or is not an option */
+ place = EMSG;
+ return (-1);
+ }
+ __wt_optopt = *place++;
+ if (__wt_optopt == '-' && *place == 0) {
+ /* "--" => end of options */
+ ++__wt_optind;
+ place = EMSG;
+ return (-1);
+ }
+ if (__wt_optopt == 0) {
+ /* Solitary '-', treat as a '-' option
+ if the program (eg su) is looking for it. */
+ place = EMSG;
+ if (strchr(ostr, '-') == NULL)
+ return (-1);
+ __wt_optopt = '-';
+ }
+ } else
+ __wt_optopt = *place++;
+
+ /* See if option letter is one the caller wanted... */
+ if (__wt_optopt == ':' || (oli = strchr(ostr, __wt_optopt)) == NULL) {
+ if (*place == 0)
+ ++__wt_optind;
+ if (__wt_opterr && *ostr != ':')
+ (void)fprintf(stderr,
+ "%s: illegal option -- %c\n", progname,
+ __wt_optopt);
+ return (BADCH);
+ }
+
+ /* Does this option need an argument? */
+ if (oli[1] != ':') {
+ /* don't need argument */
+ __wt_optarg = NULL;
+ if (*place == 0)
+ ++__wt_optind;
+ } else {
+ /* Option-argument is either the rest of this argument or the
+ entire next argument. */
+ if (*place)
+ __wt_optarg = (char *)place;
+ else if (nargc > ++__wt_optind)
+ __wt_optarg = nargv[__wt_optind];
+ else {
+ /* option-argument absent */
+ place = EMSG;
+ if (*ostr == ':')
+ return (BADARG);
+ if (__wt_opterr)
+ (void)fprintf(stderr,
+ "%s: option requires an argument -- %c\n",
+ progname, __wt_optopt);
+ return (BADCH);
+ }
+ place = EMSG;
+ ++__wt_optind;
+ }
+ return (__wt_optopt); /* return option letter */
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_map.c b/src/third_party/wiredtiger/src/os_posix/os_map.c
new file mode 100644
index 00000000000..be4d27e96a3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_map.c
@@ -0,0 +1,136 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_mmap --
+ * Map a file into memory.
+ */
+int
+__wt_mmap(WT_SESSION_IMPL *session,
+ WT_FH *fh, void *mapp, size_t *lenp, void **mappingcookie)
+{
+ void *map;
+ size_t orig_size;
+
+ WT_UNUSED(mappingcookie);
+
+ /*
+ * Record the current size and only map and set that as the length, it
+ * could change between the map call and when we set the return length.
+ * For the same reason we could actually map past the end of the file;
+ * we don't read bytes past the end of the file though, so as long as
+ * the map call succeeds, it's all OK.
+ */
+ orig_size = (size_t)fh->size;
+ if ((map = mmap(NULL, orig_size,
+ PROT_READ,
+#ifdef MAP_NOCORE
+ MAP_NOCORE |
+#endif
+ MAP_PRIVATE,
+ fh->fd, (wt_off_t)0)) == MAP_FAILED) {
+ WT_RET_MSG(session, __wt_errno(),
+ "%s map error: failed to map %" WT_SIZET_FMT " bytes",
+ fh->name, orig_size);
+ }
+ (void)__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: map %p: %" WT_SIZET_FMT " bytes", fh->name, map, orig_size);
+
+ *(void **)mapp = map;
+ *lenp = orig_size;
+ return (0);
+}
+
+#define WT_VM_PAGESIZE 4096
+
+/*
+ * __wt_mmap_preload --
+ * Cause a section of a memory map to be faulted in.
+ */
+int
+__wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size)
+{
+#ifdef HAVE_POSIX_MADVISE
+ /* Linux requires the address be aligned to a 4KB boundary. */
+ WT_BM *bm = S2BT(session)->bm;
+ WT_DECL_RET;
+ void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1));
+ size += WT_PTRDIFF(p, blk);
+
+ /* XXX proxy for "am I doing a scan?" -- manual read-ahead */
+ if (F_ISSET(session, WT_SESSION_NO_CACHE)) {
+ /* Read in 2MB blocks every 1MB of data. */
+ if (((uintptr_t)((uint8_t *)blk + size) &
+ (uintptr_t)((1<<20) - 1)) < (uintptr_t)blk)
+ return (0);
+ size = WT_MIN(WT_MAX(20 * size, 2 << 20),
+ WT_PTRDIFF((uint8_t *)bm->map + bm->maplen, blk));
+ }
+
+ /*
+ * Manual pages aren't clear on whether alignment is required for the
+ * size, so we will be conservative.
+ */
+ size &= ~(size_t)(WT_VM_PAGESIZE - 1);
+
+ if (size > WT_VM_PAGESIZE &&
+ (ret = posix_madvise(blk, size, POSIX_MADV_WILLNEED)) != 0)
+ WT_RET_MSG(session, ret, "posix_madvise will need");
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(p);
+ WT_UNUSED(size);
+#endif
+
+ return (0);
+}
+
+/*
+ * __wt_mmap_discard --
+ * Discard a chunk of the memory map.
+ */
+int
+__wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size)
+{
+#ifdef HAVE_POSIX_MADVISE
+ /* Linux requires the address be aligned to a 4KB boundary. */
+ WT_DECL_RET;
+ void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1));
+ size += WT_PTRDIFF(p, blk);
+
+ if ((ret = posix_madvise(blk, size, POSIX_MADV_DONTNEED)) != 0)
+ WT_RET_MSG(session, ret, "posix_madvise don't need");
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(p);
+ WT_UNUSED(size);
+#endif
+ return (0);
+}
+
+/*
+ * __wt_munmap --
+ * Remove a memory mapping.
+ */
+int
+__wt_munmap(WT_SESSION_IMPL *session,
+ WT_FH *fh, void *map, size_t len, void **mappingcookie)
+{
+ WT_UNUSED(mappingcookie);
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: unmap %p: %" WT_SIZET_FMT " bytes", fh->name, map, len));
+
+ if (munmap(map, len) == 0)
+ return (0);
+
+ WT_RET_MSG(session, __wt_errno(),
+ "%s unmap error: failed to unmap %" WT_SIZET_FMT " bytes",
+ fh->name, len);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
new file mode 100644
index 00000000000..3a76cceb3f0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
@@ -0,0 +1,157 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cond_alloc --
+ * Allocate and initialize a condition variable.
+ */
+int
+__wt_cond_alloc(WT_SESSION_IMPL *session,
+ const char *name, int is_signalled, WT_CONDVAR **condp)
+{
+ WT_CONDVAR *cond;
+ WT_DECL_RET;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL session handle.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond));
+
+ WT_ERR(pthread_mutex_init(&cond->mtx, NULL));
+
+ /* Initialize the condition variable to permit self-blocking. */
+ WT_ERR(pthread_cond_init(&cond->cond, NULL));
+
+ cond->name = name;
+ cond->waiters = is_signalled ? -1 : 0;
+
+ *condp = cond;
+ return (0);
+
+err: __wt_free(session, cond);
+ return (ret);
+}
+
+/*
+ * __wt_cond_wait --
+ * Wait on a mutex, optionally timing out.
+ */
+int
+__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs)
+{
+ struct timespec ts;
+ WT_DECL_RET;
+ int locked;
+
+ locked = 0;
+ WT_ASSERT(session, usecs >= 0);
+
+ /* Fast path if already signalled. */
+ if (WT_ATOMIC_ADD4(cond->waiters, 1) == 0)
+ return (0);
+
+ /*
+ * !!!
+ * This function MUST handle a NULL session handle.
+ */
+ if (session != NULL) {
+ WT_RET(__wt_verbose(session, WT_VERB_MUTEX,
+ "wait %s cond (%p)", cond->name, cond));
+ WT_STAT_FAST_CONN_INCR(session, cond_wait);
+ }
+
+ WT_ERR(pthread_mutex_lock(&cond->mtx));
+ locked = 1;
+
+ if (usecs > 0) {
+ WT_ERR(__wt_epoch(session, &ts));
+ ts.tv_sec += (ts.tv_nsec + 1000 * usecs) / WT_BILLION;
+ ts.tv_nsec = (ts.tv_nsec + 1000 * usecs) % WT_BILLION;
+ ret = pthread_cond_timedwait(&cond->cond, &cond->mtx, &ts);
+ } else
+ ret = pthread_cond_wait(&cond->cond, &cond->mtx);
+
+ /*
+ * Check pthread_cond_wait() return for EINTR, ETIME and
+ * ETIMEDOUT, some systems return these errors.
+ */
+ if (ret == EINTR ||
+#ifdef ETIME
+ ret == ETIME ||
+#endif
+ ret == ETIMEDOUT)
+ ret = 0;
+
+ (void)WT_ATOMIC_SUB4(cond->waiters, 1);
+
+err: if (locked)
+ WT_TRET(pthread_mutex_unlock(&cond->mtx));
+ if (ret == 0)
+ return (0);
+ WT_RET_MSG(session, ret, "pthread_cond_wait");
+}
+
+/*
+ * __wt_cond_signal --
+ * Signal a waiting thread.
+ */
+int
+__wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
+{
+ WT_DECL_RET;
+ int locked;
+
+ locked = 0;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL session handle.
+ */
+ if (session != NULL)
+ WT_RET(__wt_verbose(session, WT_VERB_MUTEX,
+ "signal %s cond (%p)", cond->name, cond));
+
+ /* Fast path if already signalled. */
+ if (cond->waiters == -1)
+ return (0);
+
+ if (cond->waiters > 0 || !WT_ATOMIC_CAS4(cond->waiters, 0, -1)) {
+ WT_ERR(pthread_mutex_lock(&cond->mtx));
+ locked = 1;
+ WT_ERR(pthread_cond_broadcast(&cond->cond));
+ }
+
+err: if (locked)
+ WT_TRET(pthread_mutex_unlock(&cond->mtx));
+ if (ret == 0)
+ return (0);
+ WT_RET_MSG(session, ret, "pthread_cond_broadcast");
+}
+
+/*
+ * __wt_cond_destroy --
+ * Destroy a condition variable.
+ */
+int
+__wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
+{
+ WT_CONDVAR *cond;
+ WT_DECL_RET;
+
+ cond = *condp;
+ if (cond == NULL)
+ return (0);
+
+ ret = pthread_cond_destroy(&cond->cond);
+ WT_TRET(pthread_mutex_destroy(&cond->mtx));
+ __wt_free(session, *condp);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
new file mode 100644
index 00000000000..1a692f71dce
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
@@ -0,0 +1,227 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Based on "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst:
+ * http://locklessinc.com/articles/locks/
+ *
+ * Dr. Fuerst further credits:
+ * There exists a form of the ticket lock that is designed for read-write
+ * locks. An example written in assembly was posted to the Linux kernel mailing
+ * list in 2002 by David Howells from RedHat. This was a highly optimized
+ * version of a read-write ticket lock developed at IBM in the early 90's by
+ * Joseph Seigh. Note that a similar (but not identical) algorithm was published
+ * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable
+ * Reader-Writer Synchronization for Shared-Memory Multiprocessors".
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rwlock_alloc --
+ * Allocate and initialize a read/write lock.
+ */
+int
+__wt_rwlock_alloc(
+ WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name)
+{
+ WT_RWLOCK *rwlock;
+
+ WT_RET(__wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name));
+
+ WT_RET(__wt_calloc_def(session, 1, &rwlock));
+
+ rwlock->name = name;
+
+ *rwlockp = rwlock;
+ return (0);
+}
+
+/*
+ * __wt_try_readlock --
+ * Try to get a shared lock, fail immediately if unavailable.
+ */
+int
+__wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ wt_rwlock_t *l;
+ uint64_t old, new, pad, users, writers;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: try_readlock %s", rwlock->name));
+ WT_STAT_FAST_CONN_INCR(session, rwlock_read);
+
+ l = &rwlock->rwlock;
+ pad = l->s.pad;
+ users = l->s.users;
+ writers = l->s.writers;
+ old = (pad << 48) + (users << 32) + (users << 16) + writers;
+ new = (pad << 48) + ((users + 1) << 32) + ((users + 1) << 16) + writers;
+ return (WT_ATOMIC_CAS_VAL8(l->u, old, new) == old ? 0 : EBUSY);
+}
+
+/*
+ * __wt_readlock --
+ * Get a shared lock.
+ */
+int
+__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ wt_rwlock_t *l;
+ uint64_t me;
+ uint16_t val;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: readlock %s", rwlock->name));
+ WT_STAT_FAST_CONN_INCR(session, rwlock_read);
+
+ l = &rwlock->rwlock;
+ me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
+ val = (uint16_t)(me >> 32);
+ while (val != l->s.readers)
+ WT_PAUSE();
+
+ ++l->s.readers;
+
+ return (0);
+}
+
+/*
+ * __wt_readunlock --
+ * Release a shared lock.
+ */
+int
+__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ wt_rwlock_t *l;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name));
+
+ l = &rwlock->rwlock;
+ WT_ATOMIC_ADD2(l->s.writers, 1);
+
+ return (0);
+}
+
+/*
+ * __wt_try_writelock --
+ * Try to get an exclusive lock, fail immediately if unavailable.
+ */
+int
+__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ wt_rwlock_t *l;
+ uint64_t old, new, pad, readers, users;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name));
+ WT_STAT_FAST_CONN_INCR(session, rwlock_write);
+
+ l = &rwlock->rwlock;
+ pad = l->s.pad;
+ readers = l->s.readers;
+ users = l->s.users;
+ old = (pad << 48) + (users << 32) + (readers << 16) + users;
+ new = (pad << 48) + ((users + 1) << 32) + (readers << 16) + users;
+ return (WT_ATOMIC_CAS_VAL8(l->u, old, new) == old ? 0 : EBUSY);
+}
+
+/*
+ * __wt_writelock --
+ * Wait to get an exclusive lock.
+ */
+int
+__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ wt_rwlock_t *l;
+ uint64_t me;
+ uint16_t val;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name));
+ WT_STAT_FAST_CONN_INCR(session, rwlock_write);
+
+ /*
+ * Possibly wrap: if we have more than 64K lockers waiting, the count
+ * of writers will wrap and two lockers will simultaneously be granted
+ * the write lock.
+ */
+ l = &rwlock->rwlock;
+ me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
+ val = (uint16_t)(me >> 32);
+ while (val != l->s.writers)
+ WT_PAUSE();
+
+ return (0);
+}
+
+/*
+ * __wt_writeunlock --
+ * Release an exclusive lock.
+ */
+int
+__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ wt_rwlock_t *l, copy;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: writeunlock %s", rwlock->name));
+
+ l = &rwlock->rwlock;
+
+ copy = *l;
+
+ WT_BARRIER();
+
+ ++copy.s.writers;
+ ++copy.s.readers;
+
+ l->us = copy.us;
+ return (0);
+}
+
+/*
+ * __wt_rwlock_destroy --
+ * Destroy a read/write lock.
+ */
+int
+__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp)
+{
+ WT_RWLOCK *rwlock;
+
+ rwlock = *rwlockp; /* Clear our caller's reference. */
+ if (rwlock == NULL)
+ return (0);
+ *rwlockp = NULL;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: destroy %s", rwlock->name));
+
+ __wt_free(session, rwlock);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_once.c b/src/third_party/wiredtiger/src/os_posix/os_once.c
new file mode 100644
index 00000000000..22eaf5f0ee5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_once.c
@@ -0,0 +1,20 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_once --
+ * One-time initialization per process.
+ */
+int
+__wt_once(void (*init_routine)(void))
+{
+ static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+
+ return (pthread_once(&once_control, init_routine));
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_open.c b/src/third_party/wiredtiger/src/os_posix/os_open.c
new file mode 100644
index 00000000000..a1bc3feb7d2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_open.c
@@ -0,0 +1,253 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __open_directory_sync --
+ * Fsync the directory in which we created the file.
+ */
+static int
+__open_directory_sync(WT_SESSION_IMPL *session, char *path)
+{
+#ifdef __linux__
+ WT_DECL_RET;
+ int fd;
+ char *dir;
+
+ /*
+ * According to the Linux fsync man page:
+ * Calling fsync() does not necessarily ensure that the entry in
+ * the directory containing the file has also reached disk. For
+ * that an explicit fsync() on a file descriptor for the directory
+ * is also needed.
+ *
+ * Open the WiredTiger home directory and sync it, I don't want the rest
+ * of the system to have to wonder if opening a file creates it.
+ */
+ if ((dir = strrchr(path, '/')) == NULL)
+ path = (char *)".";
+ else
+ *dir = '\0';
+ WT_SYSCALL_RETRY(((fd =
+ open(path, O_RDONLY, 0444)) == -1 ? 1 : 0), ret);
+ if (dir != NULL)
+ *dir = '/';
+ if (ret != 0)
+ WT_RET_MSG(session, ret, "%s: open", path);
+
+ WT_SYSCALL_RETRY(fsync(fd), ret);
+ if (ret != 0)
+ WT_ERR_MSG(session, ret, "%s: fsync", path);
+
+err: WT_SYSCALL_RETRY(close(fd), ret);
+ if (ret != 0)
+ __wt_err(session, ret, "%s: close", path);
+ return (ret);
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(path);
+ return (0);
+#endif
+}
+
+/*
+ * __wt_open --
+ * Open a file handle.
+ */
+int
+__wt_open(WT_SESSION_IMPL *session,
+ const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *fh, *tfh;
+ mode_t mode;
+ int direct_io, f, fd, matched;
+ char *path;
+
+ conn = S2C(session);
+ fh = NULL;
+ fd = -1;
+ path = NULL;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name));
+
+ /* Increment the reference count if we already have the file open. */
+ matched = 0;
+ __wt_spin_lock(session, &conn->fh_lock);
+ TAILQ_FOREACH(tfh, &conn->fhqh, q)
+ if (strcmp(name, tfh->name) == 0) {
+ ++tfh->ref;
+ *fhp = tfh;
+ matched = 1;
+ break;
+ }
+ __wt_spin_unlock(session, &conn->fh_lock);
+ if (matched)
+ return (0);
+
+ WT_RET(__wt_filename(session, name, &path));
+
+ f = O_RDWR;
+#ifdef O_BINARY
+ /* Windows clones: we always want to treat the file as a binary. */
+ f |= O_BINARY;
+#endif
+#ifdef O_CLOEXEC
+ /*
+ * Security:
+ * The application may spawn a new process, and we don't want another
+ * process to have access to our file handles.
+ */
+ f |= O_CLOEXEC;
+#endif
+#ifdef O_NOATIME
+ /* Avoid updating metadata for read-only workloads. */
+ if (dio_type == WT_FILE_TYPE_DATA ||
+ dio_type == WT_FILE_TYPE_CHECKPOINT)
+ f |= O_NOATIME;
+#endif
+
+ if (ok_create) {
+ f |= O_CREAT;
+ if (exclusive)
+ f |= O_EXCL;
+ mode = 0666;
+ } else
+ mode = 0;
+
+ direct_io = 0;
+#ifdef O_DIRECT
+ if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) {
+ f |= O_DIRECT;
+ direct_io = 1;
+ }
+#endif
+ if (dio_type == WT_FILE_TYPE_LOG &&
+ FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC))
+#ifdef O_DSYNC
+ f |= O_DSYNC;
+#elif defined(O_SYNC)
+ f |= O_SYNC;
+#else
+ WT_ERR_MSG(session, ENOTSUP,
+ "Unsupported log sync mode requested");
+#endif
+ WT_SYSCALL_RETRY(((fd = open(path, f, mode)) == -1 ? 1 : 0), ret);
+ if (ret != 0)
+ WT_ERR_MSG(session, ret,
+ direct_io ?
+ "%s: open failed with direct I/O configured, some "
+ "filesystem types do not support direct I/O" : "%s", path);
+
+#if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) && !defined(O_CLOEXEC)
+ /*
+ * Security:
+ * The application may spawn a new process, and we don't want another
+ * process to have access to our file handles. There's an obvious
+ * race here, so we prefer the flag to open if available.
+ */
+ if ((f = fcntl(fd, F_GETFD)) == -1 ||
+ fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1)
+ WT_ERR_MSG(session, __wt_errno(), "%s: fcntl", name);
+#endif
+
+#if defined(HAVE_POSIX_FADVISE)
+ /* Disable read-ahead on trees: it slows down random read workloads. */
+ if (dio_type == WT_FILE_TYPE_DATA ||
+ dio_type == WT_FILE_TYPE_CHECKPOINT)
+ WT_ERR(posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM));
+#endif
+
+ if (F_ISSET(conn, WT_CONN_CKPT_SYNC))
+ WT_ERR(__open_directory_sync(session, path));
+
+ WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh));
+ WT_ERR(__wt_strdup(session, name, &fh->name));
+ fh->fd = fd;
+ fh->ref = 1;
+ fh->direct_io = direct_io;
+
+ /* Set the file's size. */
+ WT_ERR(__wt_filesize(session, fh, &fh->size));
+
+ /* Configure file extension. */
+ if (dio_type == WT_FILE_TYPE_DATA ||
+ dio_type == WT_FILE_TYPE_CHECKPOINT)
+ fh->extend_len = conn->data_extend_len;
+
+ /* Configure fallocate/posix_fallocate calls. */
+ __wt_fallocate_config(session, fh);
+
+ /*
+ * Repeat the check for a match, but then link onto the database's list
+ * of files.
+ */
+ matched = 0;
+ __wt_spin_lock(session, &conn->fh_lock);
+ TAILQ_FOREACH(tfh, &conn->fhqh, q)
+ if (strcmp(name, tfh->name) == 0) {
+ ++tfh->ref;
+ *fhp = tfh;
+ matched = 1;
+ break;
+ }
+ if (!matched) {
+ TAILQ_INSERT_TAIL(&conn->fhqh, fh, q);
+ WT_STAT_FAST_CONN_INCR(session, file_open);
+
+ *fhp = fh;
+ }
+ __wt_spin_unlock(session, &conn->fh_lock);
+ if (matched) {
+err: if (fh != NULL) {
+ __wt_free(session, fh->name);
+ __wt_free(session, fh);
+ }
+ if (fd != -1)
+ (void)close(fd);
+ }
+
+ __wt_free(session, path);
+ return (ret);
+}
+
+/*
+ * __wt_close --
+ * Close a file handle.
+ */
+int
+__wt_close(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ __wt_spin_lock(session, &conn->fh_lock);
+ if (fh == NULL || fh->ref == 0 || --fh->ref > 0) {
+ __wt_spin_unlock(session, &conn->fh_lock);
+ return (0);
+ }
+
+ /* Remove from the list. */
+ TAILQ_REMOVE(&conn->fhqh, fh, q);
+ WT_STAT_FAST_CONN_DECR(session, file_open);
+
+ __wt_spin_unlock(session, &conn->fh_lock);
+
+ /* Discard the memory. */
+ if (close(fh->fd) != 0) {
+ ret = __wt_errno();
+ __wt_err(session, ret, "close: %s", fh->name);
+ }
+
+ __wt_free(session, fh->name);
+ __wt_free(session, fh);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_path.c b/src/third_party/wiredtiger/src/os_posix/os_path.c
new file mode 100644
index 00000000000..aed99d1d027
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_path.c
@@ -0,0 +1,28 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_absolute_path --
+ * Return if a filename is an absolute path.
+ */
+int
+__wt_absolute_path(const char *path)
+{
+ return (path[0] == '/' ? 1 : 0);
+}
+
+/*
+ * __wt_path_separator --
+ * Return the path separator string.
+ */
+const char *
+__wt_path_separator(void)
+{
+ return ("/");
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_priv.c b/src/third_party/wiredtiger/src/os_posix/os_priv.c
new file mode 100644
index 00000000000..7d56359da4f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_priv.c
@@ -0,0 +1,19 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_has_priv --
+ * Return if the process has special privileges, defined as having
+ * different effective and read UIDs or GIDs.
+ */
+int
+__wt_has_priv(void)
+{
+ return (getuid() != geteuid() || getgid() != getegid());
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_remove.c b/src/third_party/wiredtiger/src/os_posix/os_remove.c
new file mode 100644
index 00000000000..a52a4db6bc7
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_remove.c
@@ -0,0 +1,66 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __remove_file_check --
+ * Check if the file is currently open before removing it.
+ */
+static void
+__remove_file_check(WT_SESSION_IMPL *session, const char *name)
+{
+#ifdef HAVE_DIAGNOSTIC
+ WT_CONNECTION_IMPL *conn;
+ WT_FH *fh;
+
+ conn = S2C(session);
+ fh = NULL;
+
+ /*
+ * Check if the file is open: it's an error if it is, since a higher
+ * level should have closed it before removing.
+ */
+ __wt_spin_lock(session, &conn->fh_lock);
+ TAILQ_FOREACH(fh, &conn->fhqh, q) {
+ if (strcmp(name, fh->name) == 0)
+ break;
+ }
+ __wt_spin_unlock(session, &conn->fh_lock);
+
+ WT_ASSERT(session, fh == NULL);
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(name);
+#endif
+}
+
+/*
+ * __wt_remove --
+ * Remove a file.
+ */
+int
+__wt_remove(WT_SESSION_IMPL *session, const char *name)
+{
+ WT_DECL_RET;
+ char *path;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: remove", name));
+
+ __remove_file_check(session, name);
+
+ WT_RET(__wt_filename(session, name, &path));
+
+ WT_SYSCALL_RETRY(remove(path), ret);
+
+ __wt_free(session, path);
+
+ if (ret == 0 || ret == ENOENT)
+ return (0);
+
+ WT_RET_MSG(session, ret, "%s: remove", name);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_rename.c b/src/third_party/wiredtiger/src/os_posix/os_rename.c
new file mode 100644
index 00000000000..ddbb59aaf37
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_rename.c
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rename --
+ * Rename a file.
+ */
+int
+__wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to)
+{
+ WT_DECL_RET;
+ char *from_path, *to_path;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "rename %s to %s", from, to));
+
+ from_path = to_path = NULL;
+
+ WT_RET(__wt_filename(session, from, &from_path));
+ WT_TRET(__wt_filename(session, to, &to_path));
+
+ if (ret == 0)
+ WT_SYSCALL_RETRY(rename(from_path, to_path), ret);
+
+ __wt_free(session, from_path);
+ __wt_free(session, to_path);
+
+ if (ret == 0)
+ return (0);
+
+ WT_RET_MSG(session, ret, "rename %s to %s", from, to);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_rw.c b/src/third_party/wiredtiger/src/os_posix/os_rw.c
new file mode 100644
index 00000000000..4247fb30fd1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_rw.c
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_read --
+ * Read a chunk.
+ */
+int
+__wt_read(
+ WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf)
+{
+ size_t chunk;
+ ssize_t nr;
+ uint8_t *addr;
+
+ WT_STAT_FAST_CONN_INCR(session, read_io);
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: read %" WT_SIZET_FMT " bytes at offset %" PRIuMAX,
+ fh->name, len, (uintmax_t)offset));
+
+ /* Assert direct I/O is aligned and a multiple of the alignment. */
+ WT_ASSERT(session,
+ !fh->direct_io ||
+ S2C(session)->buffer_alignment == 0 ||
+ (!((uintptr_t)buf &
+ (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
+ len >= S2C(session)->buffer_alignment &&
+ len % S2C(session)->buffer_alignment == 0));
+
+ /* Break reads larger than 1GB into 1GB chunks. */
+ for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) {
+ chunk = WT_MIN(len, WT_GIGABYTE);
+ if ((nr = pread(fh->fd, addr, chunk, offset)) <= 0)
+ WT_RET_MSG(session, nr == 0 ? WT_ERROR : __wt_errno(),
+ "%s read error: failed to read %" WT_SIZET_FMT
+ " bytes at offset %" PRIuMAX,
+ fh->name, chunk, (uintmax_t)offset);
+ }
+ return (0);
+}
+
+/*
+ * __wt_write --
+ * Write a chunk.
+ */
+int
+__wt_write(WT_SESSION_IMPL *session,
+ WT_FH *fh, wt_off_t offset, size_t len, const void *buf)
+{
+ size_t chunk;
+ ssize_t nw;
+ const uint8_t *addr;
+
+ WT_STAT_FAST_CONN_INCR(session, write_io);
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: write %" WT_SIZET_FMT " bytes at offset %" PRIuMAX,
+ fh->name, len, (uintmax_t)offset));
+
+ /* Assert direct I/O is aligned and a multiple of the alignment. */
+ WT_ASSERT(session,
+ !fh->direct_io ||
+ S2C(session)->buffer_alignment == 0 ||
+ (!((uintptr_t)buf &
+ (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
+ len >= S2C(session)->buffer_alignment &&
+ len % S2C(session)->buffer_alignment == 0));
+
+ /* Break writes larger than 1GB into 1GB chunks. */
+ for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) {
+ chunk = WT_MIN(len, WT_GIGABYTE);
+ if ((nw = pwrite(fh->fd, addr, chunk, offset)) < 0)
+ WT_RET_MSG(session, __wt_errno(),
+ "%s write error: failed to write %" WT_SIZET_FMT
+ " bytes at offset %" PRIuMAX,
+ fh->name, chunk, (uintmax_t)offset);
+ }
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_sleep.c b/src/third_party/wiredtiger/src/os_posix/os_sleep.c
new file mode 100644
index 00000000000..665330a26e7
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_sleep.c
@@ -0,0 +1,23 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_sleep --
+ * Pause the thread of control.
+ */
+void
+__wt_sleep(long seconds, long micro_seconds)
+{
+ struct timeval t;
+
+ t.tv_sec = seconds + micro_seconds / 1000000;
+ t.tv_usec = (suseconds_t)(micro_seconds % 1000000);
+
+ (void)select(0, NULL, NULL, NULL, &t);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_strtouq.c b/src/third_party/wiredtiger/src/os_posix/os_strtouq.c
new file mode 100644
index 00000000000..97f9759f76f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_strtouq.c
@@ -0,0 +1,24 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_strtouq --
+ * Convert a string to an unsigned quad integer.
+ */
+uint64_t
+__wt_strtouq(const char *nptr, char **endptr, int base)
+{
+#if defined(HAVE_STRTOUQ)
+ return (strtouq(nptr, endptr, base));
+#else
+ WT_STATIC_ASSERT(sizeof(uint64_t) == sizeof(unsigned long long));
+
+ return (strtoull(nptr, endptr, base));
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_thread.c b/src/third_party/wiredtiger/src/os_posix/os_thread.c
new file mode 100644
index 00000000000..7c447710b46
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_thread.c
@@ -0,0 +1,59 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_thread_create --
+ * Create a new thread of control.
+ */
+int
+__wt_thread_create(WT_SESSION_IMPL *session,
+ wt_thread_t *tidret, void *(*func)(void *), void *arg)
+{
+ WT_DECL_RET;
+
+ /* Spawn a new thread of control. */
+ if ((ret = pthread_create(tidret, NULL, func, arg)) == 0)
+ return (0);
+ WT_RET_MSG(session, ret, "pthread_create");
+}
+
+/*
+ * __wt_thread_join --
+ * Wait for a thread of control to exit.
+ */
+int
+__wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid)
+{
+ WT_DECL_RET;
+
+ if ((ret = pthread_join(tid, NULL)) == 0)
+ return (0);
+
+ WT_RET_MSG(session, ret, "pthread_join");
+}
+
+/*
+ * __wt_thread_id --
+ * Fill in a printable version of the process and thread IDs.
+ */
+void
+__wt_thread_id(char *buf, size_t buflen)
+{
+ pthread_t self;
+
+ /*
+ * POSIX 1003.1 allows pthread_t to be an opaque type, but on systems
+ * where it's a pointer, we'd rather print out the pointer and match
+ * gdb output. Since we don't yet run on any systems where pthread_t
+ * is not a pointer, do it that way for now.
+ */
+ self = pthread_self();
+ (void)snprintf(buf, buflen,
+ "%" PRIu64 ":%p", (uint64_t)getpid(), (void *)self);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_time.c b/src/third_party/wiredtiger/src/os_posix/os_time.c
new file mode 100644
index 00000000000..56f688a1e14
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_time.c
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_seconds --
+ * Return the seconds since the Epoch.
+ */
+int
+__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
+{
+ struct timespec t;
+
+ WT_RET(__wt_epoch(session, &t));
+
+ *timep = t.tv_sec;
+
+ return (0);
+}
+
+/*
+ * __wt_epoch --
+ * Return the time since the Epoch.
+ */
+int
+__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
+{
+ WT_DECL_RET;
+
+#if defined(HAVE_CLOCK_GETTIME)
+ WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, tsp), ret);
+ if (ret == 0)
+ return (0);
+ WT_RET_MSG(session, ret, "clock_gettime");
+#elif defined(HAVE_GETTIMEOFDAY)
+ struct timeval v;
+
+ WT_SYSCALL_RETRY(gettimeofday(&v, NULL), ret);
+ if (ret == 0) {
+ tsp->tv_sec = v.tv_sec;
+ tsp->tv_nsec = v.tv_usec * 1000;
+ return (0);
+ }
+ WT_RET_MSG(session, ret, "gettimeofday");
+#else
+ NO TIME-OF-DAY IMPLEMENTATION: see src/os_posix/os_time.c
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_yield.c b/src/third_party/wiredtiger/src/os_posix/os_yield.c
new file mode 100644
index 00000000000..6af30803e81
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_yield.c
@@ -0,0 +1,18 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_yield --
+ * Yield the thread of control.
+ */
+void
+__wt_yield(void)
+{
+ sched_yield();
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_dir.c b/src/third_party/wiredtiger/src/os_win/os_dir.c
new file mode 100644
index 00000000000..076c64670d4
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_dir.c
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_dirlist --
+ * Get a list of files from a directory, optionally filtered by
+ * a given prefix.
+ */
+int
+__wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix,
+ uint32_t flags, char ***dirlist, u_int *countp)
+{
+ HANDLE findhandle;
+ WIN32_FIND_DATA finddata;
+ WT_DECL_ITEM(pathbuf);
+ WT_DECL_RET;
+ size_t dirallocsz, pathlen;
+ u_int count, dirsz;
+ int match;
+ char **entries, *path;
+
+ *dirlist = NULL;
+ *countp = 0;
+
+ findhandle = INVALID_HANDLE_VALUE;
+ count = 0;
+
+ WT_RET(__wt_filename(session, dir, &path));
+
+ pathlen = strlen(path);
+ if (path[pathlen - 1] == '\\') {
+ path[pathlen - 1] = '\0';
+ }
+
+ WT_ERR(__wt_scr_alloc(session, 0, &pathbuf));
+ WT_ERR(__wt_buf_fmt(session, pathbuf, "%s\\*", path));
+
+ dirallocsz = 0;
+ dirsz = 0;
+ entries = NULL;
+ if (flags == 0)
+ LF_SET(WT_DIRLIST_INCLUDE);
+
+ WT_ERR(__wt_verbose(session, WT_VERB_FILEOPS,
+ "wt_dirlist of %s %s prefix %s",
+ pathbuf->data, LF_ISSET(WT_DIRLIST_INCLUDE) ? "include" : "exclude",
+ prefix == NULL ? "all" : prefix));
+
+ findhandle = FindFirstFileA(pathbuf->data, &finddata);
+
+ if (INVALID_HANDLE_VALUE == findhandle)
+ WT_ERR_MSG(session, __wt_errno(), "%s: FindFirstFile",
+ pathbuf->data);
+ else {
+ do {
+ /*
+ * Skip . and ..
+ */
+ if (strcmp(finddata.cFileName, ".") == 0 ||
+ strcmp(finddata.cFileName, "..") == 0)
+ continue;
+ match = 0;
+ if (prefix != NULL &&
+ ((LF_ISSET(WT_DIRLIST_INCLUDE) &&
+ WT_PREFIX_MATCH(finddata.cFileName, prefix)) ||
+ (LF_ISSET(WT_DIRLIST_EXCLUDE) &&
+ !WT_PREFIX_MATCH(finddata.cFileName, prefix))))
+ match = 1;
+ if (prefix == NULL || match) {
+ /*
+ * We have a file name we want to return.
+ */
+ count++;
+ if (count > dirsz) {
+ dirsz += WT_DIR_ENTRY;
+ WT_ERR(__wt_realloc_def(session,
+ &dirallocsz, dirsz, &entries));
+ }
+ WT_ERR(__wt_strdup(session,
+ finddata.cFileName, &entries[count - 1]));
+ }
+ } while (FindNextFileA(findhandle, &finddata) != 0);
+ }
+
+ if (count > 0)
+ *dirlist = entries;
+ *countp = count;
+
+err:
+ if (findhandle != INVALID_HANDLE_VALUE)
+ (void)FindClose(findhandle);
+ __wt_free(session, path);
+ __wt_buf_free(session, pathbuf);
+
+ if (ret == 0)
+ return (0);
+
+ if (*dirlist != NULL) {
+ for (count = dirsz; count > 0; count--)
+ __wt_free(session, entries[count]);
+ __wt_free(session, entries);
+ }
+
+ WT_RET_MSG(session, ret, "dirlist %s prefix %s", dir, prefix);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_dlopen.c b/src/third_party/wiredtiger/src/os_win/os_dlopen.c
new file mode 100644
index 00000000000..ebc90edd2b2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_dlopen.c
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_dlopen --
+ * Open a dynamic library.
+ */
+int
+__wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
+{
+ WT_DECL_RET;
+ WT_DLH *dlh;
+
+ WT_RET(__wt_calloc_def(session, 1, &dlh));
+ WT_ERR(__wt_strdup(session, path, &dlh->name));
+
+ /* NULL means load from the current binary */
+ if (path == NULL) {
+ ret = GetModuleHandleExA(0, NULL, &dlh->handle);
+ if (ret == FALSE)
+ WT_ERR_MSG(session,
+ __wt_errno(), "GetModuleHandleEx(%s): %s", path, 0);
+ } else {
+ // TODO: load dll here
+ DebugBreak();
+ }
+
+ /* Windows returns 0 on failure, WT expects 0 on success */
+ ret = !ret;
+
+ *dlhp = dlh;
+ if (0) {
+err: __wt_free(session, dlh->name);
+ __wt_free(session, dlh);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_dlsym --
+ * Lookup a symbol in a dynamic library.
+ */
+int
+__wt_dlsym(WT_SESSION_IMPL *session,
+ WT_DLH *dlh, const char *name, int fail, void *sym_ret)
+{
+ void *sym;
+
+ *(void **)sym_ret = NULL;
+
+ sym = GetProcAddress(dlh->handle, name);
+ if (sym == NULL && fail) {
+ WT_RET_MSG(session, __wt_errno(),
+ "GetProcAddress(%s in %s): %s", name, dlh->name, 0);
+ }
+
+ *(void **)sym_ret = sym;
+ return (0);
+}
+
+/*
+ * __wt_dlclose --
+ * Close a dynamic library
+ */
+int
+__wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh)
+{
+ WT_DECL_RET;
+
+ if ((ret = FreeLibrary(dlh->handle)) == FALSE) {
+ __wt_err(session, __wt_errno(), "FreeLibrary");
+ }
+
+ /* Windows returns 0 on failure, WT expects 0 on success */
+ ret = !ret;
+
+ __wt_free(session, dlh->name);
+ __wt_free(session, dlh);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_errno.c b/src/third_party/wiredtiger/src/os_win/os_errno.c
new file mode 100644
index 00000000000..ce50106b0cc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_errno.c
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_errno --
+ * Return errno, or WT_ERROR if errno not set.
+ */
+int
+__wt_errno(void)
+{
+ /*
+ * Called when we know an error occurred, and we want the system
+ * error code, but there's some chance it's not set.
+ */
+ DWORD err = GetLastError();
+
+ /* GetLastError should only be called if we hit an actual error */
+ WT_ASSERT(NULL, err != ERROR_SUCCESS);
+
+ return (err == ERROR_SUCCESS ? WT_ERROR : err);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_exist.c b/src/third_party/wiredtiger/src/os_win/os_exist.c
new file mode 100644
index 00000000000..ab3805f19df
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_exist.c
@@ -0,0 +1,32 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_exist --
+ * Return if the file exists.
+ */
+int
+__wt_exist(WT_SESSION_IMPL *session, const char *filename, int *existp)
+{
+ WT_DECL_RET;
+ char *path;
+
+ WT_RET(__wt_filename(session, filename, &path));
+
+ ret = GetFileAttributesA(path);
+
+ __wt_free(session, path);
+
+ if (ret != INVALID_FILE_ATTRIBUTES)
+ *existp = 1;
+ else
+ *existp = 0;
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_fallocate.c b/src/third_party/wiredtiger/src/os_win/os_fallocate.c
new file mode 100644
index 00000000000..bd71c780dc5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_fallocate.c
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_fallocate_config --
+ * Configure fallocate behavior for a file handle.
+ */
+void
+__wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+ fh->fallocate_available = 1;
+
+ /*
+ * We use a separate handle for file size changes, so there's no need
+ * for locking.
+ */
+ fh->fallocate_requires_locking = 0;
+}
+
+/*
+ * __wt_fallocate --
+ * Allocate space for a file handle.
+ */
+int
+__wt_fallocate(
+ WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len)
+{
+ WT_DECL_RET;
+ LARGE_INTEGER largeint;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "%s: fallocate", fh->name));
+
+ largeint.QuadPart = offset + len;
+
+ if ((ret = SetFilePointerEx(
+ fh->filehandle_secondary, largeint, NULL, FILE_BEGIN)) == FALSE)
+ WT_RET_MSG(session,
+ __wt_errno(), "%s SetFilePointerEx error", fh->name);
+
+ if ((ret = SetEndOfFile(fh->filehandle_secondary)) != FALSE) {
+ fh->size = fh->extend_size = len;
+ return (0);
+ }
+
+ WT_RET_MSG(session, __wt_errno(), "%s SetEndOfFile error", fh->name);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_filesize.c b/src/third_party/wiredtiger/src/os_win/os_filesize.c
new file mode 100644
index 00000000000..309ee1db40b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_filesize.c
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_filesize --
+ * Get the size of a file in bytes.
+ */
+int
+__wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
+{
+ WT_DECL_RET;
+ LARGE_INTEGER size;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "%s: GetFileSizeEx", fh->name));
+
+ if ((ret = GetFileSizeEx(fh->filehandle, &size)) != 0) {
+ *sizep = size.QuadPart;
+ return (0);
+ }
+
+ WT_RET_MSG(session, __wt_errno(), "%s: GetFileSizeEx", fh->name);
+}
+
+/*
+ * __wt_filesize_name --
+ * Return the size of a file in bytes, given a file name.
+ */
+int
+__wt_filesize_name(
+ WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep)
+{
+ WT_DECL_RET;
+ WIN32_FILE_ATTRIBUTE_DATA data;
+ char *path;
+
+ WT_RET(__wt_filename(session, filename, &path));
+
+ ret = GetFileAttributesExA(path, GetFileExInfoStandard, &data);
+
+ __wt_free(session, path);
+
+ if (ret != 0) {
+ *sizep =
+ ((int64_t)data.nFileSizeHigh << 32) | data.nFileSizeLow;
+ return (0);
+ }
+
+ WT_RET_MSG(session, __wt_errno(), "%s: GetFileAttributesEx", filename);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_flock.c b/src/third_party/wiredtiger/src/os_win/os_flock.c
new file mode 100644
index 00000000000..4b3ca34d65f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_flock.c
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bytelock --
+ * Lock/unlock a byte in a file.
+ */
+int
+__wt_bytelock(WT_FH *fhp, wt_off_t byte, int lock)
+{
+ WT_DECL_RET;
+
+ /*
+ * WiredTiger requires this function be able to acquire locks past
+ * the end of file.
+ *
+ * Note we're using fcntl(2) locking: all fcntl locks associated with a
+ * file for a given process are removed when any file descriptor for the
+ * file is closed by the process, even if a lock was never requested for
+ * that file descriptor.
+ *
+ * http://msdn.microsoft.com/
+ * en-us/library/windows/desktop/aa365202%28v=vs.85%29.aspx
+ *
+ * You can lock bytes that are beyond the end of the current file.
+ * This is useful to coordinate adding records to the end of a file.
+ */
+ if (lock) {
+ ret = LockFile(fhp->filehandle, UINT32_MAX & byte,
+ UINT32_MAX & (byte >> 32), 1, 0);
+ } else {
+ ret = UnlockFile(fhp->filehandle, UINT32_MAX & byte,
+ UINT32_MAX & (byte >> 32), 1, 0);
+ }
+
+ if (ret == FALSE)
+ WT_RET_MSG(NULL, __wt_errno(), "%s: LockFile", fhp->name);
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_fsync.c b/src/third_party/wiredtiger/src/os_win/os_fsync.c
new file mode 100644
index 00000000000..cd509131649
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_fsync.c
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_fsync --
+ * Flush a file handle.
+ */
+int
+__wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+ WT_DECL_RET;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: FlushFileBuffers",
+ fh->name));
+
+ if ((ret = FlushFileBuffers(fh->filehandle)) == FALSE)
+ WT_RET_MSG(session,
+ __wt_errno(), "%s FlushFileBuffers error", fh->name);
+
+ return (0);
+}
+
+/*
+ * __wt_fsync_async --
+ * Flush a file handle and don't wait for the result.
+ */
+int
+__wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+ WT_UNUSED(session);
+ WT_UNUSED(fh);
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_ftruncate.c b/src/third_party/wiredtiger/src/os_win/os_ftruncate.c
new file mode 100644
index 00000000000..5d87f1ce06a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_ftruncate.c
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ftruncate --
+ * Truncate a file.
+ */
+int
+__wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len)
+{
+ WT_DECL_RET;
+ LARGE_INTEGER largeint;
+ uint32_t lasterror;
+
+ largeint.QuadPart = len;
+
+ if ((ret = SetFilePointerEx(
+ fh->filehandle_secondary, largeint, NULL, FILE_BEGIN)) == FALSE)
+ WT_RET_MSG(session, __wt_errno(), "%s SetFilePointerEx error",
+ fh->name);
+
+ ret = SetEndOfFile(fh->filehandle_secondary);
+ if (ret != FALSE) {
+ fh->size = fh->extend_size = len;
+ return (0);
+ }
+
+ lasterror = GetLastError();
+
+ if (lasterror = ERROR_USER_MAPPED_FILE)
+ return (EBUSY);
+
+ WT_RET_MSG(session, lasterror, "%s SetEndOfFile error", fh->name);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_map.c b/src/third_party/wiredtiger/src/os_win/os_map.c
new file mode 100644
index 00000000000..b3b4f0f7501
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_map.c
@@ -0,0 +1,106 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_mmap --
+ * Map a file into memory.
+ */
+int
+__wt_mmap(WT_SESSION_IMPL *session, WT_FH *fh, void *mapp, size_t *lenp,
+ void** mappingcookie)
+{
+ void *map;
+ size_t orig_size;
+
+ /*
+ * Record the current size and only map and set that as the length, it
+ * could change between the map call and when we set the return length.
+ * For the same reason we could actually map past the end of the file;
+ * we don't read bytes past the end of the file though, so as long as
+ * the map call succeeds, it's all OK.
+ */
+ orig_size = (size_t)fh->size;
+ *mappingcookie =
+ CreateFileMappingA(fh->filehandle, NULL, PAGE_READONLY, 0, 0, NULL);
+ if (*mappingcookie == NULL)
+ WT_RET_MSG(session, __wt_errno(),
+ "%s CreateFileMapping error: failed to map %"
+ WT_SIZET_FMT " bytes",
+ fh->name, orig_size);
+
+ if ((map = MapViewOfFile(
+ *mappingcookie, FILE_MAP_READ, 0, 0, orig_size)) == NULL) {
+ CloseHandle(*mappingcookie);
+ *mappingcookie = NULL;
+
+ WT_RET_MSG(session, __wt_errno(),
+ "%s map error: failed to map %" WT_SIZET_FMT " bytes",
+ fh->name, orig_size);
+ }
+ (void)__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: MapViewOfFile %p: %" WT_SIZET_FMT " bytes",
+ fh->name, map, orig_size);
+
+ *(void **)mapp = map;
+ *lenp = orig_size;
+ return (0);
+}
+
+/*
+ * __wt_mmap_preload --
+ * Cause a section of a memory map to be faulted in.
+ */
+int
+__wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size)
+{
+ WT_UNUSED(session);
+ WT_UNUSED(p);
+ WT_UNUSED(size);
+
+ return (0);
+}
+
+/*
+ * __wt_mmap_discard --
+ * Discard a chunk of the memory map.
+ */
+int
+__wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size)
+{
+ WT_UNUSED(session);
+ WT_UNUSED(p);
+ WT_UNUSED(size);
+ return (0);
+}
+
+/*
+ * __wt_munmap --
+ * Remove a memory mapping.
+ */
+int
+__wt_munmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len,
+ void** mappingcookie)
+{
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: UnmapViewOfFile %p: %" WT_SIZET_FMT " bytes",
+ fh->name, map, len));
+
+ if (UnmapViewOfFile(map) == 0) {
+ WT_RET_MSG(session, __wt_errno(),
+ "%s UnmapViewOfFile error: failed to unmap %" WT_SIZET_FMT
+ " bytes",
+ fh->name, len);
+ }
+
+ CloseHandle(*mappingcookie);
+
+ *mappingcookie = 0;
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c
new file mode 100644
index 00000000000..9c9907bd8be
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c
@@ -0,0 +1,155 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cond_alloc --
+ * Allocate and initialize a condition variable.
+ */
+int
+__wt_cond_alloc(WT_SESSION_IMPL *session,
+ const char *name, int is_signalled, WT_CONDVAR **condp)
+{
+ WT_CONDVAR *cond;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL session handle.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond));
+
+ InitializeCriticalSection(&cond->mtx);
+
+ /* Initialize the condition variable to permit self-blocking. */
+ InitializeConditionVariable(&cond->cond);
+
+ cond->name = name;
+ cond->waiters = is_signalled ? -1 : 0;
+
+ *condp = cond;
+ return (0);
+}
+
+/*
+ * __wt_cond_wait --
+ * Wait on a mutex, optionally timing out.
+ */
+int
+__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs)
+{
+ WT_DECL_RET;
+ int locked;
+ int lasterror;
+ int milliseconds;
+ locked = 0;
+ WT_ASSERT(session, usecs >= 0);
+
+ /* Fast path if already signalled. */
+ if (WT_ATOMIC_ADD4(cond->waiters, 1) == 0)
+ return (0);
+
+ /*
+ * !!!
+ * This function MUST handle a NULL session handle.
+ */
+ if (session != NULL) {
+ WT_RET(__wt_verbose(session, WT_VERB_MUTEX,
+ "wait %s cond (%p)", cond->name, cond));
+ WT_STAT_FAST_CONN_INCR(session, cond_wait);
+ }
+
+ EnterCriticalSection(&cond->mtx);
+ locked = 1;
+
+ if (usecs > 0) {
+ milliseconds = usecs / 1000;
+ /*
+ * 0 would mean the CV sleep becomes a TryCV which we do not
+ * want
+ */
+ if (milliseconds == 0)
+ milliseconds = 1;
+ ret = SleepConditionVariableCS(
+ &cond->cond, &cond->mtx, milliseconds);
+ } else
+ ret = SleepConditionVariableCS(
+ &cond->cond, &cond->mtx, INFINITE);
+
+ if (ret == 0) {
+ lasterror = GetLastError();
+ if (lasterror == ERROR_TIMEOUT) {
+ ret = 1;
+ }
+ }
+
+ (void)WT_ATOMIC_SUB4(cond->waiters, 1);
+
+ if (locked)
+ LeaveCriticalSection(&cond->mtx);
+ if (ret != 0)
+ return (0);
+ WT_RET_MSG(session, ret, "SleepConditionVariableCS");
+}
+
+/*
+ * __wt_cond_signal --
+ * Signal a waiting thread.
+ */
+int
+__wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
+{
+ WT_DECL_RET;
+ int locked;
+
+ locked = 0;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL session handle.
+ */
+ if (session != NULL)
+ WT_RET(__wt_verbose(session, WT_VERB_MUTEX,
+ "signal %s cond (%p)", cond->name, cond));
+
+ /* Fast path if already signalled. */
+ if (cond->waiters == -1)
+ return (0);
+
+ if (cond->waiters > 0 || !WT_ATOMIC_CAS4(cond->waiters, 0, -1)) {
+ EnterCriticalSection(&cond->mtx);
+ locked = 1;
+ WakeAllConditionVariable(&cond->cond);
+ }
+
+ if (locked)
+ LeaveCriticalSection(&cond->mtx);
+ if (ret == 0)
+ return (0);
+ WT_RET_MSG(session, ret, "WakeAllConditionVariable");
+}
+
+/*
+ * __wt_cond_destroy --
+ * Destroy a condition variable.
+ */
+int
+__wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
+{
+ WT_CONDVAR *cond;
+ WT_DECL_RET;
+
+ cond = *condp;
+ if (cond == NULL)
+ return (0);
+
+ /* Do nothing to delete Condition Variable */
+ DeleteCriticalSection(&cond->mtx);
+ __wt_free(session, *condp);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_win/os_mtx_rw.c
new file mode 100644
index 00000000000..ec0894a2f29
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_mtx_rw.c
@@ -0,0 +1,126 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rwlock_alloc --
+ * Allocate and initialize a read/write lock.
+ */
+int
+__wt_rwlock_alloc(
+ WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name)
+{
+ WT_RWLOCK *rwlock;
+
+ WT_RET(__wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name));
+
+ WT_RET(__wt_calloc_def(session, 1, &rwlock));
+
+ rwlock->name = name;
+ InitializeSRWLock(&rwlock->rwlock);
+
+ *rwlockp = rwlock;
+ return (0);
+}
+
+/*
+ * __wt_readlock --
+ * Get a shared lock.
+ */
+int
+__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: readlock %s", rwlock->name));
+ WT_STAT_FAST_CONN_INCR(session, rwlock_read);
+
+ AcquireSRWLockShared(&rwlock->rwlock);
+
+ return (0);
+}
+
+/*
+ * __wt_readunlock --
+ * Release a shared lock.
+ */
+int
+__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name));
+
+ ReleaseSRWLockShared(&rwlock->rwlock);
+ return (0);
+}
+
+/*
+ * __wt_try_writelock --
+ * Try to get an exclusive lock, fail immediately if unavailable.
+ */
+int
+__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name));
+ WT_STAT_FAST_CONN_INCR(session, rwlock_write);
+
+ return (TryAcquireSRWLockExclusive(&rwlock->rwlock) == 0 ? EBUSY : 0);
+}
+
+/*
+ * __wt_writelock --
+ * Wait to get an exclusive lock.
+ */
+int
+__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name));
+ WT_STAT_FAST_CONN_INCR(session, rwlock_write);
+
+ AcquireSRWLockExclusive(&rwlock->rwlock);
+
+ return (0);
+}
+
+/*
+ * __wt_writeunlock --
+ * Release an exclusive lock.
+ */
+int
+__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: writeunlock %s", rwlock->name));
+
+ ReleaseSRWLockExclusive(&rwlock->rwlock);
+ return (0);
+}
+
+/*
+ * __wt_rwlock_destroy --
+ * Destroy a read/write lock.
+ */
+int
+__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp)
+{
+ WT_RWLOCK *rwlock;
+
+ rwlock = *rwlockp; /* Clear our caller's reference. */
+ if (rwlock == NULL)
+ return (0);
+ *rwlockp = NULL;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: destroy %s", rwlock->name));
+
+ /* Nothing to delete for Slim Reader Writer lock */
+
+ __wt_free(session, rwlock);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_once.c b/src/third_party/wiredtiger/src/os_win/os_once.c
new file mode 100644
index 00000000000..40640acf129
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_once.c
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_init_once_callback --
+ * Global initialization, run once.
+ */
+BOOL CALLBACK _wt_init_once_callback(
+ _Inout_ PINIT_ONCE InitOnce,
+ _Inout_opt_ PVOID Parameter,
+ _Out_opt_ PVOID *Context
+ )
+{
+ void(*init_routine)(void) = Parameter;
+
+ init_routine();
+
+ return (TRUE);
+}
+
+/*
+ * __wt_library_init --
+ * Some things to do, before we do anything else.
+ */
+int
+__wt_once(void(*init_routine)(void))
+{
+ INIT_ONCE once_control = INIT_ONCE_STATIC_INIT;
+ PVOID lpContext = NULL;
+
+ return !InitOnceExecuteOnce(&once_control, &_wt_init_once_callback,
+ init_routine, lpContext);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_open.c b/src/third_party/wiredtiger/src/os_win/os_open.c
new file mode 100644
index 00000000000..7be98b604ec
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_open.c
@@ -0,0 +1,219 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_open --
+ * Open a file handle.
+ */
+int
+__wt_open(WT_SESSION_IMPL *session,
+ const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp)
+{
+ DWORD dwCreationDisposition;
+ HANDLE filehandle, filehandle_secondary;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *fh, *tfh;
+ int direct_io, f, matched, share_mode;
+ char *path;
+
+ conn = S2C(session);
+ fh = NULL;
+ path = NULL;
+ filehandle = INVALID_HANDLE_VALUE;
+ filehandle_secondary = INVALID_HANDLE_VALUE;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name));
+
+ /* Increment the reference count if we already have the file open. */
+ matched = 0;
+ __wt_spin_lock(session, &conn->fh_lock);
+ TAILQ_FOREACH(tfh, &conn->fhqh, q)
+ if (strcmp(name, tfh->name) == 0) {
+ ++tfh->ref;
+ *fhp = tfh;
+ matched = 1;
+ break;
+ }
+ __wt_spin_unlock(session, &conn->fh_lock);
+ if (matched)
+ return (0);
+
+ WT_RET(__wt_filename(session, name, &path));
+
+ share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
+ /*
+ * Security:
+ * The application may spawn a new process, and we don't want another
+ * process to have access to our file handles.
+ *
+ * TODO: Set tighter file permissions but set bInheritHandle to false
+ * to prevent inheritance
+ */
+
+ f = FILE_ATTRIBUTE_NORMAL;
+
+ dwCreationDisposition = 0;
+ if (ok_create) {
+ dwCreationDisposition = CREATE_NEW;
+ if (exclusive)
+ dwCreationDisposition = CREATE_ALWAYS;
+ } else
+ dwCreationDisposition = OPEN_EXISTING;
+
+ direct_io = 0;
+
+ if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) {
+ f |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
+ direct_io = 1;
+ }
+
+ if (dio_type == WT_FILE_TYPE_LOG &&
+ FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) {
+ f |= FILE_FLAG_WRITE_THROUGH;
+ }
+
+ /* Disable read-ahead on trees: it slows down random read workloads. */
+ if (dio_type == WT_FILE_TYPE_DATA ||
+ dio_type == WT_FILE_TYPE_CHECKPOINT)
+ f |= FILE_FLAG_RANDOM_ACCESS;
+
+ filehandle = CreateFileA(path,
+ (GENERIC_READ | GENERIC_WRITE),
+ share_mode,
+ NULL,
+ dwCreationDisposition,
+ f,
+ NULL);
+ if (filehandle == INVALID_HANDLE_VALUE) {
+ if (GetLastError() == ERROR_FILE_EXISTS && ok_create)
+ filehandle = CreateFileA(path,
+ (GENERIC_READ | GENERIC_WRITE),
+ share_mode,
+ NULL,
+ OPEN_EXISTING,
+ f,
+ NULL);
+
+ if (filehandle == INVALID_HANDLE_VALUE)
+ WT_ERR_MSG(session, __wt_errno(),
+ direct_io ?
+ "%s: open failed with direct I/O configured, some "
+ "filesystem types do not support direct I/O" :
+ "%s", path);
+ }
+
+ /*
+ * Open a second handle to file to support allocation/truncation
+ * concurrently with reads on the file. Writes would also move the file
+ * pointer.
+ */
+ filehandle_secondary = CreateFileA(path,
+ (GENERIC_READ | GENERIC_WRITE),
+ share_mode,
+ NULL,
+ OPEN_EXISTING,
+ f,
+ NULL);
+ if (filehandle == INVALID_HANDLE_VALUE)
+ WT_ERR_MSG(session, __wt_errno(),
+ "open failed for secondary handle: %s", path);
+
+ WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh));
+ WT_ERR(__wt_strdup(session, name, &fh->name));
+ fh->filehandle = filehandle;
+ fh->filehandle_secondary = filehandle_secondary;
+ fh->ref = 1;
+ fh->direct_io = direct_io;
+
+ /* Set the file's size. */
+ WT_ERR(__wt_filesize(session, fh, &fh->size));
+
+ /* Configure file extension. */
+ if (dio_type == WT_FILE_TYPE_DATA ||
+ dio_type == WT_FILE_TYPE_CHECKPOINT)
+ fh->extend_len = conn->data_extend_len;
+
+ /* Configure fallocate/posix_fallocate calls. */
+ __wt_fallocate_config(session, fh);
+
+ /*
+ * Repeat the check for a match, but then link onto the database's list
+ * of files.
+ */
+ matched = 0;
+ __wt_spin_lock(session, &conn->fh_lock);
+ TAILQ_FOREACH(tfh, &conn->fhqh, q)
+ if (strcmp(name, tfh->name) == 0) {
+ ++tfh->ref;
+ *fhp = tfh;
+ matched = 1;
+ break;
+ }
+ if (!matched) {
+ TAILQ_INSERT_TAIL(&conn->fhqh, fh, q);
+ WT_STAT_FAST_CONN_INCR(session, file_open);
+
+ *fhp = fh;
+ }
+ __wt_spin_unlock(session, &conn->fh_lock);
+ if (matched) {
+err: if (fh != NULL) {
+ __wt_free(session, fh->name);
+ __wt_free(session, fh);
+ }
+ if (filehandle != INVALID_HANDLE_VALUE)
+ (void)CloseHandle(filehandle);
+ if (filehandle_secondary != INVALID_HANDLE_VALUE)
+ (void)CloseHandle(filehandle_secondary);
+ }
+
+ __wt_free(session, path);
+ return (ret);
+}
+
+/*
+ * __wt_close --
+ * Close a file handle.
+ */
+int
+__wt_close(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ __wt_spin_lock(session, &conn->fh_lock);
+ if (fh == NULL || fh->ref == 0 || --fh->ref > 0) {
+ __wt_spin_unlock(session, &conn->fh_lock);
+ return (0);
+ }
+
+ /* Remove from the list. */
+ TAILQ_REMOVE(&conn->fhqh, fh, q);
+ WT_STAT_FAST_CONN_DECR(session, file_open);
+
+ __wt_spin_unlock(session, &conn->fh_lock);
+
+ /* Discard the memory. */
+ if (!CloseHandle(fh->filehandle) != 0) {
+ ret = __wt_errno();
+ __wt_err(session, ret, "CloseHandle: %s", fh->name);
+ }
+
+ if (!CloseHandle(fh->filehandle_secondary) != 0) {
+ ret = __wt_errno();
+ __wt_err(session, ret, "CloseHandle: secondary: %s", fh->name);
+ }
+
+ __wt_free(session, fh->name);
+ __wt_free(session, fh);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_path.c b/src/third_party/wiredtiger/src/os_win/os_path.c
new file mode 100644
index 00000000000..9f6b79c565c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_path.c
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_absolute_path --
+ * Return if a filename is an absolute path.
+ */
+int
+__wt_absolute_path(const char *path)
+{
+ /*
+ * Check for a drive name (for example, "D:"), allow both forward and
+ * backward slashes.
+ */
+ if (strlen(path) >= 3 && isalpha(path[0]) && path[1] == ':')
+ path += 2;
+ return (path[0] == '/' || path[0] == '\\' ? 1 : 0);
+}
+
+/*
+ * __wt_path_separator --
+ * Return the path separator string.
+ */
+const char *
+__wt_path_separator(void)
+{
+ return ("\\");
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_priv.c b/src/third_party/wiredtiger/src/os_win/os_priv.c
new file mode 100644
index 00000000000..7b5152b4652
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_priv.c
@@ -0,0 +1,19 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_has_priv --
+ * Return if the process has special privileges, defined as having
+ * different effective and read UIDs or GIDs.
+ */
+int
+__wt_has_priv(void)
+{
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_remove.c b/src/third_party/wiredtiger/src/os_win/os_remove.c
new file mode 100644
index 00000000000..d15ee929c00
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_remove.c
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __remove_file_check --
+ * Check if the file is currently open before removing it.
+ */
+static inline void
+__remove_file_check(WT_SESSION_IMPL *session, const char *name)
+{
+#ifdef HAVE_DIAGNOSTIC
+ WT_CONNECTION_IMPL *conn;
+ WT_FH *fh;
+
+ conn = S2C(session);
+ fh = NULL;
+
+ /*
+ * Check if the file is open: it's an error if it is, since a higher
+ * level should have closed it before removing.
+ */
+ __wt_spin_lock(session, &conn->fh_lock);
+ TAILQ_FOREACH(fh, &conn->fhqh, q) {
+ if (strcmp(name, fh->name) == 0)
+ break;
+ }
+ __wt_spin_unlock(session, &conn->fh_lock);
+
+ WT_ASSERT(session, fh == NULL);
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(name);
+#endif
+}
+
+/*
+ * __wt_remove --
+ * Remove a file.
+ */
+int
+__wt_remove(WT_SESSION_IMPL *session, const char *name)
+{
+ WT_DECL_RET;
+ char *path;
+ uint32_t lasterror;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: remove", name));
+
+ __remove_file_check(session, name);
+
+ WT_RET(__wt_filename(session, name, &path));
+
+ if ((ret = DeleteFileA(path)) == FALSE)
+ lasterror = __wt_errno();
+
+ __wt_free(session, path);
+
+ if (ret != FALSE)
+ return (0);
+
+ WT_RET_MSG(session, lasterror, "%s: remove", name);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_rename.c b/src/third_party/wiredtiger/src/os_win/os_rename.c
new file mode 100644
index 00000000000..092f5d62a40
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_rename.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rename --
+ * Rename a file.
+ */
+int
+__wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to)
+{
+ WT_DECL_RET;
+ uint32_t lasterror;
+ char *from_path, *to_path;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "rename %s to %s", from, to));
+
+ from_path = to_path = NULL;
+
+ WT_RET(__wt_filename(session, from, &from_path));
+ WT_TRET(__wt_filename(session, to, &to_path));
+
+ /*
+ * Check if file exists since Windows does not override the file if
+ * it exists.
+ */
+ if ((ret = GetFileAttributesA(to_path)) != INVALID_FILE_ATTRIBUTES) {
+ if ((ret = DeleteFileA(to_path)) == FALSE) {
+ lasterror = GetLastError();
+ goto err;
+ }
+ }
+
+ if ((MoveFileA(from_path, to_path)) == FALSE)
+ lasterror = GetLastError();
+
+err:
+ __wt_free(session, from_path);
+ __wt_free(session, to_path);
+
+ if (ret != FALSE)
+ return (0);
+
+ WT_RET_MSG(session, lasterror, "MoveFile %s to %s", from, to);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_rw.c b/src/third_party/wiredtiger/src/os_win/os_rw.c
new file mode 100644
index 00000000000..291533bc6bc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_rw.c
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_read --
+ * Read a chunk.
+ */
+int
+__wt_read(
+ WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf)
+{
+ DWORD chunk;
+ DWORD nr;
+ uint8_t *addr;
+ OVERLAPPED overlapped = { 0 };
+
+ nr = 0;
+
+ WT_STAT_FAST_CONN_INCR(session, read_io);
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: read %" WT_SIZET_FMT " bytes at offset %" PRIuMAX,
+ fh->name, len, (uintmax_t)offset));
+
+ /* Assert direct I/O is aligned and a multiple of the alignment. */
+ WT_ASSERT(session,
+ !fh->direct_io ||
+ S2C(session)->buffer_alignment == 0 ||
+ (!((uintptr_t)buf &
+ (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
+ len >= S2C(session)->buffer_alignment &&
+ len % S2C(session)->buffer_alignment == 0));
+
+ /* Break reads larger than 1GB into 1GB chunks. */
+ for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) {
+ chunk = (DWORD)WT_MIN(len, WT_GIGABYTE);
+ overlapped.Offset = UINT32_MAX & offset;
+ overlapped.OffsetHigh = UINT32_MAX & (offset >> 32);
+
+ if (!ReadFile(fh->filehandle, addr, chunk, &nr, &overlapped))
+ WT_RET_MSG(session, nr == 0 ? WT_ERROR : __wt_errno(),
+ "%s read error: failed to read %" WT_SIZET_FMT
+ " bytes at offset %" PRIuMAX,
+ fh->name, chunk, (uintmax_t)offset);
+ }
+ return (0);
+}
+
+/*
+ * __wt_write --
+ * Write a chunk.
+ */
+int
+__wt_write(WT_SESSION_IMPL *session,
+ WT_FH *fh, wt_off_t offset, size_t len, const void *buf)
+{
+ DWORD chunk;
+ DWORD nw;
+ const uint8_t *addr;
+ OVERLAPPED overlapped = { 0 };
+
+ nw = 0;
+
+ WT_STAT_FAST_CONN_INCR(session, write_io);
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: write %" WT_SIZET_FMT " bytes at offset %" PRIuMAX,
+ fh->name, len, (uintmax_t)offset));
+
+ /* Assert direct I/O is aligned and a multiple of the alignment. */
+ WT_ASSERT(session,
+ !fh->direct_io ||
+ S2C(session)->buffer_alignment == 0 ||
+ (!((uintptr_t)buf &
+ (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
+ len >= S2C(session)->buffer_alignment &&
+ len % S2C(session)->buffer_alignment == 0));
+
+ /* Break writes larger than 1GB into 1GB chunks. */
+ for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) {
+ chunk = (DWORD)WT_MIN(len, WT_GIGABYTE);
+ overlapped.Offset = UINT32_MAX & offset;
+ overlapped.OffsetHigh = UINT32_MAX & (offset >> 32);
+
+ if (!WriteFile(fh->filehandle, addr, chunk, &nw, &overlapped))
+ WT_RET_MSG(session, __wt_errno(),
+ "%s write error: failed to write %" WT_SIZET_FMT
+ " bytes at offset %" PRIuMAX,
+ fh->name, chunk, (uintmax_t)offset);
+ }
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_sleep.c b/src/third_party/wiredtiger/src/os_win/os_sleep.c
new file mode 100644
index 00000000000..b9a8cc2e545
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_sleep.c
@@ -0,0 +1,18 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_sleep --
+ * Pause the thread of control.
+ */
+void
+__wt_sleep(long seconds, long micro_seconds)
+{
+ Sleep(seconds * 1000 + micro_seconds / 1000);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_thread.c b/src/third_party/wiredtiger/src/os_win/os_thread.c
new file mode 100644
index 00000000000..4d8cf89f264
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_thread.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_thread_create --
+ * Create a new thread of control.
+ */
+int
+__wt_thread_create(WT_SESSION_IMPL *session,
+ wt_thread_t *tidret, void *(*func)(void *), void *arg)
+{
+ /* Spawn a new thread of control. */
+ *tidret = CreateThread(NULL, 0, func, arg, 0, NULL);
+ if (*tidret != NULL)
+ return (0);
+
+ WT_RET_MSG(session, __wt_errno(), "CreateThread");
+}
+
+/*
+ * __wt_thread_join --
+ * Wait for a thread of control to exit.
+ */
+int
+__wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid)
+{
+ WT_DECL_RET;
+
+ if ((ret = WaitForSingleObject(tid, INFINITE)) == WAIT_OBJECT_0)
+ return (0);
+
+ WT_RET_MSG(session, ret, "WaitForSingleObject");
+}
+
+/*
+ * __wt_thread_id --
+ * Fill in a printable version of the process and thread IDs.
+ */
+void
+__wt_thread_id(char* buf, size_t buflen)
+{
+ (void)snprintf(buf, buflen,
+ "%" PRIu64 ":%" PRIu64,
+ (uint64_t)GetCurrentProcessId(), (uint64_t)GetCurrentThreadId);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_time.c b/src/third_party/wiredtiger/src/os_win/os_time.c
new file mode 100644
index 00000000000..b49b738fe54
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_time.c
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_seconds --
+ * Return the seconds since the Epoch.
+ */
+int
+__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
+{
+ struct timespec t;
+
+ WT_RET(__wt_epoch(session, &t));
+
+ *timep = t.tv_sec;
+
+ return (0);
+}
+
+/*
+ * __wt_epoch --
+ * Return the time since the Epoch.
+ */
+int
+__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
+{
+ uint64_t ns100;
+
+ FILETIME time;
+ GetSystemTimeAsFileTime(&time);
+
+ ns100 = (((int64_t)time.dwHighDateTime << 32) + time.dwLowDateTime)
+ - 116444736000000000LL;
+ tsp->tv_sec = ns100 / 10000000;
+ tsp->tv_nsec = (long)((ns100 % 10000000) * 100);
+
+ return (0);
+}
+
+/*
+ * localtime_r --
+ * Return the current local time.
+ */
+struct tm *
+localtime_r(const time_t *timer, struct tm *result)
+{
+ errno_t err;
+
+ err = localtime_s(result, timer);
+ if (err != 0) {
+ __wt_err(NULL, err, "localtime_s");
+ return (NULL);
+ }
+
+ return (result);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_vsnprintf.c b/src/third_party/wiredtiger/src/os_win/os_vsnprintf.c
new file mode 100644
index 00000000000..1058203e326
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_vsnprintf.c
@@ -0,0 +1,31 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#undef vsnprintf
+
+_Check_return_opt_ int __cdecl _wt_vsnprintf(
+ _Out_writes_(_MaxCount) char * _DstBuf,
+ _In_ size_t _MaxCount,
+ _In_z_ _Printf_format_string_ const char * _Format,
+ va_list _ArgList)
+{
+ int len;
+
+ len = (size_t)vsnprintf(_DstBuf, _MaxCount, _Format, _ArgList);
+
+ /*
+ * The MSVC implementation returns -1 on truncation instead of what
+ * it would have written. We could iteratively grow the buffer, or
+ * just ask us how big a buffer they would like.
+ */
+ if (len == -1)
+ len = _vscprintf(_Format, _ArgList) + 1;
+
+ return (len);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_yield.c b/src/third_party/wiredtiger/src/os_win/os_yield.c
new file mode 100644
index 00000000000..970bfa139d0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_yield.c
@@ -0,0 +1,18 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_yield --
+ * Yield the thread of control.
+ */
+void
+__wt_yield(void)
+{
+ SwitchToThread();
+}
diff --git a/src/third_party/wiredtiger/src/packing/pack_api.c b/src/third_party/wiredtiger/src/packing/pack_api.c
new file mode 100644
index 00000000000..c0c1e53c8ca
--- /dev/null
+++ b/src/third_party/wiredtiger/src/packing/pack_api.c
@@ -0,0 +1,137 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_struct_pack --
+ * Pack a byte string (extension API).
+ */
+int
+wiredtiger_struct_pack(WT_SESSION *wt_session,
+ void *buffer, size_t size, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_packv(session, buffer, size, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * wiredtiger_struct_size --
+ * Calculate the size of a packed byte string (extension API).
+ */
+int
+wiredtiger_struct_size(WT_SESSION *wt_session,
+ size_t *sizep, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_sizev(session, sizep, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * wiredtiger_struct_unpack --
+ * Unpack a byte string (extension API).
+ */
+int
+wiredtiger_struct_unpack(WT_SESSION *wt_session,
+ const void *buffer, size_t size, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_unpackv(session, buffer, size, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_ext_struct_pack --
+ * Pack a byte string (extension API).
+ */
+int
+__wt_ext_struct_pack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+ void *buffer, size_t size, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ session = (wt_session != NULL) ? (WT_SESSION_IMPL *)wt_session :
+ ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_packv(session, buffer, size, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_ext_struct_size --
+ * Calculate the size of a packed byte string (extension API).
+ */
+int
+__wt_ext_struct_size(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+ size_t *sizep, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ session = (wt_session != NULL) ? (WT_SESSION_IMPL *)wt_session :
+ ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_sizev(session, sizep, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_ext_struct_unpack --
+ * Unpack a byte string (extension API).
+ */
+int
+__wt_ext_struct_unpack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+ const void *buffer, size_t size, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ session = (wt_session != NULL) ? (WT_SESSION_IMPL *)wt_session :
+ ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_unpackv(session, buffer, size, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/packing/pack_impl.c b/src/third_party/wiredtiger/src/packing/pack_impl.c
new file mode 100644
index 00000000000..12b1582e6d0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/packing/pack_impl.c
@@ -0,0 +1,96 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_struct_check --
+ * Check that the specified packing format is valid, and whether it fits
+ * into a fixed-sized bitfield.
+ */
+int
+__wt_struct_check(WT_SESSION_IMPL *session,
+ const char *fmt, size_t len, int *fixedp, uint32_t *fixed_lenp)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ int fields;
+
+ WT_RET(__pack_initn(session, &pack, fmt, len));
+ for (fields = 0; (ret = __pack_next(&pack, &pv)) == 0; fields++)
+ ;
+
+ if (ret != WT_NOTFOUND)
+ return (ret);
+
+ if (fixedp != NULL && fixed_lenp != NULL) {
+ if (fields == 0) {
+ *fixedp = 1;
+ *fixed_lenp = 0;
+ } else if (fields == 1 && pv.type == 't') {
+ *fixedp = 1;
+ *fixed_lenp = pv.size;
+ } else
+ *fixedp = 0;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_struct_size --
+ * Calculate the size of a packed byte string.
+ */
+int
+__wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_sizev(session, sizep, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_struct_pack --
+ * Pack a byte string.
+ */
+int
+__wt_struct_pack(WT_SESSION_IMPL *session,
+ void *buffer, size_t size, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_packv(session, buffer, size, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_struct_unpack --
+ * Unpack a byte string.
+ */
+int
+__wt_struct_unpack(WT_SESSION_IMPL *session,
+ const void *buffer, size_t size, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_unpackv(session, buffer, size, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/packing/pack_stream.c b/src/third_party/wiredtiger/src/packing/pack_stream.c
new file mode 100644
index 00000000000..efbbd5d9adb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/packing/pack_stream.c
@@ -0,0 +1,296 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Streaming interface to packing.
+ *
+ * This allows applications to pack or unpack records one field at a time.
+ */
+struct __wt_pack_stream {
+ WT_PACK pack;
+ uint8_t *end, *p, *start;
+};
+
+/*
+ * wiredtiger_pack_start --
+ * Open a stream for packing.
+ */
+int
+wiredtiger_pack_start(WT_SESSION *wt_session,
+ const char *format, void *buffer, size_t len, WT_PACK_STREAM **psp)
+{
+ WT_DECL_RET;
+ WT_PACK_STREAM *ps;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ WT_RET(__wt_calloc_def(session, 1, &ps));
+ WT_ERR(__pack_init(session, &ps->pack, format));
+ ps->p = ps->start = buffer;
+ ps->end = ps->p + len;
+ *psp = ps;
+
+ if (0) {
+err: (void)wiredtiger_pack_close(ps, NULL);
+ }
+ return (ret);
+}
+
+/*
+ * wiredtiger_unpack_start --
+ * Open a stream for unpacking.
+ */
+int
+wiredtiger_unpack_start(WT_SESSION *wt_session, const char *format,
+ const void *buffer, size_t size, WT_PACK_STREAM **psp)
+{
+ return (wiredtiger_pack_start(
+ wt_session, format, (void *)buffer, size, psp));
+}
+
+/*
+ * wiredtiger_pack_close --
+ * Close a packing stream.
+ */
+int
+wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp)
+{
+ if (usedp != NULL)
+ *usedp = WT_PTRDIFF(ps->p, ps->start);
+
+ if (ps != NULL)
+ __wt_free(ps->pack.session, ps);
+
+ return (0);
+}
+
+/*
+ * wiredtiger_pack_item --
+ * Pack an item.
+ */
+int
+wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'U':
+ case 'u':
+ pv.u.item.data = item->data;
+ pv.u.item.size = item->size;
+ WT_RET(__pack_write(
+ session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_pack_int --
+ * Pack a signed integer.
+ */
+int
+wiredtiger_pack_int(WT_PACK_STREAM *ps, int64_t i)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'b':
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'q':
+ pv.u.i = i;
+ WT_RET(__pack_write(
+ session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_pack_str --
+ * Pack a string.
+ */
+int
+wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'S':
+ case 's':
+ pv.u.s = s;
+ WT_RET(__pack_write(
+ session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_pack_uint --
+ * Pack an unsigned int.
+ */
+int
+wiredtiger_pack_uint(WT_PACK_STREAM *ps, uint64_t u)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'B':
+ case 'H':
+ case 'I':
+ case 'L':
+ case 'Q':
+ case 'R':
+ case 'r':
+ case 't':
+ pv.u.u = u;
+ WT_RET(__pack_write(
+ session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_unpack_item --
+ * Unpack an item.
+ */
+int
+wiredtiger_unpack_item(WT_PACK_STREAM *ps, WT_ITEM *item)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'U':
+ case 'u':
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+ item->data = pv.u.item.data;
+ item->size = pv.u.item.size;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_unpack_int --
+ * Unpack a signed integer.
+ */
+int
+wiredtiger_unpack_int(WT_PACK_STREAM *ps, int64_t *ip)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'b':
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'q':
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+ *ip = pv.u.i;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
+
+/*
+ * wiredtiger_unpack_str --
+ * Unpack a string.
+ */
+int
+wiredtiger_unpack_str(WT_PACK_STREAM *ps, const char **sp)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'S':
+ case 's':
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+ *sp = pv.u.s;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
+
+/*
+ * wiredtiger_unpack_uint --
+ * Unpack an unsigned integer.
+ */
+int
+wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'B':
+ case 'H':
+ case 'I':
+ case 'L':
+ case 'Q':
+ case 'R':
+ case 'r':
+ case 't':
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+ *up = pv.u.u;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_create.c b/src/third_party/wiredtiger/src/schema/schema_create.c
new file mode 100644
index 00000000000..398fea4476f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_create.c
@@ -0,0 +1,595 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_direct_io_size_check --
+ * Return a size from the configuration, complaining if it's insufficient
+ * for direct I/O.
+ */
+int
+__wt_direct_io_size_check(WT_SESSION_IMPL *session,
+ const char **cfg, const char *config_name, uint32_t *allocsizep)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ int64_t align;
+
+ *allocsizep = 0;
+
+ conn = S2C(session);
+
+ WT_RET(__wt_config_gets(session, cfg, config_name, &cval));
+
+ /*
+ * This function exists as a place to hang this comment: if direct I/O
+ * is configured, page sizes must be at least as large as any buffer
+ * alignment as well as a multiple of the alignment. Linux gets unhappy
+ * if you configure direct I/O and then don't do I/O in alignments and
+ * units of its happy place.
+ */
+ if (FLD_ISSET(conn->direct_io,
+ WT_FILE_TYPE_CHECKPOINT | WT_FILE_TYPE_DATA)) {
+ align = (int64_t)conn->buffer_alignment;
+ if (align != 0 && (cval.val < align || cval.val % align != 0))
+ WT_RET_MSG(session, EINVAL,
+ "when direct I/O is configured, the %s size must "
+ "be at least as large as the buffer alignment as "
+ "well as a multiple of the buffer alignment",
+ config_name);
+ }
+ *allocsizep = (uint32_t)cval.val;
+ return (0);
+}
+
+/*
+ * __create_file --
+ * Create a new 'file:' object.
+ */
+static int
+__create_file(WT_SESSION_IMPL *session,
+ const char *uri, int exclusive, const char *config)
+{
+ WT_DECL_ITEM(val);
+ WT_DECL_RET;
+ uint32_t allocsize;
+ int is_metadata;
+ const char *fileconf, *filename;
+ const char **p, *filecfg[] =
+ { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL };
+
+ fileconf = NULL;
+
+ is_metadata = strcmp(uri, WT_METAFILE_URI) == 0;
+
+ filename = uri;
+ if (!WT_PREFIX_SKIP(filename, "file:"))
+ WT_RET_MSG(session, EINVAL, "Expected a 'file:' URI: %s", uri);
+
+ /* Check if the file already exists. */
+ if (!is_metadata && (ret =
+ __wt_metadata_search(session, uri, &fileconf)) != WT_NOTFOUND) {
+ if (exclusive)
+ WT_TRET(EEXIST);
+ goto err;
+ }
+
+ /* Sanity check the allocation size. */
+ WT_RET(__wt_direct_io_size_check(
+ session, filecfg, "allocation_size", &allocsize));
+
+ /* Create the file. */
+ WT_ERR(__wt_block_manager_create(session, filename, allocsize));
+ if (WT_META_TRACKING(session))
+ WT_ERR(__wt_meta_track_fileop(session, NULL, uri));
+
+ /*
+ * If creating an ordinary file, append the file ID and current version
+ * numbers to the passed-in configuration and insert the resulting
+ * configuration into the metadata.
+ */
+ if (!is_metadata) {
+ WT_ERR(__wt_scr_alloc(session, 0, &val));
+ WT_ERR(__wt_buf_fmt(session, val,
+ "id=%" PRIu32 ",version=(major=%d,minor=%d)",
+ ++S2C(session)->next_file_id,
+ WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX));
+ for (p = filecfg; *p != NULL; ++p)
+ ;
+ *p = val->data;
+ WT_ERR(__wt_config_collapse(session, filecfg, &fileconf));
+ WT_ERR(__wt_metadata_insert(session, uri, fileconf));
+ }
+
+ /*
+ * Open the file to check that it was setup correctly. We don't need
+ * to pass the configuration, we just wrote the collapsed configuration
+ * into the metadata file, and it's going to be read/used by underlying
+ * functions.
+ *
+ * Keep the handle exclusive until it is released at the end of the
+ * call, otherwise we could race with a drop.
+ */
+ WT_ERR(__wt_session_get_btree(
+ session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
+ if (WT_META_TRACKING(session))
+ WT_ERR(__wt_meta_track_handle_lock(session, 1));
+ else
+ WT_ERR(__wt_session_release_btree(session));
+
+err: __wt_scr_free(&val);
+ __wt_free(session, fileconf);
+ return (ret);
+}
+
+/*
+ * __wt_schema_colgroup_source --
+ * Get the URI of the data source for a column group.
+ */
+int
+__wt_schema_colgroup_source(WT_SESSION_IMPL *session,
+ WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ size_t len;
+ const char *prefix, *suffix, *tablename;
+
+ tablename = table->name + strlen("table:");
+ if ((ret = __wt_config_getones(session, config, "type", &cval)) == 0 &&
+ !WT_STRING_MATCH("file", cval.str, cval.len)) {
+ prefix = cval.str;
+ len = cval.len;
+ suffix = "";
+ } else {
+ prefix = "file";
+ len = strlen(prefix);
+ suffix = ".wt";
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ if (cgname == NULL)
+ WT_RET(__wt_buf_fmt(session, buf, "%.*s:%s%s",
+ (int)len, prefix, tablename, suffix));
+ else
+ WT_RET(__wt_buf_fmt(session, buf, "%.*s:%s_%s%s",
+ (int)len, prefix, tablename, cgname, suffix));
+
+ return (0);
+}
+
+/*
+ * __create_colgroup --
+ * Create a column group.
+ */
+static int
+__create_colgroup(WT_SESSION_IMPL *session,
+ const char *name, int exclusive, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_ITEM confbuf, fmt, namebuf;
+ WT_TABLE *table;
+ size_t tlen;
+ const char *cfg[4] =
+ { WT_CONFIG_BASE(session, colgroup_meta), config, NULL, NULL };
+ const char *sourcecfg[] = { config, NULL, NULL };
+ const char **cfgp;
+ const char *cgconf, *cgname, *sourceconf, *oldconf;
+ const char *source, *tablename;
+
+ cgconf = sourceconf = oldconf = NULL;
+ WT_CLEAR(fmt);
+ WT_CLEAR(confbuf);
+ WT_CLEAR(namebuf);
+
+ tablename = name;
+ if (!WT_PREFIX_SKIP(tablename, "colgroup:"))
+ return (EINVAL);
+ cgname = strchr(tablename, ':');
+ if (cgname != NULL) {
+ tlen = (size_t)(cgname - tablename);
+ ++cgname;
+ } else
+ tlen = strlen(tablename);
+
+ if ((ret =
+ __wt_schema_get_table(session, tablename, tlen, 1, &table)) != 0)
+ WT_RET_MSG(session, (ret == WT_NOTFOUND) ? ENOENT : ret,
+ "Can't create '%s' for non-existent table '%.*s'",
+ name, (int)tlen, tablename);
+
+ /* Make sure the column group is referenced from the table. */
+ if (cgname != NULL && (ret =
+ __wt_config_subgets(session, &table->cgconf, cgname, &cval)) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "Column group '%s' not found in table '%.*s'",
+ cgname, (int)tlen, tablename);
+
+ /* Find the first NULL entry in the cfg stack. */
+ for (cfgp = &cfg[1]; *cfgp; cfgp++)
+ ;
+
+ /* Add the source to the colgroup config before collapsing. */
+ if (__wt_config_getones(
+ session, config, "source", &cval) == 0 && cval.len != 0) {
+ WT_ERR(__wt_buf_fmt(
+ session, &namebuf, "%.*s", (int)cval.len, cval.str));
+ source = namebuf.data;
+ } else {
+ WT_ERR(__wt_schema_colgroup_source(
+ session, table, cgname, config, &namebuf));
+ source = namebuf.data;
+ WT_ERR(__wt_buf_fmt(
+ session, &confbuf, "source=\"%s\"", source));
+ *cfgp++ = confbuf.data;
+ }
+
+ /* Calculate the key/value formats: these go into the source config. */
+ WT_ERR(__wt_buf_fmt(session, &fmt, "key_format=%s", table->key_format));
+ if (cgname == NULL)
+ WT_ERR(__wt_buf_catfmt
+ (session, &fmt, ",value_format=%s", table->value_format));
+ else {
+ if (__wt_config_getones(session, config, "columns", &cval) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "No 'columns' configuration for '%s'", name);
+ WT_ERR(__wt_buf_catfmt(session, &fmt, ",value_format="));
+ WT_ERR(__wt_struct_reformat(session,
+ table, cval.str, cval.len, NULL, 1, &fmt));
+ }
+ sourcecfg[1] = fmt.data;
+ WT_ERR(__wt_config_concat(session, sourcecfg, &sourceconf));
+
+ WT_ERR(__wt_schema_create(session, source, sourceconf));
+
+ WT_ERR(__wt_config_collapse(session, cfg, &cgconf));
+ if ((ret = __wt_metadata_insert(session, name, cgconf)) != 0) {
+ /*
+ * If the entry already exists in the metadata, we're done.
+ * This is an error for exclusive creates but okay otherwise.
+ */
+ if (ret == WT_DUPLICATE_KEY)
+ ret = exclusive ? EEXIST : 0;
+ goto err;
+ }
+
+ WT_ERR(__wt_schema_open_colgroups(session, table));
+
+err: __wt_free(session, cgconf);
+ __wt_free(session, sourceconf);
+ __wt_free(session, oldconf);
+ __wt_buf_free(session, &confbuf);
+ __wt_buf_free(session, &fmt);
+ __wt_buf_free(session, &namebuf);
+
+ __wt_schema_release_table(session, table);
+ return (ret);
+}
+
+/*
+ * __wt_schema_index_source --
+ * Get the URI of the data source for an index.
+ */
+int
+__wt_schema_index_source(WT_SESSION_IMPL *session,
+ WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ size_t len;
+ const char *prefix, *suffix, *tablename;
+
+ tablename = table->name + strlen("table:");
+ if ((ret = __wt_config_getones(session, config, "type", &cval)) == 0 &&
+ !WT_STRING_MATCH("file", cval.str, cval.len)) {
+ prefix = cval.str;
+ len = cval.len;
+ suffix = "_idx";
+ } else {
+ prefix = "file";
+ len = strlen(prefix);
+ suffix = ".wti";
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ WT_RET(__wt_buf_fmt(session, buf, "%.*s:%s_%s%s",
+ (int)len, prefix, tablename, idxname, suffix));
+
+ return (0);
+}
+
+/*
+ * __create_index --
+ * Create an index.
+ */
+static int
+__create_index(WT_SESSION_IMPL *session,
+ const char *name, int exclusive, const char *config)
+{
+ WT_CONFIG pkcols;
+ WT_CONFIG_ITEM ckey, cval, icols;
+ WT_DECL_RET;
+ WT_ITEM confbuf, extra_cols, fmt, namebuf;
+ WT_TABLE *table;
+ const char *cfg[4] =
+ { WT_CONFIG_BASE(session, index_meta), NULL, NULL, NULL };
+ const char *sourcecfg[] = { config, NULL, NULL };
+ const char *sourceconf, *source, *idxconf, *idxname;
+ const char *tablename;
+ size_t tlen;
+ u_int i;
+
+ idxconf = sourceconf = NULL;
+ WT_CLEAR(confbuf);
+ WT_CLEAR(fmt);
+ WT_CLEAR(extra_cols);
+ WT_CLEAR(namebuf);
+
+ tablename = name;
+ if (!WT_PREFIX_SKIP(tablename, "index:"))
+ return (EINVAL);
+ idxname = strchr(tablename, ':');
+ if (idxname == NULL)
+ WT_RET_MSG(session, EINVAL, "Invalid index name, "
+ "should be <table name>:<index name>: %s", name);
+
+ tlen = (size_t)(idxname++ - tablename);
+ if ((ret =
+ __wt_schema_get_table(session, tablename, tlen, 1, &table)) != 0)
+ WT_RET_MSG(session, ret,
+ "Can't create an index for a non-existent table: %.*s",
+ (int)tlen, tablename);
+
+ if (__wt_config_getones(session, config, "source", &cval) == 0) {
+ WT_ERR(__wt_buf_fmt(session, &namebuf,
+ "%.*s", (int)cval.len, cval.str));
+ source = namebuf.data;
+ } else {
+ WT_ERR(__wt_schema_index_source(
+ session, table, idxname, config, &namebuf));
+ source = namebuf.data;
+
+ /* Add the source name to the index config before collapsing. */
+ WT_ERR(__wt_buf_catfmt(session, &confbuf,
+ ",source=\"%s\"", source));
+ }
+
+ /* Calculate the key/value formats. */
+ if (__wt_config_getones(session, config, "columns", &icols) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "No 'columns' configuration for '%s'", name);
+
+ /*
+ * The key format for an index is somewhat subtle: the application
+ * specifies a set of columns that it will use for the key, but the
+ * engine usually adds some hidden columns in order to derive the
+ * primary key. These hidden columns are part of the source's
+ * key_format, which we are calculating now, but not part of an index
+ * cursor's key_format.
+ */
+ WT_ERR(__wt_config_subinit(session, &pkcols, &table->colconf));
+ for (i = 0; i < table->nkey_columns &&
+ (ret = __wt_config_next(&pkcols, &ckey, &cval)) == 0;
+ i++) {
+ /*
+ * If the primary key column is already in the secondary key,
+ * don't add it again.
+ */
+ if (__wt_config_subgetraw(session, &icols, &ckey, &cval) == 0)
+ continue;
+ WT_ERR(__wt_buf_catfmt(
+ session, &extra_cols, "%.*s,", (int)ckey.len, ckey.str));
+ }
+ if (ret != 0 && ret != WT_NOTFOUND)
+ goto err;
+
+ /*
+ * Index values are normally empty: all columns are packed into the
+ * index key. The exception is LSM, which (currently) reserves empty
+ * values as tombstones. Use a single padding byte in that case.
+ */
+ if (WT_PREFIX_MATCH(source, "lsm:"))
+ WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=x,"));
+ else
+ WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,"));
+ WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,key_format="));
+ WT_ERR(__wt_struct_reformat(session, table,
+ icols.str, icols.len, (const char *)extra_cols.data, 0, &fmt));
+
+ /* Check for a record number index key, which makes no sense. */
+ WT_ERR(__wt_config_getones(session, fmt.data, "key_format", &cval));
+ if (cval.len == 1 && cval.str[0] == 'r')
+ WT_ERR_MSG(session, EINVAL,
+ "column-store index may not use the record number as its "
+ "index key");
+
+ sourcecfg[1] = fmt.data;
+ WT_ERR(__wt_config_concat(session, sourcecfg, &sourceconf));
+
+ WT_ERR(__wt_schema_create(session, source, sourceconf));
+
+ cfg[1] = sourceconf;
+ cfg[2] = confbuf.data;
+ WT_ERR(__wt_config_collapse(session, cfg, &idxconf));
+ if ((ret = __wt_metadata_insert(session, name, idxconf)) != 0) {
+ /*
+ * If the entry already exists in the metadata, we're done.
+ * This is an error for exclusive creates but okay otherwise.
+ */
+ if (ret == WT_DUPLICATE_KEY)
+ ret = exclusive ? EEXIST : 0;
+ goto err;
+ }
+
+err: __wt_free(session, idxconf);
+ __wt_free(session, sourceconf);
+ __wt_buf_free(session, &confbuf);
+ __wt_buf_free(session, &extra_cols);
+ __wt_buf_free(session, &fmt);
+ __wt_buf_free(session, &namebuf);
+
+ __wt_schema_release_table(session, table);
+ return (ret);
+}
+
+/*
+ * __create_table --
+ * Create a table.
+ */
+static int
+__create_table(WT_SESSION_IMPL *session,
+ const char *name, int exclusive, const char *config)
+{
+ WT_CONFIG conf;
+ WT_CONFIG_ITEM cgkey, cgval, cval;
+ WT_DECL_RET;
+ WT_TABLE *table;
+ size_t cgsize;
+ int ncolgroups;
+ char *cgname;
+ const char *cfg[4] =
+ { WT_CONFIG_BASE(session, table_meta), config, NULL, NULL };
+ const char *tableconf, *tablename;
+
+ cgname = NULL;
+ table = NULL;
+ tableconf = NULL;
+
+ tablename = name;
+ if (!WT_PREFIX_SKIP(tablename, "table:"))
+ return (EINVAL);
+
+ if ((ret = __wt_schema_get_table(session,
+ tablename, strlen(tablename), 0, &table)) == 0) {
+ __wt_schema_release_table(session, table);
+ return (exclusive ? EEXIST : 0);
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ WT_RET(__wt_config_gets(session, cfg, "colgroups", &cval));
+ WT_RET(__wt_config_subinit(session, &conf, &cval));
+ for (ncolgroups = 0;
+ (ret = __wt_config_next(&conf, &cgkey, &cgval)) == 0;
+ ncolgroups++)
+ ;
+ WT_RET_NOTFOUND_OK(ret);
+
+ WT_RET(__wt_config_collapse(session, cfg, &tableconf));
+ if ((ret = __wt_metadata_insert(session, name, tableconf)) != 0) {
+ /*
+ * If the entry already exists in the metadata, we're done.
+ * This is an error for exclusive creates but okay otherwise.
+ */
+ if (ret == WT_DUPLICATE_KEY)
+ ret = exclusive ? EEXIST : 0;
+ goto err;
+ }
+
+ /* Attempt to open the table now to catch any errors. */
+ WT_ERR(__wt_schema_get_table(
+ session, tablename, strlen(tablename), 1, &table));
+
+ if (ncolgroups == 0) {
+ cgsize = strlen("colgroup:") + strlen(tablename) + 1;
+ WT_ERR(__wt_calloc_def(session, cgsize, &cgname));
+ snprintf(cgname, cgsize, "colgroup:%s", tablename);
+ WT_ERR(__create_colgroup(session, cgname, exclusive, config));
+ }
+
+ if (0) {
+err: if (table != NULL) {
+ __wt_schema_remove_table(session, table);
+ table = NULL;
+ }
+ }
+ if (table != NULL)
+ __wt_schema_release_table(session, table);
+ __wt_free(session, cgname);
+ __wt_free(session, tableconf);
+ return (ret);
+}
+
+/*
+ * __create_data_source --
+ * Create a custom data source.
+ */
+static int
+__create_data_source(WT_SESSION_IMPL *session,
+ const char *uri, const char *config, WT_DATA_SOURCE *dsrc)
+{
+ WT_CONFIG_ITEM cval;
+ const char *cfg[] = {
+ WT_CONFIG_BASE(session, session_create), config, NULL };
+
+ /*
+ * Check to be sure the key/value formats are legal: the underlying
+ * data source doesn't have access to the functions that check.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
+ WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL));
+ WT_RET(__wt_config_gets(session, cfg, "value_format", &cval));
+ WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL));
+
+ /*
+ * User-specified collators aren't supported for data-source objects.
+ */
+ if (__wt_config_getones(
+ session, config, "collator", &cval) != WT_NOTFOUND)
+ WT_RET_MSG(session, EINVAL,
+ "WT_DATA_SOURCE objects do not support WT_COLLATOR "
+ "ordering");
+
+ return (dsrc->create(dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg));
+}
+
+/*
+ * __wt_schema_create --
+ * Process a WT_SESSION::create operation for all supported types.
+ */
+int
+__wt_schema_create(
+ WT_SESSION_IMPL *session, const char *uri, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DATA_SOURCE *dsrc;
+ WT_DECL_RET;
+ int exclusive;
+
+ exclusive = (
+ __wt_config_getones(session, config, "exclusive", &cval) == 0 &&
+ cval.val != 0);
+
+ /*
+ * We track create operations: if we fail in the middle of creating a
+ * complex object, we want to back it all out.
+ */
+ WT_RET(__wt_meta_track_on(session));
+
+ if (WT_PREFIX_MATCH(uri, "colgroup:"))
+ ret = __create_colgroup(session, uri, exclusive, config);
+ else if (WT_PREFIX_MATCH(uri, "file:"))
+ ret = __create_file(session, uri, exclusive, config);
+ else if (WT_PREFIX_MATCH(uri, "lsm:"))
+ ret = __wt_lsm_tree_create(session, uri, exclusive, config);
+ else if (WT_PREFIX_MATCH(uri, "index:"))
+ ret = __create_index(session, uri, exclusive, config);
+ else if (WT_PREFIX_MATCH(uri, "table:"))
+ ret = __create_table(session, uri, exclusive, config);
+ else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
+ ret = dsrc->create == NULL ?
+ __wt_object_unsupported(session, uri) :
+ __create_data_source(session, uri, config, dsrc);
+ else
+ ret = __wt_bad_object_type(session, uri);
+
+ session->dhandle = NULL;
+ WT_TRET(__wt_meta_track_off(session, ret != 0));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_drop.c b/src/third_party/wiredtiger/src/schema/schema_drop.c
new file mode 100644
index 00000000000..6df7e6930c9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_drop.c
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __drop_file --
+ * Drop a file.
+ */
+static int
+__drop_file(
+ WT_SESSION_IMPL *session, const char *uri, int force, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ int exist, remove_files;
+ const char *filename;
+
+ WT_RET(__wt_config_gets(session, cfg, "remove_files", &cval));
+ remove_files = (cval.val != 0);
+
+ filename = uri;
+ if (!WT_PREFIX_SKIP(filename, "file:"))
+ return (EINVAL);
+
+ /* Close all btree handles associated with this file. */
+ WT_RET(__wt_conn_dhandle_close_all(session, uri, force));
+
+ /* Remove the metadata entry (ignore missing items). */
+ WT_TRET(__wt_metadata_remove(session, uri));
+ if (force && ret == WT_NOTFOUND)
+ ret = 0;
+
+ if (!remove_files)
+ return (ret);
+
+ /* Remove the underlying physical file. */
+ exist = 0;
+ WT_TRET(__wt_exist(session, filename, &exist));
+ if (exist) {
+ /*
+ * There is no point tracking this operation: there is no going
+ * back from here.
+ */
+ WT_TRET(__wt_remove(session, filename));
+ }
+
+ return (ret);
+}
+
+/*
+ * __drop_colgroup --
+ * WT_SESSION::drop for a colgroup.
+ */
+static int
+__drop_colgroup(
+ WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+ WT_COLGROUP *colgroup;
+ WT_DECL_RET;
+ WT_TABLE *table;
+
+ /* If we can get the colgroup, detach it from the table. */
+ if ((ret = __wt_schema_get_colgroup(
+ session, uri, &table, &colgroup)) == 0) {
+ table->cg_complete = 0;
+ WT_TRET(__wt_schema_drop(session, colgroup->source, cfg));
+ }
+
+ WT_TRET(__wt_metadata_remove(session, uri));
+ return (ret);
+}
+
+/*
+ * __drop_index --
+ * WT_SESSION::drop for a colgroup.
+ */
+static int
+__drop_index(
+ WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+ WT_INDEX *idx;
+ WT_DECL_RET;
+ WT_TABLE *table;
+
+ /* If we can get the colgroup, detach it from the table. */
+ if ((ret = __wt_schema_get_index(session, uri, &table, &idx)) == 0) {
+ table->idx_complete = 0;
+ WT_TRET(__wt_schema_drop(session, idx->source, cfg));
+ }
+
+ WT_TRET(__wt_metadata_remove(session, uri));
+ return (ret);
+}
+
+/*
+ * __drop_table --
+ * WT_SESSION::drop for a table.
+ */
+static int
+__drop_table(
+ WT_SESSION_IMPL *session, const char *uri, int force, const char *cfg[])
+{
+ WT_COLGROUP *colgroup;
+ WT_DECL_RET;
+ WT_INDEX *idx;
+ WT_TABLE *table;
+ const char *name;
+ u_int i;
+
+ name = uri;
+ (void)WT_PREFIX_SKIP(name, "table:");
+
+ table = NULL;
+ WT_ERR(__wt_schema_get_table(session, name, strlen(name), 1, &table));
+
+ /* Drop the column groups. */
+ for (i = 0; i < WT_COLGROUPS(table); i++) {
+ if ((colgroup = table->cgroups[i]) == NULL)
+ continue;
+ WT_ERR(__wt_metadata_remove(session, colgroup->name));
+ WT_ERR(__wt_schema_drop(session, colgroup->source, cfg));
+ }
+
+ /* Drop the indices. */
+ WT_ERR(__wt_schema_open_indices(session, table));
+ for (i = 0; i < table->nindices; i++) {
+ if ((idx = table->indices[i]) == NULL)
+ continue;
+ WT_ERR(__wt_metadata_remove(session, idx->name));
+ WT_ERR(__wt_schema_drop(session, idx->source, cfg));
+ }
+
+ __wt_schema_remove_table(session, table);
+ table = NULL;
+
+ /* Remove the metadata entry (ignore missing items). */
+ WT_ERR(__wt_metadata_remove(session, uri));
+
+err: if (force && ret == WT_NOTFOUND)
+ ret = 0;
+ if (table != NULL)
+ __wt_schema_release_table(session, table);
+ return (ret);
+}
+
+/*
+ * __wt_schema_drop --
+ * Process a WT_SESSION::drop operation for all supported types.
+ */
+int
+__wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_DATA_SOURCE *dsrc;
+ WT_DECL_RET;
+ int force;
+
+ WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
+ force = (cval.val != 0);
+
+ WT_RET(__wt_meta_track_on(session));
+
+ /* Be careful to ignore any btree handle in our caller. */
+ WT_CLEAR_BTREE_IN_SESSION(session);
+
+ if (WT_PREFIX_MATCH(uri, "colgroup:"))
+ ret = __drop_colgroup(session, uri, cfg);
+ else if (WT_PREFIX_MATCH(uri, "file:"))
+ ret = __drop_file(session, uri, force, cfg);
+ else if (WT_PREFIX_MATCH(uri, "index:"))
+ ret = __drop_index(session, uri, cfg);
+ else if (WT_PREFIX_MATCH(uri, "lsm:"))
+ ret = __wt_lsm_tree_drop(session, uri, cfg);
+ else if (WT_PREFIX_MATCH(uri, "table:"))
+ ret = __drop_table(session, uri, force, cfg);
+ else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
+ ret = dsrc->drop == NULL ?
+ __wt_object_unsupported(session, uri) :
+ dsrc->drop(
+ dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg);
+ else
+ ret = __wt_bad_object_type(session, uri);
+
+ /*
+ * Map WT_NOTFOUND to ENOENT (or to 0 if "force" is set), based on the
+ * assumption WT_NOTFOUND means there was no metadata entry. The
+ * underlying drop functions should handle this case (we passed them
+ * the "force" value), but better safe than sorry.
+ */
+ if (ret == WT_NOTFOUND)
+ ret = force ? 0 : ENOENT;
+
+ /* Bump the schema generation so that stale data is ignored. */
+ ++S2C(session)->schema_gen;
+
+ WT_TRET(__wt_meta_track_off(session, ret != 0));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_list.c b/src/third_party/wiredtiger/src/schema/schema_list.c
new file mode 100644
index 00000000000..05421283bf6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_list.c
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __schema_add_table --
+ * Add a table handle to the session's cache.
+ */
+static int
+__schema_add_table(WT_SESSION_IMPL *session,
+ const char *name, size_t namelen, WT_TABLE **tablep)
+{
+ WT_TABLE *table;
+
+ WT_RET(__wt_schema_open_table(session, name, namelen, &table));
+
+ /* Copy the schema generation into the new table. */
+ table->schema_gen = S2C(session)->schema_gen;
+
+ TAILQ_INSERT_HEAD(&session->tables, table, q);
+ *tablep = table;
+
+ return (0);
+}
+
+/*
+ * __schema_find_table --
+ * Find the table handle for the named table in the session cache.
+ */
+static int
+__schema_find_table(WT_SESSION_IMPL *session,
+ const char *name, size_t namelen, WT_TABLE **tablep)
+{
+ WT_TABLE *table;
+ const char *tablename;
+
+restart:
+ TAILQ_FOREACH(table, &session->tables, q) {
+ tablename = table->name;
+ (void)WT_PREFIX_SKIP(tablename, "table:");
+ if (WT_STRING_MATCH(tablename, name, namelen)) {
+ /*
+ * Ignore stale tables.
+ *
+ * XXX: should be managed the same as btree handles,
+ * with a local cache in each session and a shared list
+ * in the connection. There is still a race here
+ * between checking the generation and opening the
+ * first column group.
+ */
+ if (table->schema_gen != S2C(session)->schema_gen) {
+ if (table->refcnt == 0) {
+ __wt_schema_remove_table(
+ session, table);
+ goto restart;
+ }
+ continue;
+ }
+ *tablep = table;
+ return (0);
+ }
+ }
+
+ return (WT_NOTFOUND);
+}
+
+/*
+ * __wt_schema_get_table --
+ * Get the table handle for the named table.
+ */
+int
+__wt_schema_get_table(WT_SESSION_IMPL *session,
+ const char *name, size_t namelen, int ok_incomplete, WT_TABLE **tablep)
+{
+ WT_DECL_RET;
+ WT_TABLE *table;
+
+ *tablep = table = NULL;
+ ret = __schema_find_table(session, name, namelen, &table);
+
+ if (ret == WT_NOTFOUND)
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __schema_add_table(session, name, namelen, &table));
+
+ if (ret == 0) {
+ if (!ok_incomplete && !table->cg_complete)
+ WT_RET_MSG(session, EINVAL, "'%s' cannot be used "
+ "until all column groups are created",
+ table->name);
+
+ ++table->refcnt;
+ *tablep = table;
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_schema_release_table --
+ * Release a table handle.
+ */
+void
+__wt_schema_release_table(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+ WT_ASSERT(session, table->refcnt > 0);
+ --table->refcnt;
+}
+
+/*
+ * __wt_schema_destroy_colgroup --
+ * Free a column group handle.
+ */
+void
+__wt_schema_destroy_colgroup(WT_SESSION_IMPL *session, WT_COLGROUP *colgroup)
+{
+ __wt_free(session, colgroup->name);
+ __wt_free(session, colgroup->source);
+ __wt_free(session, colgroup->config);
+ __wt_free(session, colgroup);
+}
+
+/*
+ * __wt_schema_destroy_index --
+ * Free an index handle.
+ */
+void
+__wt_schema_destroy_index(WT_SESSION_IMPL *session, WT_INDEX *idx)
+{
+ __wt_free(session, idx->name);
+ __wt_free(session, idx->source);
+ __wt_free(session, idx->config);
+ __wt_free(session, idx->key_format);
+ __wt_free(session, idx->key_plan);
+ __wt_free(session, idx->value_plan);
+ __wt_free(session, idx->idxkey_format);
+ __wt_free(session, idx);
+}
+
+/*
+ * __wt_schema_destroy_table --
+ * Free a table handle.
+ */
+void
+__wt_schema_destroy_table(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+ WT_COLGROUP *colgroup;
+ WT_INDEX *idx;
+ u_int i;
+
+ __wt_free(session, table->name);
+ __wt_free(session, table->config);
+ __wt_free(session, table->plan);
+ __wt_free(session, table->key_format);
+ __wt_free(session, table->value_format);
+ if (table->cgroups != NULL) {
+ for (i = 0; i < WT_COLGROUPS(table); i++) {
+ if ((colgroup = table->cgroups[i]) == NULL)
+ continue;
+ __wt_schema_destroy_colgroup(session, colgroup);
+ }
+ __wt_free(session, table->cgroups);
+ }
+ if (table->indices != NULL) {
+ for (i = 0; i < table->nindices; i++) {
+ if ((idx = table->indices[i]) == NULL)
+ continue;
+ __wt_schema_destroy_index(session, idx);
+ }
+ __wt_free(session, table->indices);
+ }
+ __wt_free(session, table);
+}
+
+/*
+ * __wt_schema_remove_table --
+ * Remove the table handle from the session, closing if necessary.
+ */
+void
+__wt_schema_remove_table(
+ WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+ WT_ASSERT(session, table->refcnt <= 1);
+
+ TAILQ_REMOVE(&session->tables, table, q);
+ __wt_schema_destroy_table(session, table);
+}
+
+/*
+ * __wt_schema_close_tables --
+ * Close all of the tables in a session.
+ */
+void
+__wt_schema_close_tables(WT_SESSION_IMPL *session)
+{
+ WT_TABLE *table;
+
+ while ((table = TAILQ_FIRST(&session->tables)) != NULL)
+ __wt_schema_remove_table(session, table);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c
new file mode 100644
index 00000000000..0332569a8e3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_open.c
@@ -0,0 +1,510 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_schema_colgroup_name --
+ * Get the URI for a column group. This is used for metadata lookups.
+ * The only complexity here is that simple tables (with a single column
+ * group) use a simpler naming scheme.
+ */
+int
+__wt_schema_colgroup_name(WT_SESSION_IMPL *session,
+ WT_TABLE *table, const char *cgname, size_t len, WT_ITEM *buf)
+{
+ const char *tablename;
+
+ tablename = table->name;
+ (void)WT_PREFIX_SKIP(tablename, "table:");
+
+ return ((table->ncolgroups == 0) ?
+ __wt_buf_fmt(session, buf, "colgroup:%s", tablename) :
+ __wt_buf_fmt(session, buf, "colgroup:%s:%.*s",
+ tablename, (int)len, cgname));
+}
+
+/*
+ * __wt_schema_open_colgroups --
+ * Open the column groups for a table.
+ */
+int
+__wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+ WT_COLGROUP *colgroup;
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM ckey, cval;
+ WT_DECL_RET;
+ WT_DECL_ITEM(buf);
+ const char *cgconfig;
+ u_int i;
+
+ if (table->cg_complete)
+ return (0);
+
+ colgroup = NULL;
+ cgconfig = NULL;
+
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+
+ WT_ERR(__wt_config_subinit(session, &cparser, &table->cgconf));
+
+ /* Open each column group. */
+ for (i = 0; i < WT_COLGROUPS(table); i++) {
+ if (table->ncolgroups > 0)
+ WT_ERR(__wt_config_next(&cparser, &ckey, &cval));
+ else
+ WT_CLEAR(ckey);
+
+ /*
+ * Always open from scratch: we may have failed part of the way
+ * through opening a table, or column groups may have changed.
+ */
+ if (table->cgroups[i] != NULL) {
+ __wt_schema_destroy_colgroup(
+ session, table->cgroups[i]);
+ table->cgroups[i] = NULL;
+ }
+
+ WT_ERR(__wt_buf_init(session, buf, 0));
+ WT_ERR(__wt_schema_colgroup_name(session, table,
+ ckey.str, ckey.len, buf));
+ if ((ret = __wt_metadata_search(
+ session, buf->data, &cgconfig)) != 0) {
+ /* It is okay if the table is incomplete. */
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ goto err;
+ }
+
+ WT_ERR(__wt_calloc_def(session, 1, &colgroup));
+ WT_ERR(__wt_strndup(
+ session, buf->data, buf->size, &colgroup->name));
+ colgroup->config = cgconfig;
+ cgconfig = NULL;
+ WT_ERR(__wt_config_getones(session,
+ colgroup->config, "columns", &colgroup->colconf));
+ WT_ERR(__wt_config_getones(
+ session, colgroup->config, "source", &cval));
+ WT_ERR(__wt_buf_init(session, buf, 0));
+ WT_ERR(__wt_buf_fmt(
+ session, buf, "%.*s", (int)cval.len, cval.str));
+ WT_ERR(__wt_strndup(
+ session, buf->data, buf->size, &colgroup->source));
+ table->cgroups[i] = colgroup;
+ colgroup = NULL;
+ }
+
+ if (!table->is_simple) {
+ WT_ERR(__wt_table_check(session, table));
+
+ WT_ERR(__wt_buf_init(session, buf, 0));
+ WT_ERR(__wt_struct_plan(session,
+ table, table->colconf.str, table->colconf.len, 1, buf));
+ WT_ERR(__wt_strndup(
+ session, buf->data, buf->size, &table->plan));
+ }
+
+ table->cg_complete = 1;
+
+err: __wt_scr_free(&buf);
+ if (colgroup != NULL)
+ __wt_schema_destroy_colgroup(session, colgroup);
+ if (cgconfig != NULL)
+ __wt_free(session, cgconfig);
+ return (ret);
+}
+
+/*
+ * __open_index --
+ * Open an index.
+ */
+static int
+__open_index(WT_SESSION_IMPL *session, WT_TABLE *table, WT_INDEX *idx)
+{
+ WT_CONFIG colconf;
+ WT_CONFIG_ITEM ckey, cval;
+ WT_DECL_ITEM(buf);
+ WT_DECL_ITEM(plan);
+ WT_DECL_RET;
+ u_int cursor_key_cols, i;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+
+ /* Get the data source from the index config. */
+ WT_ERR(__wt_config_getones(session, idx->config, "source", &cval));
+ WT_ERR(__wt_buf_fmt(session, buf, "%.*s", (int)cval.len, cval.str));
+ WT_ERR(__wt_strndup(session, buf->data, buf->size, &idx->source));
+
+ WT_ERR(__wt_buf_init(session, buf, 0));
+ WT_ERR(__wt_config_getones(session, idx->config, "key_format", &cval));
+ WT_ERR(__wt_buf_fmt(session, buf, "%.*s", (int)cval.len, cval.str));
+ WT_ERR(__wt_strndup(session, buf->data, buf->size, &idx->key_format));
+
+ /*
+ * The key format for an index is somewhat subtle: the application
+ * specifies a set of columns that it will use for the key, but the
+ * engine usually adds some hidden columns in order to derive the
+ * primary key. These hidden columns are part of the file's key.
+ *
+ * The file's key_format is stored persistently, we need to calculate
+ * the index cursor key format (which will usually omit some of those
+ * keys).
+ */
+ WT_ERR(__wt_buf_init(session, buf, 0));
+ WT_ERR(__wt_config_getones(
+ session, idx->config, "columns", &idx->colconf));
+
+ /* Start with the declared index columns. */
+ WT_ERR(__wt_config_subinit(session, &colconf, &idx->colconf));
+ cursor_key_cols = 0;
+ while ((ret = __wt_config_next(&colconf, &ckey, &cval)) == 0) {
+ WT_ERR(__wt_buf_catfmt(
+ session, buf, "%.*s,", (int)ckey.len, ckey.str));
+ ++cursor_key_cols;
+ }
+ if (ret != 0 && ret != WT_NOTFOUND)
+ goto err;
+
+ /*
+ * Now add any primary key columns from the table that are not
+ * already part of the index key.
+ */
+ WT_ERR(__wt_config_subinit(session, &colconf, &table->colconf));
+ for (i = 0; i < table->nkey_columns &&
+ (ret = __wt_config_next(&colconf, &ckey, &cval)) == 0;
+ i++) {
+ /*
+ * If the primary key column is already in the secondary key,
+ * don't add it again.
+ */
+ if (__wt_config_subgetraw(
+ session, &idx->colconf, &ckey, &cval) == 0)
+ continue;
+ WT_ERR(__wt_buf_catfmt(
+ session, buf, "%.*s,", (int)ckey.len, ckey.str));
+ }
+ if (ret != 0 && ret != WT_NOTFOUND)
+ goto err;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &plan));
+ WT_ERR(__wt_struct_plan(session, table, buf->data, buf->size, 0, plan));
+ WT_ERR(__wt_strndup(session, plan->data, plan->size, &idx->key_plan));
+
+ /* Set up the cursor key format (the visible columns). */
+ WT_ERR(__wt_buf_init(session, buf, 0));
+ WT_ERR(__wt_struct_truncate(session,
+ idx->key_format, cursor_key_cols, buf));
+ WT_ERR(__wt_strndup(
+ session, buf->data, buf->size, &idx->idxkey_format));
+
+ /* By default, index cursor values are the table value columns. */
+ /* TODO Optimize to use index columns in preference to table lookups. */
+ WT_ERR(__wt_buf_init(session, plan, 0));
+ WT_ERR(__wt_struct_plan(session,
+ table, table->colconf.str, table->colconf.len, 1, plan));
+ WT_ERR(__wt_strndup(session, plan->data, plan->size, &idx->value_plan));
+
+err: __wt_scr_free(&buf);
+ __wt_scr_free(&plan);
+ return (ret);
+}
+
+/*
+ * __wt_schema_open_index --
+ * Open one or more indices for a table.
+ */
+int
+__wt_schema_open_index(WT_SESSION_IMPL *session,
+ WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_INDEX *idx;
+ u_int i;
+ int cmp, match;
+ const char *idxconf, *name, *tablename, *uri;
+
+ /* Check if we've already done the work. */
+ if (idxname == NULL && table->idx_complete)
+ return (0);
+
+ cursor = NULL;
+ idx = NULL;
+
+ /* Build a search key. */
+ tablename = table->name;
+ (void)WT_PREFIX_SKIP(tablename, "table:");
+ WT_ERR(__wt_scr_alloc(session, 512, &tmp));
+ WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename));
+
+ /* Find matching indices. */
+ WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ cursor->set_key(cursor, tmp->data);
+ if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
+ ret = cursor->next(cursor);
+ for (i = 0; ret == 0; i++, ret = cursor->next(cursor)) {
+ WT_ERR(cursor->get_key(cursor, &uri));
+ name = uri;
+ if (!WT_PREFIX_SKIP(name, tmp->data))
+ break;
+
+ /* Is this the index we are looking for? */
+ match = idxname == NULL || WT_STRING_MATCH(name, idxname, len);
+
+ /*
+ * Ensure there is space, including if we have to make room for
+ * a new entry in the middle of the list.
+ */
+ WT_ERR(__wt_realloc_def(session, &table->idx_alloc,
+ WT_MAX(i, table->nindices) + 1, &table->indices));
+
+ /* Keep the in-memory list in sync with the metadata. */
+ cmp = 0;
+ while (table->indices[i] != NULL &&
+ (cmp = strcmp(uri, table->indices[i]->name)) > 0) {
+ /* Index no longer exists, remove it. */
+ __wt_free(session, table->indices[i]);
+ memmove(&table->indices[i], &table->indices[i + 1],
+ (table->nindices - i) * sizeof(WT_INDEX *));
+ table->indices[--table->nindices] = NULL;
+ }
+ if (cmp < 0) {
+ /* Make room for a new index. */
+ memmove(&table->indices[i + 1], &table->indices[i],
+ (table->nindices - i) * sizeof(WT_INDEX *));
+ table->indices[i] = NULL;
+ ++table->nindices;
+ }
+
+ if (!match)
+ continue;
+
+ if (table->indices[i] == NULL) {
+ WT_ERR(cursor->get_value(cursor, &idxconf));
+ WT_ERR(__wt_calloc_def(session, 1, &idx));
+ WT_ERR(__wt_strdup(session, uri, &idx->name));
+ WT_ERR(__wt_strdup(session, idxconf, &idx->config));
+ WT_ERR(__open_index(session, table, idx));
+
+ table->indices[i] = idx;
+ idx = NULL;
+ }
+
+ /* If we were looking for a single index, we're done. */
+ if (indexp != NULL)
+ *indexp = table->indices[i];
+ if (idxname != NULL)
+ break;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ /* If we did a full pass, we won't need to do it again. */
+ if (idxname == NULL) {
+ table->nindices = i;
+ table->idx_complete = 1;
+ }
+
+err: __wt_scr_free(&tmp);
+ if (idx != NULL)
+ __wt_schema_destroy_index(session, idx);
+ if (cursor != NULL)
+ WT_TRET(cursor->close(cursor));
+ return (ret);
+}
+
+/*
+ * __wt_schema_open_indices --
+ * Open the indices for a table.
+ */
+int
+__wt_schema_open_indices(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+ return (__wt_schema_open_index(session, table, NULL, 0, NULL));
+}
+
+/*
+ * __wt_schema_open_table --
+ * Open a named table.
+ */
+int
+__wt_schema_open_table(WT_SESSION_IMPL *session,
+ const char *name, size_t namelen, WT_TABLE **tablep)
+{
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM ckey, cval;
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_TABLE *table;
+ const char *tconfig;
+ char *tablename;
+
+ cursor = NULL;
+ table = NULL;
+ tablename = NULL;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_buf_fmt(session, buf, "table:%.*s", (int)namelen, name));
+ WT_ERR(__wt_strndup(session, buf->data, buf->size, &tablename));
+
+ WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ cursor->set_key(cursor, tablename);
+ WT_ERR(cursor->search(cursor));
+ WT_ERR(cursor->get_value(cursor, &tconfig));
+
+ WT_ERR(__wt_calloc_def(session, 1, &table));
+ table->name = tablename;
+ tablename = NULL;
+
+ WT_ERR(__wt_config_getones(session, tconfig, "columns", &cval));
+
+ WT_ERR(__wt_config_getones(session, tconfig, "key_format", &cval));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->key_format));
+ WT_ERR(__wt_config_getones(session, tconfig, "value_format", &cval));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->value_format));
+ WT_ERR(__wt_strdup(session, tconfig, &table->config));
+
+ /* Point to some items in the copy to save re-parsing. */
+ WT_ERR(__wt_config_getones(session, table->config,
+ "columns", &table->colconf));
+
+ /*
+ * Count the number of columns: tables are "simple" if the columns
+ * are not named.
+ */
+ WT_ERR(__wt_config_subinit(session, &cparser, &table->colconf));
+ table->is_simple = 1;
+ while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
+ table->is_simple = 0;
+ if (ret != WT_NOTFOUND)
+ goto err;
+
+ /* Check that the columns match the key and value formats. */
+ if (!table->is_simple)
+ WT_ERR(__wt_schema_colcheck(session,
+ table->key_format, table->value_format, &table->colconf,
+ &table->nkey_columns, NULL));
+
+ WT_ERR(__wt_config_getones(session, table->config,
+ "colgroups", &table->cgconf));
+
+ /* Count the number of column groups. */
+ WT_ERR(__wt_config_subinit(session, &cparser, &table->cgconf));
+ table->ncolgroups = 0;
+ while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
+ ++table->ncolgroups;
+ if (ret != WT_NOTFOUND)
+ goto err;
+
+ WT_ERR(__wt_calloc_def(session, WT_COLGROUPS(table), &table->cgroups));
+ WT_ERR(__wt_schema_open_colgroups(session, table));
+ *tablep = table;
+
+ if (0) {
+err: if (table != NULL)
+ __wt_schema_destroy_table(session, table);
+ }
+ if (cursor != NULL)
+ WT_TRET(cursor->close(cursor));
+
+ __wt_free(session, tablename);
+ __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_schema_get_colgroup --
+ * Find a column group by URI.
+ */
+int
+__wt_schema_get_colgroup(WT_SESSION_IMPL *session,
+ const char *uri, WT_TABLE **tablep, WT_COLGROUP **colgroupp)
+{
+ WT_COLGROUP *colgroup;
+ WT_TABLE *table;
+ const char *tablename, *tend;
+ u_int i;
+
+ *colgroupp = NULL;
+
+ tablename = uri;
+ if (!WT_PREFIX_SKIP(tablename, "colgroup:"))
+ return (__wt_bad_object_type(session, uri));
+
+ if ((tend = strchr(tablename, ':')) == NULL)
+ tend = tablename + strlen(tablename);
+
+ WT_RET(__wt_schema_get_table(session,
+ tablename, WT_PTRDIFF(tend, tablename), 0, &table));
+
+ for (i = 0; i < WT_COLGROUPS(table); i++) {
+ colgroup = table->cgroups[i];
+ if (strcmp(colgroup->name, uri) == 0) {
+ *colgroupp = colgroup;
+ if (tablep != NULL)
+ *tablep = table;
+ else
+ __wt_schema_release_table(session, table);
+ return (0);
+ }
+ }
+
+ __wt_schema_release_table(session, table);
+ WT_RET_MSG(session, ENOENT, "%s not found in table", uri);
+}
+
+/*
+ * __wt_schema_get_index --
+ * Find a column group by URI.
+ */
+int
+__wt_schema_get_index(WT_SESSION_IMPL *session,
+ const char *uri, WT_TABLE **tablep, WT_INDEX **indexp)
+{
+ WT_DECL_RET;
+ WT_INDEX *idx;
+ WT_TABLE *table;
+ const char *tablename, *tend;
+ u_int i;
+
+ *indexp = NULL;
+
+ tablename = uri;
+ if (!WT_PREFIX_SKIP(tablename, "index:") ||
+ (tend = strchr(tablename, ':')) == NULL)
+ return (__wt_bad_object_type(session, uri));
+
+ WT_RET(__wt_schema_get_table(session,
+ tablename, WT_PTRDIFF(tend, tablename), 0, &table));
+
+ /* Try to find the index in the table. */
+ for (i = 0; i < table->nindices; i++) {
+ idx = table->indices[i];
+ if (strcmp(idx->name, uri) == 0) {
+ if (tablep != NULL)
+ *tablep = table;
+ else
+ __wt_schema_release_table(session, table);
+ *indexp = idx;
+ return (0);
+ }
+ }
+
+ /* Otherwise, open it. */
+ WT_ERR(__wt_schema_open_index(
+ session, table, tend + 1, strlen(tend + 1), indexp));
+
+err: __wt_schema_release_table(session, table);
+ WT_RET(ret);
+
+ if (*indexp != NULL)
+ return (0);
+
+ WT_RET_MSG(session, ENOENT, "%s not found in table", uri);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_plan.c b/src/third_party/wiredtiger/src/schema/schema_plan.c
new file mode 100644
index 00000000000..5abe0dd67d4
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_plan.c
@@ -0,0 +1,394 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __find_next_col --
+ * Find the next column to use for a plan.
+ */
+static int
+__find_next_col(WT_SESSION_IMPL *session, WT_TABLE *table,
+ WT_CONFIG_ITEM *colname, u_int *cgnump, u_int *colnump, char *coltype)
+{
+ WT_COLGROUP *colgroup;
+ WT_CONFIG conf;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_DECL_RET;
+ u_int cg, col, foundcg, foundcol, matchcg, matchcol;
+ int getnext;
+
+ foundcg = foundcol = UINT_MAX;
+ matchcg = *cgnump;
+ matchcol = (*coltype == WT_PROJ_KEY) ?
+ *colnump : *colnump + table->nkey_columns;
+
+ getnext = 1;
+ for (colgroup = NULL, cg = 0; cg < WT_COLGROUPS(table); cg++) {
+ colgroup = table->cgroups[cg];
+
+ /*
+ * If there is only one column group, we just scan through all
+ * of the columns. For tables with multiple column groups, we
+ * look at the key columns once, then go through the value
+ * columns for each group.
+ */
+ if (cg == 0) {
+ cval = table->colconf;
+ col = 0;
+ } else {
+cgcols: cval = colgroup->colconf;
+ col = table->nkey_columns;
+ }
+ WT_RET(__wt_config_subinit(session, &conf, &cval));
+ for (; (ret = __wt_config_next(&conf, &k, &v)) == 0; col++) {
+ if (k.len == colname->len &&
+ strncmp(colname->str, k.str, k.len) == 0) {
+ if (getnext) {
+ foundcg = cg;
+ foundcol = col;
+ }
+ getnext = (cg == matchcg && col == matchcol);
+ }
+ if (cg == 0 && table->ncolgroups > 0 &&
+ col == table->nkey_columns - 1)
+ goto cgcols;
+ }
+ WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+ colgroup = NULL;
+ }
+
+ if (foundcg == UINT_MAX)
+ return (WT_NOTFOUND);
+
+ *cgnump = foundcg;
+ if (foundcol < table->nkey_columns) {
+ *coltype = WT_PROJ_KEY;
+ *colnump = foundcol;
+ } else {
+ *coltype = WT_PROJ_VALUE;
+ *colnump = foundcol - table->nkey_columns;
+ }
+ return (0);
+}
+
+/*
+ * __wt_schema_colcheck --
+ * Check that a list of columns matches a (key,value) format pair.
+ */
+int
+__wt_schema_colcheck(WT_SESSION_IMPL *session,
+ const char *key_format, const char *value_format, WT_CONFIG_ITEM *colconf,
+ u_int *kcolsp, u_int *vcolsp)
+{
+ WT_CONFIG conf;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ u_int kcols, ncols, vcols;
+
+ WT_RET(__pack_init(session, &pack, key_format));
+ for (kcols = 0; (ret = __pack_next(&pack, &pv)) == 0; kcols++)
+ ;
+ WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+ WT_RET(__pack_init(session, &pack, value_format));
+ for (vcols = 0; (ret = __pack_next(&pack, &pv)) == 0; vcols++)
+ ;
+ WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+ /* Walk through the named columns. */
+ WT_RET(__wt_config_subinit(session, &conf, colconf));
+ for (ncols = 0; (ret = __wt_config_next(&conf, &k, &v)) == 0; ncols++)
+ ;
+ WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+ if (ncols != 0 && ncols != kcols + vcols)
+ WT_RET_MSG(session, EINVAL, "Number of columns in '%.*s' "
+ "does not match key format '%s' plus value format '%s'",
+ (int)colconf->len, colconf->str, key_format, value_format);
+
+ if (kcolsp != NULL)
+ *kcolsp = kcols;
+ if (vcolsp != NULL)
+ *vcolsp = vcols;
+
+ return (0);
+}
+
+/*
+ * __wt_table_check --
+ * Make sure all columns appear in a column group.
+ */
+int
+__wt_table_check(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+ WT_CONFIG conf;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_RET;
+ u_int cg, col, i;
+ char coltype;
+
+ if (table->is_simple)
+ return (0);
+
+ /* Walk through the columns. */
+ WT_RET(__wt_config_subinit(session, &conf, &table->colconf));
+
+ /* Skip over the key columns. */
+ for (i = 0; i < table->nkey_columns; i++)
+ WT_RET(__wt_config_next(&conf, &k, &v));
+ cg = col = 0;
+ coltype = 0;
+ while ((ret = __wt_config_next(&conf, &k, &v)) == 0) {
+ if (__find_next_col(
+ session, table, &k, &cg, &col, &coltype) != 0)
+ WT_RET_MSG(session, EINVAL,
+ "Column '%.*s' in '%s' does not appear in a "
+ "column group",
+ (int)k.len, k.str, table->name);
+ /*
+ * Column groups can't store key columns in their value:
+ * __wt_struct_reformat should have already detected this case.
+ */
+ WT_ASSERT(session, coltype == WT_PROJ_VALUE);
+
+ }
+ WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+ return (0);
+}
+
+/*
+ * __wt_struct_plan --
+ * Given a table cursor containing a complete table, build the "projection
+ * plan" to distribute the columns to dependent stores. A string
+ * representing the plan will be appended to the plan buffer.
+ */
+int
+__wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table,
+ const char *columns, size_t len, int value_only, WT_ITEM *plan)
+{
+ WT_CONFIG conf;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_RET;
+ u_int cg, col, current_cg, current_col, i, start_cg, start_col;
+ int have_it;
+ char coltype, current_coltype;
+
+ start_cg = start_col = UINT_MAX; /* -Wuninitialized */
+
+ /* Work through the value columns by skipping over the key columns. */
+ WT_RET(__wt_config_initn(session, &conf, columns, len));
+ if (value_only)
+ for (i = 0; i < table->nkey_columns; i++)
+ WT_RET(__wt_config_next(&conf, &k, &v));
+
+ current_cg = cg = 0;
+ current_col = col = INT_MAX;
+ current_coltype = coltype = WT_PROJ_KEY; /* Keep lint quiet. */
+ for (i = 0; (ret = __wt_config_next(&conf, &k, &v)) == 0; i++) {
+ have_it = 0;
+
+ while (__find_next_col(session, table,
+ &k, &cg, &col, &coltype) == 0 &&
+ (!have_it || cg != start_cg || col != start_col)) {
+ /*
+ * First we move to the column. If that is in a
+ * different column group to the last column we
+ * accessed, or before the last column in the same
+ * column group, or moving from the key to the value,
+ * we need to switch column groups or rewind.
+ */
+ if (current_cg != cg || current_col > col ||
+ current_coltype != coltype) {
+ WT_ASSERT(session, !value_only ||
+ coltype == WT_PROJ_VALUE);
+ WT_RET(__wt_buf_catfmt(
+ session, plan, "%d%c", cg, coltype));
+
+ /*
+ * Set the current column group and column
+ * within the table.
+ */
+ current_cg = cg;
+ current_col = 0;
+ current_coltype = coltype;
+ }
+ /* Now move to the column we want. */
+ if (current_col < col) {
+ if (col - current_col > 1)
+ WT_RET(__wt_buf_catfmt(session,
+ plan, "%d", col - current_col));
+ WT_RET(__wt_buf_catfmt(session,
+ plan, "%c", WT_PROJ_SKIP));
+ }
+ /*
+ * Now copy the value in / out. In the common case,
+ * where each value is used in one column, we do a
+ * "next" operation. If the value is used again, we do
+ * a "reuse" operation to avoid making another copy.
+ */
+ if (!have_it) {
+ WT_RET(__wt_buf_catfmt(session,
+ plan, "%c", WT_PROJ_NEXT));
+
+ start_cg = cg;
+ start_col = col;
+ have_it = 1;
+ } else
+ WT_RET(__wt_buf_catfmt(session,
+ plan, "%c", WT_PROJ_REUSE));
+ current_col = col + 1;
+ }
+ }
+ WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+ /* Special case empty plans. */
+ if (i == 0 && plan->size == 0)
+ WT_RET(__wt_buf_set(session, plan, "", 1));
+
+ return (0);
+}
+
+/*
+ * __find_column_format --
+ * Find the format of the named column.
+ */
+static int
+__find_column_format(WT_SESSION_IMPL *session,
+ WT_TABLE *table, WT_CONFIG_ITEM *colname, int value_only, WT_PACK_VALUE *pv)
+{
+ WT_CONFIG conf;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_RET;
+ WT_PACK pack;
+ int inkey;
+
+ WT_RET(__wt_config_subinit(session, &conf, &table->colconf));
+ WT_RET(__pack_init(session, &pack, table->key_format));
+ inkey = 1;
+
+ while ((ret = __wt_config_next(&conf, &k, &v)) == 0) {
+ if ((ret = __pack_next(&pack, pv)) == WT_NOTFOUND && inkey) {
+ ret = __pack_init(session, &pack, table->value_format);
+ if (ret == 0)
+ ret = __pack_next(&pack, pv);
+ inkey = 0;
+ }
+ if (ret != 0)
+ return (ret);
+
+ if (k.len == colname->len &&
+ strncmp(colname->str, k.str, k.len) == 0) {
+ if (value_only && inkey)
+ return (EINVAL);
+ return (0);
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_struct_reformat --
+ * Given a table and a list of columns (which could be values in a column
+ * group or index keys), calculate the resulting new format string.
+ * The result will be appended to the format buffer.
+ */
+int
+__wt_struct_reformat(WT_SESSION_IMPL *session, WT_TABLE *table,
+ const char *columns, size_t len, const char *extra_cols, int value_only,
+ WT_ITEM *format)
+{
+ WT_CONFIG config;
+ WT_CONFIG_ITEM k, next_k, next_v;
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ int have_next;
+
+ WT_RET(__wt_config_initn(session, &config, columns, len));
+ /*
+ * If an empty column list is specified, this will fail with
+ * WT_NOTFOUND, that's okay.
+ */
+ WT_RET_NOTFOUND_OK(ret = __wt_config_next(&config, &next_k, &next_v));
+ if (ret == WT_NOTFOUND) {
+ if (format->size == 0)
+ WT_RET(__wt_buf_set(session, format, "", 1));
+ return (0);
+ }
+ do {
+ k = next_k;
+ ret = __wt_config_next(&config, &next_k, &next_v);
+ if (ret != 0 && ret != WT_NOTFOUND)
+ return (ret);
+ have_next = (ret == 0);
+
+ if (!have_next && extra_cols != NULL) {
+ WT_RET(__wt_config_init(session, &config, extra_cols));
+ WT_RET(__wt_config_next(&config, &next_k, &next_v));
+ have_next = 1;
+ extra_cols = NULL;
+ }
+
+ if ((ret = __find_column_format(session,
+ table, &k, value_only, &pv)) != 0) {
+ if (value_only && ret == EINVAL)
+ WT_RET_MSG(session, EINVAL,
+ "A column group cannot store key column "
+ "'%.*s' in its value", (int)k.len, k.str);
+ WT_RET_MSG(session, EINVAL,
+ "Column '%.*s' not found", (int)k.len, k.str);
+ }
+
+ /*
+ * Check whether we're moving an unsized WT_ITEM from the end
+ * to the middle, or vice-versa. This determines whether the
+ * size needs to be prepended. This is the only case where the
+ * destination size can be larger than the source size.
+ */
+ if (pv.type == 'u' && !pv.havesize && have_next)
+ pv.type = 'U';
+ else if (pv.type == 'U' && !have_next)
+ pv.type = 'u';
+
+ if (pv.havesize)
+ WT_RET(__wt_buf_catfmt(
+ session, format, "%d%c", (int)pv.size, pv.type));
+ else
+ WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type));
+ } while (have_next);
+
+ return (0);
+}
+
+/*
+ * __wt_struct_truncate --
+ * Return a packing string for the first N columns in a value.
+ */
+int
+__wt_struct_truncate(WT_SESSION_IMPL *session,
+ const char *input_fmt, u_int ncols, WT_ITEM *format)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_PACK pack;
+
+ WT_RET(__pack_init(session, &pack, input_fmt));
+ while (ncols-- > 0) {
+ WT_RET(__pack_next(&pack, &pv));
+ if (pv.havesize)
+ WT_RET(__wt_buf_catfmt(
+ session, format, "%d%c", (int)pv.size, pv.type));
+ else
+ WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type));
+ }
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_project.c b/src/third_party/wiredtiger/src/schema/schema_project.c
new file mode 100644
index 00000000000..9aff4c8dded
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_project.c
@@ -0,0 +1,474 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_schema_project_in --
+ * Given list of cursors and a projection, read columns from the
+ * application into the dependent cursors.
+ */
+int
+__wt_schema_project_in(WT_SESSION_IMPL *session,
+ WT_CURSOR **cp, const char *proj_arg, va_list ap)
+{
+ WT_CURSOR *c;
+ WT_DECL_ITEM(buf);
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_PACK(pack);
+ WT_PACK_VALUE old_pv;
+ size_t len, offset, old_len;
+ u_long arg;
+ char *proj;
+ uint8_t *p, *end;
+ const uint8_t *next;
+
+ p = end = NULL; /* -Wuninitialized */
+
+ /* Reset any of the buffers we will be setting. */
+ for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+ arg = strtoul(proj, &proj, 10);
+ if (*proj == WT_PROJ_KEY) {
+ c = cp[arg];
+ WT_RET(__wt_buf_init(session, &c->key, 0));
+ } else if (*proj == WT_PROJ_VALUE) {
+ c = cp[arg];
+ WT_RET(__wt_buf_init(session, &c->value, 0));
+ }
+ }
+
+ for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+ arg = strtoul(proj, &proj, 10);
+
+ switch (*proj) {
+ case WT_PROJ_KEY:
+ c = cp[arg];
+ if (WT_CURSOR_RECNO(c)) {
+ c->key.data = &c->recno;
+ c->key.size = sizeof(c->recno);
+ WT_RET(__pack_init(session, &pack, "R"));
+ } else
+ WT_RET(__pack_init(
+ session, &pack, c->key_format));
+ buf = &c->key;
+ p = (uint8_t *)buf->data;
+ end = p + buf->size;
+ continue;
+
+ case WT_PROJ_VALUE:
+ c = cp[arg];
+ WT_RET(__pack_init(session, &pack, c->value_format));
+ buf = &c->value;
+ p = (uint8_t *)buf->data;
+ end = p + buf->size;
+ continue;
+ }
+
+ /* We have to get a key or value before any operations. */
+ WT_ASSERT(session, buf != NULL);
+
+ /*
+ * Otherwise, the argument is a count, where a missing
+ * count means a count of 1.
+ */
+ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
+ switch (*proj) {
+ case WT_PROJ_SKIP:
+ WT_RET(__pack_next(&pack, &pv));
+ /*
+ * A nasty case: if we are inserting
+ * out-of-order, we may reach the end of the
+ * data. That's okay: we want to append in
+ * that case, and we're positioned to do that.
+ */
+ if (p == end) {
+ /* Set up an empty value. */
+ WT_CLEAR(pv.u);
+ if (pv.type == 'S' || pv.type == 's')
+ pv.u.s = "";
+
+ len = __pack_size(session, &pv);
+ WT_RET(__wt_buf_grow(session,
+ buf, buf->size + len));
+ p = (uint8_t *)buf->mem + buf->size;
+ WT_RET(__pack_write(
+ session, &pv, &p, len));
+ buf->size += len;
+ end = (uint8_t *)buf->mem + buf->size;
+ } else if (*proj == WT_PROJ_SKIP)
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&p,
+ (size_t)(end - p)));
+ break;
+
+ case WT_PROJ_NEXT:
+ WT_RET(__pack_next(&pack, &pv));
+ WT_PACK_GET(session, pv, ap);
+ /* FALLTHROUGH */
+
+ case WT_PROJ_REUSE:
+ /* Read the item we're about to overwrite. */
+ next = p;
+ if (p < end) {
+ old_pv = pv;
+ WT_RET(__unpack_read(session, &old_pv,
+ &next, (size_t)(end - p)));
+ }
+ old_len = (size_t)(next - p);
+
+ len = __pack_size(session, &pv);
+ offset = WT_PTRDIFF(p, buf->mem);
+ WT_RET(__wt_buf_grow(session,
+ buf, buf->size + len));
+ p = (uint8_t *)buf->mem + offset;
+ end = (uint8_t *)buf->mem + buf->size + len;
+ /* Make room if we're inserting out-of-order. */
+ if (offset + old_len < buf->size)
+ memmove(p + len, p + old_len,
+ buf->size - (offset + old_len));
+ WT_RET(__pack_write(session, &pv, &p, len));
+ buf->size += len;
+ break;
+
+ default:
+ WT_RET_MSG(session, EINVAL,
+ "unexpected projection plan: %c",
+ (int)*proj);
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_schema_project_out --
+ * Given list of cursors and a projection, read columns from the
+ * dependent cursors and return them to the application.
+ */
+int
+__wt_schema_project_out(WT_SESSION_IMPL *session,
+ WT_CURSOR **cp, const char *proj_arg, va_list ap)
+{
+ WT_CURSOR *c;
+ WT_DECL_PACK(pack);
+ WT_DECL_PACK_VALUE(pv);
+ u_long arg;
+ char *proj;
+ uint8_t *p, *end;
+
+ p = end = NULL; /* -Wuninitialized */
+
+ for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+ arg = strtoul(proj, &proj, 10);
+
+ switch (*proj) {
+ case WT_PROJ_KEY:
+ c = cp[arg];
+ if (WT_CURSOR_RECNO(c)) {
+ c->key.data = &c->recno;
+ c->key.size = sizeof(c->recno);
+ WT_RET(__pack_init(session, &pack, "R"));
+ } else
+ WT_RET(__pack_init(
+ session, &pack, c->key_format));
+ p = (uint8_t *)c->key.data;
+ end = p + c->key.size;
+ continue;
+
+ case WT_PROJ_VALUE:
+ c = cp[arg];
+ WT_RET(__pack_init(session, &pack, c->value_format));
+ p = (uint8_t *)c->value.data;
+ end = p + c->value.size;
+ continue;
+ }
+
+ /*
+ * Otherwise, the argument is a count, where a missing
+ * count means a count of 1.
+ */
+ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
+ switch (*proj) {
+ case WT_PROJ_NEXT:
+ case WT_PROJ_SKIP:
+ case WT_PROJ_REUSE:
+ WT_RET(__pack_next(&pack, &pv));
+ WT_RET(__unpack_read(session, &pv,
+ (const uint8_t **)&p, (size_t)(end - p)));
+ /* Only copy the value out once. */
+ if (*proj != WT_PROJ_NEXT)
+ break;
+ WT_UNPACK_PUT(session, pv, ap);
+ break;
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_schema_project_slice --
+ * Given list of cursors and a projection, read columns from the
+ * a raw buffer.
+ */
+int
+__wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp,
+ const char *proj_arg, int key_only, const char *vformat, WT_ITEM *value)
+{
+ WT_CURSOR *c;
+ WT_DECL_ITEM(buf);
+ WT_DECL_PACK(pack);
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_PACK_VALUE(vpv);
+ WT_PACK vpack;
+ u_long arg;
+ char *proj;
+ uint8_t *end, *p;
+ const uint8_t *next, *vp, *vend;
+ size_t len, offset, old_len;
+ int skip;
+
+ p = end = NULL; /* -Wuninitialized */
+
+ WT_RET(__pack_init(session, &vpack, vformat));
+ vp = value->data;
+ vend = vp + value->size;
+
+ /* Reset any of the buffers we will be setting. */
+ for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+ arg = strtoul(proj, &proj, 10);
+ if (*proj == WT_PROJ_KEY) {
+ c = cp[arg];
+ WT_RET(__wt_buf_init(session, &c->key, 0));
+ } else if (*proj == WT_PROJ_VALUE && !key_only) {
+ c = cp[arg];
+ WT_RET(__wt_buf_init(session, &c->value, 0));
+ }
+ }
+
+ skip = key_only;
+ for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+ arg = strtoul(proj, &proj, 10);
+
+ switch (*proj) {
+ case WT_PROJ_KEY:
+ skip = 0;
+ c = cp[arg];
+ if (WT_CURSOR_RECNO(c)) {
+ c->key.data = &c->recno;
+ c->key.size = sizeof(c->recno);
+ WT_RET(__pack_init(session, &pack, "R"));
+ } else
+ WT_RET(__pack_init(
+ session, &pack, c->key_format));
+ buf = &c->key;
+ p = (uint8_t *)buf->data;
+ end = p + buf->size;
+ continue;
+
+ case WT_PROJ_VALUE:
+ if ((skip = key_only) != 0)
+ continue;
+ c = cp[arg];
+ WT_RET(__pack_init(session, &pack, c->value_format));
+ buf = &c->value;
+ p = (uint8_t *)buf->data;
+ end = p + buf->size;
+ continue;
+ }
+
+ /* We have to get a key or value before any operations. */
+ WT_ASSERT(session, skip || buf != NULL);
+
+ /*
+ * Otherwise, the argument is a count, where a missing
+ * count means a count of 1.
+ */
+ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
+ switch (*proj) {
+ case WT_PROJ_SKIP:
+ if (skip)
+ break;
+ WT_RET(__pack_next(&pack, &pv));
+
+ /*
+ * A nasty case: if we are inserting
+ * out-of-order, append a zero value to keep
+ * the buffer in the correct format.
+ */
+ if (p == end) {
+ /* Set up an empty value. */
+ WT_CLEAR(pv.u);
+ if (pv.type == 'S' || pv.type == 's')
+ pv.u.s = "";
+
+ len = __pack_size(session, &pv);
+ WT_RET(__wt_buf_grow(session,
+ buf, buf->size + len));
+ p = (uint8_t *)buf->data + buf->size;
+ WT_RET(__pack_write(
+ session, &pv, &p, len));
+ end = p;
+ buf->size += len;
+ } else
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&p,
+ (size_t)(end - p)));
+ break;
+
+ case WT_PROJ_NEXT:
+ WT_RET(__pack_next(&vpack, &vpv));
+ WT_RET(__unpack_read(session, &vpv,
+ &vp, (size_t)(vend - vp)));
+ /* FALLTHROUGH */
+
+ case WT_PROJ_REUSE:
+ if (skip)
+ break;
+
+ /*
+ * Read the item we're about to overwrite.
+ *
+ * There is subtlety here: the value format
+ * may not exactly match the cursor's format.
+ * In particular, we need lengths with raw
+ * columns in the middle of a packed struct,
+ * but not if they are at the end of a struct.
+ */
+ WT_RET(__pack_next(&pack, &pv));
+
+ next = p;
+ if (p < end)
+ WT_RET(__unpack_read(session, &pv,
+ &next, (size_t)(end - p)));
+ old_len = (size_t)(next - p);
+
+ /* Make sure the types are compatible. */
+ WT_ASSERT(session,
+ tolower(pv.type) == tolower(vpv.type));
+ pv.u = vpv.u;
+
+ len = __pack_size(session, &pv);
+ offset = WT_PTRDIFF(p, buf->data);
+ /*
+ * Avoid growing the buffer if the value fits.
+ * This is not just a performance issue: it
+ * covers the case of record number keys, which
+ * have to be written to cursor->recno.
+ */
+ if (len > old_len)
+ WT_RET(__wt_buf_grow(session,
+ buf, buf->size + len - old_len));
+ p = (uint8_t *)buf->data + offset;
+ /* Make room if we're inserting out-of-order. */
+ if (offset + old_len < buf->size)
+ memmove(p + len, p + old_len,
+ buf->size - (offset + old_len));
+ WT_RET(__pack_write(session, &pv, &p, len));
+ buf->size += len - old_len;
+ end = (uint8_t *)buf->data + buf->size;
+ break;
+ default:
+ WT_RET_MSG(session, EINVAL,
+ "unexpected projection plan: %c",
+ (int)*proj);
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_schema_project_merge --
+ * Given list of cursors and a projection, build a buffer containing the
+ * column values read from the cursors.
+ */
+int
+__wt_schema_project_merge(WT_SESSION_IMPL *session,
+ WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value)
+{
+ WT_CURSOR *c;
+ WT_ITEM *buf;
+ WT_DECL_PACK(pack);
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_PACK_VALUE(vpv);
+ WT_PACK vpack;
+ u_long arg;
+ char *proj;
+ const uint8_t *p, *end;
+ uint8_t *vp;
+ size_t len;
+
+ p = end = NULL; /* -Wuninitialized */
+
+ WT_RET(__wt_buf_init(session, value, 0));
+ WT_RET(__pack_init(session, &vpack, vformat));
+
+ for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+ arg = strtoul(proj, &proj, 10);
+
+ switch (*proj) {
+ case WT_PROJ_KEY:
+ c = cp[arg];
+ if (WT_CURSOR_RECNO(c)) {
+ c->key.data = &c->recno;
+ c->key.size = sizeof(c->recno);
+ WT_RET(__pack_init(session, &pack, "R"));
+ } else
+ WT_RET(__pack_init(
+ session, &pack, c->key_format));
+ buf = &c->key;
+ p = buf->data;
+ end = p + buf->size;
+ continue;
+
+ case WT_PROJ_VALUE:
+ c = cp[arg];
+ WT_RET(__pack_init(session, &pack, c->value_format));
+ buf = &c->value;
+ p = buf->data;
+ end = p + buf->size;
+ continue;
+ }
+
+ /*
+ * Otherwise, the argument is a count, where a missing
+ * count means a count of 1.
+ */
+ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
+ switch (*proj) {
+ case WT_PROJ_NEXT:
+ case WT_PROJ_SKIP:
+ case WT_PROJ_REUSE:
+ WT_RET(__pack_next(&pack, &pv));
+ WT_RET(__unpack_read(session, &pv,
+ &p, (size_t)(end - p)));
+ /* Only copy the value out once. */
+ if (*proj != WT_PROJ_NEXT)
+ break;
+
+ WT_RET(__pack_next(&vpack, &vpv));
+ /* Make sure the types are compatible. */
+ WT_ASSERT(session,
+ tolower(pv.type) == tolower(vpv.type));
+ vpv.u = pv.u;
+ len = __pack_size(session, &vpv);
+ WT_RET(__wt_buf_grow(session,
+ value, value->size + len));
+ vp = (uint8_t *)value->mem + value->size;
+ WT_RET(__pack_write(session, &vpv, &vp, len));
+ value->size += len;
+ break;
+ }
+ }
+ }
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c
new file mode 100644
index 00000000000..8605ea41c80
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_rename.c
@@ -0,0 +1,276 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __rename_file --
+ * WT_SESSION::rename for a file.
+ */
+static int
+__rename_file(
+ WT_SESSION_IMPL *session, const char *uri, const char *newuri)
+{
+ WT_DECL_RET;
+ int exist;
+ const char *filename, *newfile, *newvalue, *oldvalue;
+
+ newvalue = oldvalue = NULL;
+
+ filename = uri;
+ newfile = newuri;
+ if (!WT_PREFIX_SKIP(filename, "file:") ||
+ !WT_PREFIX_SKIP(newfile, "file:"))
+ return (EINVAL);
+
+ /* Close any btree handles in the file. */
+ WT_ERR(__wt_conn_dhandle_close_all(session, uri, 0));
+
+ /*
+ * First, check if the file being renamed exists in the system. Doing
+ * this check first matches the table rename behavior because we return
+ * WT_NOTFOUND when the renamed file doesn't exist (subsequently mapped
+ * to ENOENT by the session layer).
+ */
+ WT_ERR(__wt_metadata_search(session, uri, &oldvalue));
+
+ /*
+ * Check to see if the proposed name is already in use, in either the
+ * metadata or the filesystem.
+ */
+ switch (ret = __wt_metadata_search(session, newuri, &newvalue)) {
+ case 0:
+ WT_ERR_MSG(session, EEXIST, "%s", newuri);
+ /* NOTREACHED */
+ case WT_NOTFOUND:
+ break;
+ default:
+ WT_ERR(ret);
+ }
+ WT_ERR(__wt_exist(session, newfile, &exist));
+ if (exist)
+ WT_ERR_MSG(session, EEXIST, "%s", newfile);
+
+ /* Replace the old file entries with new file entries. */
+ WT_ERR(__wt_metadata_remove(session, uri));
+ WT_ERR(__wt_metadata_insert(session, newuri, oldvalue));
+
+ /* Rename the underlying file. */
+ WT_ERR(__wt_rename(session, filename, newfile));
+ if (WT_META_TRACKING(session))
+ WT_ERR(__wt_meta_track_fileop(session, uri, newuri));
+
+err: __wt_free(session, newvalue);
+ __wt_free(session, oldvalue);
+ return (ret);
+}
+
+/*
+ * __rename_tree --
+ * Rename an index or colgroup reference.
+ */
+static int
+__rename_tree(WT_SESSION_IMPL *session,
+ WT_TABLE *table, const char *newuri, const char *name, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_ITEM(nn);
+ WT_DECL_ITEM(ns);
+ WT_DECL_ITEM(nv);
+ WT_DECL_ITEM(os);
+ WT_DECL_RET;
+ const char *newname, *olduri, *suffix, *value;
+ int is_colgroup;
+
+ olduri = table->name;
+ value = NULL;
+
+ newname = newuri;
+ (void)WT_PREFIX_SKIP(newname, "table:");
+
+ /*
+ * Create the new data source URI and update the schema value.
+ *
+ * 'name' has the format (colgroup|index):<tablename>[:<suffix>];
+ * we need the suffix.
+ */
+ is_colgroup = WT_PREFIX_MATCH(name, "colgroup:");
+ if (!is_colgroup && !WT_PREFIX_MATCH(name, "index:"))
+ WT_ERR_MSG(session, EINVAL,
+ "expected a 'colgroup:' or 'index:' source: '%s'", name);
+
+ suffix = strchr(name, ':');
+ /* An existing table should have a well formed name. */
+ WT_ASSERT(session, suffix != NULL);
+ suffix = strchr(suffix + 1, ':');
+
+ WT_ERR(__wt_scr_alloc(session, 0, &nn));
+ WT_ERR(__wt_buf_fmt(session, nn, "%s%s%s",
+ is_colgroup ? "colgroup:" : "index:",
+ newname,
+ (suffix == NULL) ? "" : suffix));
+
+ /* Skip the colon, if any. */
+ if (suffix != NULL)
+ ++suffix;
+
+ /* Read the old schema value. */
+ WT_ERR(__wt_metadata_search(session, name, &value));
+
+ /*
+ * Calculate the new data source URI. Use the existing table structure
+ * and substitute the new name temporarily.
+ */
+ WT_ERR(__wt_scr_alloc(session, 0, &ns));
+ table->name = newuri;
+ if (is_colgroup)
+ WT_ERR(__wt_schema_colgroup_source(
+ session, table, suffix, value, ns));
+ else
+ WT_ERR(__wt_schema_index_source(
+ session, table, suffix, value, ns));
+
+ if ((ret = __wt_config_getones(session, value, "source", &cval)) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "index or column group has no data source: %s", value);
+
+ /* Take a copy of the old data source. */
+ WT_ERR(__wt_scr_alloc(session, 0, &os));
+ WT_ERR(__wt_buf_fmt(session, os, "%.*s", (int)cval.len, cval.str));
+
+ /* Overwrite it with the new data source. */
+ WT_ERR(__wt_scr_alloc(session, 0, &nv));
+ WT_ERR(__wt_buf_fmt(session, nv, "%.*s%s%s",
+ (int)WT_PTRDIFF(cval.str, value), value,
+ (const char *)ns->data,
+ cval.str + cval.len));
+
+ /*
+ * Remove the old metadata entry.
+ * Insert the new metadata entry.
+ */
+ WT_ERR(__wt_metadata_remove(session, name));
+ WT_ERR(__wt_metadata_insert(session, nn->data, nv->data));
+
+ /* Rename the file. */
+ WT_ERR(__wt_schema_rename(session, os->data, ns->data, cfg));
+
+err: __wt_scr_free(&nn);
+ __wt_scr_free(&ns);
+ __wt_scr_free(&nv);
+ __wt_scr_free(&os);
+ __wt_free(session, value);
+ table->name = olduri;
+ return (ret);
+}
+
+/*
+ * __metadata_rename --
+ * Rename an entry in the metadata table.
+ */
+static int
+__metadata_rename(WT_SESSION_IMPL *session, const char *uri, const char *newuri)
+{
+ WT_DECL_RET;
+ const char *value;
+
+ WT_RET(__wt_metadata_search(session, uri, &value));
+ WT_ERR(__wt_metadata_remove(session, uri));
+ WT_ERR(__wt_metadata_insert(session, newuri, value));
+
+err: __wt_free(session, value);
+ return (ret);
+}
+
+/*
+ * __rename_table --
+ * WT_SESSION::rename for a table.
+ */
+static int
+__rename_table(WT_SESSION_IMPL *session,
+ const char *uri, const char *newuri, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_TABLE *table;
+ u_int i;
+ const char *oldname;
+
+ oldname = uri;
+ (void)WT_PREFIX_SKIP(oldname, "table:");
+
+ WT_RET(__wt_schema_get_table(
+ session, oldname, strlen(oldname), 0, &table));
+
+ /* Rename the column groups. */
+ for (i = 0; i < WT_COLGROUPS(table); i++)
+ WT_ERR(__rename_tree(session, table, newuri,
+ table->cgroups[i]->name, cfg));
+
+ /* Rename the indices. */
+ WT_ERR(__wt_schema_open_indices(session, table));
+ for (i = 0; i < table->nindices; i++)
+ WT_ERR(__rename_tree(session, table, newuri,
+ table->indices[i]->name, cfg));
+
+ __wt_schema_remove_table(session, table);
+ table = NULL;
+
+ /* Rename the table. */
+ WT_ERR(__metadata_rename(session, uri, newuri));
+
+err: if (table != NULL)
+ __wt_schema_release_table(session, table);
+ return (ret);
+}
+
+/*
+ * __wt_schema_rename --
+ * WT_SESSION::rename.
+ */
+int
+__wt_schema_rename(WT_SESSION_IMPL *session,
+ const char *uri, const char *newuri, const char *cfg[])
+{
+ WT_DATA_SOURCE *dsrc;
+ WT_DECL_RET;
+ const char *p, *t;
+
+ /* The target type must match the source type. */
+ for (p = uri, t = newuri; *p == *t && *p != ':'; ++p, ++t)
+ ;
+ if (*p != ':' || *t != ':')
+ WT_RET_MSG(session, EINVAL,
+ "rename target type must match URI: %s to %s", uri, newuri);
+
+ /*
+ * We track rename operations, if we fail in the middle, we want to
+ * back it all out.
+ */
+ WT_RET(__wt_meta_track_on(session));
+
+ if (WT_PREFIX_MATCH(uri, "file:"))
+ ret = __rename_file(session, uri, newuri);
+ else if (WT_PREFIX_MATCH(uri, "lsm:"))
+ ret = __wt_lsm_tree_rename(session, uri, newuri, cfg);
+ else if (WT_PREFIX_MATCH(uri, "table:"))
+ ret = __rename_table(session, uri, newuri, cfg);
+ else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
+ ret = dsrc->rename == NULL ?
+ __wt_object_unsupported(session, uri) :
+ dsrc->rename(dsrc,
+ &session->iface, uri, newuri, (WT_CONFIG_ARG *)cfg);
+ else
+ ret = __wt_bad_object_type(session, uri);
+
+ /* Bump the schema generation so that stale data is ignored. */
+ ++S2C(session)->schema_gen;
+
+ WT_TRET(__wt_meta_track_off(session, ret != 0));
+
+ /* If we didn't find a metadata entry, map that error to ENOENT. */
+ return (ret == WT_NOTFOUND ? ENOENT : ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_stat.c b/src/third_party/wiredtiger/src/schema/schema_stat.c
new file mode 100644
index 00000000000..cb8e7f6c418
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_stat.c
@@ -0,0 +1,114 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_curstat_colgroup_init --
+ * Initialize the statistics for a column group.
+ */
+int
+__wt_curstat_colgroup_init(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+ WT_COLGROUP *colgroup;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+
+ WT_RET(__wt_schema_get_colgroup(session, uri, NULL, &colgroup));
+
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_buf_fmt(session, buf, "statistics:%s", colgroup->source));
+ ret = __wt_curstat_init(session, buf->data, cfg, cst);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_curstat_index_init --
+ * Initialize the statistics for an index.
+ */
+int
+__wt_curstat_index_init(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_INDEX *idx;
+
+ WT_RET(__wt_schema_get_index(session, uri, NULL, &idx));
+
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_buf_fmt(session, buf, "statistics:%s", idx->source));
+ ret = __wt_curstat_init(session, buf->data, cfg, cst);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_curstat_table_init --
+ * Initialize the statistics for a table.
+ */
+int
+__wt_curstat_table_init(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+ WT_CURSOR *stat_cursor;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_DSRC_STATS *new, *stats;
+ WT_TABLE *table;
+ u_int i;
+ const char *name;
+
+ name = uri + strlen("table:");
+ WT_RET(__wt_schema_get_table(session, name, strlen(name), 0, &table));
+
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+
+ /*
+ * Process the column groups.
+ *
+ * Set the cursor to reference the data source statistics; we don't
+ * initialize it, instead we copy (rather than aggregate), the first
+ * column's statistics, which has the same effect.
+ */
+ stats = &cst->u.dsrc_stats;
+ for (i = 0; i < WT_COLGROUPS(table); i++) {
+ WT_ERR(__wt_buf_fmt(
+ session, buf, "statistics:%s", table->cgroups[i]->name));
+ WT_ERR(__wt_curstat_open(
+ session, buf->data, cfg, &stat_cursor));
+ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
+ if (i == 0)
+ *stats = *new;
+ else
+ __wt_stat_aggregate_dsrc_stats(new, stats);
+ WT_ERR(stat_cursor->close(stat_cursor));
+ }
+
+ /* Process the indices. */
+ WT_ERR(__wt_schema_open_indices(session, table));
+ for (i = 0; i < table->nindices; i++) {
+ WT_ERR(__wt_buf_fmt(
+ session, buf, "statistics:%s", table->indices[i]->name));
+ WT_ERR(__wt_curstat_open(
+ session, buf->data, cfg, &stat_cursor));
+ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
+ __wt_stat_aggregate_dsrc_stats(new, stats);
+ WT_ERR(stat_cursor->close(stat_cursor));
+ }
+
+ __wt_curstat_dsrc_final(cst);
+
+err: __wt_schema_release_table(session, table);
+
+ __wt_scr_free(&buf);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_truncate.c b/src/third_party/wiredtiger/src/schema/schema_truncate.c
new file mode 100644
index 00000000000..1da3b103f10
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_truncate.c
@@ -0,0 +1,183 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __truncate_file --
+ * WT_SESSION::truncate for a file.
+ */
+static int
+__truncate_file(WT_SESSION_IMPL *session, const char *name)
+{
+ const char *filename;
+ uint32_t allocsize;
+
+ filename = name;
+ if (!WT_PREFIX_SKIP(filename, "file:"))
+ return (EINVAL);
+
+ /* Open and lock the file. */
+ WT_RET(__wt_session_get_btree(
+ session, name, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
+
+ /* Get the allocation size. */
+ allocsize = S2BT(session)->allocsize;
+
+ WT_RET(__wt_session_release_btree(session));
+
+ /* Close any btree handles in the file. */
+ WT_RET(__wt_conn_dhandle_close_all(session, name, 0));
+
+ /* Delete the root address and truncate the file. */
+ WT_RET(__wt_meta_checkpoint_clear(session, name));
+ WT_RET(__wt_block_manager_truncate(session, filename, allocsize));
+
+ return (0);
+}
+
+/*
+ * __truncate_table --
+ * WT_SESSION::truncate for a table.
+ */
+static int
+__truncate_table(WT_SESSION_IMPL *session, const char *name, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_TABLE *table;
+ u_int i;
+
+ WT_RET(__wt_schema_get_table(session, name, strlen(name), 0, &table));
+
+ /* Truncate the column groups. */
+ for (i = 0; i < WT_COLGROUPS(table); i++)
+ WT_ERR(__wt_schema_truncate(
+ session, table->cgroups[i]->source, cfg));
+
+ /* Truncate the indices. */
+ WT_ERR(__wt_schema_open_indices(session, table));
+ for (i = 0; i < table->nindices; i++)
+ WT_ERR(__wt_schema_truncate(
+ session, table->indices[i]->source, cfg));
+
+err: __wt_schema_release_table(session, table);
+ return (ret);
+}
+
+/*
+ * __truncate_dsrc --
+ * WT_SESSION::truncate for a data-source without a truncate operation.
+ */
+static int
+__truncate_dsrc(WT_SESSION_IMPL *session, const char *uri)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ const char *cfg[2];
+
+ /* Open a cursor and traverse the object, removing every entry. */
+ cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+ cfg[1] = NULL;
+ WT_RET(__wt_open_cursor(session, uri, NULL, cfg, &cursor));
+ while ((ret = cursor->next(cursor)) == 0)
+ WT_ERR(cursor->remove(cursor));
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: WT_TRET(cursor->close(cursor));
+ return (ret);
+}
+
+/*
+ * __wt_schema_truncate --
+ * WT_SESSION::truncate without a range.
+ */
+int
+__wt_schema_truncate(
+ WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+ WT_DATA_SOURCE *dsrc;
+ WT_DECL_RET;
+ const char *tablename;
+
+ tablename = uri;
+
+ if (WT_PREFIX_MATCH(uri, "file:")) {
+ ret = __truncate_file(session, uri);
+ } else if (WT_PREFIX_MATCH(uri, "lsm:"))
+ ret = __wt_lsm_tree_truncate(session, uri, cfg);
+ else if (WT_PREFIX_SKIP(tablename, "table:"))
+ ret = __truncate_table(session, tablename, cfg);
+ else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
+ ret = dsrc->truncate == NULL ?
+ __truncate_dsrc(session, uri) :
+ dsrc->truncate(
+ dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg);
+ else
+ ret = __wt_bad_object_type(session, uri);
+
+ /* If we didn't find a metadata entry, map that error to ENOENT. */
+ return (ret == WT_NOTFOUND ? ENOENT : ret);
+}
+
+/*
+ * __wt_range_truncate --
+ * Truncate of a cursor range, default implementation.
+ */
+int
+__wt_range_truncate(WT_CURSOR *start, WT_CURSOR *stop)
+{
+ WT_DECL_RET;
+ int cmp;
+
+ if (start == NULL) {
+ do {
+ WT_RET(stop->remove(stop));
+ } while ((ret = stop->prev(stop)) == 0);
+ WT_RET_NOTFOUND_OK(ret);
+ } else {
+ cmp = -1;
+ do {
+ if (stop != NULL)
+ WT_RET(start->compare(start, stop, &cmp));
+ WT_RET(start->remove(start));
+ } while (cmp < 0 && (ret = start->next(start)) == 0);
+ WT_RET_NOTFOUND_OK(ret);
+ }
+ return (0);
+}
+
+/*
+ * __wt_schema_range_truncate --
+ * WT_SESSION::truncate with a range.
+ */
+int
+__wt_schema_range_truncate(
+ WT_SESSION_IMPL *session, WT_CURSOR *start, WT_CURSOR *stop)
+{
+ WT_CURSOR *cursor;
+ WT_DATA_SOURCE *dsrc;
+ WT_DECL_RET;
+ const char *uri;
+
+ cursor = (start != NULL) ? start : stop;
+ uri = cursor->internal_uri;
+
+ if (WT_PREFIX_MATCH(uri, "file:"))
+ WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)cursor)->btree,
+ ret = __wt_btcur_range_truncate(
+ (WT_CURSOR_BTREE *)start, (WT_CURSOR_BTREE *)stop));
+ else if (WT_PREFIX_MATCH(uri, "table:"))
+ ret = __wt_table_range_truncate(
+ (WT_CURSOR_TABLE *)start, (WT_CURSOR_TABLE *)stop);
+ else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL &&
+ dsrc->range_truncate != NULL)
+ ret = dsrc->range_truncate(dsrc, &session->iface, start, stop);
+ else
+ ret = __wt_range_truncate(start, stop);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_util.c b/src/third_party/wiredtiger/src/schema/schema_util.c
new file mode 100644
index 00000000000..263f56f1c41
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_util.c
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_schema_get_source --
+ * Find a matching data source or report an error.
+ */
+WT_DATA_SOURCE *
+__wt_schema_get_source(WT_SESSION_IMPL *session, const char *name)
+{
+ WT_NAMED_DATA_SOURCE *ndsrc;
+
+ TAILQ_FOREACH(ndsrc, &S2C(session)->dsrcqh, q)
+ if (WT_PREFIX_MATCH(name, ndsrc->prefix))
+ return (ndsrc->dsrc);
+ return (NULL);
+}
+
+/*
+ * __wt_str_name_check --
+ * Disallow any use of the WiredTiger name space.
+ */
+int
+__wt_str_name_check(WT_SESSION_IMPL *session, const char *str)
+{
+ const char *name, *sep;
+ int skipped;
+
+ /*
+ * Check if name is somewhere in the WiredTiger name space: it would be
+ * "bad" if the application truncated the metadata file. Skip any
+ * leading URI prefix, check and then skip over a table name.
+ */
+ name = str;
+ for (skipped = 0; skipped < 2; skipped++) {
+ if ((sep = strchr(name, ':')) == NULL)
+ break;
+
+ name = sep + 1;
+ if (WT_PREFIX_MATCH(name, "WiredTiger"))
+ WT_RET_MSG(session, EINVAL,
+ "%s: the \"WiredTiger\" name space may not be "
+ "used by applications", name);
+ }
+
+ /*
+ * Disallow JSON quoting characters -- the config string parsing code
+ * supports quoted strings, but there's no good reason to use them in
+ * names and we're not going to do the testing.
+ */
+ if (strpbrk(name, "{},:[]\\\"'") != NULL)
+ WT_RET_MSG(session, EINVAL,
+ "%s: WiredTiger objects should not include grouping "
+ "characters in their names",
+ name);
+
+ return (0);
+}
+
+/*
+ * __wt_name_check --
+ * Disallow any use of the WiredTiger name space.
+ */
+int
+__wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len)
+{
+ WT_DECL_RET;
+ WT_DECL_ITEM(tmp);
+
+ WT_RET(__wt_scr_alloc(session, len, &tmp));
+
+ WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)len, str));
+
+ ret = __wt_str_name_check(session, tmp->data);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_worker.c b/src/third_party/wiredtiger/src/schema/schema_worker.c
new file mode 100644
index 00000000000..8e7ed3925f6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_worker.c
@@ -0,0 +1,134 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_schema_worker --
+ * Get Btree handles for the object and cycle through calls to an
+ * underlying worker function with each handle.
+ */
+int
+__wt_schema_worker(WT_SESSION_IMPL *session,
+ const char *uri,
+ int (*file_func)(WT_SESSION_IMPL *, const char *[]),
+ int (*name_func)(WT_SESSION_IMPL *, const char *, int *),
+ const char *cfg[], uint32_t open_flags)
+{
+ WT_COLGROUP *colgroup;
+ WT_DATA_SOURCE *dsrc;
+ WT_DECL_RET;
+ WT_INDEX *idx;
+ WT_SESSION *wt_session;
+ WT_TABLE *table;
+ const char *tablename;
+ u_int i;
+ int skip;
+
+ table = NULL;
+ tablename = uri;
+
+ skip = 0;
+ if (name_func != NULL)
+ WT_ERR(name_func(session, uri, &skip));
+
+ /* If the callback said to skip this object, we're done. */
+ if (skip)
+ return (0);
+
+ /* Get the btree handle(s) and call the underlying function. */
+ if (WT_PREFIX_MATCH(uri, "file:")) {
+ if (file_func != NULL) {
+ /*
+ * If the operation requires exclusive access, close
+ * any open file handles, including checkpoints.
+ */
+ if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE))
+ WT_ERR(__wt_conn_dhandle_close_all(
+ session, uri, 0));
+
+ WT_ERR(__wt_session_get_btree_ckpt(
+ session, uri, cfg, open_flags));
+ ret = file_func(session, cfg);
+ WT_TRET(__wt_session_release_btree(session));
+ }
+ } else if (WT_PREFIX_MATCH(uri, "colgroup:")) {
+ WT_ERR(__wt_schema_get_colgroup(session, uri, NULL, &colgroup));
+ WT_ERR(__wt_schema_worker(session, colgroup->source,
+ file_func, name_func, cfg, open_flags));
+ } else if (WT_PREFIX_SKIP(tablename, "index:")) {
+ idx = NULL;
+ WT_ERR(__wt_schema_get_index(session, uri, NULL, &idx));
+ WT_ERR(__wt_schema_worker(session, idx->source,
+ file_func, name_func, cfg, open_flags));
+ } else if (WT_PREFIX_MATCH(uri, "lsm:")) {
+ /*
+ * LSM compaction is handled elsewhere, but if we get here
+ * trying to compact files, don't descend into an LSM tree.
+ */
+ if (file_func != __wt_compact)
+ WT_ERR(__wt_lsm_tree_worker(session,
+ uri, file_func, name_func, cfg, open_flags));
+ } else if (WT_PREFIX_SKIP(tablename, "table:")) {
+ WT_ERR(__wt_schema_get_table(session,
+ tablename, strlen(tablename), 0, &table));
+ WT_ASSERT(session, session->dhandle == NULL);
+
+ /*
+ * We could make a recursive call for each colgroup or index
+ * URI, but since we have already opened the table, we can take
+ * a short cut and skip straight to the sources. If we have a
+ * name function, it needs to know about the intermediate URIs.
+ */
+ for (i = 0; i < WT_COLGROUPS(table); i++) {
+ colgroup = table->cgroups[i];
+ skip = 0;
+ if (name_func != NULL)
+ WT_ERR(name_func(
+ session, colgroup->name, &skip));
+ if (!skip)
+ WT_ERR(__wt_schema_worker(
+ session, colgroup->source,
+ file_func, name_func, cfg, open_flags));
+ }
+
+ WT_ERR(__wt_schema_open_indices(session, table));
+ for (i = 0; i < table->nindices; i++) {
+ idx = table->indices[i];
+ skip = 0;
+ if (name_func != NULL)
+ WT_ERR(name_func(session, idx->name, &skip));
+ if (!skip)
+ WT_ERR(__wt_schema_worker(session, idx->source,
+ file_func, name_func, cfg, open_flags));
+ }
+ } else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) {
+ wt_session = (WT_SESSION *)session;
+ if (file_func == __wt_compact && dsrc->compact != NULL)
+ WT_ERR(dsrc->compact(
+ dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg));
+ else if (file_func == __wt_salvage && dsrc->salvage != NULL)
+ WT_ERR(dsrc->salvage(
+ dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg));
+ else if (file_func == __wt_verify && dsrc->verify != NULL)
+ WT_ERR(dsrc->verify(
+ dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg));
+ else if (file_func == __wt_checkpoint)
+ ;
+ else if (file_func == __wt_checkpoint_list)
+ ;
+ else if (file_func == __wt_checkpoint_sync)
+ ;
+ else
+ WT_ERR(__wt_object_unsupported(session, uri));
+ } else
+ WT_ERR(__wt_bad_object_type(session, uri));
+
+err: if (table != NULL)
+ __wt_schema_release_table(session, table);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
new file mode 100644
index 00000000000..39b9dd0de61
--- /dev/null
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -0,0 +1,1054 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __session_checkpoint(WT_SESSION *, const char *);
+static int __session_rollback_transaction(WT_SESSION *, const char *);
+
+/*
+ * __wt_session_reset_cursors --
+ * Reset all open cursors.
+ */
+int
+__wt_session_reset_cursors(WT_SESSION_IMPL *session)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ TAILQ_FOREACH(cursor, &session->cursors, q) {
+ /* Stop when there are no positioned cursors. */
+ if (session->ncursors == 0)
+ break;
+ WT_TRET(cursor->reset(cursor));
+ }
+ return (ret);
+}
+
+/*
+ * __wt_session_copy_values --
+ * Copy values into all positioned cursors, so that they don't keep
+ * transaction IDs pinned.
+ */
+int
+__wt_session_copy_values(WT_SESSION_IMPL *session)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ TAILQ_FOREACH(cursor, &session->cursors, q)
+ if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) {
+ F_CLR(cursor, WT_CURSTD_VALUE_INT);
+ WT_RET(__wt_buf_set(session, &cursor->value,
+ cursor->value.data, cursor->value.size));
+ F_SET(cursor, WT_CURSTD_VALUE_EXT);
+ }
+
+ return (ret);
+}
+
+/*
+ * __session_clear --
+ * Clear a session structure.
+ */
+static void
+__session_clear(WT_SESSION_IMPL *session)
+{
+ /*
+ * There's no serialization support around the review of the hazard
+ * array, which means threads checking for hazard pointers first check
+ * the active field (which may be 0) and then use the hazard pointer
+ * (which cannot be NULL).
+ *
+ * Additionally, the session structure can include information that
+ * persists past the session's end-of-life, stored as part of page
+ * splits.
+ *
+ * For these reasons, be careful when clearing the session structure.
+ */
+ memset(session, 0, WT_SESSION_CLEAR_SIZE(session));
+ session->hazard_size = 0;
+ session->nhazard = 0;
+}
+
+/*
+ * __session_close --
+ * WT_SESSION->close method.
+ */
+static int
+__session_close(WT_SESSION *wt_session, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_session->connection;
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ SESSION_API_CALL(session, close, config, cfg);
+ WT_UNUSED(cfg);
+
+ /* Rollback any active transaction. */
+ if (F_ISSET(&session->txn, TXN_RUNNING))
+ WT_TRET(__session_rollback_transaction(wt_session, NULL));
+
+ /*
+ * Also release any pinned transaction ID from a non-transactional
+ * operation.
+ */
+ if (conn->txn_global.states != NULL)
+ __wt_txn_release_snapshot(session);
+
+ /* Close all open cursors. */
+ while ((cursor = TAILQ_FIRST(&session->cursors)) != NULL) {
+ /*
+ * Notify the user that we are closing the cursor handle
+ * via the registered close callback.
+ */
+ if (session->event_handler->handle_close != NULL)
+ WT_TRET(session->event_handler->handle_close(
+ session->event_handler, wt_session, cursor));
+ WT_TRET(cursor->close(cursor));
+ }
+
+ WT_ASSERT(session, session->ncursors == 0);
+
+ /* Discard cached handles. */
+ __wt_session_close_cache(session);
+
+ /* Close all tables. */
+ __wt_schema_close_tables(session);
+
+ /* Discard metadata tracking. */
+ __wt_meta_track_discard(session);
+
+ /* Discard scratch buffers. */
+ __wt_scr_discard(session);
+
+ /* Free transaction information. */
+ __wt_txn_destroy(session);
+
+ /* Confirm we're not holding any hazard pointers. */
+ __wt_hazard_close(session);
+
+ /* Cleanup */
+ if (session->block_manager_cleanup != NULL)
+ WT_TRET(session->block_manager_cleanup(session));
+ if (session->reconcile_cleanup != NULL)
+ WT_TRET(session->reconcile_cleanup(session));
+
+ /* Free the eviction exclusive-lock information. */
+ __wt_free(session, session->excl);
+
+ /* Destroy the thread's mutex. */
+ WT_TRET(__wt_cond_destroy(session, &session->cond));
+
+ /* The API lock protects opening and closing of sessions. */
+ __wt_spin_lock(session, &conn->api_lock);
+
+ /* Decrement the count of open sessions. */
+ WT_STAT_FAST_CONN_DECR(session, session_open);
+
+ /*
+ * Sessions are re-used, clear the structure: the clear sets the active
+ * field to 0, which will exclude the hazard array from review by the
+ * eviction thread. Because some session fields are accessed by other
+ * threads, the structure must be cleared carefully.
+ *
+ * We don't need to publish here, because regardless of the active field
+ * being non-zero, the hazard pointer is always valid.
+ */
+ __session_clear(session);
+ session = conn->default_session;
+
+ /*
+ * Decrement the count of active sessions if that's possible: a session
+ * being closed may or may not be at the end of the array, step toward
+ * the beginning of the array until we reach an active session.
+ */
+ while (conn->sessions[conn->session_cnt - 1].active == 0)
+ if (--conn->session_cnt == 0)
+ break;
+
+ __wt_spin_unlock(session, &conn->api_lock);
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_reconfigure --
+ * WT_SESSION->reconfigure method.
+ */
+static int
+__session_reconfigure(WT_SESSION *wt_session, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, reconfigure, config, cfg);
+
+ if (F_ISSET(&session->txn, TXN_RUNNING))
+ WT_ERR_MSG(session, EINVAL, "transaction in progress");
+
+ WT_TRET(__wt_session_reset_cursors(session));
+
+ WT_ERR(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
+ if (cval.len != 0)
+ session->isolation = session->txn.isolation =
+ WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
+ TXN_ISO_SNAPSHOT :
+ WT_STRING_MATCH("read-uncommitted", cval.str, cval.len) ?
+ TXN_ISO_READ_UNCOMMITTED : TXN_ISO_READ_COMMITTED;
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_open_cursor --
+ * Internal version of WT_SESSION::open_cursor.
+ */
+int
+__wt_open_cursor(WT_SESSION_IMPL *session,
+ const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_COLGROUP *colgroup;
+ WT_DATA_SOURCE *dsrc;
+ WT_DECL_RET;
+
+ *cursorp = NULL;
+
+ /*
+ * Open specific cursor types we know about, or call the generic data
+ * source open function.
+ *
+ * Unwind a set of string comparisons into a switch statement hoping
+ * the compiler can make it fast, but list the common choices first
+ * instead of sorting so if/else patterns are still fast.
+ */
+ switch (uri[0]) {
+ /*
+ * Common cursor types.
+ */
+ case 't':
+ if (WT_PREFIX_MATCH(uri, "table:"))
+ WT_RET(__wt_curtable_open(session, uri, cfg, cursorp));
+ break;
+ case 'c':
+ if (WT_PREFIX_MATCH(uri, "colgroup:")) {
+ /*
+ * Column groups are a special case: open a cursor on
+ * the underlying data source.
+ */
+ WT_RET(__wt_schema_get_colgroup(
+ session, uri, NULL, &colgroup));
+ WT_RET(__wt_open_cursor(
+ session, colgroup->source, owner, cfg, cursorp));
+ } else if (WT_PREFIX_MATCH(uri, "config:"))
+ WT_RET(__wt_curconfig_open(
+ session, uri, cfg, cursorp));
+ break;
+ case 'i':
+ if (WT_PREFIX_MATCH(uri, "index:"))
+ WT_RET(__wt_curindex_open(
+ session, uri, owner, cfg, cursorp));
+ break;
+ case 'l':
+ if (WT_PREFIX_MATCH(uri, "lsm:"))
+ WT_RET(__wt_clsm_open(
+ session, uri, owner, cfg, cursorp));
+ else if (WT_PREFIX_MATCH(uri, "log:"))
+ WT_RET(__wt_curlog_open(session, uri, cfg, cursorp));
+ break;
+
+ /*
+ * Less common cursor types.
+ */
+ case 'f':
+ if (WT_PREFIX_MATCH(uri, "file:"))
+ WT_RET(__wt_curfile_open(
+ session, uri, owner, cfg, cursorp));
+ break;
+ case 'm':
+ if (WT_PREFIX_MATCH(uri, WT_METADATA_URI))
+ WT_RET(__wt_curmetadata_open(
+ session, uri, owner, cfg, cursorp));
+ break;
+ case 'b':
+ if (WT_PREFIX_MATCH(uri, "backup:"))
+ WT_RET(__wt_curbackup_open(
+ session, uri, cfg, cursorp));
+ break;
+ case 's':
+ if (WT_PREFIX_MATCH(uri, "statistics:"))
+ WT_RET(__wt_curstat_open(session, uri, cfg, cursorp));
+ break;
+ default:
+ break;
+ }
+
+ if (*cursorp == NULL &&
+ (dsrc = __wt_schema_get_source(session, uri)) != NULL)
+ WT_RET(dsrc->open_cursor == NULL ?
+ __wt_object_unsupported(session, uri) :
+ __wt_curds_open(session, uri, owner, cfg, dsrc, cursorp));
+
+ if (*cursorp == NULL)
+ return (__wt_bad_object_type(session, uri));
+
+ /*
+ * When opening simple tables, the table code calls this function on the
+ * underlying data source, in which case the application's URI has been
+ * copied.
+ */
+ if ((*cursorp)->uri == NULL &&
+ (ret = __wt_strdup(session, uri, &(*cursorp)->uri)) != 0)
+ WT_TRET((*cursorp)->close(*cursorp));
+
+ return (ret);
+}
+
+/*
+ * __session_open_cursor --
+ * WT_SESSION->open_cursor method.
+ */
+static int
+__session_open_cursor(WT_SESSION *wt_session,
+ const char *uri, WT_CURSOR *to_dup, const char *config, WT_CURSOR **cursorp)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cursor = *cursorp = NULL;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, open_cursor, config, cfg);
+
+ if ((to_dup == NULL && uri == NULL) || (to_dup != NULL && uri != NULL))
+ WT_ERR_MSG(session, EINVAL,
+ "should be passed either a URI or a cursor to duplicate, "
+ "but not both");
+
+ if (to_dup != NULL) {
+ uri = to_dup->uri;
+ if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+ !WT_PREFIX_MATCH(uri, "index:") &&
+ !WT_PREFIX_MATCH(uri, "file:") &&
+ !WT_PREFIX_MATCH(uri, "lsm:") &&
+ !WT_PREFIX_MATCH(uri, WT_METADATA_URI) &&
+ !WT_PREFIX_MATCH(uri, "table:") &&
+ __wt_schema_get_source(session, uri) == NULL)
+ WT_ERR(__wt_bad_object_type(session, uri));
+ }
+
+ WT_ERR(__wt_open_cursor(session, uri, NULL, cfg, &cursor));
+ if (to_dup != NULL)
+ WT_ERR(__wt_cursor_dup_position(to_dup, cursor));
+
+ *cursorp = cursor;
+
+ if (0) {
+err: if (cursor != NULL)
+ WT_TRET(cursor->close(cursor));
+ }
+
+ /*
+ * Opening a cursor on a non-existent data source will set ret to
+ * either of ENOENT or WT_NOTFOUND at this point. However,
+ * applications may reasonably do this inside a transaction to check
+ * for the existence of a table or index.
+ *
+ * Prefer WT_NOTFOUND here: that does not force running transactions to
+ * roll back. It will be mapped back to ENOENT.
+ */
+ if (ret == ENOENT)
+ ret = WT_NOTFOUND;
+
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_session_create_strip --
+ * Discard any configuration information from a schema entry that is not
+ * applicable to an session.create call, here for the wt dump command utility,
+ * which only wants to dump the schema information needed for load.
+ */
+int
+__wt_session_create_strip(WT_SESSION *wt_session,
+ const char *v1, const char *v2, const char **value_ret)
+{
+ WT_SESSION_IMPL *session = (WT_SESSION_IMPL *)wt_session;
+ const char *cfg[] =
+ { WT_CONFIG_BASE(session, session_create), v1, v2, NULL };
+
+ return (__wt_config_collapse(session, cfg, value_ret));
+}
+
+/*
+ * __session_create --
+ * WT_SESSION->create method.
+ */
+static int
+__session_create(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, create, config, cfg);
+ WT_UNUSED(cfg);
+
+ /* Disallow objects in the WiredTiger name space. */
+ WT_ERR(__wt_str_name_check(session, uri));
+
+ /*
+ * Type configuration only applies to tables, column groups and indexes.
+ * We don't want applications to attempt to layer LSM on top of their
+ * extended data-sources, and the fact we allow LSM as a valid URI is an
+ * invitation to that mistake: nip it in the bud.
+ */
+ if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+ !WT_PREFIX_MATCH(uri, "index:") &&
+ !WT_PREFIX_MATCH(uri, "table:")) {
+ /*
+ * We can't disallow type entirely, a configuration string might
+ * innocently include it, for example, a dump/load pair. If the
+ * URI type prefix and the type are the same, let it go.
+ */
+ if ((ret =
+ __wt_config_getones(session, config, "type", &cval)) == 0 &&
+ (strncmp(uri, cval.str, cval.len) != 0 ||
+ uri[cval.len] != ':'))
+ WT_ERR_MSG(session, EINVAL,
+ "%s: unsupported type configuration", uri);
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_create(session, uri, config));
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_log_printf --
+ * WT_SESSION->log_printf method.
+ */
+static int
+__session_log_printf(WT_SESSION *wt_session, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+{
+ WT_SESSION_IMPL *session;
+ WT_DECL_RET;
+ va_list ap;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL_NOCONF(session, log_printf);
+
+ va_start(ap, fmt);
+ ret = __wt_log_vprintf(session, fmt, ap);
+ va_end(ap);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __session_rename --
+ * WT_SESSION->rename method.
+ */
+static int
+__session_rename(WT_SESSION *wt_session,
+ const char *uri, const char *newuri, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, rename, config, cfg);
+
+ /* Disallow objects in the WiredTiger name space. */
+ WT_ERR(__wt_str_name_check(session, uri));
+ WT_ERR(__wt_str_name_check(session, newuri));
+
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_rename(session, uri, newuri, cfg));
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_compact --
+ * WT_SESSION->compact method.
+ */
+static int
+__session_compact(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ /* Disallow objects in the WiredTiger name space. */
+ WT_RET(__wt_str_name_check(session, uri));
+
+ if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+ !WT_PREFIX_MATCH(uri, "file:") &&
+ !WT_PREFIX_MATCH(uri, "index:") &&
+ !WT_PREFIX_MATCH(uri, "lsm:") &&
+ !WT_PREFIX_MATCH(uri, "table:"))
+ return (__wt_bad_object_type(session, uri));
+
+ return (__wt_session_compact(wt_session, uri, config));
+}
+
+/*
+ * __session_drop --
+ * WT_SESSION->drop method.
+ */
+static int
+__session_drop(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, drop, config, cfg);
+
+ /* Disallow objects in the WiredTiger name space. */
+ WT_ERR(__wt_str_name_check(session, uri));
+
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_drop(session, uri, cfg));
+
+err: /* Note: drop operations cannot be unrolled (yet?). */
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_salvage --
+ * WT_SESSION->salvage method.
+ */
+static int
+__session_salvage(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ SESSION_API_CALL(session, salvage, config, cfg);
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_worker(session, uri, __wt_salvage,
+ NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_SALVAGE));
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_truncate --
+ * WT_SESSION->truncate method.
+ */
+static int
+__session_truncate(WT_SESSION *wt_session,
+ const char *uri, WT_CURSOR *start, WT_CURSOR *stop, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_CURSOR *cursor;
+ int cmp;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_TXN_API_CALL(session, truncate, config, cfg);
+
+ /*
+ * If the URI is specified, we don't need a start/stop, if start/stop
+ * is specified, we don't need a URI.
+ *
+ * If no URI is specified, and both cursors are specified, start/stop
+ * must reference the same object.
+ *
+ * Any specified cursor must have been initialized.
+ */
+ if ((uri == NULL && start == NULL && stop == NULL) ||
+ (uri != NULL && (start != NULL || stop != NULL)))
+ WT_ERR_MSG(session, EINVAL,
+ "the truncate method should be passed either a URI or "
+ "start/stop cursors, but not both");
+
+ if (uri != NULL) {
+ /* Disallow objects in the WiredTiger name space. */
+ WT_ERR(__wt_str_name_check(session, uri));
+
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_truncate(session, uri, cfg));
+ goto done;
+ }
+
+ /*
+ * Cursor truncate is only supported for some objects, check for the
+ * supporting methods we need, range_truncate and compare.
+ */
+ cursor = start == NULL ? stop : start;
+ if (cursor->compare == NULL)
+ WT_ERR(__wt_bad_object_type(session, cursor->uri));
+
+ /*
+ * If both cursors set, check they're correctly ordered with respect to
+ * each other. We have to test this before any search, the search can
+ * change the initial cursor position.
+ *
+ * Rather happily, the compare routine will also confirm the cursors
+ * reference the same object and the keys are set.
+ */
+ if (start != NULL && stop != NULL) {
+ WT_ERR(start->compare(start, stop, &cmp));
+ if (cmp > 0)
+ WT_ERR_MSG(session, EINVAL,
+ "the start cursor position is after the stop "
+ "cursor position");
+ }
+
+ /*
+ * Truncate does not require keys actually exist so that applications
+ * can discard parts of the object's name space without knowing exactly
+ * what records currently appear in the object. For this reason, do a
+ * search-near, rather than a search. Additionally, we have to correct
+ * after calling search-near, to position the start/stop cursors on the
+ * next record greater than/less than the original key. If the cursors
+ * hit the beginning/end of the object, or the start/stop keys cross,
+ * we're done, the range must be empty.
+ */
+ if (start != NULL) {
+ WT_ERR(start->search_near(start, &cmp));
+ if (cmp < 0 && (ret = start->next(start)) != 0) {
+ WT_ERR_NOTFOUND_OK(ret);
+ goto done;
+ }
+ }
+ if (stop != NULL) {
+ WT_ERR(stop->search_near(stop, &cmp));
+ if (cmp > 0 && (ret = stop->prev(stop)) != 0) {
+ WT_ERR_NOTFOUND_OK(ret);
+ goto done;
+ }
+
+ if (start != NULL) {
+ WT_ERR(start->compare(start, stop, &cmp));
+ if (cmp > 0)
+ goto done;
+ }
+ }
+
+ WT_ERR(__wt_schema_range_truncate(session, start, stop));
+
+done:
+err: TXN_API_END_RETRY(session, ret, 0);
+ return ((ret) == WT_NOTFOUND ? ENOENT : (ret));
+}
+
+/*
+ * __session_upgrade --
+ * WT_SESSION->upgrade method.
+ */
+static int
+__session_upgrade(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ SESSION_API_CALL(session, upgrade, config, cfg);
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_worker(session, uri, __wt_upgrade,
+ NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_UPGRADE));
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_verify --
+ * WT_SESSION->verify method.
+ */
+static int
+__session_verify(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ SESSION_API_CALL(session, verify, config, cfg);
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_worker(session, uri, __wt_verify,
+ NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_VERIFY));
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_begin_transaction --
+ * WT_SESSION->begin_transaction method.
+ */
+static int
+__session_begin_transaction(WT_SESSION *wt_session, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, begin_transaction, config, cfg);
+ WT_STAT_FAST_CONN_INCR(session, txn_begin);
+
+ if (F_ISSET(&session->txn, TXN_RUNNING))
+ WT_ERR_MSG(session, EINVAL, "Transaction already running");
+
+ /*
+ * There is no transaction active in this thread; check if the cache is
+ * full, if we have to block for eviction, this is the best time to do
+ * it.
+ */
+ WT_ERR(__wt_cache_full_check(session));
+
+ ret = __wt_txn_begin(session, cfg);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __session_commit_transaction --
+ * WT_SESSION->commit_transaction method.
+ */
+static int
+__session_commit_transaction(WT_SESSION *wt_session, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_TXN *txn;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, commit_transaction, config, cfg);
+ WT_STAT_FAST_CONN_INCR(session, txn_commit);
+
+ txn = &session->txn;
+ if (F_ISSET(txn, TXN_ERROR)) {
+ __wt_errx(session, "failed transaction requires rollback");
+ ret = EINVAL;
+ }
+
+ if (ret == 0)
+ ret = __wt_txn_commit(session, cfg);
+ else {
+ WT_TRET(__wt_session_reset_cursors(session));
+ WT_TRET(__wt_txn_rollback(session, cfg));
+ }
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __session_rollback_transaction --
+ * WT_SESSION->rollback_transaction method.
+ */
+static int
+__session_rollback_transaction(WT_SESSION *wt_session, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, rollback_transaction, config, cfg);
+ WT_STAT_FAST_CONN_INCR(session, txn_rollback);
+
+ WT_TRET(__wt_session_reset_cursors(session));
+
+ WT_TRET(__wt_txn_rollback(session, cfg));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __session_transaction_pinned_range --
+ * WT_SESSION->transaction_pinned_range method.
+ */
+static int
+__session_transaction_pinned_range(WT_SESSION *wt_session, uint64_t *prange)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_TXN_STATE *txn_state;
+ uint64_t pinned;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL_NOCONF(session, pinned_range);
+
+ txn_state = WT_SESSION_TXN_STATE(session);
+
+ /* Assign pinned to the lesser of id or snap_min */
+ if (txn_state->id != WT_TXN_NONE &&
+ TXNID_LT(txn_state->id, txn_state->snap_min))
+ pinned = txn_state->id;
+ else
+ pinned = txn_state->snap_min;
+
+ if (pinned == WT_TXN_NONE)
+ *prange = 0;
+ else
+ *prange = S2C(session)->txn_global.current - pinned;
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __session_checkpoint --
+ * WT_SESSION->checkpoint method.
+ */
+static int
+__session_checkpoint(WT_SESSION *wt_session, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_TXN *txn;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ txn = &session->txn;
+
+ WT_STAT_FAST_CONN_INCR(session, txn_checkpoint);
+ SESSION_API_CALL(session, checkpoint, config, cfg);
+
+ /*
+ * Checkpoints require a snapshot to write a transactionally consistent
+ * snapshot of the data.
+ *
+ * We can't use an application's transaction: if it has uncommitted
+ * changes, they will be written in the checkpoint and may appear after
+ * a crash.
+ *
+ * Use a real snapshot transaction: we don't want any chance of the
+ * snapshot being updated during the checkpoint. Eviction is prevented
+ * from evicting anything newer than this because we track the oldest
+ * transaction ID in the system that is not visible to all readers.
+ */
+ if (F_ISSET(txn, TXN_RUNNING))
+ WT_ERR_MSG(session, EINVAL,
+ "Checkpoint not permitted in a transaction");
+
+ /*
+ * Reset open cursors. Do this explicitly, even though it will happen
+ * implicitly in the call to begin_transaction for the checkpoint, the
+ * checkpoint code will acquire the schema lock before we do that, and
+ * some implementation of WT_CURSOR::reset might need the schema lock.
+ */
+ WT_ERR(__wt_session_reset_cursors(session));
+
+ /*
+ * Don't highjack the session checkpoint thread for eviction.
+ *
+ * Application threads are not generally available for potentially slow
+ * operations, but checkpoint does enough I/O it may be called upon to
+ * perform slow operations for the block manager.
+ */
+ F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK);
+
+ /*
+ * Only one checkpoint can be active at a time, and checkpoints must run
+ * in the same order as they update the metadata. It's probably a bad
+ * idea to run checkpoints out of multiple threads, but serialize them
+ * here to ensure we don't get into trouble.
+ */
+ WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 1);
+ __wt_spin_lock(session, &S2C(session)->checkpoint_lock);
+
+ ret = __wt_txn_checkpoint(session, cfg);
+
+ WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0);
+ __wt_spin_unlock(session, &S2C(session)->checkpoint_lock);
+
+err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK);
+
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_open_internal_session --
+ * Allocate a session for WiredTiger's use.
+ */
+int
+__wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name,
+ int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp)
+{
+ WT_SESSION_IMPL *session;
+
+ *sessionp = NULL;
+
+ WT_RET(__wt_open_session(conn, NULL, NULL, &session));
+ session->name = name;
+
+ /*
+ * Public sessions are automatically closed during WT_CONNECTION->close.
+ * If the session handles for internal threads were to go on the public
+ * list, there would be complex ordering issues during close. Set a
+ * flag to avoid this: internal sessions are not closed automatically.
+ */
+ F_SET(session, WT_SESSION_INTERNAL);
+
+ /*
+ * Some internal threads must keep running after we close all data
+ * handles. Make sure these threads don't open their own handles.
+ */
+ if (!uses_dhandles)
+ F_SET(session, WT_SESSION_NO_DATA_HANDLES);
+
+ /*
+ * Acquiring the metadata handle requires the schema lock; we've seen
+ * problems in the past where a worker thread has acquired the schema
+ * lock unexpectedly, relatively late in the run, and deadlocked. Be
+ * defensive, get it now. The metadata file may not exist when the
+ * connection first creates its default session or the shared cache
+ * pool creates its sessions, let our caller decline this work.
+ */
+ if (open_metadata) {
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+ WT_RET(__wt_metadata_open(session));
+ }
+
+ *sessionp = session;
+ return (0);
+}
+
+/*
+ * __wt_open_session --
+ * Allocate a session handle. The internal parameter is used for sessions
+ * opened by WiredTiger for its own use.
+ */
+int
+__wt_open_session(WT_CONNECTION_IMPL *conn,
+ WT_EVENT_HANDLER *event_handler, const char *config,
+ WT_SESSION_IMPL **sessionp)
+{
+ static const WT_SESSION stds = {
+ NULL,
+ __session_close,
+ __session_reconfigure,
+ __session_open_cursor,
+ __session_create,
+ __session_compact,
+ __session_drop,
+ __session_log_printf,
+ __session_rename,
+ __session_salvage,
+ __session_truncate,
+ __session_upgrade,
+ __session_verify,
+ __session_begin_transaction,
+ __session_commit_transaction,
+ __session_rollback_transaction,
+ __session_checkpoint,
+ __session_transaction_pinned_range
+ };
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session, *session_ret;
+ uint32_t i;
+
+ *sessionp = NULL;
+
+ session = conn->default_session;
+ session_ret = NULL;
+
+ __wt_spin_lock(session, &conn->api_lock);
+
+ /*
+ * Make sure we don't try to open a new session after the application
+ * closes the connection. This is particularly intended to catch
+ * cases where server threads open sessions.
+ */
+ WT_ASSERT(session, F_ISSET(conn, WT_CONN_SERVER_RUN));
+
+ /* Find the first inactive session slot. */
+ for (session_ret = conn->sessions,
+ i = 0; i < conn->session_size; ++session_ret, ++i)
+ if (!session_ret->active)
+ break;
+ if (i == conn->session_size)
+ WT_ERR_MSG(session, ENOMEM,
+ "only configured to support %" PRIu32 " sessions"
+ " (including %" PRIu32 " internal)",
+ conn->session_size, WT_NUM_INTERNAL_SESSIONS);
+
+ /*
+ * If the active session count is increasing, update it. We don't worry
+ * about correcting the session count on error, as long as we don't mark
+ * this session as active, we'll clean it up on close.
+ */
+ if (i >= conn->session_cnt) /* Defend against off-by-one errors. */
+ conn->session_cnt = i + 1;
+
+ session_ret->id = i;
+ session_ret->iface = stds;
+ session_ret->iface.connection = &conn->iface;
+
+ WT_ERR(__wt_cond_alloc(session, "session", 0, &session_ret->cond));
+
+ __wt_random_init(session_ret->rnd);
+
+ __wt_event_handler_set(session_ret,
+ event_handler == NULL ? session->event_handler : event_handler);
+
+ TAILQ_INIT(&session_ret->cursors);
+ SLIST_INIT(&session_ret->dhandles);
+
+ /* Initialize transaction support: default to read-committed. */
+ session_ret->isolation = TXN_ISO_READ_COMMITTED;
+ WT_ERR(__wt_txn_init(session_ret));
+
+ /*
+ * The session's hazard pointer memory isn't discarded during normal
+ * session close because access to it isn't serialized. Allocate the
+ * first time we open this session.
+ */
+ if (session_ret->hazard == NULL)
+ WT_ERR(__wt_calloc_def(
+ session, conn->hazard_max, &session_ret->hazard));
+
+ /*
+ * Set an initial size for the hazard array. It will be grown as
+ * required up to hazard_max. The hazard_size is reset on close, since
+ * __wt_hazard_close ensures the array is cleared - so it is safe to
+ * reset the starting size on each open.
+ */
+ session_ret->hazard_size = WT_HAZARD_INCR;
+
+ /*
+ * Configuration: currently, the configuration for open_session is the
+ * same as session.reconfigure, so use that function.
+ */
+ if (config != NULL)
+ WT_ERR(
+ __session_reconfigure((WT_SESSION *)session_ret, config));
+
+ session_ret->name = NULL;
+
+ /*
+ * Publish: make the entry visible to server threads. There must be a
+ * barrier for two reasons, to ensure structure fields are set before
+ * any other thread will consider the session, and to push the session
+ * count to ensure the eviction thread can't review too few slots.
+ */
+ WT_PUBLISH(session_ret->active, 1);
+
+ WT_STATIC_ASSERT(offsetof(WT_SESSION_IMPL, iface) == 0);
+ *sessionp = session_ret;
+
+ WT_STAT_FAST_CONN_INCR(session, session_open);
+
+err: __wt_spin_unlock(session, &conn->api_lock);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c
new file mode 100644
index 00000000000..6eca8a58d13
--- /dev/null
+++ b/src/third_party/wiredtiger/src/session/session_compact.c
@@ -0,0 +1,236 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Compaction is the place where the underlying block manager becomes visible
+ * in the higher engine btree and API layers. As there is currently only one
+ * block manager, this code is written with it in mind: other block managers
+ * may need changes to support compaction, and a smart block manager might need
+ * far less support from the engine.
+ *
+ * First, the default block manager cannot entirely own compaction because it
+ * has no way to find a block after it moves other than a request from the
+ * btree layer with the new address. In other words, if internal page X points
+ * to leaf page Y, and page Y moves, the address of page Y has to be updated in
+ * page X. Generally, this is solved by building a translation layer in the
+ * block manager so internal pages don't require updates to relocate blocks:
+ * however, the translation table must be durable, has its own garbage
+ * collection issues and might be slower, all of which have their own problems.
+ *
+ * Second, the btree layer cannot entirely own compaction because page
+ * addresses are opaque, it cannot know where a page is in the file from the
+ * address cookie.
+ *
+ * For these reasons, compaction is a cooperative process between the btree
+ * layer and the block manager. The btree layer walks files, and asks the
+ * block manager if rewriting a particular block would reduce the file
+ * footprint: if writing the page will help, the page is marked dirty so it
+ * will eventually be written. As pages are written, the original page
+ * potentially becomes available for reuse and if enough pages at the end of
+ * the file are available for reuse, the file can be truncated, and compaction
+ * succeeds.
+ *
+ * However, writing a page is not by itself sufficient to make a page available
+ * for reuse. The original version of the page is still referenced by at least
+ * the most recent checkpoint in the file. To make a page available for reuse,
+ * we have to checkpoint the file so we can discard the checkpoint referencing
+ * the original version of the block; once no checkpoint references a block, it
+ * becomes available for reuse.
+ *
+ * Compaction is not necessarily possible in WiredTiger, even in a file with
+ * lots of available space. If a block at the end of the file is referenced by
+ * a named checkpoint, there is nothing we can do to compact the file, no
+ * matter how many times we rewrite the block, the named checkpoint can't be
+ * discarded and so the reference count on the original block will never go to
+ * zero. What's worse, because the block manager doesn't reference count
+ * blocks, it can't easily know this is the case, and so we'll waste a lot of
+ * effort trying to compact files that can't be compacted.
+ *
+ * Now, to the actual process. First, we checkpoint the high-level object
+ * (which is potentially composed of multiple files): there are potentially
+ * many dirty blocks in the cache, and we want to write them out and then
+ * discard previous checkpoints so we have as many blocks as possible on the
+ * file's "available for reuse" list when we start compaction.
+ *
+ * Then, we compact the high-level object.
+ *
+ * Compacting the object is done 10% at a time, that is, we try and move blocks
+ * from the last 10% of the file into the beginning of the file (the 10% is
+ * hard coded in the block manager). The reason for this is because we are
+ * walking the file in logical order, not block offset order, and we can fail
+ * to compact a file if we write the wrong blocks first.
+ *
+ * For example, imagine a file with 10 blocks in the first 10% of a file, 1,000
+ * blocks in the 3rd quartile of the file, and 10 blocks in the last 10% of the
+ * file. If we were to rewrite blocks from more than the last 10% of the file,
+ * and found the 1,000 blocks in the 3rd quartile of the file first, we'd copy
+ * 10 of them without ever rewriting the blocks from the end of the file which
+ * would allow us to compact the file. So, we compact the last 10% of the
+ * file, and if that works, we compact the last 10% of the file again, and so
+ * on. Note the block manager uses a first-fit block selection algorithm
+ * during compaction to maximize block movement.
+ *
+ * After each 10% compaction, we checkpoint two more times (seriously, twice).
+ * The second and third checkpoints are because the block manager checkpoints
+ * in two steps: blocks made available for reuse during a checkpoint are put on
+ * a special checkpoint-available list and only moved to the real available
+ * list after the metadata has been updated with the new checkpoint's
+ * information. (Otherwise it is possible to allocate a rewritten block, crash
+ * before the metadata is updated, and see corruption.) For this reason,
+ * blocks allocated to write the checkpoint itself cannot be taken from the
+ * blocks made available by the checkpoint.
+ *
+ * To say it another way, the second checkpoint puts the blocks from the end of
+ * the file that were made available by compaction onto the checkpoint-available
+ * list, but then potentially writes the checkpoint itself at the end of the
+ * file, which would prevent any file truncation. When the metadata is updated
+ * for the second checkpoint, the blocks freed by compaction become available
+ * for the third checkpoint, so the third checkpoint's blocks are written
+ * towards the beginning of the file, and then the file can be truncated.
+ */
+
+/*
+ * __wt_compact_uri_analyze --
+ * Extract information relevant to deciding what work compact needs to
+ * do from a URI that is part of a table schema.
+ * Called via the schema_worker function.
+ */
+int
+__wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip)
+{
+ /*
+ * Add references to schema URI objects to the list of objects to be
+ * compacted. Skip over LSM trees or we will get false positives on
+ * the "file:" URIs for the chunks.
+ */
+ if (WT_PREFIX_MATCH(uri, "lsm:")) {
+ session->compact->lsm_count++;
+ *skip = 1;
+ } else if (WT_PREFIX_MATCH(uri, "file:"))
+ session->compact->file_count++;
+
+ return (0);
+}
+
+/*
+ * __session_compact_check_timeout --
+ * Check if the timeout has been exceeded.
+ */
+static int
+__session_compact_check_timeout(
+ WT_SESSION_IMPL *session, struct timespec begin)
+{
+ struct timespec end;
+
+ if (session->compact->max_time == 0)
+ return (0);
+
+ WT_RET(__wt_epoch(session, &end));
+ if (session->compact->max_time <
+ WT_TIMEDIFF(end, begin) / WT_BILLION)
+ WT_RET(ETIMEDOUT);
+ return (0);
+}
+
+/*
+ * __compact_file --
+ * Function to alternate between checkpoints and compaction calls.
+ */
+static int
+__compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_DECL_ITEM(t);
+ WT_SESSION *wt_session;
+ WT_TXN *txn;
+ int i;
+ struct timespec start_time;
+
+ txn = &session->txn;
+ wt_session = &session->iface;
+
+ /*
+ * File compaction requires checkpoints, which will fail in a
+ * transactional context. Check now so the error message isn't
+ * confusing.
+ */
+ if (session->compact->file_count != 0 && F_ISSET(txn, TXN_RUNNING))
+ WT_ERR_MSG(session, EINVAL,
+ " File compaction not permitted in a transaction");
+
+ /*
+ * Force the checkpoint: we don't want to skip it because the work we
+ * need to have done is done in the underlying block manager.
+ */
+ WT_ERR(__wt_scr_alloc(session, 128, &t));
+ WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri));
+
+ WT_ERR(__wt_epoch(session, &start_time));
+
+ /*
+ * We compact 10% of the file on each pass, try 10 times (which is
+ * probably overkill), and quit if we make no progress. Check for a
+ * timeout each time through the loop.
+ */
+ for (i = 0; i < 10; ++i) {
+ WT_ERR(wt_session->checkpoint(wt_session, t->data));
+
+ session->compaction = 0;
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_worker(
+ session, uri, __wt_compact, NULL, cfg, 0));
+ WT_ERR(ret);
+ if (!session->compaction)
+ break;
+
+ WT_ERR(wt_session->checkpoint(wt_session, t->data));
+ WT_ERR(wt_session->checkpoint(wt_session, t->data));
+ WT_ERR(__session_compact_check_timeout(session, start_time));
+ }
+
+err: __wt_scr_free(&t);
+ return (ret);
+}
+
+/*
+ * __wt_session_compact --
+ */
+int
+__wt_session_compact(
+ WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_COMPACT compact;
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, compact, config, cfg);
+
+ /* Setup the structure in the session handle */
+ memset(&compact, 0, sizeof(WT_COMPACT));
+ session->compact = &compact;
+
+ WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval));
+ session->compact->max_time = (uint64_t)cval.val;
+
+ /* Find the types of data sources are being compacted. */
+ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(
+ session, uri, NULL, __wt_compact_uri_analyze, cfg, 0));
+ WT_ERR(ret);
+
+ if (session->compact->lsm_count != 0)
+ WT_ERR(__wt_schema_worker(
+ session, uri, NULL, __wt_lsm_compact, cfg, 0));
+ if (session->compact->file_count != 0)
+ WT_ERR(__compact_file(session, uri, cfg));
+
+err: session->compact = NULL;
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c
new file mode 100644
index 00000000000..0c07e5fa259
--- /dev/null
+++ b/src/third_party/wiredtiger/src/session/session_dhandle.c
@@ -0,0 +1,478 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_session_dhandle_incr_use --
+ * Increment the session data source's in-use counter.
+ */
+void
+__wt_session_dhandle_incr_use(WT_SESSION_IMPL *session)
+{
+ WT_DATA_HANDLE *dhandle;
+
+ dhandle = session->dhandle;
+
+ (void)WT_ATOMIC_ADD4(dhandle->session_inuse, 1);
+}
+
+/*
+ * __session_dhandle_decr_use --
+ * Decrement the session data source's in-use counter.
+ */
+static int
+__session_dhandle_decr_use(WT_SESSION_IMPL *session)
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ dhandle = session->dhandle;
+
+ /*
+ * Decrement the in-use count on the underlying data-source -- if we're
+ * the last reference, set the time-of-death timestamp.
+ */
+ WT_ASSERT(session, dhandle->session_inuse > 0);
+ if (WT_ATOMIC_SUB4(dhandle->session_inuse, 1) == 0)
+ WT_TRET(__wt_seconds(session, &dhandle->timeofdeath));
+ return (0);
+}
+
+/*
+ * __session_add_btree --
+ * Add a handle to the session's cache.
+ */
+static int
+__session_add_btree(
+ WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE **dhandle_cachep)
+{
+ WT_DATA_HANDLE_CACHE *dhandle_cache;
+
+ WT_RET(__wt_calloc_def(session, 1, &dhandle_cache));
+ dhandle_cache->dhandle = session->dhandle;
+
+ SLIST_INSERT_HEAD(&session->dhandles, dhandle_cache, l);
+
+ if (dhandle_cachep != NULL)
+ *dhandle_cachep = dhandle_cache;
+
+ return (0);
+}
+
+/*
+ * __wt_session_lock_btree --
+ * Lock a btree handle.
+ */
+int
+__wt_session_lock_btree(WT_SESSION_IMPL *session, uint32_t flags)
+{
+ enum { NOLOCK, READLOCK, WRITELOCK } locked;
+ WT_BTREE *btree;
+ WT_DATA_HANDLE *dhandle;
+ uint32_t special_flags;
+
+ btree = S2BT(session);
+ dhandle = session->dhandle;
+ locked = NOLOCK;
+
+ /*
+ * Special operation flags will cause the handle to be reopened.
+ * For example, a handle opened with WT_BTREE_BULK cannot use the same
+ * internal data structures as a handle opened for ordinary access.
+ */
+ special_flags = LF_ISSET(WT_BTREE_SPECIAL_FLAGS);
+ WT_ASSERT(session,
+ special_flags == 0 || LF_ISSET(WT_DHANDLE_EXCLUSIVE));
+
+ if (LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
+ /*
+ * Try to get an exclusive handle lock and fail immediately if
+ * it's unavailable. We don't expect exclusive operations on
+ * trees to be mixed with ordinary cursor access, but if there
+ * is a use case in the future, we could make blocking here
+ * configurable.
+ *
+ * Special flags will cause the handle to be reopened, which
+ * will get the necessary lock, so don't bother here.
+ */
+ if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) || special_flags == 0) {
+ WT_RET(__wt_try_writelock(session, dhandle->rwlock));
+ F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
+ locked = WRITELOCK;
+ }
+ } else if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS))
+ return (EBUSY);
+ else {
+ WT_RET(__wt_readlock(session, dhandle->rwlock));
+ locked = READLOCK;
+ }
+
+ /*
+ * At this point, we have the requested lock -- if that is all that was
+ * required, we're done. Otherwise, check that the handle is open and
+ * that no special flags are required.
+ */
+ if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) ||
+ (F_ISSET(dhandle, WT_DHANDLE_OPEN) && special_flags == 0))
+ return (0);
+
+ /*
+ * The handle needs to be opened. If we locked the handle above,
+ * unlock it before returning.
+ */
+ switch (locked) {
+ case NOLOCK:
+ break;
+ case READLOCK:
+ WT_RET(__wt_readunlock(session, dhandle->rwlock));
+ break;
+ case WRITELOCK:
+ F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+ WT_RET(__wt_writeunlock(session, dhandle->rwlock));
+ break;
+ }
+
+ /* Treat an unopened handle just like a non-existent handle. */
+ return (WT_NOTFOUND);
+}
+
+/*
+ * __wt_session_release_btree --
+ * Unlock a btree handle.
+ */
+int
+__wt_session_release_btree(WT_SESSION_IMPL *session)
+{
+ enum { NOLOCK, READLOCK, WRITELOCK } locked;
+ WT_BTREE *btree;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ btree = S2BT(session);
+ dhandle = session->dhandle;
+
+ /*
+ * Decrement the data-source's in-use counter. We ignore errors because
+ * they're insignificant and handling them complicates error handling in
+ * this function more than I'm willing to live with.
+ */
+ (void)__session_dhandle_decr_use(session);
+
+ locked = F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) ? WRITELOCK : READLOCK;
+ if (F_ISSET(dhandle, WT_DHANDLE_DISCARD_CLOSE)) {
+ /*
+ * If configured to discard on last close, trade any read lock
+ * for an exclusive lock. If the exchange succeeds, setup for
+ * discard. It is expected acquiring an exclusive lock will fail
+ * sometimes since the handle may still be in use: in that case
+ * we're done.
+ */
+ if (locked == READLOCK) {
+ locked = NOLOCK;
+ WT_ERR(__wt_readunlock(session, dhandle->rwlock));
+ ret = __wt_try_writelock(session, dhandle->rwlock);
+ if (ret != 0) {
+ if (ret == EBUSY)
+ ret = 0;
+ goto err;
+ }
+ locked = WRITELOCK;
+ F_CLR(dhandle, WT_DHANDLE_DISCARD_CLOSE);
+ F_SET(dhandle,
+ WT_DHANDLE_DISCARD | WT_DHANDLE_EXCLUSIVE);
+ }
+ }
+
+ /*
+ * If we had special flags set, close the handle so that future access
+ * can get a handle without special flags.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_DISCARD) ||
+ F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) {
+ WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE));
+ F_CLR(dhandle, WT_DHANDLE_DISCARD);
+
+ WT_TRET(__wt_conn_btree_sync_and_close(session, 0));
+ }
+
+ if (F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE))
+ F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+
+err: switch (locked) {
+ case NOLOCK:
+ break;
+ case READLOCK:
+ WT_TRET(__wt_readunlock(session, dhandle->rwlock));
+ break;
+ case WRITELOCK:
+ WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+ break;
+ }
+
+ session->dhandle = NULL;
+ return (ret);
+}
+
+/*
+ * __wt_session_get_btree_ckpt --
+ * Check the configuration strings for a checkpoint name, get a btree
+ * handle for the given name, set session->dhandle.
+ */
+int
+__wt_session_get_btree_ckpt(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], uint32_t flags)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ int last_ckpt;
+ const char *checkpoint;
+
+ last_ckpt = 0;
+ checkpoint = NULL;
+
+ /*
+ * This function exists to handle checkpoint configuration. Callers
+ * that never open a checkpoint call the underlying function directly.
+ */
+ WT_RET_NOTFOUND_OK(
+ __wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
+ if (cval.len != 0) {
+ /*
+ * The internal checkpoint name is special, find the last
+ * unnamed checkpoint of the object.
+ */
+ if (WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) {
+ last_ckpt = 1;
+retry: WT_RET(__wt_meta_checkpoint_last_name(
+ session, uri, &checkpoint));
+ } else
+ WT_RET(__wt_strndup(
+ session, cval.str, cval.len, &checkpoint));
+ }
+
+ ret = __wt_session_get_btree(session, uri, checkpoint, cfg, flags);
+
+ __wt_free(session, checkpoint);
+
+ /*
+ * There's a potential race: we get the name of the most recent unnamed
+ * checkpoint, but if it's discarded (or locked so it can be discarded)
+ * by the time we try to open it, we'll fail the open. Retry in those
+ * cases, a new "last" checkpoint should surface, and we can't return an
+ * error, the application will be justifiably upset if we can't open the
+ * last checkpoint instance of an object.
+ *
+ * The check against WT_NOTFOUND is correct: if there was no checkpoint
+ * for the object (that is, the object has never been in a checkpoint),
+ * we returned immediately after the call to search for that name.
+ */
+ if (last_ckpt && (ret == WT_NOTFOUND || ret == EBUSY))
+ goto retry;
+ return (ret);
+}
+
+/*
+ * __session_discard_btree --
+ * Discard our reference to the btree.
+ */
+static void
+__session_discard_btree(
+ WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE *dhandle_cache)
+{
+ WT_DATA_HANDLE *saved_dhandle;
+
+ SLIST_REMOVE(
+ &session->dhandles, dhandle_cache, __wt_data_handle_cache, l);
+
+ saved_dhandle = session->dhandle;
+ session->dhandle = dhandle_cache->dhandle;
+
+ __wt_overwrite_and_free(session, dhandle_cache);
+ __wt_conn_btree_close(session);
+
+ /* Restore the original handle in the session. */
+ session->dhandle = saved_dhandle;
+}
+
+/*
+ * __wt_session_close_cache --
+ * Close any cached handles in a session.
+ */
+void
+__wt_session_close_cache(WT_SESSION_IMPL *session)
+{
+ WT_DATA_HANDLE_CACHE *dhandle_cache;
+
+ while ((dhandle_cache = SLIST_FIRST(&session->dhandles)) != NULL)
+ __session_discard_btree(session, dhandle_cache);
+}
+
+/*
+ * __session_dhandle_sweep --
+ * Discard any session dhandles that are not open.
+ */
+static int
+__session_dhandle_sweep(WT_SESSION_IMPL *session, uint32_t flags)
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DATA_HANDLE_CACHE *dhandle_cache, *dhandle_cache_next;
+ time_t now;
+
+ /*
+ * Check the local flag WT_DHANDLE_LOCK_ONLY; a common caller with that
+ * flag is in the path to discard the handle, don't sweep in that case.
+ */
+ if (LF_ISSET(WT_DHANDLE_LOCK_ONLY))
+ return (0);
+
+ /*
+ * Periodically sweep for dead handles; if we've swept recently, don't
+ * do it again.
+ */
+ WT_RET(__wt_seconds(session, &now));
+ if (now - session->last_sweep < WT_DHANDLE_SWEEP_PERIOD)
+ return (0);
+ session->last_sweep = now;
+
+ WT_STAT_FAST_CONN_INCR(session, dh_session_sweeps);
+
+ dhandle_cache = SLIST_FIRST(&session->dhandles);
+ while (dhandle_cache != NULL) {
+ dhandle_cache_next = SLIST_NEXT(dhandle_cache, l);
+ dhandle = dhandle_cache->dhandle;
+ if (dhandle != session->dhandle &&
+ dhandle->session_inuse == 0 &&
+ now - dhandle->timeofdeath > WT_DHANDLE_SWEEP_WAIT) {
+ WT_STAT_FAST_CONN_INCR(session, dh_session_handles);
+ __session_discard_btree(session, dhandle_cache);
+ }
+ dhandle_cache = dhandle_cache_next;
+ }
+ return (0);
+}
+
+/*
+ * __wt_session_get_btree --
+ * Get a btree handle for the given name, set session->dhandle.
+ */
+int
+__wt_session_get_btree(WT_SESSION_IMPL *session,
+ const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags)
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DATA_HANDLE_CACHE *dhandle_cache;
+ WT_DECL_RET;
+ uint64_t hash;
+ int candidate;
+
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES));
+
+ dhandle = NULL;
+ candidate = 0;
+
+ hash = __wt_hash_city64(uri, strlen(uri));
+ SLIST_FOREACH(dhandle_cache, &session->dhandles, l) {
+ dhandle = dhandle_cache->dhandle;
+ if (hash != dhandle->name_hash ||
+ strcmp(uri, dhandle->name) != 0)
+ continue;
+ if (checkpoint == NULL && dhandle->checkpoint == NULL)
+ break;
+ if (checkpoint != NULL && dhandle->checkpoint != NULL &&
+ strcmp(checkpoint, dhandle->checkpoint) == 0)
+ break;
+ }
+
+ if (dhandle_cache != NULL) {
+ candidate = 1;
+ /* We found the data handle, don't try to get it again. */
+ LF_SET(WT_DHANDLE_HAVE_REF);
+ session->dhandle = dhandle;
+
+ /*
+ * Try to lock the file; if we succeed, our "exclusive" state
+ * must match.
+ */
+ ret = __wt_session_lock_btree(session, flags);
+ if (ret == WT_NOTFOUND)
+ dhandle_cache = NULL;
+ else
+ WT_RET(ret);
+ }
+
+ if (dhandle_cache == NULL) {
+ /* Sweep the handle list to remove any dead handles. */
+ WT_RET(__session_dhandle_sweep(session, flags));
+
+ /*
+ * Acquire the schema lock if we don't already hold it, find
+ * and/or open the handle.
+ */
+ WT_WITH_SCHEMA_LOCK(session, ret =
+ __wt_conn_btree_get(session, uri, checkpoint, cfg, flags));
+ WT_RET(ret);
+
+ if (!candidate)
+ WT_RET(__session_add_btree(session, NULL));
+ WT_ASSERT(session, LF_ISSET(WT_DHANDLE_LOCK_ONLY) ||
+ F_ISSET(session->dhandle, WT_DHANDLE_OPEN));
+ }
+
+ /* Increment the data-source's in-use counter. */
+ __wt_session_dhandle_incr_use(session);
+
+ WT_ASSERT(session, LF_ISSET(WT_DHANDLE_EXCLUSIVE) ==
+ F_ISSET(session->dhandle, WT_DHANDLE_EXCLUSIVE));
+ F_SET(session->dhandle, LF_ISSET(WT_DHANDLE_DISCARD_CLOSE));
+
+ return (0);
+}
+
+/*
+ * __wt_session_lock_checkpoint --
+ * Lock the btree handle for the given checkpoint name.
+ */
+int
+__wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint)
+{
+ WT_DATA_HANDLE *dhandle, *saved_dhandle;
+ WT_DECL_RET;
+
+ saved_dhandle = session->dhandle;
+
+ /*
+ * Get the checkpoint handle exclusive, so no one else can access it
+ * while we are creating the new checkpoint.
+ */
+ WT_ERR(__wt_session_get_btree(session, saved_dhandle->name,
+ checkpoint, NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY));
+
+ /*
+ * Flush any pages in this checkpoint from the cache (we are about to
+ * re-write the checkpoint which will mean cached pages no longer have
+ * valid contents). This is especially noticeable with memory mapped
+ * files, since changes to the underlying file are visible to the in
+ * memory pages.
+ */
+ WT_ERR(__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
+
+ /*
+ * We lock checkpoint handles that we are overwriting, so the handle
+ * must be closed when we release it.
+ */
+ dhandle = session->dhandle;
+ F_SET(dhandle, WT_DHANDLE_DISCARD);
+
+ WT_ASSERT(session, WT_META_TRACKING(session));
+ WT_ERR(__wt_meta_track_handle_lock(session, 0));
+
+ /* Restore the original btree in the session. */
+err: session->dhandle = saved_dhandle;
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/session/session_salvage.c b/src/third_party/wiredtiger/src/session/session_salvage.c
new file mode 100644
index 00000000000..1512c6515ec
--- /dev/null
+++ b/src/third_party/wiredtiger/src/session/session_salvage.c
@@ -0,0 +1,58 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_salvage --
+ * Salvage a single file.
+ */
+int
+__wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CKPT *ckptbase;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ dhandle = session->dhandle;
+
+ /*
+ * XXX
+ * The salvage process reads and discards previous checkpoints, so the
+ * underlying block manager has to ignore any previous checkpoint
+ * entries when creating a new checkpoint, in other words, we can't use
+ * the metadata checkpoint list, it has all of those checkpoint listed
+ * and we don't care about them. Build a clean checkpoint list and use
+ * it instead.
+ *
+ * Don't first clear the metadata checkpoint list and call the function
+ * to get a list of checkpoints: a crash between clearing the metadata
+ * checkpoint list and creating a new checkpoint list would look like a
+ * create or open of a file without a checkpoint to roll-forward from,
+ * and the contents of the file would be discarded.
+ */
+ WT_RET(__wt_calloc_def(session, 2, &ckptbase));
+ WT_ERR(__wt_strdup(session, WT_CHECKPOINT, &ckptbase[0].name));
+ F_SET(&ckptbase[0], WT_CKPT_ADD);
+
+ WT_ERR(__wt_bt_salvage(session, ckptbase, cfg));
+
+ /*
+ * If no checkpoint was created, well, it's probably bad news, but there
+ * is nothing to do but clear any recorded checkpoints for the file. If
+ * a checkpoint was created, life is good, replace any existing list of
+ * checkpoints with the single new one.
+ */
+ if (ckptbase[0].raw.data == NULL)
+ WT_ERR(__wt_meta_checkpoint_clear(session, dhandle->name));
+ else
+ WT_ERR(__wt_meta_ckptlist_set(
+ session, dhandle->name, ckptbase, NULL));
+
+err: __wt_meta_ckptlist_free(session, ckptbase);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/support/cksum.c b/src/third_party/wiredtiger/src/support/cksum.c
new file mode 100644
index 00000000000..1eaa345d1fe
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/cksum.c
@@ -0,0 +1,1306 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * This file contains two implementations for computing CRC: one that uses
+ * hardware CRC instructions, available on newer x86_64/amd64, and one that uses
+ * a fast software algorithm. __wt_cksum() provides a common entry point that
+ * indirects to one of these two methods.
+ */
+static uint32_t (*__wt_cksum_func)(const void *chunk, size_t len);
+
+/*
+ * The CRC slicing tables are used by __wt_cksum_sw.
+ */
+static const uint32_t g_crc_slicing[8][256] = {
+#ifdef WORDS_BIGENDIAN
+ /*
+ * Big endian tables have entries that are byte reversed from little
+ * endian tables.
+ */
+ {
+ 0x00000000, 0x03836bf2, 0xf7703be1, 0xf4f35013,
+ 0x1f979ac7, 0x1c14f135, 0xe8e7a126, 0xeb64cad4,
+ 0xcf58d98a, 0xccdbb278, 0x3828e26b, 0x3bab8999,
+ 0xd0cf434d, 0xd34c28bf, 0x27bf78ac, 0x243c135e,
+ 0x6fc75e10, 0x6c4435e2, 0x98b765f1, 0x9b340e03,
+ 0x7050c4d7, 0x73d3af25, 0x8720ff36, 0x84a394c4,
+ 0xa09f879a, 0xa31cec68, 0x57efbc7b, 0x546cd789,
+ 0xbf081d5d, 0xbc8b76af, 0x487826bc, 0x4bfb4d4e,
+ 0xde8ebd20, 0xdd0dd6d2, 0x29fe86c1, 0x2a7ded33,
+ 0xc11927e7, 0xc29a4c15, 0x36691c06, 0x35ea77f4,
+ 0x11d664aa, 0x12550f58, 0xe6a65f4b, 0xe52534b9,
+ 0x0e41fe6d, 0x0dc2959f, 0xf931c58c, 0xfab2ae7e,
+ 0xb149e330, 0xb2ca88c2, 0x4639d8d1, 0x45bab323,
+ 0xaede79f7, 0xad5d1205, 0x59ae4216, 0x5a2d29e4,
+ 0x7e113aba, 0x7d925148, 0x8961015b, 0x8ae26aa9,
+ 0x6186a07d, 0x6205cb8f, 0x96f69b9c, 0x9575f06e,
+ 0xbc1d7b41, 0xbf9e10b3, 0x4b6d40a0, 0x48ee2b52,
+ 0xa38ae186, 0xa0098a74, 0x54fada67, 0x5779b195,
+ 0x7345a2cb, 0x70c6c939, 0x8435992a, 0x87b6f2d8,
+ 0x6cd2380c, 0x6f5153fe, 0x9ba203ed, 0x9821681f,
+ 0xd3da2551, 0xd0594ea3, 0x24aa1eb0, 0x27297542,
+ 0xcc4dbf96, 0xcfced464, 0x3b3d8477, 0x38beef85,
+ 0x1c82fcdb, 0x1f019729, 0xebf2c73a, 0xe871acc8,
+ 0x0315661c, 0x00960dee, 0xf4655dfd, 0xf7e6360f,
+ 0x6293c661, 0x6110ad93, 0x95e3fd80, 0x96609672,
+ 0x7d045ca6, 0x7e873754, 0x8a746747, 0x89f70cb5,
+ 0xadcb1feb, 0xae487419, 0x5abb240a, 0x59384ff8,
+ 0xb25c852c, 0xb1dfeede, 0x452cbecd, 0x46afd53f,
+ 0x0d549871, 0x0ed7f383, 0xfa24a390, 0xf9a7c862,
+ 0x12c302b6, 0x11406944, 0xe5b33957, 0xe63052a5,
+ 0xc20c41fb, 0xc18f2a09, 0x357c7a1a, 0x36ff11e8,
+ 0xdd9bdb3c, 0xde18b0ce, 0x2aebe0dd, 0x29688b2f,
+ 0x783bf682, 0x7bb89d70, 0x8f4bcd63, 0x8cc8a691,
+ 0x67ac6c45, 0x642f07b7, 0x90dc57a4, 0x935f3c56,
+ 0xb7632f08, 0xb4e044fa, 0x401314e9, 0x43907f1b,
+ 0xa8f4b5cf, 0xab77de3d, 0x5f848e2e, 0x5c07e5dc,
+ 0x17fca892, 0x147fc360, 0xe08c9373, 0xe30ff881,
+ 0x086b3255, 0x0be859a7, 0xff1b09b4, 0xfc986246,
+ 0xd8a47118, 0xdb271aea, 0x2fd44af9, 0x2c57210b,
+ 0xc733ebdf, 0xc4b0802d, 0x3043d03e, 0x33c0bbcc,
+ 0xa6b54ba2, 0xa5362050, 0x51c57043, 0x52461bb1,
+ 0xb922d165, 0xbaa1ba97, 0x4e52ea84, 0x4dd18176,
+ 0x69ed9228, 0x6a6ef9da, 0x9e9da9c9, 0x9d1ec23b,
+ 0x767a08ef, 0x75f9631d, 0x810a330e, 0x828958fc,
+ 0xc97215b2, 0xcaf17e40, 0x3e022e53, 0x3d8145a1,
+ 0xd6e58f75, 0xd566e487, 0x2195b494, 0x2216df66,
+ 0x062acc38, 0x05a9a7ca, 0xf15af7d9, 0xf2d99c2b,
+ 0x19bd56ff, 0x1a3e3d0d, 0xeecd6d1e, 0xed4e06ec,
+ 0xc4268dc3, 0xc7a5e631, 0x3356b622, 0x30d5ddd0,
+ 0xdbb11704, 0xd8327cf6, 0x2cc12ce5, 0x2f424717,
+ 0x0b7e5449, 0x08fd3fbb, 0xfc0e6fa8, 0xff8d045a,
+ 0x14e9ce8e, 0x176aa57c, 0xe399f56f, 0xe01a9e9d,
+ 0xabe1d3d3, 0xa862b821, 0x5c91e832, 0x5f1283c0,
+ 0xb4764914, 0xb7f522e6, 0x430672f5, 0x40851907,
+ 0x64b90a59, 0x673a61ab, 0x93c931b8, 0x904a5a4a,
+ 0x7b2e909e, 0x78adfb6c, 0x8c5eab7f, 0x8fddc08d,
+ 0x1aa830e3, 0x192b5b11, 0xedd80b02, 0xee5b60f0,
+ 0x053faa24, 0x06bcc1d6, 0xf24f91c5, 0xf1ccfa37,
+ 0xd5f0e969, 0xd673829b, 0x2280d288, 0x2103b97a,
+ 0xca6773ae, 0xc9e4185c, 0x3d17484f, 0x3e9423bd,
+ 0x756f6ef3, 0x76ec0501, 0x821f5512, 0x819c3ee0,
+ 0x6af8f434, 0x697b9fc6, 0x9d88cfd5, 0x9e0ba427,
+ 0xba37b779, 0xb9b4dc8b, 0x4d478c98, 0x4ec4e76a,
+ 0xa5a02dbe, 0xa623464c, 0x52d0165f, 0x51537dad
+ },{
+ 0x00000000, 0x7798a213, 0xee304527, 0x99a8e734,
+ 0xdc618a4e, 0xabf9285d, 0x3251cf69, 0x45c96d7a,
+ 0xb8c3149d, 0xcf5bb68e, 0x56f351ba, 0x216bf3a9,
+ 0x64a29ed3, 0x133a3cc0, 0x8a92dbf4, 0xfd0a79e7,
+ 0x81f1c53f, 0xf669672c, 0x6fc18018, 0x1859220b,
+ 0x5d904f71, 0x2a08ed62, 0xb3a00a56, 0xc438a845,
+ 0x3932d1a2, 0x4eaa73b1, 0xd7029485, 0xa09a3696,
+ 0xe5535bec, 0x92cbf9ff, 0x0b631ecb, 0x7cfbbcd8,
+ 0x02e38b7f, 0x757b296c, 0xecd3ce58, 0x9b4b6c4b,
+ 0xde820131, 0xa91aa322, 0x30b24416, 0x472ae605,
+ 0xba209fe2, 0xcdb83df1, 0x5410dac5, 0x238878d6,
+ 0x664115ac, 0x11d9b7bf, 0x8871508b, 0xffe9f298,
+ 0x83124e40, 0xf48aec53, 0x6d220b67, 0x1abaa974,
+ 0x5f73c40e, 0x28eb661d, 0xb1438129, 0xc6db233a,
+ 0x3bd15add, 0x4c49f8ce, 0xd5e11ffa, 0xa279bde9,
+ 0xe7b0d093, 0x90287280, 0x098095b4, 0x7e1837a7,
+ 0x04c617ff, 0x735eb5ec, 0xeaf652d8, 0x9d6ef0cb,
+ 0xd8a79db1, 0xaf3f3fa2, 0x3697d896, 0x410f7a85,
+ 0xbc050362, 0xcb9da171, 0x52354645, 0x25ade456,
+ 0x6064892c, 0x17fc2b3f, 0x8e54cc0b, 0xf9cc6e18,
+ 0x8537d2c0, 0xf2af70d3, 0x6b0797e7, 0x1c9f35f4,
+ 0x5956588e, 0x2ecefa9d, 0xb7661da9, 0xc0febfba,
+ 0x3df4c65d, 0x4a6c644e, 0xd3c4837a, 0xa45c2169,
+ 0xe1954c13, 0x960dee00, 0x0fa50934, 0x783dab27,
+ 0x06259c80, 0x71bd3e93, 0xe815d9a7, 0x9f8d7bb4,
+ 0xda4416ce, 0xaddcb4dd, 0x347453e9, 0x43ecf1fa,
+ 0xbee6881d, 0xc97e2a0e, 0x50d6cd3a, 0x274e6f29,
+ 0x62870253, 0x151fa040, 0x8cb74774, 0xfb2fe567,
+ 0x87d459bf, 0xf04cfbac, 0x69e41c98, 0x1e7cbe8b,
+ 0x5bb5d3f1, 0x2c2d71e2, 0xb58596d6, 0xc21d34c5,
+ 0x3f174d22, 0x488fef31, 0xd1270805, 0xa6bfaa16,
+ 0xe376c76c, 0x94ee657f, 0x0d46824b, 0x7ade2058,
+ 0xf9fac3fb, 0x8e6261e8, 0x17ca86dc, 0x605224cf,
+ 0x259b49b5, 0x5203eba6, 0xcbab0c92, 0xbc33ae81,
+ 0x4139d766, 0x36a17575, 0xaf099241, 0xd8913052,
+ 0x9d585d28, 0xeac0ff3b, 0x7368180f, 0x04f0ba1c,
+ 0x780b06c4, 0x0f93a4d7, 0x963b43e3, 0xe1a3e1f0,
+ 0xa46a8c8a, 0xd3f22e99, 0x4a5ac9ad, 0x3dc26bbe,
+ 0xc0c81259, 0xb750b04a, 0x2ef8577e, 0x5960f56d,
+ 0x1ca99817, 0x6b313a04, 0xf299dd30, 0x85017f23,
+ 0xfb194884, 0x8c81ea97, 0x15290da3, 0x62b1afb0,
+ 0x2778c2ca, 0x50e060d9, 0xc94887ed, 0xbed025fe,
+ 0x43da5c19, 0x3442fe0a, 0xadea193e, 0xda72bb2d,
+ 0x9fbbd657, 0xe8237444, 0x718b9370, 0x06133163,
+ 0x7ae88dbb, 0x0d702fa8, 0x94d8c89c, 0xe3406a8f,
+ 0xa68907f5, 0xd111a5e6, 0x48b942d2, 0x3f21e0c1,
+ 0xc22b9926, 0xb5b33b35, 0x2c1bdc01, 0x5b837e12,
+ 0x1e4a1368, 0x69d2b17b, 0xf07a564f, 0x87e2f45c,
+ 0xfd3cd404, 0x8aa47617, 0x130c9123, 0x64943330,
+ 0x215d5e4a, 0x56c5fc59, 0xcf6d1b6d, 0xb8f5b97e,
+ 0x45ffc099, 0x3267628a, 0xabcf85be, 0xdc5727ad,
+ 0x999e4ad7, 0xee06e8c4, 0x77ae0ff0, 0x0036ade3,
+ 0x7ccd113b, 0x0b55b328, 0x92fd541c, 0xe565f60f,
+ 0xa0ac9b75, 0xd7343966, 0x4e9cde52, 0x39047c41,
+ 0xc40e05a6, 0xb396a7b5, 0x2a3e4081, 0x5da6e292,
+ 0x186f8fe8, 0x6ff72dfb, 0xf65fcacf, 0x81c768dc,
+ 0xffdf5f7b, 0x8847fd68, 0x11ef1a5c, 0x6677b84f,
+ 0x23bed535, 0x54267726, 0xcd8e9012, 0xba163201,
+ 0x471c4be6, 0x3084e9f5, 0xa92c0ec1, 0xdeb4acd2,
+ 0x9b7dc1a8, 0xece563bb, 0x754d848f, 0x02d5269c,
+ 0x7e2e9a44, 0x09b63857, 0x901edf63, 0xe7867d70,
+ 0xa24f100a, 0xd5d7b219, 0x4c7f552d, 0x3be7f73e,
+ 0xc6ed8ed9, 0xb1752cca, 0x28ddcbfe, 0x5f4569ed,
+ 0x1a8c0497, 0x6d14a684, 0xf4bc41b0, 0x8324e3a3
+ },{
+ 0x00000000, 0x7e9241a5, 0x0d526f4f, 0x73c02eea,
+ 0x1aa4de9e, 0x64369f3b, 0x17f6b1d1, 0x6964f074,
+ 0xc53e5138, 0xbbac109d, 0xc86c3e77, 0xb6fe7fd2,
+ 0xdf9a8fa6, 0xa108ce03, 0xd2c8e0e9, 0xac5aa14c,
+ 0x8a7da270, 0xf4efe3d5, 0x872fcd3f, 0xf9bd8c9a,
+ 0x90d97cee, 0xee4b3d4b, 0x9d8b13a1, 0xe3195204,
+ 0x4f43f348, 0x31d1b2ed, 0x42119c07, 0x3c83dda2,
+ 0x55e72dd6, 0x2b756c73, 0x58b54299, 0x2627033c,
+ 0x14fb44e1, 0x6a690544, 0x19a92bae, 0x673b6a0b,
+ 0x0e5f9a7f, 0x70cddbda, 0x030df530, 0x7d9fb495,
+ 0xd1c515d9, 0xaf57547c, 0xdc977a96, 0xa2053b33,
+ 0xcb61cb47, 0xb5f38ae2, 0xc633a408, 0xb8a1e5ad,
+ 0x9e86e691, 0xe014a734, 0x93d489de, 0xed46c87b,
+ 0x8422380f, 0xfab079aa, 0x89705740, 0xf7e216e5,
+ 0x5bb8b7a9, 0x252af60c, 0x56ead8e6, 0x28789943,
+ 0x411c6937, 0x3f8e2892, 0x4c4e0678, 0x32dc47dd,
+ 0xd98065c7, 0xa7122462, 0xd4d20a88, 0xaa404b2d,
+ 0xc324bb59, 0xbdb6fafc, 0xce76d416, 0xb0e495b3,
+ 0x1cbe34ff, 0x622c755a, 0x11ec5bb0, 0x6f7e1a15,
+ 0x061aea61, 0x7888abc4, 0x0b48852e, 0x75dac48b,
+ 0x53fdc7b7, 0x2d6f8612, 0x5eafa8f8, 0x203de95d,
+ 0x49591929, 0x37cb588c, 0x440b7666, 0x3a9937c3,
+ 0x96c3968f, 0xe851d72a, 0x9b91f9c0, 0xe503b865,
+ 0x8c674811, 0xf2f509b4, 0x8135275e, 0xffa766fb,
+ 0xcd7b2126, 0xb3e96083, 0xc0294e69, 0xbebb0fcc,
+ 0xd7dfffb8, 0xa94dbe1d, 0xda8d90f7, 0xa41fd152,
+ 0x0845701e, 0x76d731bb, 0x05171f51, 0x7b855ef4,
+ 0x12e1ae80, 0x6c73ef25, 0x1fb3c1cf, 0x6121806a,
+ 0x47068356, 0x3994c2f3, 0x4a54ec19, 0x34c6adbc,
+ 0x5da25dc8, 0x23301c6d, 0x50f03287, 0x2e627322,
+ 0x8238d26e, 0xfcaa93cb, 0x8f6abd21, 0xf1f8fc84,
+ 0x989c0cf0, 0xe60e4d55, 0x95ce63bf, 0xeb5c221a,
+ 0x4377278b, 0x3de5662e, 0x4e2548c4, 0x30b70961,
+ 0x59d3f915, 0x2741b8b0, 0x5481965a, 0x2a13d7ff,
+ 0x864976b3, 0xf8db3716, 0x8b1b19fc, 0xf5895859,
+ 0x9ceda82d, 0xe27fe988, 0x91bfc762, 0xef2d86c7,
+ 0xc90a85fb, 0xb798c45e, 0xc458eab4, 0xbacaab11,
+ 0xd3ae5b65, 0xad3c1ac0, 0xdefc342a, 0xa06e758f,
+ 0x0c34d4c3, 0x72a69566, 0x0166bb8c, 0x7ff4fa29,
+ 0x16900a5d, 0x68024bf8, 0x1bc26512, 0x655024b7,
+ 0x578c636a, 0x291e22cf, 0x5ade0c25, 0x244c4d80,
+ 0x4d28bdf4, 0x33bafc51, 0x407ad2bb, 0x3ee8931e,
+ 0x92b23252, 0xec2073f7, 0x9fe05d1d, 0xe1721cb8,
+ 0x8816eccc, 0xf684ad69, 0x85448383, 0xfbd6c226,
+ 0xddf1c11a, 0xa36380bf, 0xd0a3ae55, 0xae31eff0,
+ 0xc7551f84, 0xb9c75e21, 0xca0770cb, 0xb495316e,
+ 0x18cf9022, 0x665dd187, 0x159dff6d, 0x6b0fbec8,
+ 0x026b4ebc, 0x7cf90f19, 0x0f3921f3, 0x71ab6056,
+ 0x9af7424c, 0xe46503e9, 0x97a52d03, 0xe9376ca6,
+ 0x80539cd2, 0xfec1dd77, 0x8d01f39d, 0xf393b238,
+ 0x5fc91374, 0x215b52d1, 0x529b7c3b, 0x2c093d9e,
+ 0x456dcdea, 0x3bff8c4f, 0x483fa2a5, 0x36ade300,
+ 0x108ae03c, 0x6e18a199, 0x1dd88f73, 0x634aced6,
+ 0x0a2e3ea2, 0x74bc7f07, 0x077c51ed, 0x79ee1048,
+ 0xd5b4b104, 0xab26f0a1, 0xd8e6de4b, 0xa6749fee,
+ 0xcf106f9a, 0xb1822e3f, 0xc24200d5, 0xbcd04170,
+ 0x8e0c06ad, 0xf09e4708, 0x835e69e2, 0xfdcc2847,
+ 0x94a8d833, 0xea3a9996, 0x99fab77c, 0xe768f6d9,
+ 0x4b325795, 0x35a01630, 0x466038da, 0x38f2797f,
+ 0x5196890b, 0x2f04c8ae, 0x5cc4e644, 0x2256a7e1,
+ 0x0471a4dd, 0x7ae3e578, 0x0923cb92, 0x77b18a37,
+ 0x1ed57a43, 0x60473be6, 0x1387150c, 0x6d1554a9,
+ 0xc14ff5e5, 0xbfddb440, 0xcc1d9aaa, 0xb28fdb0f,
+ 0xdbeb2b7b, 0xa5796ade, 0xd6b94434, 0xa82b0591
+ },{
+ 0x00000000, 0xb8aa45dd, 0x812367bf, 0x39892262,
+ 0xf331227b, 0x4b9b67a6, 0x721245c4, 0xcab80019,
+ 0xe66344f6, 0x5ec9012b, 0x67402349, 0xdfea6694,
+ 0x1552668d, 0xadf82350, 0x94710132, 0x2cdb44ef,
+ 0x3db164e9, 0x851b2134, 0xbc920356, 0x0438468b,
+ 0xce804692, 0x762a034f, 0x4fa3212d, 0xf70964f0,
+ 0xdbd2201f, 0x637865c2, 0x5af147a0, 0xe25b027d,
+ 0x28e30264, 0x904947b9, 0xa9c065db, 0x116a2006,
+ 0x8b1425d7, 0x33be600a, 0x0a374268, 0xb29d07b5,
+ 0x782507ac, 0xc08f4271, 0xf9066013, 0x41ac25ce,
+ 0x6d776121, 0xd5dd24fc, 0xec54069e, 0x54fe4343,
+ 0x9e46435a, 0x26ec0687, 0x1f6524e5, 0xa7cf6138,
+ 0xb6a5413e, 0x0e0f04e3, 0x37862681, 0x8f2c635c,
+ 0x45946345, 0xfd3e2698, 0xc4b704fa, 0x7c1d4127,
+ 0x50c605c8, 0xe86c4015, 0xd1e56277, 0x694f27aa,
+ 0xa3f727b3, 0x1b5d626e, 0x22d4400c, 0x9a7e05d1,
+ 0xe75fa6ab, 0x5ff5e376, 0x667cc114, 0xded684c9,
+ 0x146e84d0, 0xacc4c10d, 0x954de36f, 0x2de7a6b2,
+ 0x013ce25d, 0xb996a780, 0x801f85e2, 0x38b5c03f,
+ 0xf20dc026, 0x4aa785fb, 0x732ea799, 0xcb84e244,
+ 0xdaeec242, 0x6244879f, 0x5bcda5fd, 0xe367e020,
+ 0x29dfe039, 0x9175a5e4, 0xa8fc8786, 0x1056c25b,
+ 0x3c8d86b4, 0x8427c369, 0xbdaee10b, 0x0504a4d6,
+ 0xcfbca4cf, 0x7716e112, 0x4e9fc370, 0xf63586ad,
+ 0x6c4b837c, 0xd4e1c6a1, 0xed68e4c3, 0x55c2a11e,
+ 0x9f7aa107, 0x27d0e4da, 0x1e59c6b8, 0xa6f38365,
+ 0x8a28c78a, 0x32828257, 0x0b0ba035, 0xb3a1e5e8,
+ 0x7919e5f1, 0xc1b3a02c, 0xf83a824e, 0x4090c793,
+ 0x51fae795, 0xe950a248, 0xd0d9802a, 0x6873c5f7,
+ 0xa2cbc5ee, 0x1a618033, 0x23e8a251, 0x9b42e78c,
+ 0xb799a363, 0x0f33e6be, 0x36bac4dc, 0x8e108101,
+ 0x44a88118, 0xfc02c4c5, 0xc58be6a7, 0x7d21a37a,
+ 0x3fc9a052, 0x8763e58f, 0xbeeac7ed, 0x06408230,
+ 0xccf88229, 0x7452c7f4, 0x4ddbe596, 0xf571a04b,
+ 0xd9aae4a4, 0x6100a179, 0x5889831b, 0xe023c6c6,
+ 0x2a9bc6df, 0x92318302, 0xabb8a160, 0x1312e4bd,
+ 0x0278c4bb, 0xbad28166, 0x835ba304, 0x3bf1e6d9,
+ 0xf149e6c0, 0x49e3a31d, 0x706a817f, 0xc8c0c4a2,
+ 0xe41b804d, 0x5cb1c590, 0x6538e7f2, 0xdd92a22f,
+ 0x172aa236, 0xaf80e7eb, 0x9609c589, 0x2ea38054,
+ 0xb4dd8585, 0x0c77c058, 0x35fee23a, 0x8d54a7e7,
+ 0x47eca7fe, 0xff46e223, 0xc6cfc041, 0x7e65859c,
+ 0x52bec173, 0xea1484ae, 0xd39da6cc, 0x6b37e311,
+ 0xa18fe308, 0x1925a6d5, 0x20ac84b7, 0x9806c16a,
+ 0x896ce16c, 0x31c6a4b1, 0x084f86d3, 0xb0e5c30e,
+ 0x7a5dc317, 0xc2f786ca, 0xfb7ea4a8, 0x43d4e175,
+ 0x6f0fa59a, 0xd7a5e047, 0xee2cc225, 0x568687f8,
+ 0x9c3e87e1, 0x2494c23c, 0x1d1de05e, 0xa5b7a583,
+ 0xd89606f9, 0x603c4324, 0x59b56146, 0xe11f249b,
+ 0x2ba72482, 0x930d615f, 0xaa84433d, 0x122e06e0,
+ 0x3ef5420f, 0x865f07d2, 0xbfd625b0, 0x077c606d,
+ 0xcdc46074, 0x756e25a9, 0x4ce707cb, 0xf44d4216,
+ 0xe5276210, 0x5d8d27cd, 0x640405af, 0xdcae4072,
+ 0x1616406b, 0xaebc05b6, 0x973527d4, 0x2f9f6209,
+ 0x034426e6, 0xbbee633b, 0x82674159, 0x3acd0484,
+ 0xf075049d, 0x48df4140, 0x71566322, 0xc9fc26ff,
+ 0x5382232e, 0xeb2866f3, 0xd2a14491, 0x6a0b014c,
+ 0xa0b30155, 0x18194488, 0x219066ea, 0x993a2337,
+ 0xb5e167d8, 0x0d4b2205, 0x34c20067, 0x8c6845ba,
+ 0x46d045a3, 0xfe7a007e, 0xc7f3221c, 0x7f5967c1,
+ 0x6e3347c7, 0xd699021a, 0xef102078, 0x57ba65a5,
+ 0x9d0265bc, 0x25a82061, 0x1c210203, 0xa48b47de,
+ 0x88500331, 0x30fa46ec, 0x0973648e, 0xb1d92153,
+ 0x7b61214a, 0xc3cb6497, 0xfa4246f5, 0x42e80328
+ },{
+ 0x00000000, 0xac6f1138, 0x58df2270, 0xf4b03348,
+ 0xb0be45e0, 0x1cd154d8, 0xe8616790, 0x440e76a8,
+ 0x910b67c5, 0x3d6476fd, 0xc9d445b5, 0x65bb548d,
+ 0x21b52225, 0x8dda331d, 0x796a0055, 0xd505116d,
+ 0xd361228f, 0x7f0e33b7, 0x8bbe00ff, 0x27d111c7,
+ 0x63df676f, 0xcfb07657, 0x3b00451f, 0x976f5427,
+ 0x426a454a, 0xee055472, 0x1ab5673a, 0xb6da7602,
+ 0xf2d400aa, 0x5ebb1192, 0xaa0b22da, 0x066433e2,
+ 0x57b5a81b, 0xfbdab923, 0x0f6a8a6b, 0xa3059b53,
+ 0xe70bedfb, 0x4b64fcc3, 0xbfd4cf8b, 0x13bbdeb3,
+ 0xc6becfde, 0x6ad1dee6, 0x9e61edae, 0x320efc96,
+ 0x76008a3e, 0xda6f9b06, 0x2edfa84e, 0x82b0b976,
+ 0x84d48a94, 0x28bb9bac, 0xdc0ba8e4, 0x7064b9dc,
+ 0x346acf74, 0x9805de4c, 0x6cb5ed04, 0xc0dafc3c,
+ 0x15dfed51, 0xb9b0fc69, 0x4d00cf21, 0xe16fde19,
+ 0xa561a8b1, 0x090eb989, 0xfdbe8ac1, 0x51d19bf9,
+ 0xae6a5137, 0x0205400f, 0xf6b57347, 0x5ada627f,
+ 0x1ed414d7, 0xb2bb05ef, 0x460b36a7, 0xea64279f,
+ 0x3f6136f2, 0x930e27ca, 0x67be1482, 0xcbd105ba,
+ 0x8fdf7312, 0x23b0622a, 0xd7005162, 0x7b6f405a,
+ 0x7d0b73b8, 0xd1646280, 0x25d451c8, 0x89bb40f0,
+ 0xcdb53658, 0x61da2760, 0x956a1428, 0x39050510,
+ 0xec00147d, 0x406f0545, 0xb4df360d, 0x18b02735,
+ 0x5cbe519d, 0xf0d140a5, 0x046173ed, 0xa80e62d5,
+ 0xf9dff92c, 0x55b0e814, 0xa100db5c, 0x0d6fca64,
+ 0x4961bccc, 0xe50eadf4, 0x11be9ebc, 0xbdd18f84,
+ 0x68d49ee9, 0xc4bb8fd1, 0x300bbc99, 0x9c64ada1,
+ 0xd86adb09, 0x7405ca31, 0x80b5f979, 0x2cdae841,
+ 0x2abedba3, 0x86d1ca9b, 0x7261f9d3, 0xde0ee8eb,
+ 0x9a009e43, 0x366f8f7b, 0xc2dfbc33, 0x6eb0ad0b,
+ 0xbbb5bc66, 0x17daad5e, 0xe36a9e16, 0x4f058f2e,
+ 0x0b0bf986, 0xa764e8be, 0x53d4dbf6, 0xffbbcace,
+ 0x5cd5a26e, 0xf0bab356, 0x040a801e, 0xa8659126,
+ 0xec6be78e, 0x4004f6b6, 0xb4b4c5fe, 0x18dbd4c6,
+ 0xcddec5ab, 0x61b1d493, 0x9501e7db, 0x396ef6e3,
+ 0x7d60804b, 0xd10f9173, 0x25bfa23b, 0x89d0b303,
+ 0x8fb480e1, 0x23db91d9, 0xd76ba291, 0x7b04b3a9,
+ 0x3f0ac501, 0x9365d439, 0x67d5e771, 0xcbbaf649,
+ 0x1ebfe724, 0xb2d0f61c, 0x4660c554, 0xea0fd46c,
+ 0xae01a2c4, 0x026eb3fc, 0xf6de80b4, 0x5ab1918c,
+ 0x0b600a75, 0xa70f1b4d, 0x53bf2805, 0xffd0393d,
+ 0xbbde4f95, 0x17b15ead, 0xe3016de5, 0x4f6e7cdd,
+ 0x9a6b6db0, 0x36047c88, 0xc2b44fc0, 0x6edb5ef8,
+ 0x2ad52850, 0x86ba3968, 0x720a0a20, 0xde651b18,
+ 0xd80128fa, 0x746e39c2, 0x80de0a8a, 0x2cb11bb2,
+ 0x68bf6d1a, 0xc4d07c22, 0x30604f6a, 0x9c0f5e52,
+ 0x490a4f3f, 0xe5655e07, 0x11d56d4f, 0xbdba7c77,
+ 0xf9b40adf, 0x55db1be7, 0xa16b28af, 0x0d043997,
+ 0xf2bff359, 0x5ed0e261, 0xaa60d129, 0x060fc011,
+ 0x4201b6b9, 0xee6ea781, 0x1ade94c9, 0xb6b185f1,
+ 0x63b4949c, 0xcfdb85a4, 0x3b6bb6ec, 0x9704a7d4,
+ 0xd30ad17c, 0x7f65c044, 0x8bd5f30c, 0x27bae234,
+ 0x21ded1d6, 0x8db1c0ee, 0x7901f3a6, 0xd56ee29e,
+ 0x91609436, 0x3d0f850e, 0xc9bfb646, 0x65d0a77e,
+ 0xb0d5b613, 0x1cbaa72b, 0xe80a9463, 0x4465855b,
+ 0x006bf3f3, 0xac04e2cb, 0x58b4d183, 0xf4dbc0bb,
+ 0xa50a5b42, 0x09654a7a, 0xfdd57932, 0x51ba680a,
+ 0x15b41ea2, 0xb9db0f9a, 0x4d6b3cd2, 0xe1042dea,
+ 0x34013c87, 0x986e2dbf, 0x6cde1ef7, 0xc0b10fcf,
+ 0x84bf7967, 0x28d0685f, 0xdc605b17, 0x700f4a2f,
+ 0x766b79cd, 0xda0468f5, 0x2eb45bbd, 0x82db4a85,
+ 0xc6d53c2d, 0x6aba2d15, 0x9e0a1e5d, 0x32650f65,
+ 0xe7601e08, 0x4b0f0f30, 0xbfbf3c78, 0x13d02d40,
+ 0x57de5be8, 0xfbb14ad0, 0x0f017998, 0xa36e68a0
+ },{
+ 0x00000000, 0x196b30ef, 0xc3a08cdb, 0xdacbbc34,
+ 0x7737f5b2, 0x6e5cc55d, 0xb4977969, 0xadfc4986,
+ 0x1f180660, 0x0673368f, 0xdcb88abb, 0xc5d3ba54,
+ 0x682ff3d2, 0x7144c33d, 0xab8f7f09, 0xb2e44fe6,
+ 0x3e300cc0, 0x275b3c2f, 0xfd90801b, 0xe4fbb0f4,
+ 0x4907f972, 0x506cc99d, 0x8aa775a9, 0x93cc4546,
+ 0x21280aa0, 0x38433a4f, 0xe288867b, 0xfbe3b694,
+ 0x561fff12, 0x4f74cffd, 0x95bf73c9, 0x8cd44326,
+ 0x8d16f485, 0x947dc46a, 0x4eb6785e, 0x57dd48b1,
+ 0xfa210137, 0xe34a31d8, 0x39818dec, 0x20eabd03,
+ 0x920ef2e5, 0x8b65c20a, 0x51ae7e3e, 0x48c54ed1,
+ 0xe5390757, 0xfc5237b8, 0x26998b8c, 0x3ff2bb63,
+ 0xb326f845, 0xaa4dc8aa, 0x7086749e, 0x69ed4471,
+ 0xc4110df7, 0xdd7a3d18, 0x07b1812c, 0x1edab1c3,
+ 0xac3efe25, 0xb555ceca, 0x6f9e72fe, 0x76f54211,
+ 0xdb090b97, 0xc2623b78, 0x18a9874c, 0x01c2b7a3,
+ 0xeb5b040e, 0xf23034e1, 0x28fb88d5, 0x3190b83a,
+ 0x9c6cf1bc, 0x8507c153, 0x5fcc7d67, 0x46a74d88,
+ 0xf443026e, 0xed283281, 0x37e38eb5, 0x2e88be5a,
+ 0x8374f7dc, 0x9a1fc733, 0x40d47b07, 0x59bf4be8,
+ 0xd56b08ce, 0xcc003821, 0x16cb8415, 0x0fa0b4fa,
+ 0xa25cfd7c, 0xbb37cd93, 0x61fc71a7, 0x78974148,
+ 0xca730eae, 0xd3183e41, 0x09d38275, 0x10b8b29a,
+ 0xbd44fb1c, 0xa42fcbf3, 0x7ee477c7, 0x678f4728,
+ 0x664df08b, 0x7f26c064, 0xa5ed7c50, 0xbc864cbf,
+ 0x117a0539, 0x081135d6, 0xd2da89e2, 0xcbb1b90d,
+ 0x7955f6eb, 0x603ec604, 0xbaf57a30, 0xa39e4adf,
+ 0x0e620359, 0x170933b6, 0xcdc28f82, 0xd4a9bf6d,
+ 0x587dfc4b, 0x4116cca4, 0x9bdd7090, 0x82b6407f,
+ 0x2f4a09f9, 0x36213916, 0xecea8522, 0xf581b5cd,
+ 0x4765fa2b, 0x5e0ecac4, 0x84c576f0, 0x9dae461f,
+ 0x30520f99, 0x29393f76, 0xf3f28342, 0xea99b3ad,
+ 0xd6b7081c, 0xcfdc38f3, 0x151784c7, 0x0c7cb428,
+ 0xa180fdae, 0xb8ebcd41, 0x62207175, 0x7b4b419a,
+ 0xc9af0e7c, 0xd0c43e93, 0x0a0f82a7, 0x1364b248,
+ 0xbe98fbce, 0xa7f3cb21, 0x7d387715, 0x645347fa,
+ 0xe88704dc, 0xf1ec3433, 0x2b278807, 0x324cb8e8,
+ 0x9fb0f16e, 0x86dbc181, 0x5c107db5, 0x457b4d5a,
+ 0xf79f02bc, 0xeef43253, 0x343f8e67, 0x2d54be88,
+ 0x80a8f70e, 0x99c3c7e1, 0x43087bd5, 0x5a634b3a,
+ 0x5ba1fc99, 0x42cacc76, 0x98017042, 0x816a40ad,
+ 0x2c96092b, 0x35fd39c4, 0xef3685f0, 0xf65db51f,
+ 0x44b9faf9, 0x5dd2ca16, 0x87197622, 0x9e7246cd,
+ 0x338e0f4b, 0x2ae53fa4, 0xf02e8390, 0xe945b37f,
+ 0x6591f059, 0x7cfac0b6, 0xa6317c82, 0xbf5a4c6d,
+ 0x12a605eb, 0x0bcd3504, 0xd1068930, 0xc86db9df,
+ 0x7a89f639, 0x63e2c6d6, 0xb9297ae2, 0xa0424a0d,
+ 0x0dbe038b, 0x14d53364, 0xce1e8f50, 0xd775bfbf,
+ 0x3dec0c12, 0x24873cfd, 0xfe4c80c9, 0xe727b026,
+ 0x4adbf9a0, 0x53b0c94f, 0x897b757b, 0x90104594,
+ 0x22f40a72, 0x3b9f3a9d, 0xe15486a9, 0xf83fb646,
+ 0x55c3ffc0, 0x4ca8cf2f, 0x9663731b, 0x8f0843f4,
+ 0x03dc00d2, 0x1ab7303d, 0xc07c8c09, 0xd917bce6,
+ 0x74ebf560, 0x6d80c58f, 0xb74b79bb, 0xae204954,
+ 0x1cc406b2, 0x05af365d, 0xdf648a69, 0xc60fba86,
+ 0x6bf3f300, 0x7298c3ef, 0xa8537fdb, 0xb1384f34,
+ 0xb0faf897, 0xa991c878, 0x735a744c, 0x6a3144a3,
+ 0xc7cd0d25, 0xdea63dca, 0x046d81fe, 0x1d06b111,
+ 0xafe2fef7, 0xb689ce18, 0x6c42722c, 0x752942c3,
+ 0xd8d50b45, 0xc1be3baa, 0x1b75879e, 0x021eb771,
+ 0x8ecaf457, 0x97a1c4b8, 0x4d6a788c, 0x54014863,
+ 0xf9fd01e5, 0xe096310a, 0x3a5d8d3e, 0x2336bdd1,
+ 0x91d2f237, 0x88b9c2d8, 0x52727eec, 0x4b194e03,
+ 0xe6e50785, 0xff8e376a, 0x25458b5e, 0x3c2ebbb1
+ },{
+ 0x00000000, 0xc82c0368, 0x905906d0, 0x587505b8,
+ 0xd1c5e0a5, 0x19e9e3cd, 0x419ce675, 0x89b0e51d,
+ 0x53fd2d4e, 0x9bd12e26, 0xc3a42b9e, 0x0b8828f6,
+ 0x8238cdeb, 0x4a14ce83, 0x1261cb3b, 0xda4dc853,
+ 0xa6fa5b9c, 0x6ed658f4, 0x36a35d4c, 0xfe8f5e24,
+ 0x773fbb39, 0xbf13b851, 0xe766bde9, 0x2f4abe81,
+ 0xf50776d2, 0x3d2b75ba, 0x655e7002, 0xad72736a,
+ 0x24c29677, 0xecee951f, 0xb49b90a7, 0x7cb793cf,
+ 0xbd835b3d, 0x75af5855, 0x2dda5ded, 0xe5f65e85,
+ 0x6c46bb98, 0xa46ab8f0, 0xfc1fbd48, 0x3433be20,
+ 0xee7e7673, 0x2652751b, 0x7e2770a3, 0xb60b73cb,
+ 0x3fbb96d6, 0xf79795be, 0xafe29006, 0x67ce936e,
+ 0x1b7900a1, 0xd35503c9, 0x8b200671, 0x430c0519,
+ 0xcabce004, 0x0290e36c, 0x5ae5e6d4, 0x92c9e5bc,
+ 0x48842def, 0x80a82e87, 0xd8dd2b3f, 0x10f12857,
+ 0x9941cd4a, 0x516dce22, 0x0918cb9a, 0xc134c8f2,
+ 0x7a07b77a, 0xb22bb412, 0xea5eb1aa, 0x2272b2c2,
+ 0xabc257df, 0x63ee54b7, 0x3b9b510f, 0xf3b75267,
+ 0x29fa9a34, 0xe1d6995c, 0xb9a39ce4, 0x718f9f8c,
+ 0xf83f7a91, 0x301379f9, 0x68667c41, 0xa04a7f29,
+ 0xdcfdece6, 0x14d1ef8e, 0x4ca4ea36, 0x8488e95e,
+ 0x0d380c43, 0xc5140f2b, 0x9d610a93, 0x554d09fb,
+ 0x8f00c1a8, 0x472cc2c0, 0x1f59c778, 0xd775c410,
+ 0x5ec5210d, 0x96e92265, 0xce9c27dd, 0x06b024b5,
+ 0xc784ec47, 0x0fa8ef2f, 0x57ddea97, 0x9ff1e9ff,
+ 0x16410ce2, 0xde6d0f8a, 0x86180a32, 0x4e34095a,
+ 0x9479c109, 0x5c55c261, 0x0420c7d9, 0xcc0cc4b1,
+ 0x45bc21ac, 0x8d9022c4, 0xd5e5277c, 0x1dc92414,
+ 0x617eb7db, 0xa952b4b3, 0xf127b10b, 0x390bb263,
+ 0xb0bb577e, 0x78975416, 0x20e251ae, 0xe8ce52c6,
+ 0x32839a95, 0xfaaf99fd, 0xa2da9c45, 0x6af69f2d,
+ 0xe3467a30, 0x2b6a7958, 0x731f7ce0, 0xbb337f88,
+ 0xf40e6ef5, 0x3c226d9d, 0x64576825, 0xac7b6b4d,
+ 0x25cb8e50, 0xede78d38, 0xb5928880, 0x7dbe8be8,
+ 0xa7f343bb, 0x6fdf40d3, 0x37aa456b, 0xff864603,
+ 0x7636a31e, 0xbe1aa076, 0xe66fa5ce, 0x2e43a6a6,
+ 0x52f43569, 0x9ad83601, 0xc2ad33b9, 0x0a8130d1,
+ 0x8331d5cc, 0x4b1dd6a4, 0x1368d31c, 0xdb44d074,
+ 0x01091827, 0xc9251b4f, 0x91501ef7, 0x597c1d9f,
+ 0xd0ccf882, 0x18e0fbea, 0x4095fe52, 0x88b9fd3a,
+ 0x498d35c8, 0x81a136a0, 0xd9d43318, 0x11f83070,
+ 0x9848d56d, 0x5064d605, 0x0811d3bd, 0xc03dd0d5,
+ 0x1a701886, 0xd25c1bee, 0x8a291e56, 0x42051d3e,
+ 0xcbb5f823, 0x0399fb4b, 0x5becfef3, 0x93c0fd9b,
+ 0xef776e54, 0x275b6d3c, 0x7f2e6884, 0xb7026bec,
+ 0x3eb28ef1, 0xf69e8d99, 0xaeeb8821, 0x66c78b49,
+ 0xbc8a431a, 0x74a64072, 0x2cd345ca, 0xe4ff46a2,
+ 0x6d4fa3bf, 0xa563a0d7, 0xfd16a56f, 0x353aa607,
+ 0x8e09d98f, 0x4625dae7, 0x1e50df5f, 0xd67cdc37,
+ 0x5fcc392a, 0x97e03a42, 0xcf953ffa, 0x07b93c92,
+ 0xddf4f4c1, 0x15d8f7a9, 0x4dadf211, 0x8581f179,
+ 0x0c311464, 0xc41d170c, 0x9c6812b4, 0x544411dc,
+ 0x28f38213, 0xe0df817b, 0xb8aa84c3, 0x708687ab,
+ 0xf93662b6, 0x311a61de, 0x696f6466, 0xa143670e,
+ 0x7b0eaf5d, 0xb322ac35, 0xeb57a98d, 0x237baae5,
+ 0xaacb4ff8, 0x62e74c90, 0x3a924928, 0xf2be4a40,
+ 0x338a82b2, 0xfba681da, 0xa3d38462, 0x6bff870a,
+ 0xe24f6217, 0x2a63617f, 0x721664c7, 0xba3a67af,
+ 0x6077affc, 0xa85bac94, 0xf02ea92c, 0x3802aa44,
+ 0xb1b24f59, 0x799e4c31, 0x21eb4989, 0xe9c74ae1,
+ 0x9570d92e, 0x5d5cda46, 0x0529dffe, 0xcd05dc96,
+ 0x44b5398b, 0x8c993ae3, 0xd4ec3f5b, 0x1cc03c33,
+ 0xc68df460, 0x0ea1f708, 0x56d4f2b0, 0x9ef8f1d8,
+ 0x174814c5, 0xdf6417ad, 0x87111215, 0x4f3d117d
+ },{
+ 0x00000000, 0x277d3c49, 0x4efa7892, 0x698744db,
+ 0x6d821d21, 0x4aff2168, 0x237865b3, 0x040559fa,
+ 0xda043b42, 0xfd79070b, 0x94fe43d0, 0xb3837f99,
+ 0xb7862663, 0x90fb1a2a, 0xf97c5ef1, 0xde0162b8,
+ 0xb4097684, 0x93744acd, 0xfaf30e16, 0xdd8e325f,
+ 0xd98b6ba5, 0xfef657ec, 0x97711337, 0xb00c2f7e,
+ 0x6e0d4dc6, 0x4970718f, 0x20f73554, 0x078a091d,
+ 0x038f50e7, 0x24f26cae, 0x4d752875, 0x6a08143c,
+ 0x9965000d, 0xbe183c44, 0xd79f789f, 0xf0e244d6,
+ 0xf4e71d2c, 0xd39a2165, 0xba1d65be, 0x9d6059f7,
+ 0x43613b4f, 0x641c0706, 0x0d9b43dd, 0x2ae67f94,
+ 0x2ee3266e, 0x099e1a27, 0x60195efc, 0x476462b5,
+ 0x2d6c7689, 0x0a114ac0, 0x63960e1b, 0x44eb3252,
+ 0x40ee6ba8, 0x679357e1, 0x0e14133a, 0x29692f73,
+ 0xf7684dcb, 0xd0157182, 0xb9923559, 0x9eef0910,
+ 0x9aea50ea, 0xbd976ca3, 0xd4102878, 0xf36d1431,
+ 0x32cb001a, 0x15b63c53, 0x7c317888, 0x5b4c44c1,
+ 0x5f491d3b, 0x78342172, 0x11b365a9, 0x36ce59e0,
+ 0xe8cf3b58, 0xcfb20711, 0xa63543ca, 0x81487f83,
+ 0x854d2679, 0xa2301a30, 0xcbb75eeb, 0xecca62a2,
+ 0x86c2769e, 0xa1bf4ad7, 0xc8380e0c, 0xef453245,
+ 0xeb406bbf, 0xcc3d57f6, 0xa5ba132d, 0x82c72f64,
+ 0x5cc64ddc, 0x7bbb7195, 0x123c354e, 0x35410907,
+ 0x314450fd, 0x16396cb4, 0x7fbe286f, 0x58c31426,
+ 0xabae0017, 0x8cd33c5e, 0xe5547885, 0xc22944cc,
+ 0xc62c1d36, 0xe151217f, 0x88d665a4, 0xafab59ed,
+ 0x71aa3b55, 0x56d7071c, 0x3f5043c7, 0x182d7f8e,
+ 0x1c282674, 0x3b551a3d, 0x52d25ee6, 0x75af62af,
+ 0x1fa77693, 0x38da4ada, 0x515d0e01, 0x76203248,
+ 0x72256bb2, 0x555857fb, 0x3cdf1320, 0x1ba22f69,
+ 0xc5a34dd1, 0xe2de7198, 0x8b593543, 0xac24090a,
+ 0xa82150f0, 0x8f5c6cb9, 0xe6db2862, 0xc1a6142b,
+ 0x64960134, 0x43eb3d7d, 0x2a6c79a6, 0x0d1145ef,
+ 0x09141c15, 0x2e69205c, 0x47ee6487, 0x609358ce,
+ 0xbe923a76, 0x99ef063f, 0xf06842e4, 0xd7157ead,
+ 0xd3102757, 0xf46d1b1e, 0x9dea5fc5, 0xba97638c,
+ 0xd09f77b0, 0xf7e24bf9, 0x9e650f22, 0xb918336b,
+ 0xbd1d6a91, 0x9a6056d8, 0xf3e71203, 0xd49a2e4a,
+ 0x0a9b4cf2, 0x2de670bb, 0x44613460, 0x631c0829,
+ 0x671951d3, 0x40646d9a, 0x29e32941, 0x0e9e1508,
+ 0xfdf30139, 0xda8e3d70, 0xb30979ab, 0x947445e2,
+ 0x90711c18, 0xb70c2051, 0xde8b648a, 0xf9f658c3,
+ 0x27f73a7b, 0x008a0632, 0x690d42e9, 0x4e707ea0,
+ 0x4a75275a, 0x6d081b13, 0x048f5fc8, 0x23f26381,
+ 0x49fa77bd, 0x6e874bf4, 0x07000f2f, 0x207d3366,
+ 0x24786a9c, 0x030556d5, 0x6a82120e, 0x4dff2e47,
+ 0x93fe4cff, 0xb48370b6, 0xdd04346d, 0xfa790824,
+ 0xfe7c51de, 0xd9016d97, 0xb086294c, 0x97fb1505,
+ 0x565d012e, 0x71203d67, 0x18a779bc, 0x3fda45f5,
+ 0x3bdf1c0f, 0x1ca22046, 0x7525649d, 0x525858d4,
+ 0x8c593a6c, 0xab240625, 0xc2a342fe, 0xe5de7eb7,
+ 0xe1db274d, 0xc6a61b04, 0xaf215fdf, 0x885c6396,
+ 0xe25477aa, 0xc5294be3, 0xacae0f38, 0x8bd33371,
+ 0x8fd66a8b, 0xa8ab56c2, 0xc12c1219, 0xe6512e50,
+ 0x38504ce8, 0x1f2d70a1, 0x76aa347a, 0x51d70833,
+ 0x55d251c9, 0x72af6d80, 0x1b28295b, 0x3c551512,
+ 0xcf380123, 0xe8453d6a, 0x81c279b1, 0xa6bf45f8,
+ 0xa2ba1c02, 0x85c7204b, 0xec406490, 0xcb3d58d9,
+ 0x153c3a61, 0x32410628, 0x5bc642f3, 0x7cbb7eba,
+ 0x78be2740, 0x5fc31b09, 0x36445fd2, 0x1139639b,
+ 0x7b3177a7, 0x5c4c4bee, 0x35cb0f35, 0x12b6337c,
+ 0x16b36a86, 0x31ce56cf, 0x58491214, 0x7f342e5d,
+ 0xa1354ce5, 0x864870ac, 0xefcf3477, 0xc8b2083e,
+ 0xccb751c4, 0xebca6d8d, 0x824d2956, 0xa530151f
+ }
+#else
+ {
+ 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
+ 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+ 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+ 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+ 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b,
+ 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+ 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54,
+ 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+ 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+ 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
+ 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5,
+ 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+ 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45,
+ 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
+ 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+ 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+ 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48,
+ 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+ 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687,
+ 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+ 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+ 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+ 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8,
+ 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+ 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096,
+ 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+ 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+ 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+ 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9,
+ 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+ 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36,
+ 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+ 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+ 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+ 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043,
+ 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+ 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3,
+ 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+ 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+ 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+ 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652,
+ 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+ 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d,
+ 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
+ 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+ 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+ 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2,
+ 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+ 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530,
+ 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+ 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+ 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+ 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f,
+ 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+ 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90,
+ 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+ 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+ 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+ 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321,
+ 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+ 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81,
+ 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+ 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+ 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351
+ },{
+ 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899,
+ 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945,
+ 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21,
+ 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd,
+ 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918,
+ 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4,
+ 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0,
+ 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c,
+ 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b,
+ 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47,
+ 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823,
+ 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff,
+ 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a,
+ 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6,
+ 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2,
+ 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e,
+ 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d,
+ 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41,
+ 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25,
+ 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9,
+ 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c,
+ 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0,
+ 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4,
+ 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78,
+ 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f,
+ 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43,
+ 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27,
+ 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb,
+ 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e,
+ 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2,
+ 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6,
+ 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a,
+ 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260,
+ 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc,
+ 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8,
+ 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004,
+ 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1,
+ 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d,
+ 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059,
+ 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185,
+ 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162,
+ 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be,
+ 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da,
+ 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306,
+ 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3,
+ 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f,
+ 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b,
+ 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287,
+ 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464,
+ 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8,
+ 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc,
+ 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600,
+ 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5,
+ 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439,
+ 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d,
+ 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781,
+ 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766,
+ 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba,
+ 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de,
+ 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502,
+ 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7,
+ 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b,
+ 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f,
+ 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483
+ },{
+ 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073,
+ 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469,
+ 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6,
+ 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac,
+ 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9,
+ 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3,
+ 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c,
+ 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726,
+ 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67,
+ 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d,
+ 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2,
+ 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8,
+ 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed,
+ 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7,
+ 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828,
+ 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32,
+ 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa,
+ 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0,
+ 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f,
+ 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75,
+ 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20,
+ 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a,
+ 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5,
+ 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff,
+ 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe,
+ 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4,
+ 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b,
+ 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161,
+ 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634,
+ 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e,
+ 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1,
+ 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb,
+ 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730,
+ 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a,
+ 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5,
+ 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def,
+ 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba,
+ 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0,
+ 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f,
+ 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065,
+ 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24,
+ 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e,
+ 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1,
+ 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb,
+ 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae,
+ 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4,
+ 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b,
+ 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71,
+ 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9,
+ 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3,
+ 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c,
+ 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36,
+ 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63,
+ 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79,
+ 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6,
+ 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc,
+ 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd,
+ 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7,
+ 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238,
+ 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622,
+ 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177,
+ 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d,
+ 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2,
+ 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8
+ },{
+ 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939,
+ 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca,
+ 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf,
+ 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c,
+ 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804,
+ 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7,
+ 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2,
+ 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11,
+ 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2,
+ 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41,
+ 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54,
+ 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7,
+ 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f,
+ 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c,
+ 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69,
+ 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a,
+ 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de,
+ 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d,
+ 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538,
+ 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb,
+ 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3,
+ 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610,
+ 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405,
+ 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6,
+ 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255,
+ 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6,
+ 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3,
+ 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040,
+ 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368,
+ 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b,
+ 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e,
+ 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d,
+ 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006,
+ 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5,
+ 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0,
+ 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213,
+ 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b,
+ 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8,
+ 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd,
+ 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e,
+ 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d,
+ 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e,
+ 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b,
+ 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698,
+ 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0,
+ 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443,
+ 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656,
+ 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5,
+ 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1,
+ 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12,
+ 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07,
+ 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4,
+ 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc,
+ 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f,
+ 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a,
+ 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9,
+ 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a,
+ 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99,
+ 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c,
+ 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f,
+ 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57,
+ 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4,
+ 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1,
+ 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842
+ },{
+ 0x00000000, 0x38116fac, 0x7022df58, 0x4833b0f4,
+ 0xe045beb0, 0xd854d11c, 0x906761e8, 0xa8760e44,
+ 0xc5670b91, 0xfd76643d, 0xb545d4c9, 0x8d54bb65,
+ 0x2522b521, 0x1d33da8d, 0x55006a79, 0x6d1105d5,
+ 0x8f2261d3, 0xb7330e7f, 0xff00be8b, 0xc711d127,
+ 0x6f67df63, 0x5776b0cf, 0x1f45003b, 0x27546f97,
+ 0x4a456a42, 0x725405ee, 0x3a67b51a, 0x0276dab6,
+ 0xaa00d4f2, 0x9211bb5e, 0xda220baa, 0xe2336406,
+ 0x1ba8b557, 0x23b9dafb, 0x6b8a6a0f, 0x539b05a3,
+ 0xfbed0be7, 0xc3fc644b, 0x8bcfd4bf, 0xb3debb13,
+ 0xdecfbec6, 0xe6ded16a, 0xaeed619e, 0x96fc0e32,
+ 0x3e8a0076, 0x069b6fda, 0x4ea8df2e, 0x76b9b082,
+ 0x948ad484, 0xac9bbb28, 0xe4a80bdc, 0xdcb96470,
+ 0x74cf6a34, 0x4cde0598, 0x04edb56c, 0x3cfcdac0,
+ 0x51eddf15, 0x69fcb0b9, 0x21cf004d, 0x19de6fe1,
+ 0xb1a861a5, 0x89b90e09, 0xc18abefd, 0xf99bd151,
+ 0x37516aae, 0x0f400502, 0x4773b5f6, 0x7f62da5a,
+ 0xd714d41e, 0xef05bbb2, 0xa7360b46, 0x9f2764ea,
+ 0xf236613f, 0xca270e93, 0x8214be67, 0xba05d1cb,
+ 0x1273df8f, 0x2a62b023, 0x625100d7, 0x5a406f7b,
+ 0xb8730b7d, 0x806264d1, 0xc851d425, 0xf040bb89,
+ 0x5836b5cd, 0x6027da61, 0x28146a95, 0x10050539,
+ 0x7d1400ec, 0x45056f40, 0x0d36dfb4, 0x3527b018,
+ 0x9d51be5c, 0xa540d1f0, 0xed736104, 0xd5620ea8,
+ 0x2cf9dff9, 0x14e8b055, 0x5cdb00a1, 0x64ca6f0d,
+ 0xccbc6149, 0xf4ad0ee5, 0xbc9ebe11, 0x848fd1bd,
+ 0xe99ed468, 0xd18fbbc4, 0x99bc0b30, 0xa1ad649c,
+ 0x09db6ad8, 0x31ca0574, 0x79f9b580, 0x41e8da2c,
+ 0xa3dbbe2a, 0x9bcad186, 0xd3f96172, 0xebe80ede,
+ 0x439e009a, 0x7b8f6f36, 0x33bcdfc2, 0x0badb06e,
+ 0x66bcb5bb, 0x5eadda17, 0x169e6ae3, 0x2e8f054f,
+ 0x86f90b0b, 0xbee864a7, 0xf6dbd453, 0xcecabbff,
+ 0x6ea2d55c, 0x56b3baf0, 0x1e800a04, 0x269165a8,
+ 0x8ee76bec, 0xb6f60440, 0xfec5b4b4, 0xc6d4db18,
+ 0xabc5decd, 0x93d4b161, 0xdbe70195, 0xe3f66e39,
+ 0x4b80607d, 0x73910fd1, 0x3ba2bf25, 0x03b3d089,
+ 0xe180b48f, 0xd991db23, 0x91a26bd7, 0xa9b3047b,
+ 0x01c50a3f, 0x39d46593, 0x71e7d567, 0x49f6bacb,
+ 0x24e7bf1e, 0x1cf6d0b2, 0x54c56046, 0x6cd40fea,
+ 0xc4a201ae, 0xfcb36e02, 0xb480def6, 0x8c91b15a,
+ 0x750a600b, 0x4d1b0fa7, 0x0528bf53, 0x3d39d0ff,
+ 0x954fdebb, 0xad5eb117, 0xe56d01e3, 0xdd7c6e4f,
+ 0xb06d6b9a, 0x887c0436, 0xc04fb4c2, 0xf85edb6e,
+ 0x5028d52a, 0x6839ba86, 0x200a0a72, 0x181b65de,
+ 0xfa2801d8, 0xc2396e74, 0x8a0ade80, 0xb21bb12c,
+ 0x1a6dbf68, 0x227cd0c4, 0x6a4f6030, 0x525e0f9c,
+ 0x3f4f0a49, 0x075e65e5, 0x4f6dd511, 0x777cbabd,
+ 0xdf0ab4f9, 0xe71bdb55, 0xaf286ba1, 0x9739040d,
+ 0x59f3bff2, 0x61e2d05e, 0x29d160aa, 0x11c00f06,
+ 0xb9b60142, 0x81a76eee, 0xc994de1a, 0xf185b1b6,
+ 0x9c94b463, 0xa485dbcf, 0xecb66b3b, 0xd4a70497,
+ 0x7cd10ad3, 0x44c0657f, 0x0cf3d58b, 0x34e2ba27,
+ 0xd6d1de21, 0xeec0b18d, 0xa6f30179, 0x9ee26ed5,
+ 0x36946091, 0x0e850f3d, 0x46b6bfc9, 0x7ea7d065,
+ 0x13b6d5b0, 0x2ba7ba1c, 0x63940ae8, 0x5b856544,
+ 0xf3f36b00, 0xcbe204ac, 0x83d1b458, 0xbbc0dbf4,
+ 0x425b0aa5, 0x7a4a6509, 0x3279d5fd, 0x0a68ba51,
+ 0xa21eb415, 0x9a0fdbb9, 0xd23c6b4d, 0xea2d04e1,
+ 0x873c0134, 0xbf2d6e98, 0xf71ede6c, 0xcf0fb1c0,
+ 0x6779bf84, 0x5f68d028, 0x175b60dc, 0x2f4a0f70,
+ 0xcd796b76, 0xf56804da, 0xbd5bb42e, 0x854adb82,
+ 0x2d3cd5c6, 0x152dba6a, 0x5d1e0a9e, 0x650f6532,
+ 0x081e60e7, 0x300f0f4b, 0x783cbfbf, 0x402dd013,
+ 0xe85bde57, 0xd04ab1fb, 0x9879010f, 0xa0686ea3
+ },{
+ 0x00000000, 0xef306b19, 0xdb8ca0c3, 0x34bccbda,
+ 0xb2f53777, 0x5dc55c6e, 0x697997b4, 0x8649fcad,
+ 0x6006181f, 0x8f367306, 0xbb8ab8dc, 0x54bad3c5,
+ 0xd2f32f68, 0x3dc34471, 0x097f8fab, 0xe64fe4b2,
+ 0xc00c303e, 0x2f3c5b27, 0x1b8090fd, 0xf4b0fbe4,
+ 0x72f90749, 0x9dc96c50, 0xa975a78a, 0x4645cc93,
+ 0xa00a2821, 0x4f3a4338, 0x7b8688e2, 0x94b6e3fb,
+ 0x12ff1f56, 0xfdcf744f, 0xc973bf95, 0x2643d48c,
+ 0x85f4168d, 0x6ac47d94, 0x5e78b64e, 0xb148dd57,
+ 0x370121fa, 0xd8314ae3, 0xec8d8139, 0x03bdea20,
+ 0xe5f20e92, 0x0ac2658b, 0x3e7eae51, 0xd14ec548,
+ 0x570739e5, 0xb83752fc, 0x8c8b9926, 0x63bbf23f,
+ 0x45f826b3, 0xaac84daa, 0x9e748670, 0x7144ed69,
+ 0xf70d11c4, 0x183d7add, 0x2c81b107, 0xc3b1da1e,
+ 0x25fe3eac, 0xcace55b5, 0xfe729e6f, 0x1142f576,
+ 0x970b09db, 0x783b62c2, 0x4c87a918, 0xa3b7c201,
+ 0x0e045beb, 0xe13430f2, 0xd588fb28, 0x3ab89031,
+ 0xbcf16c9c, 0x53c10785, 0x677dcc5f, 0x884da746,
+ 0x6e0243f4, 0x813228ed, 0xb58ee337, 0x5abe882e,
+ 0xdcf77483, 0x33c71f9a, 0x077bd440, 0xe84bbf59,
+ 0xce086bd5, 0x213800cc, 0x1584cb16, 0xfab4a00f,
+ 0x7cfd5ca2, 0x93cd37bb, 0xa771fc61, 0x48419778,
+ 0xae0e73ca, 0x413e18d3, 0x7582d309, 0x9ab2b810,
+ 0x1cfb44bd, 0xf3cb2fa4, 0xc777e47e, 0x28478f67,
+ 0x8bf04d66, 0x64c0267f, 0x507ceda5, 0xbf4c86bc,
+ 0x39057a11, 0xd6351108, 0xe289dad2, 0x0db9b1cb,
+ 0xebf65579, 0x04c63e60, 0x307af5ba, 0xdf4a9ea3,
+ 0x5903620e, 0xb6330917, 0x828fc2cd, 0x6dbfa9d4,
+ 0x4bfc7d58, 0xa4cc1641, 0x9070dd9b, 0x7f40b682,
+ 0xf9094a2f, 0x16392136, 0x2285eaec, 0xcdb581f5,
+ 0x2bfa6547, 0xc4ca0e5e, 0xf076c584, 0x1f46ae9d,
+ 0x990f5230, 0x763f3929, 0x4283f2f3, 0xadb399ea,
+ 0x1c08b7d6, 0xf338dccf, 0xc7841715, 0x28b47c0c,
+ 0xaefd80a1, 0x41cdebb8, 0x75712062, 0x9a414b7b,
+ 0x7c0eafc9, 0x933ec4d0, 0xa7820f0a, 0x48b26413,
+ 0xcefb98be, 0x21cbf3a7, 0x1577387d, 0xfa475364,
+ 0xdc0487e8, 0x3334ecf1, 0x0788272b, 0xe8b84c32,
+ 0x6ef1b09f, 0x81c1db86, 0xb57d105c, 0x5a4d7b45,
+ 0xbc029ff7, 0x5332f4ee, 0x678e3f34, 0x88be542d,
+ 0x0ef7a880, 0xe1c7c399, 0xd57b0843, 0x3a4b635a,
+ 0x99fca15b, 0x76ccca42, 0x42700198, 0xad406a81,
+ 0x2b09962c, 0xc439fd35, 0xf08536ef, 0x1fb55df6,
+ 0xf9fab944, 0x16cad25d, 0x22761987, 0xcd46729e,
+ 0x4b0f8e33, 0xa43fe52a, 0x90832ef0, 0x7fb345e9,
+ 0x59f09165, 0xb6c0fa7c, 0x827c31a6, 0x6d4c5abf,
+ 0xeb05a612, 0x0435cd0b, 0x308906d1, 0xdfb96dc8,
+ 0x39f6897a, 0xd6c6e263, 0xe27a29b9, 0x0d4a42a0,
+ 0x8b03be0d, 0x6433d514, 0x508f1ece, 0xbfbf75d7,
+ 0x120cec3d, 0xfd3c8724, 0xc9804cfe, 0x26b027e7,
+ 0xa0f9db4a, 0x4fc9b053, 0x7b757b89, 0x94451090,
+ 0x720af422, 0x9d3a9f3b, 0xa98654e1, 0x46b63ff8,
+ 0xc0ffc355, 0x2fcfa84c, 0x1b736396, 0xf443088f,
+ 0xd200dc03, 0x3d30b71a, 0x098c7cc0, 0xe6bc17d9,
+ 0x60f5eb74, 0x8fc5806d, 0xbb794bb7, 0x544920ae,
+ 0xb206c41c, 0x5d36af05, 0x698a64df, 0x86ba0fc6,
+ 0x00f3f36b, 0xefc39872, 0xdb7f53a8, 0x344f38b1,
+ 0x97f8fab0, 0x78c891a9, 0x4c745a73, 0xa344316a,
+ 0x250dcdc7, 0xca3da6de, 0xfe816d04, 0x11b1061d,
+ 0xf7fee2af, 0x18ce89b6, 0x2c72426c, 0xc3422975,
+ 0x450bd5d8, 0xaa3bbec1, 0x9e87751b, 0x71b71e02,
+ 0x57f4ca8e, 0xb8c4a197, 0x8c786a4d, 0x63480154,
+ 0xe501fdf9, 0x0a3196e0, 0x3e8d5d3a, 0xd1bd3623,
+ 0x37f2d291, 0xd8c2b988, 0xec7e7252, 0x034e194b,
+ 0x8507e5e6, 0x6a378eff, 0x5e8b4525, 0xb1bb2e3c
+ },{
+ 0x00000000, 0x68032cc8, 0xd0065990, 0xb8057558,
+ 0xa5e0c5d1, 0xcde3e919, 0x75e69c41, 0x1de5b089,
+ 0x4e2dfd53, 0x262ed19b, 0x9e2ba4c3, 0xf628880b,
+ 0xebcd3882, 0x83ce144a, 0x3bcb6112, 0x53c84dda,
+ 0x9c5bfaa6, 0xf458d66e, 0x4c5da336, 0x245e8ffe,
+ 0x39bb3f77, 0x51b813bf, 0xe9bd66e7, 0x81be4a2f,
+ 0xd27607f5, 0xba752b3d, 0x02705e65, 0x6a7372ad,
+ 0x7796c224, 0x1f95eeec, 0xa7909bb4, 0xcf93b77c,
+ 0x3d5b83bd, 0x5558af75, 0xed5dda2d, 0x855ef6e5,
+ 0x98bb466c, 0xf0b86aa4, 0x48bd1ffc, 0x20be3334,
+ 0x73767eee, 0x1b755226, 0xa370277e, 0xcb730bb6,
+ 0xd696bb3f, 0xbe9597f7, 0x0690e2af, 0x6e93ce67,
+ 0xa100791b, 0xc90355d3, 0x7106208b, 0x19050c43,
+ 0x04e0bcca, 0x6ce39002, 0xd4e6e55a, 0xbce5c992,
+ 0xef2d8448, 0x872ea880, 0x3f2bddd8, 0x5728f110,
+ 0x4acd4199, 0x22ce6d51, 0x9acb1809, 0xf2c834c1,
+ 0x7ab7077a, 0x12b42bb2, 0xaab15eea, 0xc2b27222,
+ 0xdf57c2ab, 0xb754ee63, 0x0f519b3b, 0x6752b7f3,
+ 0x349afa29, 0x5c99d6e1, 0xe49ca3b9, 0x8c9f8f71,
+ 0x917a3ff8, 0xf9791330, 0x417c6668, 0x297f4aa0,
+ 0xe6ecfddc, 0x8eefd114, 0x36eaa44c, 0x5ee98884,
+ 0x430c380d, 0x2b0f14c5, 0x930a619d, 0xfb094d55,
+ 0xa8c1008f, 0xc0c22c47, 0x78c7591f, 0x10c475d7,
+ 0x0d21c55e, 0x6522e996, 0xdd279cce, 0xb524b006,
+ 0x47ec84c7, 0x2fefa80f, 0x97eadd57, 0xffe9f19f,
+ 0xe20c4116, 0x8a0f6dde, 0x320a1886, 0x5a09344e,
+ 0x09c17994, 0x61c2555c, 0xd9c72004, 0xb1c40ccc,
+ 0xac21bc45, 0xc422908d, 0x7c27e5d5, 0x1424c91d,
+ 0xdbb77e61, 0xb3b452a9, 0x0bb127f1, 0x63b20b39,
+ 0x7e57bbb0, 0x16549778, 0xae51e220, 0xc652cee8,
+ 0x959a8332, 0xfd99affa, 0x459cdaa2, 0x2d9ff66a,
+ 0x307a46e3, 0x58796a2b, 0xe07c1f73, 0x887f33bb,
+ 0xf56e0ef4, 0x9d6d223c, 0x25685764, 0x4d6b7bac,
+ 0x508ecb25, 0x388de7ed, 0x808892b5, 0xe88bbe7d,
+ 0xbb43f3a7, 0xd340df6f, 0x6b45aa37, 0x034686ff,
+ 0x1ea33676, 0x76a01abe, 0xcea56fe6, 0xa6a6432e,
+ 0x6935f452, 0x0136d89a, 0xb933adc2, 0xd130810a,
+ 0xccd53183, 0xa4d61d4b, 0x1cd36813, 0x74d044db,
+ 0x27180901, 0x4f1b25c9, 0xf71e5091, 0x9f1d7c59,
+ 0x82f8ccd0, 0xeafbe018, 0x52fe9540, 0x3afdb988,
+ 0xc8358d49, 0xa036a181, 0x1833d4d9, 0x7030f811,
+ 0x6dd54898, 0x05d66450, 0xbdd31108, 0xd5d03dc0,
+ 0x8618701a, 0xee1b5cd2, 0x561e298a, 0x3e1d0542,
+ 0x23f8b5cb, 0x4bfb9903, 0xf3feec5b, 0x9bfdc093,
+ 0x546e77ef, 0x3c6d5b27, 0x84682e7f, 0xec6b02b7,
+ 0xf18eb23e, 0x998d9ef6, 0x2188ebae, 0x498bc766,
+ 0x1a438abc, 0x7240a674, 0xca45d32c, 0xa246ffe4,
+ 0xbfa34f6d, 0xd7a063a5, 0x6fa516fd, 0x07a63a35,
+ 0x8fd9098e, 0xe7da2546, 0x5fdf501e, 0x37dc7cd6,
+ 0x2a39cc5f, 0x423ae097, 0xfa3f95cf, 0x923cb907,
+ 0xc1f4f4dd, 0xa9f7d815, 0x11f2ad4d, 0x79f18185,
+ 0x6414310c, 0x0c171dc4, 0xb412689c, 0xdc114454,
+ 0x1382f328, 0x7b81dfe0, 0xc384aab8, 0xab878670,
+ 0xb66236f9, 0xde611a31, 0x66646f69, 0x0e6743a1,
+ 0x5daf0e7b, 0x35ac22b3, 0x8da957eb, 0xe5aa7b23,
+ 0xf84fcbaa, 0x904ce762, 0x2849923a, 0x404abef2,
+ 0xb2828a33, 0xda81a6fb, 0x6284d3a3, 0x0a87ff6b,
+ 0x17624fe2, 0x7f61632a, 0xc7641672, 0xaf673aba,
+ 0xfcaf7760, 0x94ac5ba8, 0x2ca92ef0, 0x44aa0238,
+ 0x594fb2b1, 0x314c9e79, 0x8949eb21, 0xe14ac7e9,
+ 0x2ed97095, 0x46da5c5d, 0xfedf2905, 0x96dc05cd,
+ 0x8b39b544, 0xe33a998c, 0x5b3fecd4, 0x333cc01c,
+ 0x60f48dc6, 0x08f7a10e, 0xb0f2d456, 0xd8f1f89e,
+ 0xc5144817, 0xad1764df, 0x15121187, 0x7d113d4f
+ },{
+ 0x00000000, 0x493c7d27, 0x9278fa4e, 0xdb448769,
+ 0x211d826d, 0x6821ff4a, 0xb3657823, 0xfa590504,
+ 0x423b04da, 0x0b0779fd, 0xd043fe94, 0x997f83b3,
+ 0x632686b7, 0x2a1afb90, 0xf15e7cf9, 0xb86201de,
+ 0x847609b4, 0xcd4a7493, 0x160ef3fa, 0x5f328edd,
+ 0xa56b8bd9, 0xec57f6fe, 0x37137197, 0x7e2f0cb0,
+ 0xc64d0d6e, 0x8f717049, 0x5435f720, 0x1d098a07,
+ 0xe7508f03, 0xae6cf224, 0x7528754d, 0x3c14086a,
+ 0x0d006599, 0x443c18be, 0x9f789fd7, 0xd644e2f0,
+ 0x2c1de7f4, 0x65219ad3, 0xbe651dba, 0xf759609d,
+ 0x4f3b6143, 0x06071c64, 0xdd439b0d, 0x947fe62a,
+ 0x6e26e32e, 0x271a9e09, 0xfc5e1960, 0xb5626447,
+ 0x89766c2d, 0xc04a110a, 0x1b0e9663, 0x5232eb44,
+ 0xa86bee40, 0xe1579367, 0x3a13140e, 0x732f6929,
+ 0xcb4d68f7, 0x827115d0, 0x593592b9, 0x1009ef9e,
+ 0xea50ea9a, 0xa36c97bd, 0x782810d4, 0x31146df3,
+ 0x1a00cb32, 0x533cb615, 0x8878317c, 0xc1444c5b,
+ 0x3b1d495f, 0x72213478, 0xa965b311, 0xe059ce36,
+ 0x583bcfe8, 0x1107b2cf, 0xca4335a6, 0x837f4881,
+ 0x79264d85, 0x301a30a2, 0xeb5eb7cb, 0xa262caec,
+ 0x9e76c286, 0xd74abfa1, 0x0c0e38c8, 0x453245ef,
+ 0xbf6b40eb, 0xf6573dcc, 0x2d13baa5, 0x642fc782,
+ 0xdc4dc65c, 0x9571bb7b, 0x4e353c12, 0x07094135,
+ 0xfd504431, 0xb46c3916, 0x6f28be7f, 0x2614c358,
+ 0x1700aeab, 0x5e3cd38c, 0x857854e5, 0xcc4429c2,
+ 0x361d2cc6, 0x7f2151e1, 0xa465d688, 0xed59abaf,
+ 0x553baa71, 0x1c07d756, 0xc743503f, 0x8e7f2d18,
+ 0x7426281c, 0x3d1a553b, 0xe65ed252, 0xaf62af75,
+ 0x9376a71f, 0xda4ada38, 0x010e5d51, 0x48322076,
+ 0xb26b2572, 0xfb575855, 0x2013df3c, 0x692fa21b,
+ 0xd14da3c5, 0x9871dee2, 0x4335598b, 0x0a0924ac,
+ 0xf05021a8, 0xb96c5c8f, 0x6228dbe6, 0x2b14a6c1,
+ 0x34019664, 0x7d3deb43, 0xa6796c2a, 0xef45110d,
+ 0x151c1409, 0x5c20692e, 0x8764ee47, 0xce589360,
+ 0x763a92be, 0x3f06ef99, 0xe44268f0, 0xad7e15d7,
+ 0x572710d3, 0x1e1b6df4, 0xc55fea9d, 0x8c6397ba,
+ 0xb0779fd0, 0xf94be2f7, 0x220f659e, 0x6b3318b9,
+ 0x916a1dbd, 0xd856609a, 0x0312e7f3, 0x4a2e9ad4,
+ 0xf24c9b0a, 0xbb70e62d, 0x60346144, 0x29081c63,
+ 0xd3511967, 0x9a6d6440, 0x4129e329, 0x08159e0e,
+ 0x3901f3fd, 0x703d8eda, 0xab7909b3, 0xe2457494,
+ 0x181c7190, 0x51200cb7, 0x8a648bde, 0xc358f6f9,
+ 0x7b3af727, 0x32068a00, 0xe9420d69, 0xa07e704e,
+ 0x5a27754a, 0x131b086d, 0xc85f8f04, 0x8163f223,
+ 0xbd77fa49, 0xf44b876e, 0x2f0f0007, 0x66337d20,
+ 0x9c6a7824, 0xd5560503, 0x0e12826a, 0x472eff4d,
+ 0xff4cfe93, 0xb67083b4, 0x6d3404dd, 0x240879fa,
+ 0xde517cfe, 0x976d01d9, 0x4c2986b0, 0x0515fb97,
+ 0x2e015d56, 0x673d2071, 0xbc79a718, 0xf545da3f,
+ 0x0f1cdf3b, 0x4620a21c, 0x9d642575, 0xd4585852,
+ 0x6c3a598c, 0x250624ab, 0xfe42a3c2, 0xb77edee5,
+ 0x4d27dbe1, 0x041ba6c6, 0xdf5f21af, 0x96635c88,
+ 0xaa7754e2, 0xe34b29c5, 0x380faeac, 0x7133d38b,
+ 0x8b6ad68f, 0xc256aba8, 0x19122cc1, 0x502e51e6,
+ 0xe84c5038, 0xa1702d1f, 0x7a34aa76, 0x3308d751,
+ 0xc951d255, 0x806daf72, 0x5b29281b, 0x1215553c,
+ 0x230138cf, 0x6a3d45e8, 0xb179c281, 0xf845bfa6,
+ 0x021cbaa2, 0x4b20c785, 0x906440ec, 0xd9583dcb,
+ 0x613a3c15, 0x28064132, 0xf342c65b, 0xba7ebb7c,
+ 0x4027be78, 0x091bc35f, 0xd25f4436, 0x9b633911,
+ 0xa777317b, 0xee4b4c5c, 0x350fcb35, 0x7c33b612,
+ 0x866ab316, 0xcf56ce31, 0x14124958, 0x5d2e347f,
+ 0xe54c35a1, 0xac704886, 0x7734cfef, 0x3e08b2c8,
+ 0xc451b7cc, 0x8d6dcaeb, 0x56294d82, 0x1f1530a5
+ }
+#endif
+};
+
+/*
+ * __wt_cksum_sw --
+ * Return a checksum for a chunk of memory, computed in software.
+ *
+ * Slicing-by-8 algorithm by Michael E. Kounavis and Frank L. Berry from
+ * Intel Corp.:
+ * http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
+ *
+ * Based on Peter Kanowski's posting:
+ * http://www.strchr.com/crc32_popcnt
+ *
+ * The big endian version calculates the same result at each step, except the
+ * value of the crc is byte reversed from what it would be at that step for
+ * little endian.
+ */
+static uint32_t
+__wt_cksum_sw(const void *chunk, size_t len)
+{
+ uint32_t crc, next;
+ size_t nqwords;
+ const uint8_t *p;
+
+ crc = 0xffffffff;
+
+ /* Checksum one byte at a time to the first 4B boundary. */
+ for (p = chunk;
+ ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 &&
+ len > 0; ++p, --len)
+#ifdef WORDS_BIGENDIAN
+ crc = g_crc_slicing[0][((crc >> 24) ^ *p) & 0xFF] ^ (crc << 8);
+#else
+ crc = g_crc_slicing[0][(crc ^ *p) & 0xFF] ^ (crc >> 8);
+#endif
+
+ /* Checksum in 8B chunks. */
+ for (nqwords = len / sizeof(uint64_t); nqwords; nqwords--) {
+ crc ^= *(uint32_t *)p;
+ p += sizeof(uint32_t);
+ next = *(uint32_t *)p;
+ p += sizeof(uint32_t);
+ crc =
+#ifdef WORDS_BIGENDIAN
+ g_crc_slicing[4][(crc ) & 0xFF] ^
+ g_crc_slicing[5][(crc >> 8) & 0xFF] ^
+ g_crc_slicing[6][(crc >> 16) & 0xFF] ^
+ g_crc_slicing[7][(crc >> 24)] ^
+ g_crc_slicing[0][(next ) & 0xFF] ^
+ g_crc_slicing[1][(next >> 8) & 0xFF] ^
+ g_crc_slicing[2][(next >> 16) & 0xFF] ^
+ g_crc_slicing[3][(next >> 24)];
+#else
+ g_crc_slicing[7][(crc ) & 0xFF] ^
+ g_crc_slicing[6][(crc >> 8) & 0xFF] ^
+ g_crc_slicing[5][(crc >> 16) & 0xFF] ^
+ g_crc_slicing[4][(crc >> 24)] ^
+ g_crc_slicing[3][(next ) & 0xFF] ^
+ g_crc_slicing[2][(next >> 8) & 0xFF] ^
+ g_crc_slicing[1][(next >> 16) & 0xFF] ^
+ g_crc_slicing[0][(next >> 24)];
+#endif
+ }
+
+ /* Checksum trailing bytes one byte at a time. */
+#ifdef WORDS_BIGENDIAN
+ for (len &= 0x7; len > 0; ++p, len--)
+ crc = g_crc_slicing[0][((crc >> 24) ^ *p) & 0xFF] ^ (crc << 8);
+
+ /* Do final byte swap to produce a result identical to little endian */
+ crc =
+ ((crc << 24) & 0xFF000000) |
+ ((crc << 8) & 0x00FF0000) |
+ ((crc >> 8) & 0x0000FF00) |
+ ((crc >> 24) & 0x000000FF);
+#else
+ for (len &= 0x7; len > 0; ++p, len--)
+ crc = g_crc_slicing[0][(crc ^ *p) & 0xFF] ^ (crc >> 8);
+#endif
+ return (~crc);
+}
+
+#if (defined(__amd64) || defined(__x86_64))
+/*
+ * __wt_cksum_hw --
+ * Return a checksum for a chunk of memory, computed in hardware
+ * using 8 byte steps.
+ */
+static uint32_t
+__wt_cksum_hw(const void *chunk, size_t len)
+{
+ uint32_t crc;
+ size_t nqwords;
+ const uint8_t *p;
+ const uint64_t *p64;
+
+ crc = 0xffffffff;
+
+ /* Checksum one byte at a time to the first 4B boundary. */
+ for (p = chunk;
+ ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 &&
+ len > 0; ++p, --len) {
+ __asm__ __volatile__(
+ ".byte 0xF2, 0x0F, 0x38, 0xF0, 0xF1"
+ : "=S" (crc)
+ : "0" (crc), "c" (*p));
+ }
+
+ p64 = (const uint64_t *)p;
+ /* Checksum in 8B chunks. */
+ for (nqwords = len / sizeof(uint64_t); nqwords; nqwords--) {
+ __asm__ __volatile__ (
+ ".byte 0xF2, 0x48, 0x0F, 0x38, 0xF1, 0xF1"
+ : "=S"(crc)
+ : "0"(crc), "c" (*p64));
+ p64++;
+ }
+
+ /* Checksum trailing bytes one byte at a time. */
+ p = (const uint8_t *)p64;
+ for (len &= 0x7; len > 0; ++p, len--) {
+ __asm__ __volatile__(
+ ".byte 0xF2, 0x0F, 0x38, 0xF0, 0xF1"
+ : "=S" (crc)
+ : "0" (crc), "c" (*p));
+ }
+ return (~crc);
+}
+#endif
+
+#if defined(_M_AMD64)
+/*
+ * __wt_cksum_hw --
+ * Return a checksum for a chunk of memory, computed in hardware
+ * using 8 byte steps.
+ */
+static uint32_t
+__wt_cksum_hw(const void *chunk, size_t len)
+{
+ uint32_t crc;
+ size_t nqwords;
+ const uint8_t *p;
+ const uint64_t *p64;
+
+ crc = 0xffffffff;
+
+ /* Checksum one byte at a time to the first 4B boundary. */
+ for (p = chunk;
+ ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 &&
+ len > 0; ++p, --len) {
+ crc = _mm_crc32_u8(crc, *p);
+ }
+
+ p64 = (const uint64_t *)p;
+ /* Checksum in 8B chunks. */
+ for (nqwords = len / sizeof(uint64_t); nqwords; nqwords--) {
+ crc = (uint32_t)_mm_crc32_u64(crc, *p64);
+ p64++;
+ }
+
+ /* Checksum trailing bytes one byte at a time. */
+ p = (const uint8_t *)p64;
+ for (len &= 0x7; len > 0; ++p, len--) {
+ crc = _mm_crc32_u8(crc, *p);
+ }
+
+ return (~crc);
+}
+#endif
+
+/*
+ * __wt_cksum --
+ * Return a checksum for a chunk of memory using the fastest method
+ * available.
+ */
+uint32_t
+__wt_cksum(const void *chunk, size_t len)
+{
+ return (*__wt_cksum_func)(chunk, len);
+}
+
+/*
+ * __wt_cksum_init --
+ * Detect CRC hardware and set the checksum function.
+ */
+void
+__wt_cksum_init(void)
+{
+#define CPUID_ECX_HAS_SSE42 (1 << 20)
+
+#if (defined(__amd64) || defined(__x86_64))
+ unsigned int eax, ebx, ecx, edx;
+
+ __asm__ __volatile__ (
+ "cpuid"
+ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
+ : "a" (1));
+
+ if (ecx & CPUID_ECX_HAS_SSE42)
+ __wt_cksum_func = __wt_cksum_hw;
+ else
+ __wt_cksum_func = __wt_cksum_sw;
+
+#elif defined(_M_AMD64)
+ int cpuInfo[4];
+
+ __cpuid(cpuInfo, 1);
+
+ if (cpuInfo[2] & CPUID_ECX_HAS_SSE42)
+ __wt_cksum_func = __wt_cksum_hw;
+ else
+ __wt_cksum_func = __wt_cksum_sw;
+#else
+ __wt_cksum_func = __wt_cksum_sw;
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c
new file mode 100644
index 00000000000..3e874078fbf
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/err.c
@@ -0,0 +1,527 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __handle_error_default --
+ * Default WT_EVENT_HANDLER->handle_error implementation: send to stderr.
+ */
+static int
+__handle_error_default(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, int error, const char *errmsg)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+ WT_UNUSED(error);
+
+ return (fprintf(stderr, "%s\n", errmsg) >= 0 &&
+ fflush(stderr) == 0 ? 0 : __wt_errno());
+}
+
+/*
+ * __handle_message_default --
+ * Default WT_EVENT_HANDLER->handle_message implementation: send to stdout.
+ */
+static int
+__handle_message_default(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, const char *message)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+
+ return (printf("%s\n", message) >= 0 &&
+ fflush(stdout) == 0 ? 0 : __wt_errno());
+}
+
+/*
+ * __handle_progress_default --
+ * Default WT_EVENT_HANDLER->handle_progress implementation: ignore.
+ */
+static int
+__handle_progress_default(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, const char *operation, uint64_t progress)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+ WT_UNUSED(operation);
+ WT_UNUSED(progress);
+
+ return (0);
+}
+
+/*
+ * __handle_close_default --
+ * Default WT_EVENT_HANDLER->handle_close implementation: ignore.
+ */
+static int
+__handle_close_default(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, WT_CURSOR *cursor)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+ WT_UNUSED(cursor);
+
+ return (0);
+}
+
+static WT_EVENT_HANDLER __event_handler_default = {
+ __handle_error_default,
+ __handle_message_default,
+ __handle_progress_default,
+ __handle_close_default
+};
+
+/*
+ * __handler_failure --
+ * Report the failure of an application-configured event handler.
+ */
+static void
+__handler_failure(WT_SESSION_IMPL *session,
+ int error, const char *which, int error_handler_failed)
+{
+ WT_EVENT_HANDLER *handler;
+ WT_SESSION *wt_session;
+
+ /*
+ * !!!
+ * SECURITY:
+ * Buffer placed at the end of the stack in case snprintf overflows.
+ */
+ char s[256];
+
+ (void)snprintf(s, sizeof(s),
+ "application %s event handler failed: %s",
+ which, wiredtiger_strerror(error));
+
+ /*
+ * Use the error handler to report the failure, unless it was the error
+ * handler that failed. If it was the error handler that failed, or a
+ * call to the error handler fails, use the default error handler.
+ */
+ wt_session = (WT_SESSION *)session;
+ handler = session->event_handler;
+ if (!error_handler_failed &&
+ handler->handle_error != __handle_error_default &&
+ handler->handle_error(handler, wt_session, error, s) == 0)
+ return;
+
+ (void)__handle_error_default(NULL, wt_session, error, s);
+}
+
+/*
+ * __wt_event_handler_set --
+ * Set an event handler, fill in any NULL methods with the defaults.
+ */
+void
+__wt_event_handler_set(WT_SESSION_IMPL *session, WT_EVENT_HANDLER *handler)
+{
+ if (handler == NULL)
+ handler = &__event_handler_default;
+ else {
+ if (handler->handle_error == NULL)
+ handler->handle_error = __handle_error_default;
+ if (handler->handle_message == NULL)
+ handler->handle_message = __handle_message_default;
+ if (handler->handle_progress == NULL)
+ handler->handle_progress = __handle_progress_default;
+ }
+
+ session->event_handler = handler;
+}
+
+/*
+ * __wt_eventv --
+ * Report a message to an event handler.
+ */
+int
+__wt_eventv(WT_SESSION_IMPL *session, int msg_event, int error,
+ const char *file_name, int line_number, const char *fmt, va_list ap)
+{
+ WT_EVENT_HANDLER *handler;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ struct timespec ts;
+ size_t len, remain, wlen;
+ int prefix_cnt;
+ const char *err, *prefix;
+ char *end, *p, tid[128];
+
+ /*
+ * We're using a stack buffer because we want error messages no matter
+ * what, and allocating a WT_ITEM, or the memory it needs, might fail.
+ *
+ * !!!
+ * SECURITY:
+ * Buffer placed at the end of the stack in case snprintf overflows.
+ */
+ char s[2048];
+
+ /*
+ * !!!
+ * This function MUST handle a NULL WT_SESSION_IMPL handle.
+ *
+ * Without a session, we don't have event handlers or prefixes for the
+ * error message. Write the error to stderr and call it a day. (It's
+ * almost impossible for that to happen given how early we allocate the
+ * first session, but if the allocation of the first session fails, for
+ * example, we can end up here without a session.)
+ */
+ if (session == NULL)
+ return (fprintf(stderr, "WiredTiger Error%s%s\n",
+ error == 0 ? "" : ": ",
+ error == 0 ? "" : wiredtiger_strerror(error)) >= 0 &&
+ fflush(stderr) == 0 ? 0 : __wt_errno());
+
+ p = s;
+ end = s + sizeof(s);
+
+ /*
+ * We have several prefixes for the error message:
+ * a timestamp and the process and thread ids, the database error
+ * prefix, the data-source's name, and the session's name. Write them
+ * as a comma-separate list, followed by a colon.
+ */
+ prefix_cnt = 0;
+ if (__wt_epoch(session, &ts) == 0) {
+ __wt_thread_id(tid, sizeof(tid));
+ remain = WT_PTRDIFF(end, p);
+ wlen = (size_t)snprintf(p, remain,
+ "[%" PRIuMAX ":%" PRIuMAX "][%s]",
+ (uintmax_t)ts.tv_sec, (uintmax_t)ts.tv_nsec / 1000, tid);
+ p = wlen >= remain ? end : p + wlen;
+ prefix_cnt = 1;
+ }
+ if ((prefix = S2C(session)->error_prefix) != NULL) {
+ remain = WT_PTRDIFF(end, p);
+ wlen = (size_t)snprintf(p, remain,
+ "%s%s", prefix_cnt == 0 ? "" : ", ", prefix);
+ p = wlen >= remain ? end : p + wlen;
+ prefix_cnt = 1;
+ }
+ prefix = session->dhandle == NULL ? NULL : session->dhandle->name;
+ if (prefix != NULL) {
+ remain = WT_PTRDIFF(end, p);
+ wlen = (size_t)snprintf(p, remain,
+ "%s%s", prefix_cnt == 0 ? "" : ", ", prefix);
+ p = wlen >= remain ? end : p + wlen;
+ prefix_cnt = 1;
+ }
+ if ((prefix = session->name) != NULL) {
+ remain = WT_PTRDIFF(end, p);
+ wlen = (size_t)snprintf(p, remain,
+ "%s%s", prefix_cnt == 0 ? "" : ", ", prefix);
+ p = wlen >= remain ? end : p + wlen;
+ prefix_cnt = 1;
+ }
+ if (prefix_cnt != 0) {
+ remain = WT_PTRDIFF(end, p);
+ wlen = (size_t)snprintf(p, remain, ": ");
+ p = wlen >= remain ? end : p + wlen;
+ }
+
+ if (file_name != NULL) {
+ remain = WT_PTRDIFF(end, p);
+ wlen = (size_t)
+ snprintf(p, remain, "%s, %d: ", file_name, line_number);
+ p = wlen >= remain ? end : p + wlen;
+ }
+
+ remain = WT_PTRDIFF(end, p);
+ wlen = (size_t)vsnprintf(p, remain, fmt, ap);
+ p = wlen >= remain ? end : p + wlen;
+
+ if (error != 0) {
+ /*
+ * When the engine calls __wt_err on error, it often outputs an
+ * error message including the string associated with the error
+ * it's returning. We could change the calls to call __wt_errx,
+ * but it's simpler to not append an error string if all we are
+ * doing is duplicating an existing error string.
+ *
+ * Use strcmp to compare: both strings are nul-terminated, and
+ * we don't want to run past the end of the buffer.
+ */
+ err = wiredtiger_strerror(error);
+ len = strlen(err);
+ if (WT_PTRDIFF(p, s) < len || strcmp(p - len, err) != 0) {
+ remain = WT_PTRDIFF(end, p);
+ (void)snprintf(p, remain, ": %s", err);
+ }
+ }
+
+ /*
+ * If a handler fails, return the error status: if we're in the process
+ * of handling an error, any return value we provide will be ignored by
+ * our caller, our caller presumably already has an error value it will
+ * be returning.
+ *
+ * If an application-specified or default informational message handler
+ * fails, complain using the application-specified or default error
+ * handler.
+ *
+ * If an application-specified error message handler fails, complain
+ * using the default error handler. If the default error handler fails,
+ * there's nothing to do.
+ */
+ wt_session = (WT_SESSION *)session;
+ handler = session->event_handler;
+ if (msg_event) {
+ ret = handler->handle_message(handler, wt_session, s);
+ if (ret != 0)
+ __handler_failure(session, ret, "message", 0);
+ } else {
+ ret = handler->handle_error(handler, wt_session, error, s);
+ if (ret != 0 && handler->handle_error != __handle_error_default)
+ __handler_failure(session, ret, "error", 1);
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_err --
+ * Report an error.
+ */
+void
+__wt_err(WT_SESSION_IMPL *session, int error, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+ va_list ap;
+
+ /*
+ * Ignore error returns from underlying event handlers, we already have
+ * an error value to return.
+ */
+ va_start(ap, fmt);
+ (void)__wt_eventv(session, 0, error, NULL, 0, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * __wt_errx --
+ * Report an error with no error code.
+ */
+void
+__wt_errx(WT_SESSION_IMPL *session, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+{
+ va_list ap;
+
+ /*
+ * Ignore error returns from underlying event handlers, we already have
+ * an error value to return.
+ */
+ va_start(ap, fmt);
+ (void)__wt_eventv(session, 0, 0, NULL, 0, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * __wt_ext_err_printf --
+ * Extension API call to print to the error stream.
+ */
+int
+__wt_ext_err_printf(
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+ va_start(ap, fmt);
+ ret = __wt_eventv(session, 0, 0, NULL, 0, fmt, ap);
+ va_end(ap);
+ return (ret);
+}
+
+/*
+ * info_msg --
+ * Informational message.
+ */
+static int
+info_msg(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
+{
+ WT_EVENT_HANDLER *handler;
+ WT_SESSION *wt_session;
+
+ /*
+ * !!!
+ * SECURITY:
+ * Buffer placed at the end of the stack in case snprintf overflows.
+ */
+ char s[2048];
+
+ (void)vsnprintf(s, sizeof(s), fmt, ap);
+
+ wt_session = (WT_SESSION *)session;
+ handler = session->event_handler;
+ return (handler->handle_message(handler, wt_session, s));
+}
+
+/*
+ * __wt_msg --
+ * Informational message.
+ */
+int
+__wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, fmt);
+ ret = info_msg(session, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_ext_msg_printf --
+ * Extension API call to print to the message stream.
+ */
+int
+__wt_ext_msg_printf(
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+ va_start(ap, fmt);
+ ret = info_msg(session, fmt, ap);
+ va_end(ap);
+ return (ret);
+}
+
+/*
+ * __wt_progress --
+ * Progress message.
+ */
+int
+__wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v)
+{
+ WT_DECL_RET;
+ WT_EVENT_HANDLER *handler;
+ WT_SESSION *wt_session;
+
+ wt_session = (WT_SESSION *)session;
+ handler = session->event_handler;
+ if (handler != NULL && handler->handle_progress != NULL)
+ if ((ret = handler->handle_progress(handler,
+ wt_session, s == NULL ? session->name : s, v)) != 0)
+ __handler_failure(session, ret, "progress", 0);
+ return (0);
+}
+
+/*
+ * __wt_assert --
+ * Assert and other unexpected failures, includes file/line information
+ * for debugging.
+ */
+void
+__wt_assert(WT_SESSION_IMPL *session,
+ int error, const char *file_name, int line_number, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 5, 6)))
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ (void)__wt_eventv(session, 0, error, file_name, line_number, fmt, ap);
+ va_end(ap);
+
+#ifdef HAVE_DIAGNOSTIC
+ __wt_abort(session); /* Drop core if testing. */
+ /* NOTREACHED */
+#endif
+}
+
+/*
+ * __wt_panic --
+ * A standard error message when we panic.
+ */
+int
+__wt_panic(WT_SESSION_IMPL *session)
+{
+ F_SET(S2C(session), WT_CONN_PANIC);
+ __wt_errx(session, "%s",
+ "the WiredTiger library cannot continue; the process must exit "
+ "and restart");
+
+#if !defined(HAVE_DIAGNOSTIC)
+ /*
+ * Chaos reigns within.
+ * Reflect, repent, and reboot.
+ * Order shall return.
+ */
+ return (WT_PANIC);
+#endif
+
+ __wt_abort(session); /* Drop core if testing. */
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_illegal_value --
+ * A standard error message when we detect an illegal value.
+ */
+int
+__wt_illegal_value(WT_SESSION_IMPL *session, const char *name)
+{
+ __wt_errx(session, "%s%s%s",
+ name == NULL ? "" : name, name == NULL ? "" : ": ",
+ "encountered an illegal file format or internal value");
+
+#if !defined(HAVE_DIAGNOSTIC)
+ return (__wt_panic(session));
+#endif
+
+ __wt_abort(session); /* Drop core if testing. */
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_object_unsupported --
+ * Print a standard error message for an object that doesn't support a
+ * particular operation.
+ */
+int
+__wt_object_unsupported(WT_SESSION_IMPL *session, const char *uri)
+{
+ WT_RET_MSG(session, ENOTSUP, "unsupported object operation: %s", uri);
+}
+
+/*
+ * __wt_bad_object_type --
+ * Print a standard error message when given an unknown or unsupported
+ * object type.
+ */
+int
+__wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri)
+{
+ if (WT_PREFIX_MATCH(uri, "backup:") ||
+ WT_PREFIX_MATCH(uri, "colgroup:") ||
+ WT_PREFIX_MATCH(uri, "config:") ||
+ WT_PREFIX_MATCH(uri, "file:") ||
+ WT_PREFIX_MATCH(uri, "index:") ||
+ WT_PREFIX_MATCH(uri, "log:") ||
+ WT_PREFIX_MATCH(uri, "lsm:") ||
+ WT_PREFIX_MATCH(uri, "statistics:") ||
+ WT_PREFIX_MATCH(uri, "table:"))
+ return (__wt_object_unsupported(session, uri));
+
+ WT_RET_MSG(session, ENOTSUP, "unknown object type: %s", uri);
+}
diff --git a/src/third_party/wiredtiger/src/support/filename.c b/src/third_party/wiredtiger/src/support/filename.c
new file mode 100644
index 00000000000..bd5d03fa633
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/filename.c
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_filename --
+ * Build a file name in a scratch buffer, automatically calculate the
+ * length of the file name.
+ */
+int
+__wt_filename(WT_SESSION_IMPL *session, const char *name, char **path)
+{
+ return (__wt_nfilename(session, name, strlen(name), path));
+}
+
+/*
+ * __wt_nfilename --
+ * Build a file name in a scratch buffer. If the name is already an
+ * absolute path duplicate it, otherwise generate a path relative to the
+ * connection home directory.
+ */
+int
+__wt_nfilename(
+ WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path)
+{
+ WT_CONNECTION_IMPL *conn;
+ size_t len;
+ char *buf;
+
+ conn = S2C(session);
+ *path = NULL;
+
+ if (__wt_absolute_path(name))
+ WT_RET(__wt_strndup(session, name, namelen, path));
+ else {
+ len = strlen(conn->home) + 1 + namelen + 1;
+ WT_RET(__wt_calloc(session, 1, len, &buf));
+ snprintf(buf, len, "%s%s%.*s",
+ conn->home, __wt_path_separator(), (int)namelen, name);
+ *path = buf;
+ }
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/support/global.c b/src/third_party/wiredtiger/src/support/global.c
new file mode 100644
index 00000000000..10f718d57f7
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/global.c
@@ -0,0 +1,118 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+WT_PROCESS __wt_process; /* Per-process structure */
+static int __wt_pthread_once_failed; /* If initialization failed */
+
+/*
+ * __system_is_little_endian --
+ * Check if the system is little endian.
+ */
+static int
+__system_is_little_endian(void)
+{
+ uint64_t v;
+ int little;
+
+ v = 1;
+ little = *((uint8_t *)&v) == 0 ? 0 : 1;
+
+ if (little)
+ return (0);
+
+ fprintf(stderr,
+ "This release of the WiredTiger data engine does not support "
+ "big-endian systems; contact WiredTiger for more information.\n");
+ return (EINVAL);
+}
+
+/*
+ * __wt_global_once --
+ * Global initialization, run once.
+ */
+static void
+__wt_global_once(void)
+{
+ WT_DECL_RET;
+
+ if ((ret = __system_is_little_endian()) != 0) {
+ __wt_pthread_once_failed = ret;
+ return;
+ }
+
+ if ((ret =
+ __wt_spin_init(NULL, &__wt_process.spinlock, "global")) != 0) {
+ __wt_pthread_once_failed = ret;
+ return;
+ }
+
+ __wt_cksum_init();
+
+ TAILQ_INIT(&__wt_process.connqh);
+
+#ifdef HAVE_DIAGNOSTIC
+ /* Load debugging code the compiler might optimize out. */
+ (void)__wt_breakpoint();
+#endif
+}
+
+/*
+ * __wt_library_init --
+ * Some things to do, before we do anything else.
+ */
+int
+__wt_library_init(void)
+{
+ static int first = 1;
+ WT_DECL_RET;
+
+ /*
+ * Do per-process initialization once, before anything else, but only
+ * once. I don't know how heavy-weight the function (pthread_once, in
+ * the POSIX world), might be, so I'm front-ending it with a local
+ * static and only using that function to avoid a race.
+ */
+ if (first) {
+ if ((ret = __wt_once(__wt_global_once)) != 0)
+ __wt_pthread_once_failed = ret;
+ first = 0;
+ }
+ return (__wt_pthread_once_failed);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_breakpoint --
+ * A simple place to put a breakpoint, if you need one.
+ */
+int
+__wt_breakpoint(void)
+{
+ return (0);
+}
+
+/*
+ * __wt_attach --
+ * A routine to wait for the debugging to attach.
+ */
+void
+__wt_attach(WT_SESSION_IMPL *session)
+{
+#ifdef HAVE_ATTACH
+ __wt_errx(session, "process ID %" PRIdMAX
+ ": waiting for debugger...", (intmax_t)getpid());
+
+ /* Sleep forever, the debugger will interrupt us when it attaches. */
+ for (;;)
+ __wt_sleep(100, 0);
+#else
+ WT_UNUSED(session);
+#endif
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/support/hash_city.c b/src/third_party/wiredtiger/src/support/hash_city.c
new file mode 100644
index 00000000000..c6978f6bfe6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/hash_city.c
@@ -0,0 +1,323 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2011 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * CityHash, by Geoff Pike and Jyrki Alakuijala
+ *
+ * This file provides CityHash64() and related functions.
+ *
+ * It's probably possible to create even faster hash functions by
+ * writing a program that systematically explores some of the space of
+ * possible hash functions, by using SIMD instructions, or by
+ * compromising on hash quality.
+ */
+
+#include <string.h>
+#include "wt_internal.h"
+
+/*
+ * Google City Hash implementation. Based on source code from:
+ * http://code.google.com/p/cityhash/
+ */
+
+typedef struct _uint128 uint128;
+struct _uint128 {
+ uint64_t first;
+ uint64_t second;
+};
+
+#define Uint128Low64(x) (x).first
+#define Uint128High64(x) (x).second
+
+static uint64_t UNALIGNED_LOAD64(const char *p) {
+ uint64_t result;
+ memcpy(&result, p, sizeof(result));
+ return (result);
+}
+
+static uint32_t UNALIGNED_LOAD32(const char *p) {
+ uint32_t result;
+ memcpy(&result, p, sizeof(result));
+ return (result);
+}
+
+#if !defined(WORDS_BIGENDIAN)
+
+#define uint32_in_expected_order(x) (x)
+#define uint64_in_expected_order(x) (x)
+
+#else
+
+#ifdef __APPLE__
+/* Mac OS X / Darwin features */
+#include <libkern/OSByteOrder.h>
+#define bswap_32(x) OSSwapInt32(x)
+#define bswap_64(x) OSSwapInt64(x)
+
+#else
+#include <byteswap.h>
+#endif
+
+#define uint32_in_expected_order(x) (bswap_32(x))
+#define uint64_in_expected_order(x) (bswap_64(x))
+
+#endif /* WORDS_BIGENDIAN */
+
+static uint64_t Fetch64(const char *p) {
+ return uint64_in_expected_order(UNALIGNED_LOAD64(p));
+}
+
+static uint32_t Fetch32(const char *p) {
+ return uint32_in_expected_order(UNALIGNED_LOAD32(p));
+}
+
+/* Some primes between 2^63 and 2^64 for various uses. */
+static const uint64_t k0 = 0xc3a5c85c97cb3127ULL;
+static const uint64_t k1 = 0xb492b66fbe98f273ULL;
+static const uint64_t k2 = 0x9ae16a3b2f90404fULL;
+static const uint64_t k3 = 0xc949d7c7509e6557ULL;
+
+/*
+ * Hash 128 input bits down to 64 bits of output.
+ * This is intended to be a reasonably good hash function.
+ */
+static inline uint64_t Hash128to64(const uint128 x) {
+ /* Murmur-inspired hashing. */
+ const uint64_t kMul = 0x9ddfea08eb382d69ULL;
+ uint64_t a, b;
+
+ a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
+ a ^= (a >> 47);
+ b = (Uint128High64(x) ^ a) * kMul;
+ b ^= (b >> 47);
+ b *= kMul;
+ return (b);
+}
+
+/*
+ * Bitwise right rotate. Normally this will compile to a single
+ * instruction, especially if the shift is a manifest constant.
+ */
+static uint64_t Rotate(uint64_t val, int shift) {
+ /* Avoid shifting by 64: doing so yields an undefined result. */
+ return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
+}
+
+/*
+ * Equivalent to Rotate(), but requires the second arg to be non-zero.
+ * On x86-64, and probably others, it's possible for this to compile
+ * to a single instruction if both args are already in registers.
+ */
+static uint64_t RotateByAtLeast1(uint64_t val, int shift) {
+ return (val >> shift) | (val << (64 - shift));
+}
+
+static uint64_t ShiftMix(uint64_t val) {
+ return val ^ (val >> 47);
+}
+
+static uint64_t HashLen16(uint64_t u, uint64_t v) {
+ uint128 result;
+
+ result.first = u;
+ result.second = v;
+ return Hash128to64(result);
+}
+
+static uint64_t HashLen0to16(const char *s, size_t len) {
+ uint64_t a64, b64;
+ uint32_t y, z;
+ uint8_t a8, b8, c8;
+ if (len > 8) {
+ a64 = Fetch64(s);
+ b64 = Fetch64(s + len - 8);
+ return HashLen16(
+ a64, RotateByAtLeast1(b64 + len, (int)len)) ^ b64;
+ }
+ if (len >= 4) {
+ a64 = Fetch32(s);
+ return HashLen16(len + (a64 << 3), Fetch32(s + len - 4));
+ }
+ if (len > 0) {
+ a8 = (uint8_t)s[0];
+ b8 = (uint8_t)s[len >> 1];
+ c8 = (uint8_t)s[len - 1];
+ y = (uint32_t)(a8) + ((uint32_t)(b8) << 8);
+ z = (uint32_t)len + ((uint32_t)(c8) << 2);
+ return ShiftMix(y * k2 ^ z * k3) * k2;
+ }
+ return (k2);
+}
+
+/*
+ * This probably works well for 16-byte strings as well, but it may be overkill
+ * in that case.
+ */
+static uint64_t HashLen17to32(const char *s, size_t len) {
+ uint64_t a = Fetch64(s) * k1;
+ uint64_t b = Fetch64(s + 8);
+ uint64_t c = Fetch64(s + len - 8) * k2;
+ uint64_t d = Fetch64(s + len - 16) * k0;
+ return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d,
+ a + Rotate(b ^ k3, 20) + len - c);
+}
+
+/*
+ * Return a 16-byte hash for 48 bytes. Quick and dirty.
+ * Callers do best to use "random-looking" values for a and b.
+ * static pair<uint64, uint64> WeakHashLen32WithSeeds(
+ */
+static void WeakHashLen32WithSeeds6(uint64_t w, uint64_t x,
+ uint64_t y, uint64_t z, uint64_t a, uint64_t b, uint128 *ret) {
+ uint64_t c;
+
+ a += w;
+ b = Rotate(b + a + z, 21);
+ c = a;
+ a += x;
+ a += y;
+ b += Rotate(a, 44);
+
+ ret->first = (uint64_t) (a + z);
+ ret->second = (uint64_t) (b + c);
+}
+
+/*
+ * Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty.
+ * static pair<uint64, uint64> WeakHashLen32WithSeeds(
+ */
+static void WeakHashLen32WithSeeds(
+ const char* s, uint64_t a, uint64_t b, uint128 *ret) {
+ WeakHashLen32WithSeeds6(Fetch64(s),
+ Fetch64(s + 8),
+ Fetch64(s + 16),
+ Fetch64(s + 24),
+ a,
+ b,
+ ret);
+}
+
+/* Return an 8-byte hash for 33 to 64 bytes. */
+static uint64_t HashLen33to64(const char *s, size_t len) {
+ uint64_t a, b, c, r, vf, vs, wf, ws, z;
+ z = Fetch64(s + 24);
+ a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0;
+ b = Rotate(a + z, 52);
+ c = Rotate(a, 37);
+ a += Fetch64(s + 8);
+ c += Rotate(a, 7);
+ a += Fetch64(s + 16);
+ vf = a + z;
+ vs = b + Rotate(a, 31) + c;
+ a = Fetch64(s + 16) + Fetch64(s + len - 32);
+ z = Fetch64(s + len - 8);
+ b = Rotate(a + z, 52);
+ c = Rotate(a, 37);
+ a += Fetch64(s + len - 24);
+ c += Rotate(a, 7);
+ a += Fetch64(s + len - 16);
+ wf = a + z;
+ ws = b + Rotate(a, 31) + c;
+ r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0);
+ return ShiftMix(r * k0 + vs) * k2;
+}
+
+static inline uint64_t CityHash64(const char *s, size_t len) {
+ uint64_t temp, x, y, z;
+ uint128 v, w;
+
+ if (len <= 32) {
+ if (len <= 16) {
+ return HashLen0to16(s, len);
+ } else {
+ return HashLen17to32(s, len);
+ }
+ } else if (len <= 64) {
+ return HashLen33to64(s, len);
+ }
+
+ /*
+ * For strings over 64 bytes we hash the end first, and then as we
+ * loop we keep 56 bytes of state: v, w, x, y, and z.
+ */
+ x = Fetch64(s + len - 40);
+ y = Fetch64(s + len - 16) + Fetch64(s + len - 56);
+ z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24));
+ WeakHashLen32WithSeeds(s + len - 64, len, z, &v);
+ WeakHashLen32WithSeeds(s + len - 32, y + k1, x, &w);
+ x = x * k1 + Fetch64(s);
+
+ /*
+ * Use len to count multiples of 64, and operate on 64-byte chunks.
+ */
+ for (len = (len - 1) >> 6; len != 0; len--) {
+ x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
+ y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+ x ^= w.second;
+ y += v.first + Fetch64(s + 40);
+ z = Rotate(z + w.first, 33) * k1;
+ WeakHashLen32WithSeeds(s, v.second * k1, x + w.first, &v);
+ WeakHashLen32WithSeeds(
+ s + 32, z + w.second, y + Fetch64(s + 16), &w);
+ temp = z;
+ z = x;
+ x = temp;
+ s += 64;
+ }
+ return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,
+ HashLen16(v.second, w.second) + x);
+}
+
+/*
+ * __wt_hash_city64 --
+ * WiredTiger wrapper around third party hash implementation.
+ */
+uint64_t
+__wt_hash_city64(const void *s, size_t len)
+{
+ return (CityHash64(s, len));
+}
diff --git a/src/third_party/wiredtiger/src/support/hash_fnv.c b/src/third_party/wiredtiger/src/support/hash_fnv.c
new file mode 100644
index 00000000000..68f8537a4a0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/hash_fnv.c
@@ -0,0 +1,161 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
+ *
+ * @(#) $Revision: 5.1 $
+ * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $
+ * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $
+ *
+ ***
+ *
+ * Fowler/Noll/Vo hash
+ *
+ * The basis of this hash algorithm was taken from an idea sent
+ * as reviewer comments to the IEEE POSIX P1003.2 committee by:
+ *
+ * Phong Vo (http://www.research.att.com/info/kpv/)
+ * Glenn Fowler (http://www.research.att.com/~gsf/)
+ *
+ * In a subsequent ballot round:
+ *
+ * Landon Curt Noll (http://www.isthe.com/chongo/)
+ *
+ * improved on their algorithm. Some people tried this hash
+ * and found that it worked rather well. In an EMail message
+ * to Landon, they named it the ``Fowler/Noll/Vo'' or FNV hash.
+ *
+ * FNV hashes are designed to be fast while maintaining a low
+ * collision rate. The FNV speed allows one to quickly hash lots
+ * of data while maintaining a reasonable collision rate. See:
+ *
+ * http://www.isthe.com/chongo/tech/comp/fnv/index.html
+ *
+ * for more details as well as other forms of the FNV hash.
+ *
+ ***
+ *
+ * To use the recommended 64 bit FNV-1a hash, pass FNV1A_64_INIT as the
+ * uint64_t hashval argument to fnv_64a_buf() or fnv_64a_str().
+ *
+ ***
+ *
+ * Please do not copyright this code. This code is in the public domain.
+ *
+ * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO
+ * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ *
+ * By:
+ * chongo <Landon Curt Noll> /\oo/\
+ * http://www.isthe.com/chongo/
+ *
+ * Share and Enjoy! :-)
+ */
+
+#include <stdlib.h>
+#include "wt_internal.h"
+
+/*
+ * This file contains a 64 bit hash implementation of the FNV 1a 64 bit hash
+ * function. The implementation is from a third party.
+ *
+ * The code has been updated to remove unnecessary content and better comply
+ * with WiredTiger coding standards. The original source code can be found at:
+ * FNV 1a 64 bit: http://www.isthe.com/chongo/src/fnv/hash_64a.c
+ */
+
+/*
+ * 64 bit FNV-1 non-zero initial basis
+ *
+ * The FNV-1 initial basis is the FNV-0 hash of the following 32 octets:
+ *
+ * chongo <Landon Curt Noll> /\../\
+ *
+ * NOTE: The \'s above are not back-slashing escape characters.
+ * They are literal ASCII backslash 0x5c characters.
+ *
+ * NOTE: The FNV-1a initial basis is the same value as FNV-1 by definition.
+ */
+#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
+
+/*
+ * fnv_64a_buf --
+ * Perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer
+ *
+ * input:
+ * buf - start of buffer to hash
+ * len - length of buffer in octets
+ * hval - previous hash value or 0 if first call
+ *
+ * returns:
+ * 64 bit hash as a static hash type
+ *
+ * NOTE: To use the recommended 64 bit FNV-1a hash, use FNV1A_64_INIT as the
+ * hval arg on the first call to either fnv_64a_buf() or fnv_64a_str().
+ */
+static inline uint64_t
+fnv_64a_buf(const void *buf, size_t len, uint64_t hval)
+{
+ const unsigned char *bp = buf; /* start of buffer */
+ const unsigned char *be = bp + len; /* beyond end of buffer */
+
+ /*
+ * FNV-1a hash each octet of the buffer
+ */
+ while (bp < be) {
+
+ /* xor the bottom with the current octet */
+ hval ^= (uint64_t)*bp++;
+
+ /*
+ * Multiply by the 64 bit FNV magic prime mod 2^64. The
+ * following shift operation is generally faster than
+ * a multiply operation.
+ */
+ hval += (hval << 1) + (hval << 4) + (hval << 5) +
+ (hval << 7) + (hval << 8) + (hval << 40);
+ }
+
+ /* return our new hash value */
+ return (hval);
+}
+
+/*
+ * __wt_hash_fnv64 --
+ * WiredTiger wrapper around third party hash implementation.
+ */
+uint64_t
+__wt_hash_fnv64(const void *string, size_t len)
+{
+ return (fnv_64a_buf(string, len, FNV1A_64_INIT));
+}
diff --git a/src/third_party/wiredtiger/src/support/hazard.c b/src/third_party/wiredtiger/src/support/hazard.c
new file mode 100644
index 00000000000..12350ab52f4
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/hazard.c
@@ -0,0 +1,244 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#ifdef HAVE_DIAGNOSTIC
+static void __hazard_dump(WT_SESSION_IMPL *);
+#endif
+
+/*
+ * __wt_hazard_set --
+ * Set a hazard pointer.
+ */
+int
+__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ )
+{
+ WT_BTREE *btree;
+ WT_HAZARD *hp;
+ int restarts = 0;
+
+ btree = S2BT(session);
+ *busyp = 0;
+
+ /* If a file can never be evicted, hazard pointers aren't required. */
+ if (F_ISSET(btree, WT_BTREE_NO_HAZARD))
+ return (0);
+
+ /*
+ * Do the dance:
+ *
+ * The memory location which makes a page "real" is the WT_REF's state
+ * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the
+ * page eviction server.
+ *
+ * Add the WT_REF reference to the session's hazard list and flush the
+ * write, then see if the page's state is still valid. If so, we can
+ * use the page because the page eviction server will see our hazard
+ * pointer before it discards the page (the eviction server sets the
+ * state to WT_REF_LOCKED, then flushes memory and checks the hazard
+ * pointers).
+ *
+ * For sessions with many active hazard pointers, skip most of the
+ * active slots: there may be a free slot in there, but checking is
+ * expensive. Most hazard pointers are released quickly: optimize
+ * for that case.
+ */
+ for (hp = session->hazard + session->nhazard;; ++hp) {
+ /* Expand the number of hazard pointers if available.*/
+ if (hp >= session->hazard + session->hazard_size) {
+ if (session->hazard_size >= S2C(session)->hazard_max)
+ break;
+ /* Restart the search. */
+ if (session->nhazard < session->hazard_size &&
+ restarts++ == 0) {
+ hp = session->hazard;
+ continue;
+ }
+ WT_PUBLISH(session->hazard_size,
+ WT_MIN(session->hazard_size + WT_HAZARD_INCR,
+ S2C(session)->hazard_max));
+ }
+
+ if (hp->page != NULL)
+ continue;
+
+ hp->page = ref->page;
+#ifdef HAVE_DIAGNOSTIC
+ hp->file = file;
+ hp->line = line;
+#endif
+ /* Publish the hazard pointer before reading page's state. */
+ WT_FULL_BARRIER();
+
+ /*
+ * Check if the page state is still valid, where valid means a
+ * state of WT_REF_MEM and the pointer is unchanged. (The
+ * pointer can change, it means the page was evicted between
+ * the time we set our hazard pointer and the publication. It
+ * would theoretically be possible for the page to be evicted
+ * and a different page read into the same memory, so the
+ * pointer hasn't changed but the contents have. That's OK, we
+ * found this page using the tree's key space, whatever page we
+ * find here is the page for us to use.)
+ */
+ if (ref->page == hp->page && ref->state == WT_REF_MEM) {
+ ++session->nhazard;
+ return (0);
+ }
+
+ /*
+ * The page isn't available, it's being considered for eviction
+ * (or being evicted, for all we know). If the eviction server
+ * sees our hazard pointer before evicting the page, it will
+ * return the page to use, no harm done, if it doesn't, it will
+ * go ahead and complete the eviction.
+ *
+ * We don't bother publishing this update: the worst case is we
+ * prevent some random page from being evicted.
+ */
+ hp->page = NULL;
+ *busyp = 1;
+ return (0);
+ }
+
+ __wt_errx(session, "session %p: hazard pointer table full", session);
+#ifdef HAVE_DIAGNOSTIC
+ __hazard_dump(session);
+#endif
+
+ return (ENOMEM);
+}
+
+/*
+ * __wt_hazard_clear --
+ * Clear a hazard pointer.
+ */
+int
+__wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_HAZARD *hp;
+
+ btree = S2BT(session);
+
+ /* If a file can never be evicted, hazard pointers aren't required. */
+ if (F_ISSET(btree, WT_BTREE_NO_HAZARD))
+ return (0);
+
+ /*
+ * Clear the caller's hazard pointer.
+ * The common pattern is LIFO, so do a reverse search.
+ */
+ for (hp = session->hazard + session->hazard_size - 1;
+ hp >= session->hazard;
+ --hp)
+ if (hp->page == page) {
+ /*
+ * We don't publish the hazard pointer clear in the
+ * general case. It's not required for correctness;
+ * it gives an eviction thread faster access to the
+ * page were the page selected for eviction, but the
+ * generation number was just set, it's unlikely the
+ * page will be selected for eviction.
+ */
+ hp->page = NULL;
+
+ /*
+ * If this was the last hazard pointer in the session,
+ * we may need to update our transactional context.
+ */
+ --session->nhazard;
+ return (0);
+ }
+
+ /*
+ * A serious error, we should always find the hazard pointer. Panic,
+ * because using a page we didn't have pinned down implies corruption.
+ */
+ WT_PANIC_RET(session, EINVAL,
+ "session %p: clear hazard pointer: %p: not found", session, page);
+}
+
+/*
+ * __wt_hazard_close --
+ * Verify that no hazard pointers are set.
+ */
+void
+__wt_hazard_close(WT_SESSION_IMPL *session)
+{
+ WT_HAZARD *hp;
+ int found;
+
+ /*
+ * Check for a set hazard pointer and complain if we find one. We could
+ * just check the session's hazard pointer count, but this is a useful
+ * diagnostic.
+ */
+ for (found = 0, hp = session->hazard;
+ hp < session->hazard + session->hazard_size; ++hp)
+ if (hp->page != NULL) {
+ found = 1;
+ break;
+ }
+ if (session->nhazard == 0 && !found)
+ return;
+
+ __wt_errx(session,
+ "session %p: close hazard pointer table: table not empty", session);
+
+#ifdef HAVE_DIAGNOSTIC
+ __hazard_dump(session);
+#endif
+
+ /*
+ * Clear any hazard pointers because it's not a correctness problem
+ * (any hazard pointer we find can't be real because the session is
+ * being closed when we're called). We do this work because session
+ * close isn't that common that it's an expensive check, and we don't
+ * want to let a hazard pointer lie around, keeping a page from being
+ * evicted.
+ *
+ * We don't panic: this shouldn't be a correctness issue (at least, I
+ * can't think of a reason it would be).
+ */
+ for (hp = session->hazard;
+ hp < session->hazard + session->hazard_size; ++hp)
+ if (hp->page != NULL) {
+ hp->page = NULL;
+ --session->nhazard;
+ }
+
+ if (session->nhazard != 0)
+ __wt_errx(session,
+ "session %p: close hazard pointer table: count didn't "
+ "match entries",
+ session);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __hazard_dump --
+ * Display the list of hazard pointers.
+ */
+static void
+__hazard_dump(WT_SESSION_IMPL *session)
+{
+ WT_HAZARD *hp;
+
+ for (hp = session->hazard;
+ hp < session->hazard + session->hazard_size; ++hp)
+ if (hp->page != NULL)
+ __wt_errx(session,
+ "session %p: hazard pointer %p: %s, line %d",
+ session, hp->page, hp->file, hp->line);
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/support/hex.c b/src/third_party/wiredtiger/src/support/hex.c
new file mode 100644
index 00000000000..9ee3e723fa2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/hex.c
@@ -0,0 +1,215 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static const u_char hex[] = "0123456789abcdef";
+
+/*
+ * __fill_hex --
+ * In-memory conversion of raw bytes to a hexadecimal representation.
+ */
+static inline void
+__fill_hex(const uint8_t *src, size_t src_max,
+ uint8_t *dest, size_t dest_max, size_t *lenp)
+{
+ uint8_t *dest_orig;
+
+ dest_orig = dest;
+ if (dest_max > 0) /* save a byte for nul-termination */
+ --dest_max;
+ for (; src_max > 0 && dest_max > 1;
+ src_max -= 1, dest_max -= 2, ++src) {
+ *dest++ = hex[(*src & 0xf0) >> 4];
+ *dest++ = hex[*src & 0x0f];
+ }
+ *dest++ = '\0';
+ if (lenp != NULL)
+ *lenp = WT_PTRDIFF(dest, dest_orig);
+}
+
+/*
+ * __wt_raw_to_hex --
+ * Convert a chunk of data to a nul-terminated printable hex string.
+ */
+int
+__wt_raw_to_hex(
+ WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to)
+{
+ size_t len;
+
+ /*
+ * Every byte takes up 2 spaces, plus a trailing nul byte.
+ */
+ len = size * 2 + 1;
+ WT_RET(__wt_buf_init(session, to, len));
+
+ __fill_hex(from, size, to->mem, len, &to->size);
+ return (0);
+}
+
+/*
+ * __wt_raw_to_esc_hex --
+ * Convert a chunk of data to a nul-terminated printable string using
+ * escaped hex, as necessary.
+ */
+int
+__wt_raw_to_esc_hex(
+ WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to)
+{
+ size_t i;
+ const uint8_t *p;
+ u_char *t;
+
+ /*
+ * In the worst case, every character takes up 3 spaces, plus a
+ * trailing nul byte.
+ */
+ WT_RET(__wt_buf_init(session, to, size * 3 + 1));
+
+ /*
+ * In the worst case, every character takes up 3 spaces, plus a
+ * trailing nul byte.
+ */
+ for (p = from, t = to->mem, i = size; i > 0; --i, ++p)
+ if (isprint((int)*p)) {
+ if (*p == '\\')
+ *t++ = '\\';
+ *t++ = *p;
+ } else {
+ *t++ = '\\';
+ *t++ = hex[(*p & 0xf0) >> 4];
+ *t++ = hex[*p & 0x0f];
+ }
+ *t++ = '\0';
+ to->size = WT_PTRDIFF(t, to->mem);
+ return (0);
+}
+
+/*
+ * __wt_hex2byte --
+ * Convert a pair of hex characters into a byte.
+ */
+int
+__wt_hex2byte(const u_char *from, u_char *to)
+{
+ uint8_t byte;
+
+ switch (from[0]) {
+ case '0': byte = 0; break;
+ case '1': byte = 1 << 4; break;
+ case '2': byte = 2 << 4; break;
+ case '3': byte = 3 << 4; break;
+ case '4': byte = 4 << 4; break;
+ case '5': byte = 5 << 4; break;
+ case '6': byte = 6 << 4; break;
+ case '7': byte = 7 << 4; break;
+ case '8': byte = 8 << 4; break;
+ case '9': byte = 9 << 4; break;
+ case 'a': byte = 10 << 4; break;
+ case 'b': byte = 11 << 4; break;
+ case 'c': byte = 12 << 4; break;
+ case 'd': byte = 13 << 4; break;
+ case 'e': byte = 14 << 4; break;
+ case 'f': byte = 15 << 4; break;
+ default:
+ return (1);
+ }
+
+ switch (from[1]) {
+ case '0': break;
+ case '1': byte |= 1; break;
+ case '2': byte |= 2; break;
+ case '3': byte |= 3; break;
+ case '4': byte |= 4; break;
+ case '5': byte |= 5; break;
+ case '6': byte |= 6; break;
+ case '7': byte |= 7; break;
+ case '8': byte |= 8; break;
+ case '9': byte |= 9; break;
+ case 'a': byte |= 10; break;
+ case 'b': byte |= 11; break;
+ case 'c': byte |= 12; break;
+ case 'd': byte |= 13; break;
+ case 'e': byte |= 14; break;
+ case 'f': byte |= 15; break;
+ default:
+ return (1);
+ }
+ *to = byte;
+ return (0);
+}
+
+/*
+ * __hex_fmterr --
+ * Hex format error message.
+ */
+static int
+__hex_fmterr(WT_SESSION_IMPL *session)
+{
+ WT_RET_MSG(session, EINVAL, "Invalid format in hexadecimal string");
+}
+
+/*
+ * __wt_hex_to_raw --
+ * Convert a nul-terminated printable hex string to a chunk of data.
+ */
+int
+__wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to)
+{
+ return (__wt_nhex_to_raw(session, from, strlen(from), to));
+}
+
+/*
+ * __wt_nhex_to_raw --
+ * Convert a printable hex string to a chunk of data.
+ */
+int
+__wt_nhex_to_raw(
+ WT_SESSION_IMPL *session, const char *from, size_t size, WT_ITEM *to)
+{
+ const u_char *p;
+ u_char *t;
+
+ if (size % 2 != 0)
+ return (__hex_fmterr(session));
+
+ WT_RET(__wt_buf_init(session, to, size / 2));
+
+ for (p = (u_char *)from, t = to->mem; size > 0; p += 2, size -= 2, ++t)
+ if (__wt_hex2byte(p, t))
+ return (__hex_fmterr(session));
+
+ to->size = WT_PTRDIFF(t, to->mem);
+ return (0);
+}
+
+/*
+ * __wt_esc_hex_to_raw --
+ * Convert a printable string, encoded in escaped hex, to a chunk of data.
+ */
+int
+__wt_esc_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to)
+{
+ const u_char *p;
+ u_char *t;
+
+ WT_RET(__wt_buf_init(session, to, strlen(from)));
+
+ for (p = (u_char *)from, t = to->mem; *p != '\0'; ++p, ++t) {
+ if ((*t = *p) != '\\')
+ continue;
+ ++p;
+ if (p[0] != '\\') {
+ if (p[0] == '\0' || p[1] == '\0' || __wt_hex2byte(p, t))
+ return (__hex_fmterr(session));
+ ++p;
+ }
+ }
+ to->size = WT_PTRDIFF(t, to->mem);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/support/huffman.c b/src/third_party/wiredtiger/src/support/huffman.c
new file mode 100644
index 00000000000..5a06b72d33e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/huffman.c
@@ -0,0 +1,899 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#define __HUFFMAN_DETAIL 0 /* Set to 1 for debugging output. */
+
+/* Length of header in compressed message, in bits. */
+#define WT_HUFFMAN_HEADER 3
+
+/*
+ * Maximum allowed length of Huffman code words, which otherwise can range up
+ * to (#symbols - 1) bits long. Lower value to use less memory for tables,
+ * higher value for better compression. Max value = 16 (or 32-7=25 or 64-7=57
+ * if adjust data types). FYI, JPEG uses 16. A side effect of limiting max
+ * code length is that the worst case compression (a message of the least
+ * frequent symbols) is shorter.
+ */
+#define MAX_CODE_LENGTH 16
+
+typedef struct __wt_freqtree_node {
+ /*
+ * Data structure representing a node of the huffman tree. It holds a
+ * 64-bit weight and pointers to the left and right child nodes. The
+ * node either has two child nodes or none.
+ */
+ uint8_t symbol; /* only used in leaf nodes */
+ uint64_t weight;
+ struct __wt_freqtree_node *left; /* bit 0 */
+ struct __wt_freqtree_node *right; /* bit 1 */
+} WT_FREQTREE_NODE;
+
+typedef struct __wt_huffman_code {
+ uint16_t pattern; /* requirement: length of field's type
+ * in bits >= MAX_CODE_LENGTH.
+ */
+ uint8_t length;
+} WT_HUFFMAN_CODE;
+
+typedef struct __wt_huffman_obj {
+ /*
+ * Data structure here defines specific instance of the encoder/decoder.
+ */
+ u_int numSymbols; /* Symbols: UINT16_MAX or UINT8_MAX */
+
+ uint16_t max_depth, min_depth; /* Tree max/min depths */
+
+ /*
+ * use: codes[symbol] = struct with pattern and length.
+ * Used in encoding and decoding.
+ * memory: codes[0-to-(number of symbols - 1)]
+ */
+ WT_HUFFMAN_CODE *codes;
+
+ /*
+ * use: code2symbol[Huffman_code] = symbol.
+ * Used in decoding.
+ * memory: code2symbol[1 << max_code_length]
+ */
+ uint8_t *code2symbol;
+} WT_HUFFMAN_OBJ;
+
+/*
+ * Queue element data structure.
+ *
+ * Consists of a pointer to a huffman tree node, and a pointer to the next
+ * element in the queue.
+ */
+typedef struct node_queue_elem {
+ WT_FREQTREE_NODE *node;
+ struct node_queue_elem *next;
+} NODE_QUEUE_ELEM;
+
+/*
+ * Queue of huffman tree nodes.
+ *
+ * Contains a pointer to the beginning and the end of the queue, which is
+ * implemented as a linked list.
+ */
+typedef struct node_queue {
+ NODE_QUEUE_ELEM *first;
+ NODE_QUEUE_ELEM *last;
+} NODE_QUEUE;
+
+/*
+ * Internal data structure used to preserve the symbol when rearranging the
+ * frequency array.
+ */
+typedef struct __indexed_byte {
+ uint32_t symbol; /* not uint8_t: match external data structure */
+ uint32_t frequency;
+} INDEXED_SYMBOL;
+
+static int indexed_freq_compare(const void *, const void *);
+static int indexed_symbol_compare(const void *, const void *);
+static void make_table(
+ WT_SESSION_IMPL *, uint8_t *, uint16_t, WT_HUFFMAN_CODE *, u_int);
+static void node_queue_close(WT_SESSION_IMPL *, NODE_QUEUE *);
+static void node_queue_dequeue(
+ WT_SESSION_IMPL *, NODE_QUEUE *, WT_FREQTREE_NODE **);
+static int node_queue_enqueue(
+ WT_SESSION_IMPL *, NODE_QUEUE *, WT_FREQTREE_NODE *);
+static uint32_t profile_tree(
+ WT_FREQTREE_NODE *, uint16_t, uint16_t *, uint16_t *);
+static void recursive_free_node(WT_SESSION_IMPL *, WT_FREQTREE_NODE *);
+static void set_codes(WT_FREQTREE_NODE *, WT_HUFFMAN_CODE *, uint16_t, uint8_t);
+
+#define node_queue_is_empty(queue) \
+ ((queue) == NULL || (queue)->first == NULL)
+
+/*
+ * indexed_symbol_compare --
+ * Qsort comparator to order the table by symbol, lowest to highest.
+ */
+static int
+indexed_symbol_compare(const void *a, const void *b)
+{
+ return (((INDEXED_SYMBOL *)a)->symbol >
+ ((INDEXED_SYMBOL *)b)->symbol ? 1 :
+ (((INDEXED_SYMBOL *)a)->symbol <
+ ((INDEXED_SYMBOL *)b)->symbol ? -1 : 0));
+}
+
+/*
+ * indexed_freq_compare --
+ * Qsort comparator to order the table by frequency (the most frequent
+ * symbols will be at the end of the array).
+ */
+static int
+indexed_freq_compare(const void *a, const void *b)
+{
+ return (((INDEXED_SYMBOL *)a)->frequency >
+ ((INDEXED_SYMBOL *)b)->frequency ? 1 :
+ (((INDEXED_SYMBOL *)a)->frequency <
+ ((INDEXED_SYMBOL *)b)->frequency ? -1 : 0));
+}
+
+/*
+ * profile_tree --
+ * Traverses tree to determine #leaves under each node, max depth, min
+ * depth of leaf.
+ */
+static uint32_t
+profile_tree(WT_FREQTREE_NODE *node,
+ uint16_t len, uint16_t *max_depth, uint16_t *min_depth)
+{
+ uint32_t leaf_cnt;
+
+ if (node->left == NULL && node->right == NULL) { /* leaf */
+ leaf_cnt = 1;
+ if (*max_depth < len)
+ *max_depth = len;
+ if (*min_depth > len)
+ *min_depth = len;
+ } else {
+ /*
+ * internal node -- way tree constructed internal always has
+ * left and right children
+ */
+ leaf_cnt =
+ profile_tree(node->left, len + 1, max_depth, min_depth) +
+ profile_tree(node->right, len + 1, max_depth, min_depth);
+ }
+ node->weight = leaf_cnt; /* abuse weight field */
+ return (leaf_cnt);
+}
+
+/*
+ * set_codes --
+ * Computes Huffman code for each symbol in tree.
+ *
+ * Method is standard way in the literature, except that limits maximum code
+ * length. A known max code length is important for limiting memory use by
+ * the tables and for knowing how large data types need to be such as the field
+ * that holds the code pattern.
+ */
+static void
+set_codes(WT_FREQTREE_NODE *node,
+ WT_HUFFMAN_CODE *codes, uint16_t pattern, uint8_t len)
+{
+ WT_HUFFMAN_CODE *code;
+ uint16_t patternleft, patternright, half;
+ uint8_t remaining;
+
+ if (node->left == NULL && node->right == NULL) {
+ code = &codes[node->symbol];
+ code->pattern = pattern;
+ code->length = len;
+#if __HUFFMAN_DETAIL
+ printf("%" PRIx16 ": code %" PRIx16 ", len %" PRIu8 "\n",
+ node->symbol, pattern, len);
+#endif
+ } else {
+ /*
+ * Check each subtree individually to see if can afford to split
+ * up bits into possibly shorter codes, or if need to employ all
+ * remaining bits up to MAX_CODE_LENGTH to consecutively number
+ * leaves.
+ */
+ remaining = MAX_CODE_LENGTH - len;
+ /*
+ * If not already in "low-bit mode", but need to be, open up
+ * lower-order bits for consecutive numbering.
+ */
+ if (len < MAX_CODE_LENGTH &&
+ ((half = 1 << (remaining - 1)) < node->left->weight ||
+ half < node->right->weight)) {
+ pattern = pattern << remaining;
+ len = MAX_CODE_LENGTH;
+ }
+
+ if (len < MAX_CODE_LENGTH) {
+ patternleft = (pattern << 1) | 0;
+ patternright = (pattern << 1) | 1;
+ len++;
+ } else { /* "low bit mode" */
+ patternleft = pattern;
+ patternright = pattern + node->left->weight;
+ /* len unchanged */
+ }
+
+ set_codes(node->left, codes, patternleft, len);
+ set_codes(node->right, codes, patternright, len);
+ }
+}
+
+/*
+ * make_table --
+ * Computes Huffman table used for subsequent lookups in encoding and
+ * decoding. With the table, encoding from a symbol to Huffman code and
+ * decoding from a code to a symbol are simple array lookups.
+ */
+static void
+make_table(WT_SESSION_IMPL *session, uint8_t *code2symbol,
+ uint16_t max_depth, WT_HUFFMAN_CODE *codes, u_int symcnt)
+{
+ uint32_t j, c1, c2; /* Exceeds uint16_t bounds at loop boundary. */
+ uint16_t c, i;
+ uint8_t len, shift;
+
+ /* Zero out, for assertion below. */
+ for (j = 0, c2 = (1U << max_depth); j < c2; j++)
+ code2symbol[j] = 0;
+
+ /*
+ * Here's the magic: flood all bit patterns for lower-order bits to
+ * point to same symbol.
+ */
+ for (i = 0; i < symcnt; i++) {
+ if ((len = codes[i].length) == 0)
+ continue;
+
+ /*
+ * The size of the array index should be enough to hold largest
+ * index into symbol table. Pre-existing symbols were packed
+ * 0-255, so 8 bits is enough. Don't want to make it larger
+ * than necessary, we allocate (2 ^ max-code-length) of them.
+ */
+ c = codes[i].pattern;
+ shift = max_depth - len;
+ c1 = (uint32_t)c << shift;
+ c2 = (uint32_t)(c + 1) << shift;
+ for (j = c1; j < c2; j++) {
+ WT_ASSERT(session, code2symbol[j] == 0);
+ code2symbol[j] = i;
+ }
+ }
+}
+
+/*
+ * recursive_free_node --
+ * Recursively free the huffman frequency tree's nodes.
+ */
+static void
+recursive_free_node(WT_SESSION_IMPL *session, WT_FREQTREE_NODE *node)
+{
+ if (node != NULL) {
+ recursive_free_node(session, node->left);
+ recursive_free_node(session, node->right);
+ __wt_free(session, node);
+ }
+}
+
+/*
+ * __wt_huffman_open --
+ * Take a frequency table and return a pointer to a descriptor object.
+ */
+int
+__wt_huffman_open(WT_SESSION_IMPL *session,
+ void *symbol_frequency_array, u_int symcnt, u_int numbytes, void *retp)
+{
+ INDEXED_SYMBOL *indexed_freqs, *sym;
+ NODE_QUEUE *combined_nodes, *leaves;
+ WT_DECL_RET;
+ WT_FREQTREE_NODE *node, *node2, **refnode, *tempnode;
+ WT_HUFFMAN_OBJ *huffman;
+ uint64_t w1, w2;
+ uint16_t i;
+
+ indexed_freqs = symbol_frequency_array;
+
+ combined_nodes = leaves = NULL;
+ node = node2 = tempnode = NULL;
+
+ WT_RET(__wt_calloc_def(session, 1, &huffman));
+
+ /*
+ * The frequency table is 4B pairs of symbol and frequency. The symbol
+ * is either 1 or 2 bytes and the frequency ranges from 1 to UINT32_MAX
+ * (a frequency of 0 means the value is never expected to appear in the
+ * input). Validate the symbols are within range.
+ */
+ if (numbytes != 1 && numbytes != 2)
+ WT_ERR_MSG(session, EINVAL,
+ "illegal number of symbol bytes specified for a huffman "
+ "table");
+
+ if (symcnt == 0)
+ WT_ERR_MSG(session, EINVAL,
+ "illegal number of symbols specified for a huffman table");
+
+ huffman->numSymbols = numbytes == 2 ? UINT16_MAX : UINT8_MAX;
+
+ /*
+ * Order the array by symbol and check for invalid symbols and
+ * duplicates.
+ */
+ qsort((void *)indexed_freqs,
+ symcnt, sizeof(INDEXED_SYMBOL), indexed_symbol_compare);
+ for (i = 0; i < symcnt; ++i) {
+ if (i > 0 &&
+ indexed_freqs[i].symbol == indexed_freqs[i - 1].symbol)
+ WT_ERR_MSG(session, EINVAL,
+ "duplicate symbol %" PRIx32
+ " specified in a huffman table",
+ indexed_freqs[i].symbol);
+ if (indexed_freqs[i].symbol > huffman->numSymbols)
+ WT_ERR_MSG(session, EINVAL,
+ "illegal symbol %" PRIx32
+ " specified in a huffman table",
+ indexed_freqs[i].symbol);
+ }
+
+ /*
+ * Massage frequencies.
+ */
+ indexed_freqs = NULL;
+ WT_ERR(__wt_calloc_def(session, 256, &indexed_freqs));
+
+ /*
+ * Minimum of frequency==1 so everybody gets a Huffman code, in case
+ * data evolves and we need to represent this value.
+ */
+ for (i = 0; i < 256; i++) {
+ sym = &indexed_freqs[i];
+ sym->symbol = i;
+ sym->frequency = 1;
+ }
+ /*
+ * Avoid large tables by splitting UTF-16 frequencies into high byte
+ * and low byte.
+ */
+ for (i = 0; i < symcnt; i++) {
+ sym = &((INDEXED_SYMBOL *)symbol_frequency_array)[i];
+ indexed_freqs[sym->symbol & 0xff].frequency += sym->frequency;
+ if (numbytes == 2)
+ indexed_freqs[(sym->symbol >> 8) & 0xff].frequency +=
+ sym->frequency;
+ }
+ huffman->numSymbols = symcnt = 256;
+
+ /*
+ * The array must be sorted by frequency to be able to use a linear time
+ * construction algorithm.
+ */
+ qsort((void *)indexed_freqs,
+ symcnt, sizeof(INDEXED_SYMBOL), indexed_freq_compare);
+
+ /* We need two node queues to build the tree. */
+ WT_ERR(__wt_calloc_def(session, 1, &leaves));
+ WT_ERR(__wt_calloc_def(session, 1, &combined_nodes));
+
+ /*
+ * Adding the leaves to the queue.
+ *
+ * Discard symbols with a frequency of 0; this assumes these symbols
+ * never occur in the source stream, and the purpose is to reduce the
+ * huffman tree's size.
+ */
+ for (i = 0; i < symcnt; ++i)
+ if (indexed_freqs[i].frequency > 0) {
+ WT_ERR(__wt_calloc_def(session, 1, &tempnode));
+ tempnode->symbol = (uint8_t)indexed_freqs[i].symbol;
+ tempnode->weight = indexed_freqs[i].frequency;
+ WT_ERR(node_queue_enqueue(session, leaves, tempnode));
+ tempnode = NULL;
+ }
+
+ while (!node_queue_is_empty(leaves) ||
+ !node_queue_is_empty(combined_nodes)) {
+ /*
+ * We have to get the node with the smaller weight, examining
+ * both queues' first element. We are collecting pairs of these
+ * items, by alternating between node and node2:
+ */
+ refnode = !node ? &node : &node2;
+
+ /*
+ * To decide which queue must be used, we get the weights of
+ * the first items from both:
+ */
+ w1 = node_queue_is_empty(leaves) ?
+ UINT64_MAX : leaves->first->node->weight;
+ w2 = node_queue_is_empty(combined_nodes) ?
+ UINT64_MAX : combined_nodes->first->node->weight;
+
+ /*
+ * Based on the two weights we finally can dequeue the smaller
+ * element and place it to the alternating target node pointer:
+ */
+ if (w1 < w2)
+ node_queue_dequeue(session, leaves, refnode);
+ else
+ node_queue_dequeue(session, combined_nodes, refnode);
+
+ /*
+ * In every second run, we have both node and node2 initialized.
+ */
+ if (node != NULL && node2 != NULL) {
+ WT_ERR(__wt_calloc_def(session, 1, &tempnode));
+
+ /* The new weight is the sum of the two weights. */
+ tempnode->weight = node->weight + node2->weight;
+ tempnode->left = node;
+ tempnode->right = node2;
+
+ /* Enqueue it to the combined nodes queue */
+ WT_ERR(node_queue_enqueue(
+ session, combined_nodes, tempnode));
+ tempnode = NULL;
+
+ /* Reset the state pointers */
+ node = node2 = NULL;
+ }
+ }
+
+ /*
+ * The remaining node is in the node variable, this is the root of the
+ * tree. Calculate how many bytes it takes to hold numSymbols bytes
+ * bits.
+ */
+ huffman->max_depth = 0;
+ huffman->min_depth = MAX_CODE_LENGTH;
+ (void)profile_tree(node, 0, &huffman->max_depth, &huffman->min_depth);
+ if (huffman->max_depth > MAX_CODE_LENGTH)
+ huffman->max_depth = MAX_CODE_LENGTH;
+
+ WT_ERR(__wt_calloc_def(session, huffman->numSymbols, &huffman->codes));
+ set_codes(node, huffman->codes, 0, 0);
+
+ WT_ERR(__wt_calloc_def(
+ session, 1U << huffman->max_depth, &huffman->code2symbol));
+ make_table(session, huffman->code2symbol,
+ huffman->max_depth, huffman->codes, huffman->numSymbols);
+
+#if __HUFFMAN_DETAIL
+ {
+ uint8_t symbol;
+ uint32_t weighted_length;
+
+ printf("leaf depth %" PRIu16 "..%" PRIu16 ", memory use: "
+ "codes %u# * %uB + code2symbol %u# * %uB\n",
+ huffman->min_depth, huffman->max_depth,
+ huffman->numSymbols, (u_int)sizeof(WT_HUFFMAN_CODE),
+ 1U << huffman->max_depth, (u_int)sizeof(uint16_t));
+
+ /*
+ * measure quality of computed Huffman codes, for different max bit
+ * lengths (say, 16 vs 24 vs 32)
+ */
+ weighted_length = 0;
+ for (i = 0; i < symcnt; i++) {
+ symbol = indexed_freqs[i].symbol;
+ weighted_length +=
+ indexed_freqs[i].frequency * huffman->codes[symbol].length;
+ printf(
+ "\t%" PRIu16 "->%" PRIu16 ". %" PRIu32 " * %" PRIu8 "\n",
+ i, symbol,
+ indexed_freqs[i].frequency, huffman->codes[symbol].length);
+ }
+ printf("weighted length of all codes (the smaller the better): "
+ "%" PRIu32 "\n", weighted_length);
+ }
+#endif
+
+ *(void **)retp = huffman;
+
+ if (0) {
+err: if (ret == 0)
+ ret = WT_ERROR;
+ }
+ __wt_free(session, indexed_freqs);
+ if (leaves != NULL)
+ node_queue_close(session, leaves);
+ if (combined_nodes != NULL)
+ node_queue_close(session, combined_nodes);
+ if (node != NULL)
+ recursive_free_node(session, node);
+ if (node2 != NULL)
+ recursive_free_node(session, node2);
+ __wt_free(session, tempnode);
+ if (ret != 0)
+ __wt_huffman_close(session, huffman);
+ return (ret);
+}
+
+/*
+ * __wt_huffman_close --
+ * Discard a Huffman descriptor object.
+ */
+void
+__wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg)
+{
+ WT_HUFFMAN_OBJ *huffman;
+
+ huffman = huffman_arg;
+
+ __wt_free(session, huffman->code2symbol);
+ __wt_free(session, huffman->codes);
+ __wt_free(session, huffman);
+}
+
+#if __HUFFMAN_DETAIL
+/*
+ * __wt_print_huffman_code --
+ * Prints a symbol's Huffman code.
+ */
+int
+__wt_print_huffman_code(void *huffman_arg, uint16_t symbol)
+{
+ WT_HUFFMAN_CODE code;
+ WT_HUFFMAN_OBJ *huffman;
+
+ huffman = huffman_arg;
+
+ if (symbol >= huffman->numSymbols)
+ printf("symbol %" PRIu16 " out of range\n", symbol);
+ else {
+ code = huffman->codes[symbol];
+ if (code.length == 0)
+ printf(
+ "symbol %" PRIu16 " not defined -- 0 frequency\n",
+ symbol);
+ else
+ /* should print code as binary */
+ printf(
+ "%" PRIu16 " -> code pattern "
+ "%" PRIx16 ", length %" PRIu8 "\n",
+ symbol, code.pattern, code.length);
+ }
+
+ return (0);
+}
+#endif
+
+/*
+ * __wt_huffman_encode --
+ * Take a byte string, encode it into the target.
+ *
+ * Translation from symbol to Huffman code is a simple array lookup.
+ *
+ * WT_HUFFMAN_OBJ contains an array called 'codes' with one WT_HUFFMAN_CODE per
+ * symbol. Then, given a symbol:
+ * pattern = codes[symbol].pattern;
+ * length = codes[symbol].length;
+ *
+ * To encode byte-string, we iterate over the input symbols. For each symbol,
+ * look it up via table, shift bits onto a shift register (an int long enough
+ * to hold the longest code word + up to 7 bits remaining from the previous),
+ * then drain out full bytes. Finally, at the end flush remaining bits
+ * and write header bits.
+ */
+int
+__wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg,
+ const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf)
+{
+ WT_DECL_RET;
+ WT_HUFFMAN_CODE code;
+ WT_HUFFMAN_OBJ *huffman;
+ WT_ITEM *tmp;
+ size_t max_len, outlen, bytes;
+ uint64_t bitpos;
+ const uint8_t *from;
+ uint8_t len, *out, padding_info, symbol;
+
+ /*
+ * Shift register to accumulate bits from input.
+ * Should be >= (MAX_CODE_LENGTH + 7), but also efficient to shift bits
+ * and preferably in a machine register.
+ */
+ uint32_t bits;
+
+ /* Count of bits in shift register ('bits' above). */
+ uint8_t valid;
+
+ huffman = huffman_arg;
+ from = from_arg;
+ tmp = NULL;
+
+ /*
+ * We don't want to find all of our callers and ensure they don't pass
+ * 0-length byte strings, but there's no reason to do any work.
+ */
+ if (from_len == 0) {
+ to_buf->size = 0;
+ return (0);
+ }
+
+ /*
+ * Compute the largest compressed output size, which is if all symbols
+ * are least frequent and so have largest Huffman codes, and compressed
+ * output may be larger than the input size. This way we don't have to
+ * worry about resizing the buffer during compression. Use the shared
+ * system buffer while compressing, then allocate a new buffer of the
+ * right size and copy the result into it.
+ */
+ max_len = (WT_HUFFMAN_HEADER +
+ from_len * huffman->max_depth + 7 /* round up to full byte */) / 8;
+ WT_ERR(__wt_scr_alloc(session, max_len, &tmp));
+
+ /*
+ * Leave the first 3 bits of the encoded value empty, it holds the
+ * number of bits actually used in the last byte of the encoded value.
+ */
+ bits = 0;
+ bitpos = WT_HUFFMAN_HEADER;
+ valid = WT_HUFFMAN_HEADER;
+ out = tmp->mem;
+ for (bytes = 0; bytes < from_len; bytes++) {
+ WT_ASSERT(session, WT_PTR_IN_RANGE(from, from_arg, from_len));
+
+ symbol = *from++;
+
+ /* Translate symbol into Huffman code and stuff into buffer. */
+ code = huffman->codes[symbol];
+ len = code.length;
+ bits = (bits << len) | code.pattern;
+ valid += len;
+ bitpos += len;
+ while (valid >= 8) {
+ WT_ASSERT(session,
+ WT_PTR_IN_RANGE(out, tmp->mem, tmp->memsize));
+ *out++ = (uint8_t)(bits >> (valid - 8));
+ valid -= 8;
+ }
+ }
+ if (valid > 0) { /* Flush shift register. */
+ WT_ASSERT(session,
+ WT_PTR_IN_RANGE(out, tmp->mem, tmp->memsize));
+ *out = (uint8_t)(bits << (8 - valid));
+ }
+
+ /*
+ * At this point, bitpos is the total number of used bits (including
+ * the 3 bits at the beginning of the buffer, which we'll set now to
+ * the number of bits used in the last byte). Note if the number of
+ * bits used in the last byte is 8, we set the 3 bits to 0, in other
+ * words, the first 3 bits of the encoded value are the number of bits
+ * used in the last byte, unless they're 0, in which case there are 8
+ * bits used in the last byte.
+ */
+ padding_info = (bitpos % 8) << (8 - WT_HUFFMAN_HEADER);
+ ((uint8_t *)tmp->mem)[0] |= padding_info;
+
+ /* Copy result of exact known size into caller's buffer. */
+ outlen = (uint32_t)((bitpos + 7) / 8);
+ WT_ERR(__wt_buf_initsize(session, to_buf, outlen));
+ memcpy(to_buf->mem, tmp->mem, outlen);
+
+#if __HUFFMAN_DETAIL
+ printf("encode: worst case %" PRIu32 " bytes -> actual %" PRIu32 "\n",
+ max_len, outlen);
+#endif
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+
+}
+
+/*
+ * __wt_huffman_decode --
+ * Take a byte string, decode it into the target.
+ *
+ * Translation from Huffman code to symbol is a simple array lookup.
+ *
+ * WT_HUFFMAN_OBJ contains an array called 'code2symbol' indexed by code word
+ * and whose value is the corresponding symbol.
+ * From the symbol, we index into the 'codes' array to get the code length.
+ *
+ * When decoding a message, we don't know where the boundaries are between
+ * codes. The trick is that we collect enough bits for the longest code word,
+ * and construct the table such that for codes with fewer bits we flood the
+ * table with all of the bit patterns in the lower order bits. This works
+ * because the Huffman code is a unique prefix, and by the flooding we are
+ * treating bits beyond the unique prefix as don't care bits.
+ *
+ * For example, we have table of length 2^max_code_length (1<<max_code_length).
+ * For a code of length, max_code_length, the position code2symbol[code] =
+ * symbol.
+ * For a code word of (max_length - 1), we fill code2symbol[code << 1] = symbol,
+ * as well as code2symbol[(code << 1) | 1] = symbol.
+ * And so on, so in general we fill:
+ * code2symbol[(code) << shift inclusive .. (code+1) << shift exclusive].
+ *
+ * To decode a message, we read in enough bits from input to fill the shift
+ * register with at least MAX_CODE_LENGTH bits.
+ * We look up in the table code2symbol to obtain the symbol.
+ * We look up the symbol in 'codes' to obtain the code length
+ * Finally, subtract off these bits from the shift register.
+ */
+int
+__wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg,
+ const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf)
+{
+ WT_DECL_RET;
+ WT_ITEM *tmp;
+ WT_HUFFMAN_OBJ *huffman;
+ size_t from_bytes, len, max_len, outlen;
+ uint64_t from_len_bits;
+ uint32_t bits, mask, max;
+ uint16_t pattern;
+ const uint8_t *from;
+ uint8_t padding_info, symbol, *to, valid;
+
+ huffman = huffman_arg;
+ from = from_arg;
+ tmp = NULL;
+
+ /*
+ * We don't want to find all of our callers and ensure they don't pass
+ * 0-length byte strings, but there's no reason to do any work.
+ */
+ if (from_len == 0) {
+ to_buf->size = 0;
+ return (0);
+ }
+
+ /*
+ * The first 3 bits are the number of used bits in the last byte, unless
+ * they're 0, in which case there are 8 bits used in the last byte.
+ */
+ padding_info = (*from & 0xE0) >> (8 - WT_HUFFMAN_HEADER);
+ from_len_bits = from_len * 8;
+ if (padding_info != 0)
+ from_len_bits -= 8U - padding_info;
+
+ /* Number of bits that have codes. */
+ from_len_bits -= WT_HUFFMAN_HEADER;
+
+ /*
+ * Compute largest uncompressed output size, which is if all symbols are
+ * most frequent and so have smallest Huffman codes and therefore
+ * largest expansion. Use the shared system buffer while uncompressing,
+ * then allocate a new buffer of exactly the right size and copy the
+ * result into it.
+ */
+ max_len = (uint32_t)(from_len_bits / huffman->min_depth);
+ WT_ERR(__wt_scr_alloc(session, max_len, &tmp));
+ to = tmp->mem;
+
+ /* The first byte of input is a special case because of header bits. */
+ bits = *from++;
+ valid = 8 - WT_HUFFMAN_HEADER;
+ from_bytes = from_len - 1;
+
+ max = huffman->max_depth;
+ mask = (1U << max) - 1;
+ for (outlen = 0; from_len_bits > 0; outlen++) {
+ while (valid < max && from_bytes > 0) {
+ WT_ASSERT(session,
+ WT_PTR_IN_RANGE(from, from_arg, from_len));
+ bits = (bits << 8) | *from++;
+ valid += 8;
+ from_bytes--;
+ }
+ pattern = valid >= max ? /* short patterns near end */
+ (bits >> (valid - max)) : (bits << (max - valid));
+ symbol = huffman->code2symbol[pattern & mask];
+ len = huffman->codes[symbol].length;
+ valid -= len;
+ WT_ASSERT(session, from_len_bits >= len);
+ from_len_bits -= len;
+
+ WT_ASSERT(session,
+ WT_PTR_IN_RANGE(to, tmp->mem, tmp->memsize));
+ *to++ = symbol;
+ }
+
+ /* Return the number of bytes used. */
+ WT_ERR(__wt_buf_initsize(session, to_buf, outlen));
+ memcpy(to_buf->mem, tmp->mem, outlen);
+
+#if __HUFFMAN_DETAIL
+ printf("decode: worst case %" PRIu32 " bytes -> actual %" PRIu32 "\n",
+ max_len, outlen);
+#endif
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * node_queue_close --
+ * Delete a queue from memory.
+ *
+ * It does not delete the pointed huffman tree nodes!
+ */
+static void
+node_queue_close(WT_SESSION_IMPL *session, NODE_QUEUE *queue)
+{
+ NODE_QUEUE_ELEM *elem, *next_elem;
+
+ /* Freeing each element of the queue's linked list. */
+ for (elem = queue->first; elem != NULL; elem = next_elem) {
+ next_elem = elem->next;
+ __wt_free(session, elem);
+ }
+
+ /* Freeing the queue record itself. */
+ __wt_free(session, queue);
+}
+
+/*
+ * node_queue_enqueue --
+ * Push a tree node to the end of the queue.
+ */
+static int
+node_queue_enqueue(
+ WT_SESSION_IMPL *session, NODE_QUEUE *queue, WT_FREQTREE_NODE *node)
+{
+ NODE_QUEUE_ELEM *elem;
+
+ /* Allocating a new linked list element */
+ WT_RET(__wt_calloc_def(session, 1, &elem));
+
+ /* It holds the tree node, and has no next element yet */
+ elem->node = node;
+ elem->next = NULL;
+
+ /* If the queue is empty, the first element will be the new one. */
+ if (queue->first == NULL)
+ queue->first = elem;
+
+ /*
+ * If the queue is not empty, the last element's next pointer must be
+ * updated.
+ */
+ if (queue->last != NULL)
+ queue->last->next = elem;
+
+ /* The last element is the new one */
+ queue->last = elem;
+
+ return (0);
+}
+
+/*
+ * node_queue_dequeue --
+ * Removes a node from the beginning of the queue and copies the node's
+ * pointer to the location referred by the retp parameter.
+ */
+static void
+node_queue_dequeue(
+ WT_SESSION_IMPL *session, NODE_QUEUE *queue, WT_FREQTREE_NODE **retp)
+{
+ NODE_QUEUE_ELEM *first_elem;
+
+ /*
+ * Getting the first element of the queue and updating it to point to
+ * the next element as first.
+ */
+ first_elem = queue->first;
+ *retp = first_elem->node;
+ queue->first = first_elem->next;
+
+ /*
+ * If the last element was the dequeued element, we have to update it
+ * to NULL.
+ */
+ if (queue->last == first_elem)
+ queue->last = NULL;
+
+ /* Freeing the linked list element that has been dequeued */
+ __wt_free(session, first_elem);
+}
diff --git a/src/third_party/wiredtiger/src/support/mutex.c b/src/third_party/wiredtiger/src/support/mutex.c
new file mode 100644
index 00000000000..ffe52cf28fd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/mutex.c
@@ -0,0 +1,257 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+
+/*
+ * __wt_spin_lock_register_lock --
+ * Add a lock to the connection's list.
+ */
+int
+__wt_spin_lock_register_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_CONNECTION_IMPL *conn;
+ u_int i;
+
+ /*
+ * There is a spinlock we initialize before we have a connection, the
+ * global library lock. In that case, the session will be NULL and
+ * we can't track the lock.
+ */
+ if (session == NULL)
+ return (0);
+
+ conn = S2C(session);
+
+ for (i = 0; i < WT_SPINLOCK_MAX; i++)
+ if (conn->spinlock_list[i] == NULL &&
+ WT_ATOMIC_CAS(conn->spinlock_list[i], NULL, t))
+ return (0);
+
+ WT_RET_MSG(session, ENOMEM,
+ "spinlock connection registry failed, increase the connection's "
+ "spinlock list size");
+}
+
+/*
+ * __wt_spin_lock_unregister_lock --
+ * Remove a lock from the connection's list.
+ */
+void
+__wt_spin_lock_unregister_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_CONNECTION_IMPL *conn;
+ u_int i;
+
+ conn = S2C(session);
+
+ for (i = 0; i < WT_SPINLOCK_MAX; i++)
+ if (conn->spinlock_list[i] == t)
+ conn->spinlock_list[i] = NULL;
+
+ /*
+ * XXX
+ * The statistics thread reads through this array, there's a possible
+ * race: if that thread reads the pointer then goes to sleep, then we
+ * free the spinlock, then the statistics thread wakes up, it can read
+ * free'd memory.
+ *
+ * This is performance debugging code, so we're not fixing the race for
+ * now, minimize the window.
+ */
+ WT_FULL_BARRIER();
+}
+
+/*
+ * __spin_lock_next_id --
+ * Return the next spinlock caller ID.
+ */
+static int
+__spin_lock_next_id(WT_SESSION_IMPL *session, int *idp)
+{
+ static int lock_id = 0, next_id = 0;
+ WT_DECL_RET;
+
+ /* If we've ever registered this location, we already have an ID. */
+ if (*idp != WT_SPINLOCK_REGISTER)
+ return (0);
+
+ /*
+ * We can't use the global spinlock to lock the ID allocation (duh!),
+ * use a CAS instruction to serialize access to a local variable.
+ * This work only gets done once per library instantiation, there
+ * isn't a performance concern.
+ */
+ while (!WT_ATOMIC_CAS(lock_id, 0, 1))
+ __wt_yield();
+
+ /* Allocate a blocking ID for this location. */
+ if (*idp == WT_SPINLOCK_REGISTER) {
+ if (next_id < WT_SPINLOCK_MAX_LOCATION_ID)
+ *idp = next_id++;
+ else
+ WT_ERR_MSG(session, ENOMEM,
+ "spinlock caller location registry failed, "
+ "increase the connection's blocking matrix size");
+ }
+
+err: WT_PUBLISH(lock_id, 0);
+ return (ret);
+}
+
+/*
+ * __wt_spin_lock_register_caller --
+ * Register a spin-lock caller's location information in the blocking
+ * matrix.
+ */
+int
+__wt_spin_lock_register_caller(WT_SESSION_IMPL *session,
+ const char *name, const char *file, int line, int *idp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CONNECTION_STATS_SPINLOCK *p;
+
+ conn = S2C(session);
+
+ /*
+ * The caller's location ID is a static offset into a per-connection
+ * structure, and that has problems: first, if there are multiple
+ * connections, we'll need to hold some kind of lock to avoid racing
+ * when setting that value, and second, if/when there are multiple
+ * connections and/or a single connection is closed and re-opened, the
+ * variable may be initialized and underlying connection information
+ * may not.
+ *
+ * First, allocate a location ID if needed.
+ */
+ WT_RET(__spin_lock_next_id(session, idp));
+
+ /*
+ * Add the caller's information to the blocking matrix. We could race
+ * here (if two threads of control register the same lock at the same
+ * time), but we don't care as both threads are setting the identical
+ * information.
+ */
+ p = &conn->spinlock_block[*idp];
+ p->name = name;
+ if ((p->file = strrchr(file, '/')) == NULL)
+ p->file = file;
+ else
+ ++p->file;
+ p->line = line;
+ return (0);
+}
+
+/*
+ * __wt_statlog_dump_spinlock --
+ * Log the spin-lock statistics.
+ */
+int
+__wt_statlog_dump_spinlock(WT_CONNECTION_IMPL *conn, const char *tag)
+{
+ WT_SPINLOCK *spin;
+ WT_CONNECTION_STATS_SPINLOCK *p, *t;
+ uint64_t block_manager, btree_page, ignore;
+ u_int i, j;
+
+ /*
+ * Ignore rare acquisition of a spinlock using a base value of 10 per
+ * second so we don't create graphs we don't care about.
+ */
+ ignore = (uint64_t)(conn->stat_usecs / 1000000) * 10;
+
+ /* Output the number of times each spinlock was acquired. */
+ block_manager = btree_page = 0;
+ for (i = 0; i < WT_ELEMENTS(conn->spinlock_list); ++i) {
+ if ((spin = conn->spinlock_list[i]) == NULL)
+ continue;
+
+ /*
+ * There are two sets of spinlocks we aggregate, the btree page
+ * locks and the block manager per-file locks. The reason is
+ * the block manager locks grow with the number of files open
+ * (and LSM and bloom filters can open a lot of files), and
+ * there are 16 btree page locks and splitting them out has not
+ * historically been that informative.
+ */
+ if (strcmp(spin->name, "block manager") == 0) {
+ block_manager += spin->counter;
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+ spin->counter = 0;
+ continue;
+ }
+ if (strcmp(spin->name, "btree page") == 0) {
+ btree_page += spin->counter;
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+ spin->counter = 0;
+ continue;
+ }
+
+ WT_RET_TEST((fprintf(conn->stat_fp,
+ "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
+ conn->stat_stamp,
+ spin->counter <= ignore ? 0 : spin->counter,
+ tag, spin->name) < 0),
+ __wt_errno());
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+ spin->counter = 0;
+ }
+ WT_RET_TEST((fprintf(conn->stat_fp,
+ "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
+ conn->stat_stamp,
+ block_manager <= ignore ? 0 : block_manager,
+ tag, "block manager") < 0),
+ __wt_errno());
+ WT_RET_TEST((fprintf(conn->stat_fp,
+ "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
+ conn->stat_stamp,
+ btree_page <= ignore ? 0 : btree_page,
+ tag, "btree page") < 0),
+ __wt_errno());
+
+ /*
+ * Output the number of times each location acquires its spinlock and
+ * the blocking matrix.
+ */
+ for (i = 0; i < WT_ELEMENTS(conn->spinlock_block); ++i) {
+ p = &conn->spinlock_block[i];
+ if (p->name == NULL)
+ continue;
+
+ WT_RET_TEST((fprintf(conn->stat_fp,
+ "%s %d %s spinlock %s acquired by %s(%d)\n",
+ conn->stat_stamp,
+ p->total <= ignore ? 0 : p->total,
+ tag,
+ p->name, p->file, p->line) < 0), __wt_errno());
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+ p->total = 0;
+
+ for (j = 0; j < WT_ELEMENTS(conn->spinlock_block); ++j) {
+ t = &conn->spinlock_block[j];
+ if (t->name == NULL)
+ continue;
+
+ WT_RET_TEST((fprintf(conn->stat_fp,
+ "%s %d %s spinlock %s: %s(%d) blocked by %s(%d)\n",
+ conn->stat_stamp,
+ p->blocked[j] <= ignore ? 0 : p->blocked[j],
+ tag,
+ p->name, p->file, p->line,
+ t->file, t->line) < 0), __wt_errno());
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+ p->blocked[j] = 0;
+ }
+ }
+
+ WT_FULL_BARRIER(); /* Minimize the window. */
+ return (0);
+}
+
+#endif /* SPINLOCK_PTHREAD_MUTEX_LOGGING */
diff --git a/src/third_party/wiredtiger/src/support/pow.c b/src/third_party/wiredtiger/src/support/pow.c
new file mode 100644
index 00000000000..a6bf6c7227f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/pow.c
@@ -0,0 +1,130 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wt_internal.h"
+
+#ifdef __WIREDTIGER_UNUSED__
+
+/*
+ * __wt_nlpo2_round --
+ * Round up to the next-largest power-of-two for a 32-bit unsigned value.
+ *
+ * In 12 operations, this code computes the next highest power of 2 for a 32-bit
+ * integer. The result may be expressed by the formula 1U << (lg(v - 1) + 1).
+ * Note that in the edge case where v is 0, it returns 0, which isn't a power of
+ * 2; you might append the expression v += (v == 0) to remedy this if it
+ * matters. It would be faster by 2 operations to use the formula and the
+ * log base 2 method that uses a lookup table, but in some situations, lookup
+ * tables are not suitable, so the above code may be best. (On a Athlon XP 2100+
+ * I've found the above shift-left and then OR code is as fast as using a single
+ * BSR assembly language instruction, which scans in reverse to find the highest
+ * set bit.) It works by copying the highest set bit to all of the lower bits,
+ * and then adding one, which results in carries that set all of the lower bits
+ * to 0 and one bit beyond the highest set bit to 1. If the original number was
+ * a power of 2, then the decrement will reduce it to one less, so that we round
+ * up to the same original value. Devised by Sean Anderson, September 14, 2001.
+ * Pete Hart pointed me to a couple newsgroup posts by him and William Lewis in
+ * February of 1997, where they arrive at the same algorithm.
+ * http://graphics.stanford.edu/~seander/bithacks.html
+ * Sean Eron Anderson, seander@cs.stanford.edu
+ */
+uint32_t
+__wt_nlpo2_round(uint32_t v)
+{
+ v--; /* If v is a power-of-two, return it. */
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ return (v + 1);
+}
+
+/*
+ * __wt_nlpo2 --
+ * Return the next largest power-of-two.
+ */
+uint32_t
+__wt_nlpo2(uint32_t v)
+{
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ return (v + 1);
+}
+#endif /* __WIREDTIGER_UNUSED__ */
+
+/*
+ * __wt_log2_int --
+ * Find the log base 2 of an integer in O(N) operations;
+ * http://graphics.stanford.edu/~seander/bithacks.html
+ */
+uint32_t
+__wt_log2_int(uint32_t n)
+{
+ uint32_t l = 0;
+
+ while (n >>= 1)
+ l++;
+ return (l);
+}
+
+/*
+ * __wt_ispo2 --
+ * Return if a number is a power-of-two.
+ */
+int
+__wt_ispo2(uint32_t v)
+{
+ /*
+ * Only numbers that are powers of two will satisfy the relationship
+ * (v & (v - 1) == 0).
+ *
+ * However n must be positive, this returns 0 as a power of 2; to fix
+ * that, use: (! (v & (v - 1)) && v)
+ */
+ return ((v & (v - 1)) == 0);
+}
+
+/*
+ * __wt_rduppo2 --
+ * Round the given int up to the next multiple of N, where N is power of 2.
+ */
+uint32_t
+__wt_rduppo2(uint32_t n, uint32_t po2)
+{
+ uint32_t bits, res;
+
+ if (__wt_ispo2(po2)) {
+ bits = __wt_log2_int(po2);
+ res = (((n - 1) >> bits) + 1) << bits;
+ } else
+ res = 0;
+ return (res);
+}
diff --git a/src/third_party/wiredtiger/src/support/rand.c b/src/third_party/wiredtiger/src/support/rand.c
new file mode 100644
index 00000000000..b716eb8c58b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/rand.c
@@ -0,0 +1,69 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wt_internal.h"
+
+#undef M_W
+#define M_W (rnd)[0]
+#undef M_Z
+#define M_Z (rnd)[1]
+
+/*
+ * __wt_random_init --
+ * Initialize return of a 32-bit pseudo-random number.
+ */
+void
+__wt_random_init(uint32_t *rnd)
+{
+ M_W = 521288629;
+ M_Z = 362436069;
+}
+
+/*
+ * __wt_random --
+ * Return a 32-bit pseudo-random number.
+ *
+ * This is an implementation of George Marsaglia's multiply-with-carry pseudo-
+ * random number generator. Computationally fast, with reasonable randomness
+ * properties.
+ *
+ * We have to be very careful about races here. Multiple threads can call
+ * __wt_random concurrently, and it is okay if those concurrent calls get the
+ * same return value. What is *not* okay is if reading the shared state races
+ * with an update and uses two different values for m_w or m_z. That could
+ * result in a value of zero, in which case they would be stuck on zero
+ * forever. Take local copies of the shared values to avoid this.
+ */
+uint32_t
+__wt_random(uint32_t *rnd)
+{
+ uint32_t w = M_W, z = M_Z;
+
+ M_Z = z = 36969 * (z & 65535) + (z >> 16);
+ M_W = w = 18000 * (w & 65535) + (w >> 16);
+ return (z << 16) + (w & 65535);
+}
diff --git a/src/third_party/wiredtiger/src/support/scratch.c b/src/third_party/wiredtiger/src/support/scratch.c
new file mode 100644
index 00000000000..ca2cdac8377
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/scratch.c
@@ -0,0 +1,319 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_buf_grow_worker --
+ * Grow a buffer that may be in-use, and ensure that all data is local to
+ * the buffer.
+ */
+int
+__wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+ size_t offset;
+ int copy_data;
+
+ /*
+ * Maintain the existing data: there are 3 cases:
+ * No existing data: allocate the required memory, and initialize
+ * the data to reference it.
+ * Existing data local to the buffer: set the data to the same
+ * offset in the re-allocated memory.
+ * Existing data not-local to the buffer: copy the data into the
+ * buffer and set the data to reference it.
+ */
+ if (WT_DATA_IN_ITEM(buf)) {
+ offset = WT_PTRDIFF(buf->data, buf->mem);
+ copy_data = 0;
+ } else {
+ offset = 0;
+ copy_data = buf->size ? 1 : 0;
+ }
+
+ /*
+ * This function is also used to ensure data is local to the buffer,
+ * check to see if we actually need to grow anything.
+ */
+ if (size > buf->memsize) {
+ if (F_ISSET(buf, WT_ITEM_ALIGNED))
+ WT_RET(__wt_realloc_aligned(
+ session, &buf->memsize, size, &buf->mem));
+ else
+ WT_RET(__wt_realloc(
+ session, &buf->memsize, size, &buf->mem));
+ }
+
+ if (buf->data == NULL) {
+ buf->data = buf->mem;
+ buf->size = 0;
+ } else {
+ if (copy_data)
+ memcpy(buf->mem, buf->data, buf->size);
+ buf->data = (uint8_t *)buf->mem + offset;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_buf_fmt --
+ * Grow a buffer to accommodate a formatted string.
+ */
+int
+__wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+ va_list ap;
+ size_t len;
+
+ for (;;) {
+ va_start(ap, fmt);
+ len = (size_t)vsnprintf(buf->mem, buf->memsize, fmt, ap);
+ va_end(ap);
+
+ /* Check if there was enough space. */
+ if (len < buf->memsize) {
+ buf->data = buf->mem;
+ buf->size = len;
+ return (0);
+ }
+
+ /*
+ * If not, double the size of the buffer: we're dealing with
+ * strings, and we don't expect these numbers to get huge.
+ */
+ WT_RET(__wt_buf_extend(session, buf, len + 1));
+ }
+}
+
+/*
+ * __wt_buf_catfmt --
+ * Grow a buffer to append a formatted string.
+ */
+int
+__wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+ va_list ap;
+ size_t len, space;
+ char *p;
+
+ /*
+ * If we're appending data to an existing buffer, any data field should
+ * point into the allocated memory. (It wouldn't be insane to copy any
+ * previously existing data at this point, if data wasn't in the local
+ * buffer, but we don't and it would be bad if we didn't notice it.)
+ */
+ WT_ASSERT(session, buf->data == NULL || WT_DATA_IN_ITEM(buf));
+
+ for (;;) {
+ va_start(ap, fmt);
+ p = (char *)((uint8_t *)buf->mem + buf->size);
+ WT_ASSERT(session, buf->memsize >= buf->size);
+ space = buf->memsize - buf->size;
+ len = (size_t)vsnprintf(p, (size_t)space, fmt, ap);
+ va_end(ap);
+
+ /* Check if there was enough space. */
+ if (len < space) {
+ buf->size += len;
+ return (0);
+ }
+
+ /*
+ * If not, double the size of the buffer: we're dealing with
+ * strings, and we don't expect these numbers to get huge.
+ */
+ WT_RET(__wt_buf_extend(session, buf, buf->size + len + 1));
+ }
+}
+
+/*
+ * __wt_scr_alloc_func --
+ * Scratch buffer allocation function.
+ */
+int
+__wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ )
+{
+ WT_DECL_RET;
+ WT_ITEM *buf, **p, **best, **slot;
+ size_t allocated;
+ u_int i;
+
+ /* Don't risk the caller not catching the error. */
+ *scratchp = NULL;
+
+ /*
+ * Each WT_SESSION_IMPL has an array of scratch buffers available for
+ * use by any function. We use WT_ITEM structures for scratch memory
+ * because we already have functions that do variable-length allocation
+ * on a WT_ITEM. Scratch buffers are allocated only by a single thread
+ * of control, so no locking is necessary.
+ *
+ * Walk the array, looking for a buffer we can use.
+ */
+ for (i = 0, best = slot = NULL,
+ p = session->scratch; i < session->scratch_alloc; ++i, ++p) {
+ /* If we find an empty slot, remember it. */
+ if ((buf = *p) == NULL) {
+ if (slot == NULL)
+ slot = p;
+ continue;
+ }
+
+ if (F_ISSET(buf, WT_ITEM_INUSE))
+ continue;
+
+ /*
+ * If we find a buffer that's not in-use, check its size: we
+ * want the smallest buffer larger than the requested size,
+ * or the largest buffer if none are large enough.
+ */
+ if (best == NULL ||
+ ((*best)->memsize < size &&
+ buf->memsize > (*best)->memsize) ||
+ (buf->memsize >= size && buf->memsize < (*best)->memsize))
+ best = p;
+
+ /* If we find a perfect match, use it. */
+ if ((*best)->memsize == size)
+ break;
+ }
+
+ /*
+ * If we didn't find a free buffer, extend the array and use the first
+ * slot we allocated.
+ */
+ if (best == NULL && slot == NULL) {
+ allocated = session->scratch_alloc * sizeof(WT_ITEM *);
+ WT_ERR(__wt_realloc(session, &allocated,
+ (session->scratch_alloc + 10) * sizeof(WT_ITEM *),
+ &session->scratch));
+#ifdef HAVE_DIAGNOSTIC
+ allocated = session->scratch_alloc * sizeof(WT_SCRATCH_TRACK);
+ WT_ERR(__wt_realloc(session, &allocated,
+ (session->scratch_alloc + 10) * sizeof(WT_SCRATCH_TRACK),
+ &session->scratch_track));
+#endif
+ slot = session->scratch + session->scratch_alloc;
+ session->scratch_alloc += 10;
+ }
+
+ /*
+ * If slot is non-NULL, we found an empty slot, try and allocate a
+ * buffer.
+ */
+ if (best == NULL) {
+ WT_ASSERT(session, slot != NULL);
+ best = slot;
+
+ WT_ERR(__wt_calloc_def(session, 1, best));
+
+ /* Scratch buffers must be aligned. */
+ F_SET(*best, WT_ITEM_ALIGNED);
+ }
+
+ /* Grow the buffer as necessary and return. */
+ WT_ERR(__wt_buf_init(session, *best, size));
+ F_SET(*best, WT_ITEM_INUSE);
+
+#ifdef HAVE_DIAGNOSTIC
+ session->scratch_track[best - session->scratch].file = file;
+ session->scratch_track[best - session->scratch].line = line;
+#endif
+
+ *scratchp = *best;
+ return (0);
+
+err: WT_RET_MSG(session, ret,
+ "session unable to allocate a scratch buffer");
+}
+
+/*
+ * __wt_scr_discard --
+ * Free all memory associated with the scratch buffers.
+ */
+void
+__wt_scr_discard(WT_SESSION_IMPL *session)
+{
+ WT_ITEM **bufp;
+ u_int i;
+
+ for (i = 0,
+ bufp = session->scratch; i < session->scratch_alloc; ++i, ++bufp) {
+ if (*bufp == NULL)
+ continue;
+ if (F_ISSET(*bufp, WT_ITEM_INUSE))
+ __wt_errx(session,
+ "scratch buffer allocated and never discarded"
+#ifdef HAVE_DIAGNOSTIC
+ ": %s: %d",
+ session->
+ scratch_track[bufp - session->scratch].file,
+ session->
+ scratch_track[bufp - session->scratch].line
+#endif
+ );
+
+ __wt_buf_free(session, *bufp);
+ __wt_free(session, *bufp);
+ }
+
+ __wt_free(session, session->scratch);
+#ifdef HAVE_DIAGNOSTIC
+ __wt_free(session, session->scratch_track);
+#endif
+}
+
+/*
+ * __wt_ext_scr_alloc --
+ * Allocate a scratch buffer, and return the memory reference.
+ */
+void *
+__wt_ext_scr_alloc(
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t size)
+{
+ WT_ITEM *buf;
+ WT_SESSION_IMPL *session;
+
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+ return (__wt_scr_alloc(session, size, &buf) == 0 ? buf->mem : NULL);
+}
+
+/*
+ * __wt_ext_scr_free --
+ * Free a scratch buffer based on the memory reference.
+ */
+void
+__wt_ext_scr_free(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *p)
+{
+ WT_ITEM **bufp;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+ for (i = 0,
+ bufp = session->scratch; i < session->scratch_alloc; ++i, ++bufp)
+ if (*bufp != NULL && (*bufp)->mem == p) {
+ /*
+ * Do NOT call __wt_scr_free() here, it clears the
+ * caller's pointer, which would truncate the list.
+ */
+ F_CLR(*bufp, WT_ITEM_INUSE);
+ return;
+ }
+ __wt_errx(session, "extension free'd non-existent scratch buffer");
+}
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
new file mode 100644
index 00000000000..bc468fbe938
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -0,0 +1,567 @@
+/* DO NOT EDIT: automatically built by dist/stat.py. */
+
+#include "wt_internal.h"
+
+void
+__wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats)
+{
+ /* Clear, so can also be called for reinitialization. */
+ memset(stats, 0, sizeof(*stats));
+
+ stats->allocation_size.desc =
+ "block manager: file allocation unit size";
+ stats->block_alloc.desc = "block manager: blocks allocated";
+ stats->block_checkpoint_size.desc = "block manager: checkpoint size";
+ stats->block_extension.desc =
+ "block manager: allocations requiring file extension";
+ stats->block_free.desc = "block manager: blocks freed";
+ stats->block_magic.desc = "block manager: file magic number";
+ stats->block_major.desc = "block manager: file major version number";
+ stats->block_minor.desc = "block manager: minor version number";
+ stats->block_reuse_bytes.desc =
+ "block manager: file bytes available for reuse";
+ stats->block_size.desc = "block manager: file size in bytes";
+ stats->bloom_count.desc = "LSM: bloom filters in the LSM tree";
+ stats->bloom_false_positive.desc = "LSM: bloom filter false positives";
+ stats->bloom_hit.desc = "LSM: bloom filter hits";
+ stats->bloom_miss.desc = "LSM: bloom filter misses";
+ stats->bloom_page_evict.desc =
+ "LSM: bloom filter pages evicted from cache";
+ stats->bloom_page_read.desc =
+ "LSM: bloom filter pages read into cache";
+ stats->bloom_size.desc = "LSM: total size of bloom filters";
+ stats->btree_column_deleted.desc =
+ "btree: column-store variable-size deleted values";
+ stats->btree_column_fix.desc =
+ "btree: column-store fixed-size leaf pages";
+ stats->btree_column_internal.desc =
+ "btree: column-store internal pages";
+ stats->btree_column_variable.desc =
+ "btree: column-store variable-size leaf pages";
+ stats->btree_compact_rewrite.desc =
+ "btree: pages rewritten by compaction";
+ stats->btree_entries.desc = "btree: number of key/value pairs";
+ stats->btree_fixed_len.desc = "btree: fixed-record size";
+ stats->btree_maximum_depth.desc = "btree: maximum tree depth";
+ stats->btree_maxintlitem.desc =
+ "btree: maximum internal page item size";
+ stats->btree_maxintlpage.desc = "btree: maximum internal page size";
+ stats->btree_maxleafitem.desc = "btree: maximum leaf page item size";
+ stats->btree_maxleafpage.desc = "btree: maximum leaf page size";
+ stats->btree_overflow.desc = "btree: overflow pages";
+ stats->btree_row_internal.desc = "btree: row-store internal pages";
+ stats->btree_row_leaf.desc = "btree: row-store leaf pages";
+ stats->cache_bytes_read.desc = "cache: bytes read into cache";
+ stats->cache_bytes_write.desc = "cache: bytes written from cache";
+ stats->cache_eviction_checkpoint.desc =
+ "cache: checkpoint blocked page eviction";
+ stats->cache_eviction_clean.desc = "cache: unmodified pages evicted";
+ stats->cache_eviction_dirty.desc = "cache: modified pages evicted";
+ stats->cache_eviction_fail.desc =
+ "cache: data source pages selected for eviction unable to be evicted";
+ stats->cache_eviction_hazard.desc =
+ "cache: hazard pointer blocked page eviction";
+ stats->cache_eviction_internal.desc = "cache: internal pages evicted";
+ stats->cache_overflow_value.desc =
+ "cache: overflow values cached in memory";
+ stats->cache_read.desc = "cache: pages read into cache";
+ stats->cache_read_overflow.desc =
+ "cache: overflow pages read into cache";
+ stats->cache_write.desc = "cache: pages written from cache";
+ stats->compress_raw_fail.desc =
+ "compression: raw compression call failed, no additional data available";
+ stats->compress_raw_fail_temporary.desc =
+ "compression: raw compression call failed, additional data available";
+ stats->compress_raw_ok.desc =
+ "compression: raw compression call succeeded";
+ stats->compress_read.desc = "compression: compressed pages read";
+ stats->compress_write.desc = "compression: compressed pages written";
+ stats->compress_write_fail.desc =
+ "compression: page written failed to compress";
+ stats->compress_write_too_small.desc =
+ "compression: page written was too small to compress";
+ stats->cursor_create.desc = "cursor: create calls";
+ stats->cursor_insert.desc = "cursor: insert calls";
+ stats->cursor_insert_bulk.desc =
+ "cursor: bulk-loaded cursor-insert calls";
+ stats->cursor_insert_bytes.desc =
+ "cursor: cursor-insert key and value bytes inserted";
+ stats->cursor_next.desc = "cursor: next calls";
+ stats->cursor_prev.desc = "cursor: prev calls";
+ stats->cursor_remove.desc = "cursor: remove calls";
+ stats->cursor_remove_bytes.desc =
+ "cursor: cursor-remove key bytes removed";
+ stats->cursor_reset.desc = "cursor: reset calls";
+ stats->cursor_search.desc = "cursor: search calls";
+ stats->cursor_search_near.desc = "cursor: search near calls";
+ stats->cursor_update.desc = "cursor: update calls";
+ stats->cursor_update_bytes.desc =
+ "cursor: cursor-update value bytes updated";
+ stats->lsm_checkpoint_throttle.desc =
+ "LSM: sleep for LSM checkpoint throttle";
+ stats->lsm_chunk_count.desc = "LSM: chunks in the LSM tree";
+ stats->lsm_generation_max.desc =
+ "LSM: highest merge generation in the LSM tree";
+ stats->lsm_lookup_no_bloom.desc =
+ "LSM: queries that could have benefited from a Bloom filter that did not exist";
+ stats->lsm_merge_throttle.desc = "LSM: sleep for LSM merge throttle";
+ stats->rec_dictionary.desc = "reconciliation: dictionary matches";
+ stats->rec_multiblock_internal.desc =
+ "reconciliation: internal page multi-block writes";
+ stats->rec_multiblock_leaf.desc =
+ "reconciliation: leaf page multi-block writes";
+ stats->rec_multiblock_max.desc =
+ "reconciliation: maximum blocks required for a page";
+ stats->rec_overflow_key_internal.desc =
+ "reconciliation: internal-page overflow keys";
+ stats->rec_overflow_key_leaf.desc =
+ "reconciliation: leaf-page overflow keys";
+ stats->rec_overflow_value.desc =
+ "reconciliation: overflow values written";
+ stats->rec_page_delete.desc = "reconciliation: pages deleted";
+ stats->rec_page_match.desc = "reconciliation: page checksum matches";
+ stats->rec_pages.desc = "reconciliation: page reconciliation calls";
+ stats->rec_pages_eviction.desc =
+ "reconciliation: page reconciliation calls for eviction";
+ stats->rec_prefix_compression.desc =
+ "reconciliation: leaf page key bytes discarded using prefix compression";
+ stats->rec_suffix_compression.desc =
+ "reconciliation: internal page key bytes discarded using suffix compression";
+ stats->session_compact.desc = "session: object compaction";
+ stats->session_cursor_open.desc = "session: open cursor count";
+ stats->txn_update_conflict.desc = "txn: update conflicts";
+}
+
+void
+__wt_stat_refresh_dsrc_stats(void *stats_arg)
+{
+ WT_DSRC_STATS *stats;
+
+ stats = (WT_DSRC_STATS *)stats_arg;
+ stats->allocation_size.v = 0;
+ stats->block_alloc.v = 0;
+ stats->block_checkpoint_size.v = 0;
+ stats->block_extension.v = 0;
+ stats->block_free.v = 0;
+ stats->block_magic.v = 0;
+ stats->block_major.v = 0;
+ stats->block_minor.v = 0;
+ stats->block_reuse_bytes.v = 0;
+ stats->block_size.v = 0;
+ stats->bloom_count.v = 0;
+ stats->bloom_false_positive.v = 0;
+ stats->bloom_hit.v = 0;
+ stats->bloom_miss.v = 0;
+ stats->bloom_page_evict.v = 0;
+ stats->bloom_page_read.v = 0;
+ stats->bloom_size.v = 0;
+ stats->btree_column_deleted.v = 0;
+ stats->btree_column_fix.v = 0;
+ stats->btree_column_internal.v = 0;
+ stats->btree_column_variable.v = 0;
+ stats->btree_compact_rewrite.v = 0;
+ stats->btree_entries.v = 0;
+ stats->btree_fixed_len.v = 0;
+ stats->btree_maximum_depth.v = 0;
+ stats->btree_maxintlitem.v = 0;
+ stats->btree_maxintlpage.v = 0;
+ stats->btree_maxleafitem.v = 0;
+ stats->btree_maxleafpage.v = 0;
+ stats->btree_overflow.v = 0;
+ stats->btree_row_internal.v = 0;
+ stats->btree_row_leaf.v = 0;
+ stats->cache_bytes_read.v = 0;
+ stats->cache_bytes_write.v = 0;
+ stats->cache_eviction_checkpoint.v = 0;
+ stats->cache_eviction_clean.v = 0;
+ stats->cache_eviction_dirty.v = 0;
+ stats->cache_eviction_fail.v = 0;
+ stats->cache_eviction_hazard.v = 0;
+ stats->cache_eviction_internal.v = 0;
+ stats->cache_overflow_value.v = 0;
+ stats->cache_read.v = 0;
+ stats->cache_read_overflow.v = 0;
+ stats->cache_write.v = 0;
+ stats->compress_raw_fail.v = 0;
+ stats->compress_raw_fail_temporary.v = 0;
+ stats->compress_raw_ok.v = 0;
+ stats->compress_read.v = 0;
+ stats->compress_write.v = 0;
+ stats->compress_write_fail.v = 0;
+ stats->compress_write_too_small.v = 0;
+ stats->cursor_create.v = 0;
+ stats->cursor_insert.v = 0;
+ stats->cursor_insert_bulk.v = 0;
+ stats->cursor_insert_bytes.v = 0;
+ stats->cursor_next.v = 0;
+ stats->cursor_prev.v = 0;
+ stats->cursor_remove.v = 0;
+ stats->cursor_remove_bytes.v = 0;
+ stats->cursor_reset.v = 0;
+ stats->cursor_search.v = 0;
+ stats->cursor_search_near.v = 0;
+ stats->cursor_update.v = 0;
+ stats->cursor_update_bytes.v = 0;
+ stats->lsm_checkpoint_throttle.v = 0;
+ stats->lsm_chunk_count.v = 0;
+ stats->lsm_generation_max.v = 0;
+ stats->lsm_lookup_no_bloom.v = 0;
+ stats->lsm_merge_throttle.v = 0;
+ stats->rec_dictionary.v = 0;
+ stats->rec_multiblock_internal.v = 0;
+ stats->rec_multiblock_leaf.v = 0;
+ stats->rec_multiblock_max.v = 0;
+ stats->rec_overflow_key_internal.v = 0;
+ stats->rec_overflow_key_leaf.v = 0;
+ stats->rec_overflow_value.v = 0;
+ stats->rec_page_delete.v = 0;
+ stats->rec_page_match.v = 0;
+ stats->rec_pages.v = 0;
+ stats->rec_pages_eviction.v = 0;
+ stats->rec_prefix_compression.v = 0;
+ stats->rec_suffix_compression.v = 0;
+ stats->session_compact.v = 0;
+ stats->txn_update_conflict.v = 0;
+}
+
+void
+__wt_stat_aggregate_dsrc_stats(const void *child, const void *parent)
+{
+ WT_DSRC_STATS *c, *p;
+
+ c = (WT_DSRC_STATS *)child;
+ p = (WT_DSRC_STATS *)parent;
+ p->block_alloc.v += c->block_alloc.v;
+ p->block_checkpoint_size.v += c->block_checkpoint_size.v;
+ p->block_extension.v += c->block_extension.v;
+ p->block_free.v += c->block_free.v;
+ p->block_reuse_bytes.v += c->block_reuse_bytes.v;
+ p->block_size.v += c->block_size.v;
+ p->bloom_count.v += c->bloom_count.v;
+ p->bloom_false_positive.v += c->bloom_false_positive.v;
+ p->bloom_hit.v += c->bloom_hit.v;
+ p->bloom_miss.v += c->bloom_miss.v;
+ p->bloom_page_evict.v += c->bloom_page_evict.v;
+ p->bloom_page_read.v += c->bloom_page_read.v;
+ p->bloom_size.v += c->bloom_size.v;
+ p->btree_column_deleted.v += c->btree_column_deleted.v;
+ p->btree_column_fix.v += c->btree_column_fix.v;
+ p->btree_column_internal.v += c->btree_column_internal.v;
+ p->btree_column_variable.v += c->btree_column_variable.v;
+ p->btree_compact_rewrite.v += c->btree_compact_rewrite.v;
+ p->btree_entries.v += c->btree_entries.v;
+ if (c->btree_maximum_depth.v > p->btree_maximum_depth.v)
+ p->btree_maximum_depth.v = c->btree_maximum_depth.v;
+ p->btree_overflow.v += c->btree_overflow.v;
+ p->btree_row_internal.v += c->btree_row_internal.v;
+ p->btree_row_leaf.v += c->btree_row_leaf.v;
+ p->cache_bytes_read.v += c->cache_bytes_read.v;
+ p->cache_bytes_write.v += c->cache_bytes_write.v;
+ p->cache_eviction_checkpoint.v += c->cache_eviction_checkpoint.v;
+ p->cache_eviction_clean.v += c->cache_eviction_clean.v;
+ p->cache_eviction_dirty.v += c->cache_eviction_dirty.v;
+ p->cache_eviction_fail.v += c->cache_eviction_fail.v;
+ p->cache_eviction_hazard.v += c->cache_eviction_hazard.v;
+ p->cache_eviction_internal.v += c->cache_eviction_internal.v;
+ p->cache_overflow_value.v += c->cache_overflow_value.v;
+ p->cache_read.v += c->cache_read.v;
+ p->cache_read_overflow.v += c->cache_read_overflow.v;
+ p->cache_write.v += c->cache_write.v;
+ p->compress_raw_fail.v += c->compress_raw_fail.v;
+ p->compress_raw_fail_temporary.v += c->compress_raw_fail_temporary.v;
+ p->compress_raw_ok.v += c->compress_raw_ok.v;
+ p->compress_read.v += c->compress_read.v;
+ p->compress_write.v += c->compress_write.v;
+ p->compress_write_fail.v += c->compress_write_fail.v;
+ p->compress_write_too_small.v += c->compress_write_too_small.v;
+ p->cursor_create.v += c->cursor_create.v;
+ p->cursor_insert.v += c->cursor_insert.v;
+ p->cursor_insert_bulk.v += c->cursor_insert_bulk.v;
+ p->cursor_insert_bytes.v += c->cursor_insert_bytes.v;
+ p->cursor_next.v += c->cursor_next.v;
+ p->cursor_prev.v += c->cursor_prev.v;
+ p->cursor_remove.v += c->cursor_remove.v;
+ p->cursor_remove_bytes.v += c->cursor_remove_bytes.v;
+ p->cursor_reset.v += c->cursor_reset.v;
+ p->cursor_search.v += c->cursor_search.v;
+ p->cursor_search_near.v += c->cursor_search_near.v;
+ p->cursor_update.v += c->cursor_update.v;
+ p->cursor_update_bytes.v += c->cursor_update_bytes.v;
+ p->lsm_checkpoint_throttle.v += c->lsm_checkpoint_throttle.v;
+ if (c->lsm_generation_max.v > p->lsm_generation_max.v)
+ p->lsm_generation_max.v = c->lsm_generation_max.v;
+ p->lsm_lookup_no_bloom.v += c->lsm_lookup_no_bloom.v;
+ p->lsm_merge_throttle.v += c->lsm_merge_throttle.v;
+ p->rec_dictionary.v += c->rec_dictionary.v;
+ p->rec_multiblock_internal.v += c->rec_multiblock_internal.v;
+ p->rec_multiblock_leaf.v += c->rec_multiblock_leaf.v;
+ if (c->rec_multiblock_max.v > p->rec_multiblock_max.v)
+ p->rec_multiblock_max.v = c->rec_multiblock_max.v;
+ p->rec_overflow_key_internal.v += c->rec_overflow_key_internal.v;
+ p->rec_overflow_key_leaf.v += c->rec_overflow_key_leaf.v;
+ p->rec_overflow_value.v += c->rec_overflow_value.v;
+ p->rec_page_delete.v += c->rec_page_delete.v;
+ p->rec_page_match.v += c->rec_page_match.v;
+ p->rec_pages.v += c->rec_pages.v;
+ p->rec_pages_eviction.v += c->rec_pages_eviction.v;
+ p->rec_prefix_compression.v += c->rec_prefix_compression.v;
+ p->rec_suffix_compression.v += c->rec_suffix_compression.v;
+ p->session_compact.v += c->session_compact.v;
+ p->session_cursor_open.v += c->session_cursor_open.v;
+ p->txn_update_conflict.v += c->txn_update_conflict.v;
+}
+
+void
+__wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
+{
+ /* Clear, so can also be called for reinitialization. */
+ memset(stats, 0, sizeof(*stats));
+
+ stats->async_alloc_race.desc =
+ "async: number of allocation state races";
+ stats->async_alloc_view.desc =
+ "async: number of op slots viewed for alloc";
+ stats->async_cur_queue.desc = "async: current work queue length";
+ stats->async_flush.desc = "async: number of async flush calls";
+ stats->async_full.desc = "async: number of times op allocation failed";
+ stats->async_max_queue.desc = "async: maximum work queue length";
+ stats->async_nowork.desc =
+ "async: number of times worker found no work";
+ stats->async_op_alloc.desc = "async: op allocations";
+ stats->async_op_compact.desc = "async: op compact calls";
+ stats->async_op_insert.desc = "async: op insert calls";
+ stats->async_op_remove.desc = "async: op remove calls";
+ stats->async_op_search.desc = "async: op search calls";
+ stats->async_op_update.desc = "async: op update calls";
+ stats->block_byte_map_read.desc = "block manager: mapped bytes read";
+ stats->block_byte_read.desc = "block manager: bytes read";
+ stats->block_byte_write.desc = "block manager: bytes written";
+ stats->block_map_read.desc = "block manager: mapped blocks read";
+ stats->block_preload.desc = "block manager: blocks pre-loaded";
+ stats->block_read.desc = "block manager: blocks read";
+ stats->block_write.desc = "block manager: blocks written";
+ stats->cache_bytes_dirty.desc =
+ "cache: tracked dirty bytes in the cache";
+ stats->cache_bytes_inuse.desc = "cache: bytes currently in the cache";
+ stats->cache_bytes_max.desc = "cache: maximum bytes configured";
+ stats->cache_bytes_read.desc = "cache: bytes read into cache";
+ stats->cache_bytes_write.desc = "cache: bytes written from cache";
+ stats->cache_eviction_checkpoint.desc =
+ "cache: checkpoint blocked page eviction";
+ stats->cache_eviction_clean.desc = "cache: unmodified pages evicted";
+ stats->cache_eviction_deepen.desc =
+ "cache: page split during eviction deepened the tree";
+ stats->cache_eviction_dirty.desc = "cache: modified pages evicted";
+ stats->cache_eviction_fail.desc =
+ "cache: pages selected for eviction unable to be evicted";
+ stats->cache_eviction_force.desc =
+ "cache: pages evicted because they exceeded the in-memory maximum";
+ stats->cache_eviction_force_fail.desc =
+ "cache: failed eviction of pages that exceeded the in-memory maximum";
+ stats->cache_eviction_hazard.desc =
+ "cache: hazard pointer blocked page eviction";
+ stats->cache_eviction_internal.desc = "cache: internal pages evicted";
+ stats->cache_eviction_queue_empty.desc =
+ "cache: eviction server candidate queue empty when topping up";
+ stats->cache_eviction_queue_not_empty.desc =
+ "cache: eviction server candidate queue not empty when topping up";
+ stats->cache_eviction_server_evicting.desc =
+ "cache: eviction server evicting pages";
+ stats->cache_eviction_server_not_evicting.desc =
+ "cache: eviction server populating queue, but not evicting pages";
+ stats->cache_eviction_slow.desc =
+ "cache: eviction server unable to reach eviction goal";
+ stats->cache_eviction_split.desc =
+ "cache: pages split during eviction";
+ stats->cache_eviction_walk.desc = "cache: pages walked for eviction";
+ stats->cache_pages_dirty.desc =
+ "cache: tracked dirty pages in the cache";
+ stats->cache_pages_inuse.desc =
+ "cache: pages currently held in the cache";
+ stats->cache_read.desc = "cache: pages read into cache";
+ stats->cache_write.desc = "cache: pages written from cache";
+ stats->cond_wait.desc = "conn: pthread mutex condition wait calls";
+ stats->cursor_create.desc = "Btree: cursor create calls";
+ stats->cursor_insert.desc = "Btree: cursor insert calls";
+ stats->cursor_next.desc = "Btree: cursor next calls";
+ stats->cursor_prev.desc = "Btree: cursor prev calls";
+ stats->cursor_remove.desc = "Btree: cursor remove calls";
+ stats->cursor_reset.desc = "Btree: cursor reset calls";
+ stats->cursor_search.desc = "Btree: cursor search calls";
+ stats->cursor_search_near.desc = "Btree: cursor search near calls";
+ stats->cursor_update.desc = "Btree: cursor update calls";
+ stats->dh_session_handles.desc = "dhandle: session dhandles swept";
+ stats->dh_session_sweeps.desc = "dhandle: session sweep attempts";
+ stats->file_open.desc = "conn: files currently open";
+ stats->log_buffer_grow.desc = "log: log buffer size increases";
+ stats->log_buffer_size.desc = "log: total log buffer size";
+ stats->log_bytes_user.desc = "log: user provided log bytes written";
+ stats->log_bytes_written.desc = "log: log bytes written";
+ stats->log_close_yields.desc =
+ "log: yields waiting for previous log file close";
+ stats->log_max_filesize.desc = "log: maximum log file size";
+ stats->log_reads.desc = "log: log read operations";
+ stats->log_scan_records.desc = "log: records processed by log scan";
+ stats->log_scan_rereads.desc =
+ "log: log scan records requiring two reads";
+ stats->log_scans.desc = "log: log scan operations";
+ stats->log_slot_closes.desc = "log: consolidated slot closures";
+ stats->log_slot_consolidated.desc = "log: logging bytes consolidated";
+ stats->log_slot_joins.desc = "log: consolidated slot joins";
+ stats->log_slot_races.desc = "log: consolidated slot join races";
+ stats->log_slot_switch_fails.desc =
+ "log: slots selected for switching that were unavailable";
+ stats->log_slot_toobig.desc = "log: record size exceeded maximum";
+ stats->log_slot_toosmall.desc =
+ "log: failed to find a slot large enough for record";
+ stats->log_slot_transitions.desc =
+ "log: consolidated slot join transitions";
+ stats->log_sync.desc = "log: log sync operations";
+ stats->log_writes.desc = "log: log write operations";
+ stats->lsm_checkpoint_throttle.desc =
+ "LSM: sleep for LSM checkpoint throttle";
+ stats->lsm_merge_throttle.desc = "LSM: sleep for LSM merge throttle";
+ stats->lsm_rows_merged.desc = "LSM: rows merged in an LSM tree";
+ stats->lsm_work_queue_app.desc =
+ "LSM: App work units currently queued";
+ stats->lsm_work_queue_manager.desc =
+ "LSM: Merge work units currently queued";
+ stats->lsm_work_queue_max.desc = "LSM: tree queue hit maximum";
+ stats->lsm_work_queue_switch.desc =
+ "LSM: Switch work units currently queued";
+ stats->lsm_work_units_created.desc =
+ "LSM: tree maintenance operations scheduled";
+ stats->lsm_work_units_discarded.desc =
+ "LSM: tree maintenance operations discarded";
+ stats->lsm_work_units_done.desc =
+ "LSM: tree maintenance operations executed";
+ stats->memory_allocation.desc = "conn: memory allocations";
+ stats->memory_free.desc = "conn: memory frees";
+ stats->memory_grow.desc = "conn: memory re-allocations";
+ stats->read_io.desc = "conn: total read I/Os";
+ stats->rec_pages.desc = "reconciliation: page reconciliation calls";
+ stats->rec_pages_eviction.desc =
+ "reconciliation: page reconciliation calls for eviction";
+ stats->rec_split_stashed_bytes.desc =
+ "reconciliation: split bytes currently awaiting free";
+ stats->rec_split_stashed_objects.desc =
+ "reconciliation: split objects currently awaiting free";
+ stats->rwlock_read.desc =
+ "conn: pthread mutex shared lock read-lock calls";
+ stats->rwlock_write.desc =
+ "conn: pthread mutex shared lock write-lock calls";
+ stats->session_cursor_open.desc = "session: open cursor count";
+ stats->session_open.desc = "session: open session count";
+ stats->txn_begin.desc = "txn: transaction begins";
+ stats->txn_checkpoint.desc = "txn: transaction checkpoints";
+ stats->txn_checkpoint_running.desc =
+ "txn: transaction checkpoint currently running";
+ stats->txn_commit.desc = "txn: transactions committed";
+ stats->txn_fail_cache.desc =
+ "txn: transaction failures due to cache overflow";
+ stats->txn_pinned_range.desc =
+ "txn: transaction range of IDs currently pinned";
+ stats->txn_rollback.desc = "txn: transactions rolled back";
+ stats->write_io.desc = "conn: total write I/Os";
+}
+
+void
+__wt_stat_refresh_connection_stats(void *stats_arg)
+{
+ WT_CONNECTION_STATS *stats;
+
+ stats = (WT_CONNECTION_STATS *)stats_arg;
+ stats->async_alloc_race.v = 0;
+ stats->async_alloc_view.v = 0;
+ stats->async_cur_queue.v = 0;
+ stats->async_flush.v = 0;
+ stats->async_full.v = 0;
+ stats->async_max_queue.v = 0;
+ stats->async_nowork.v = 0;
+ stats->async_op_alloc.v = 0;
+ stats->async_op_compact.v = 0;
+ stats->async_op_insert.v = 0;
+ stats->async_op_remove.v = 0;
+ stats->async_op_search.v = 0;
+ stats->async_op_update.v = 0;
+ stats->block_byte_map_read.v = 0;
+ stats->block_byte_read.v = 0;
+ stats->block_byte_write.v = 0;
+ stats->block_map_read.v = 0;
+ stats->block_preload.v = 0;
+ stats->block_read.v = 0;
+ stats->block_write.v = 0;
+ stats->cache_bytes_dirty.v = 0;
+ stats->cache_bytes_read.v = 0;
+ stats->cache_bytes_write.v = 0;
+ stats->cache_eviction_checkpoint.v = 0;
+ stats->cache_eviction_clean.v = 0;
+ stats->cache_eviction_deepen.v = 0;
+ stats->cache_eviction_dirty.v = 0;
+ stats->cache_eviction_fail.v = 0;
+ stats->cache_eviction_force.v = 0;
+ stats->cache_eviction_force_fail.v = 0;
+ stats->cache_eviction_hazard.v = 0;
+ stats->cache_eviction_internal.v = 0;
+ stats->cache_eviction_queue_empty.v = 0;
+ stats->cache_eviction_queue_not_empty.v = 0;
+ stats->cache_eviction_server_evicting.v = 0;
+ stats->cache_eviction_server_not_evicting.v = 0;
+ stats->cache_eviction_slow.v = 0;
+ stats->cache_eviction_split.v = 0;
+ stats->cache_eviction_walk.v = 0;
+ stats->cache_pages_dirty.v = 0;
+ stats->cache_read.v = 0;
+ stats->cache_write.v = 0;
+ stats->cond_wait.v = 0;
+ stats->cursor_create.v = 0;
+ stats->cursor_insert.v = 0;
+ stats->cursor_next.v = 0;
+ stats->cursor_prev.v = 0;
+ stats->cursor_remove.v = 0;
+ stats->cursor_reset.v = 0;
+ stats->cursor_search.v = 0;
+ stats->cursor_search_near.v = 0;
+ stats->cursor_update.v = 0;
+ stats->dh_session_handles.v = 0;
+ stats->dh_session_sweeps.v = 0;
+ stats->log_buffer_grow.v = 0;
+ stats->log_bytes_user.v = 0;
+ stats->log_bytes_written.v = 0;
+ stats->log_close_yields.v = 0;
+ stats->log_reads.v = 0;
+ stats->log_scan_records.v = 0;
+ stats->log_scan_rereads.v = 0;
+ stats->log_scans.v = 0;
+ stats->log_slot_closes.v = 0;
+ stats->log_slot_consolidated.v = 0;
+ stats->log_slot_joins.v = 0;
+ stats->log_slot_races.v = 0;
+ stats->log_slot_switch_fails.v = 0;
+ stats->log_slot_toobig.v = 0;
+ stats->log_slot_toosmall.v = 0;
+ stats->log_slot_transitions.v = 0;
+ stats->log_sync.v = 0;
+ stats->log_writes.v = 0;
+ stats->lsm_checkpoint_throttle.v = 0;
+ stats->lsm_merge_throttle.v = 0;
+ stats->lsm_rows_merged.v = 0;
+ stats->lsm_work_queue_max.v = 0;
+ stats->lsm_work_units_created.v = 0;
+ stats->lsm_work_units_discarded.v = 0;
+ stats->lsm_work_units_done.v = 0;
+ stats->memory_allocation.v = 0;
+ stats->memory_free.v = 0;
+ stats->memory_grow.v = 0;
+ stats->read_io.v = 0;
+ stats->rec_pages.v = 0;
+ stats->rec_pages_eviction.v = 0;
+ stats->rwlock_read.v = 0;
+ stats->rwlock_write.v = 0;
+ stats->txn_begin.v = 0;
+ stats->txn_checkpoint.v = 0;
+ stats->txn_commit.v = 0;
+ stats->txn_fail_cache.v = 0;
+ stats->txn_rollback.v = 0;
+ stats->write_io.v = 0;
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
new file mode 100644
index 00000000000..292d1a37ceb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -0,0 +1,554 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_txnid_cmp --
+ * Compare transaction IDs for sorting / searching.
+ */
+int
+__wt_txnid_cmp(const void *v1, const void *v2)
+{
+ uint64_t id1, id2;
+
+ id1 = *(uint64_t *)v1;
+ id2 = *(uint64_t *)v2;
+
+ return ((id1 == id2) ? 0 : TXNID_LT(id1, id2) ? -1 : 1);
+}
+
+/*
+ * __txn_sort_snapshot --
+ * Sort a snapshot for faster searching and set the min/max bounds.
+ */
+static void
+__txn_sort_snapshot(WT_SESSION_IMPL *session, uint32_t n, uint64_t snap_max)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+
+ if (n > 1)
+ qsort(txn->snapshot, n, sizeof(uint64_t), __wt_txnid_cmp);
+ txn->snapshot_count = n;
+ txn->snap_max = snap_max;
+ txn->snap_min = (n > 0 && TXNID_LE(txn->snapshot[0], snap_max)) ?
+ txn->snapshot[0] : snap_max;
+ F_SET(txn, TXN_HAS_SNAPSHOT);
+ WT_ASSERT(session, n == 0 || txn->snap_min != WT_TXN_NONE);
+}
+
+/*
+ * __wt_txn_release_snapshot --
+ * Release the snapshot in the current transaction.
+ */
+void
+__wt_txn_release_snapshot(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+ WT_TXN_STATE *txn_state;
+
+ txn = &session->txn;
+ txn_state = &S2C(session)->txn_global.states[session->id];
+
+ if (txn_state->snap_min != WT_TXN_NONE) {
+ WT_ASSERT(session,
+ session->txn.isolation == TXN_ISO_READ_UNCOMMITTED ||
+ !__wt_txn_visible_all(session, txn_state->snap_min));
+ txn_state->snap_min = WT_TXN_NONE;
+ }
+ F_CLR(txn, TXN_HAS_SNAPSHOT);
+}
+
+/*
+ * __wt_txn_update_oldest --
+ * Sweep the running transactions to update the oldest ID required.
+ */
+void
+__wt_txn_update_oldest(WT_SESSION_IMPL *session)
+{
+ /*
+ * !!!
+ * If a data-source is calling the WT_EXTENSION_API.transaction_oldest
+ * method (for the oldest transaction ID not yet visible to a running
+ * transaction), and then comparing that oldest ID against committed
+ * transactions to see if updates for a committed transaction are still
+ * visible to running transactions, the oldest transaction ID may be
+ * the same as the last committed transaction ID, if the transaction
+ * state wasn't refreshed after the last transaction committed. Push
+ * past the last committed transaction.
+ */
+ __wt_txn_refresh(session, 0);
+}
+
+/*
+ * __wt_txn_refresh --
+ * Allocate a transaction ID and/or a snapshot.
+ */
+void
+__wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *s, *txn_state;
+ uint64_t current_id, id, oldest_id;
+ uint64_t prev_oldest_id, snap_min;
+ uint32_t i, n, oldest_session, session_cnt;
+ int32_t count;
+
+ conn = S2C(session);
+ txn = &session->txn;
+ txn_global = &conn->txn_global;
+ txn_state = &txn_global->states[session->id];
+
+ current_id = snap_min = txn_global->current;
+ prev_oldest_id = txn_global->oldest_id;
+
+ /* For pure read-only workloads, avoid scanning. */
+ if (prev_oldest_id == current_id) {
+ if (get_snapshot) {
+ txn_state->snap_min = current_id;
+ __txn_sort_snapshot(session, 0, current_id);
+ }
+ /* Check that the oldest ID has not moved in the meantime. */
+ if (prev_oldest_id == txn_global->oldest_id &&
+ txn_global->scan_count == 0)
+ return;
+ }
+
+ /*
+ * We're going to scan. Increment the count of scanners to prevent the
+ * oldest ID from moving forwards. Spin if the count is negative,
+ * which indicates that some thread is moving the oldest ID forwards.
+ */
+ do {
+ if ((count = txn_global->scan_count) < 0)
+ WT_PAUSE();
+ } while (count < 0 ||
+ !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1));
+
+ /* The oldest ID cannot change until the scan count goes to zero. */
+ prev_oldest_id = txn_global->oldest_id;
+ current_id = oldest_id = snap_min = txn_global->current;
+ oldest_session = 0;
+
+ /* Walk the array of concurrent transactions. */
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ /*
+ * Build our snapshot of any concurrent transaction IDs.
+ *
+ * Ignore our own ID: we always read our own updates.
+ *
+ * Also ignore the ID if it is older than the oldest ID we saw.
+ * This can happen if we race with a thread that is allocating
+ * an ID -- the ID will not be used because the thread will
+ * keep spinning until it gets a valid one.
+ */
+ if (s != txn_state &&
+ (id = s->id) != WT_TXN_NONE &&
+ TXNID_LE(prev_oldest_id, id)) {
+ if (get_snapshot)
+ txn->snapshot[n++] = id;
+ if (TXNID_LT(id, snap_min))
+ snap_min = id;
+ }
+
+ /*
+ * Ignore the session's own snap_min: we are about to update
+ * it.
+ */
+ if (get_snapshot && s == txn_state)
+ continue;
+
+ /*
+ * !!!
+ * Note: Don't ignore snap_min values older than the previous
+ * oldest ID. Read-uncommitted operations publish snap_min
+ * values without incrementing scan_count to protect the global
+ * table. See the comment in __wt_txn_cursor_op for
+ * more details.
+ */
+ if ((id = s->snap_min) != WT_TXN_NONE &&
+ TXNID_LT(id, oldest_id)) {
+ oldest_id = id;
+ oldest_session = i;
+ }
+ }
+
+ if (TXNID_LT(snap_min, oldest_id))
+ oldest_id = snap_min;
+ if (txn->id != WT_TXN_NONE && TXNID_LT(txn->id, oldest_id))
+ oldest_id = txn->id;
+
+ /*
+ * If we got a new snapshot, update the published snap_min for this
+ * session.
+ */
+ if (get_snapshot) {
+ WT_ASSERT(session, TXNID_LE(prev_oldest_id, snap_min));
+ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
+ txn_state->snap_min = snap_min;
+ }
+
+ /*
+ * Update the last running ID if we have a much newer value or we are
+ * forcing an update.
+ */
+ if (!get_snapshot || snap_min > txn_global->last_running + 100)
+ txn_global->last_running = snap_min;
+
+ /*
+ * Update the oldest ID if we have a newer ID and we can get exclusive
+ * access. During normal snapshot refresh, only do this if we have a
+ * much newer value. Once we get exclusive access, do another pass to
+ * make sure nobody else is using an earlier ID.
+ */
+ if (TXNID_LT(prev_oldest_id, oldest_id) &&
+ (!get_snapshot || oldest_id - prev_oldest_id > 100) &&
+ WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) {
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ if ((id = s->id) != WT_TXN_NONE &&
+ TXNID_LT(id, oldest_id))
+ oldest_id = id;
+ if ((id = s->snap_min) != WT_TXN_NONE &&
+ TXNID_LT(id, oldest_id))
+ oldest_id = id;
+ }
+ if (TXNID_LT(txn_global->oldest_id, oldest_id))
+ txn_global->oldest_id = oldest_id;
+ txn_global->scan_count = 0;
+ } else {
+ if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
+ current_id - oldest_id > 10000 &&
+ txn_global->oldest_session != oldest_session) {
+ (void)__wt_verbose(session, WT_VERB_TRANSACTION,
+ "old snapshot %" PRIu64
+ " pinned in session %d [%s]"
+ " with snap_min %" PRIu64 "\n",
+ oldest_id, oldest_session,
+ conn->sessions[oldest_session].lastop,
+ conn->sessions[oldest_session].txn.snap_min);
+ txn_global->oldest_session = oldest_session;
+ }
+ WT_ASSERT(session, txn_global->scan_count > 0);
+ (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
+ }
+
+ if (get_snapshot)
+ __txn_sort_snapshot(session, n, current_id);
+}
+
+/*
+ * __wt_txn_begin --
+ * Begin a transaction.
+ */
+int
+__wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_TXN *txn;
+
+ txn = &session->txn;
+
+ WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
+ if (cval.len == 0)
+ txn->isolation = session->isolation;
+ else
+ txn->isolation =
+ WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
+ TXN_ISO_SNAPSHOT :
+ WT_STRING_MATCH("read-committed", cval.str, cval.len) ?
+ TXN_ISO_READ_COMMITTED : TXN_ISO_READ_UNCOMMITTED;
+
+ /*
+ * The default sync setting is inherited from the connection, but can
+ * be overridden by an explicit "sync" setting for this transaction.
+ */
+ txn->txn_logsync = S2C(session)->txn_logsync;
+ WT_RET(__wt_config_gets_def(session, cfg, "sync",
+ FLD_ISSET(txn->txn_logsync, WT_LOG_FLUSH), &cval));
+ if (!cval.val)
+ txn->txn_logsync = 0;
+
+ F_SET(txn, TXN_RUNNING);
+ if (txn->isolation == TXN_ISO_SNAPSHOT) {
+ if (session->ncursors > 0)
+ WT_RET(__wt_session_copy_values(session));
+ __wt_txn_refresh(session, 1);
+ }
+ return (0);
+}
+
+/*
+ * __wt_txn_release --
+ * Release the resources associated with the current transaction.
+ */
+void
+__wt_txn_release(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *txn_state;
+
+ txn = &session->txn;
+ WT_ASSERT(session, txn->mod_count == 0);
+ txn->notify = NULL;
+
+ txn_global = &S2C(session)->txn_global;
+ txn_state = &txn_global->states[session->id];
+
+ /* Clear the transaction's ID from the global table. */
+ if (F_ISSET(txn, TXN_HAS_ID)) {
+ WT_ASSERT(session, txn_state->id != WT_TXN_NONE &&
+ txn->id != WT_TXN_NONE);
+ WT_PUBLISH(txn_state->id, WT_TXN_NONE);
+ txn->id = WT_TXN_NONE;
+ }
+
+ /* Free the scratch buffer allocated for logging. */
+ __wt_logrec_free(session, &txn->logrec);
+
+ /* Discard any memory from the session's split stash that we can. */
+ if (session->split_stash_cnt > 0)
+ __wt_split_stash_discard(session);
+
+ /*
+ * Reset the transaction state to not running and release the snapshot.
+ */
+ __wt_txn_release_snapshot(session);
+ txn->isolation = session->isolation;
+ F_CLR(txn, TXN_ERROR | TXN_HAS_ID | TXN_RUNNING);
+}
+
+/*
+ * __wt_txn_commit --
+ * Commit the current transaction.
+ */
+int
+__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_TXN *txn;
+ WT_TXN_OP *op;
+ u_int i;
+
+ txn = &session->txn;
+ WT_ASSERT(session, !F_ISSET(txn, TXN_ERROR));
+
+ if (!F_ISSET(txn, TXN_RUNNING))
+ WT_RET_MSG(session, EINVAL, "No transaction is active");
+
+ /* Commit notification. */
+ if (txn->notify != NULL)
+ WT_TRET(txn->notify->notify(txn->notify,
+ (WT_SESSION *)session, txn->id, 1));
+
+ /* If we are logging, write a commit log record. */
+ if (ret == 0 &&
+ txn->mod_count > 0 && S2C(session)->logging &&
+ !F_ISSET(session, WT_SESSION_NO_LOGGING))
+ ret = __wt_txn_log_commit(session, cfg);
+
+ /*
+ * If anything went wrong, roll back.
+ *
+ * !!!
+ * Nothing can fail after this point.
+ */
+ if (ret != 0) {
+ WT_TRET(__wt_txn_rollback(session, cfg));
+ return (ret);
+ }
+
+ /* Free memory associated with updates. */
+ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++)
+ __wt_txn_op_free(session, op);
+ txn->mod_count = 0;
+
+ /*
+ * We are about to release the snapshot: copy values into any
+ * positioned cursors so they don't point to updates that could be
+ * freed once we don't have a transaction ID pinned.
+ */
+ if (session->ncursors > 0)
+ WT_RET(__wt_session_copy_values(session));
+
+ __wt_txn_release(session);
+ return (0);
+}
+
+/*
+ * __wt_txn_rollback --
+ * Roll back the current transaction.
+ */
+int
+__wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_TXN *txn;
+ WT_TXN_OP *op;
+ u_int i;
+
+ WT_UNUSED(cfg);
+
+ txn = &session->txn;
+ if (!F_ISSET(txn, TXN_RUNNING))
+ WT_RET_MSG(session, EINVAL, "No transaction is active");
+
+ /* Rollback notification. */
+ if (txn->notify != NULL)
+ WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session,
+ txn->id, 0));
+
+ /* Rollback updates. */
+ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
+ /* Metadata updates are never rolled back. */
+ if (op->fileid == WT_METAFILE_ID)
+ continue;
+
+ switch (op->type) {
+ case TXN_OP_BASIC:
+ case TXN_OP_INMEM:
+ op->u.upd->txnid = WT_TXN_ABORTED;
+ break;
+ case TXN_OP_REF:
+ __wt_delete_page_rollback(session, op->u.ref);
+ break;
+ case TXN_OP_TRUNCATE_COL:
+ case TXN_OP_TRUNCATE_ROW:
+ /*
+ * Nothing to do: these operations are only logged for
+ * recovery. The in-memory changes will be rolled back
+ * with a combination of TXN_OP_REF and TXN_OP_INMEM
+ * operations.
+ */
+ break;
+ }
+
+ /* Free any memory allocated for the operation. */
+ __wt_txn_op_free(session, op);
+ }
+ txn->mod_count = 0;
+
+ __wt_txn_release(session);
+ return (ret);
+}
+
+/*
+ * __wt_txn_init --
+ * Initialize a session's transaction data.
+ */
+int
+__wt_txn_init(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ txn->id = WT_TXN_NONE;
+
+ WT_RET(__wt_calloc_def(session,
+ S2C(session)->session_size, &txn->snapshot));
+
+#ifdef HAVE_DIAGNOSTIC
+ if (S2C(session)->txn_global.states != NULL) {
+ WT_TXN_STATE *txn_state;
+ txn_state = &S2C(session)->txn_global.states[session->id];
+ WT_ASSERT(session, txn_state->snap_min == WT_TXN_NONE);
+ }
+#endif
+
+ /*
+ * Take care to clean these out in case we are reusing the transaction
+ * for eviction.
+ */
+ txn->mod = NULL;
+
+ txn->isolation = session->isolation;
+ return (0);
+}
+
+/*
+ * __wt_txn_stats_update --
+ * Update the transaction statistics for return to the application.
+ */
+void
+__wt_txn_stats_update(WT_SESSION_IMPL *session)
+{
+ WT_TXN_GLOBAL *txn_global;
+ WT_CONNECTION_IMPL *conn;
+ WT_CONNECTION_STATS *stats;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+ stats = &conn->stats;
+
+ WT_STAT_SET(stats, txn_pinned_range,
+ txn_global->current - txn_global->oldest_id);
+}
+
+/*
+ * __wt_txn_destroy --
+ * Destroy a session's transaction data.
+ */
+void
+__wt_txn_destroy(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ __wt_free(session, txn->mod);
+ __wt_free(session, txn->snapshot);
+}
+
+/*
+ * __wt_txn_global_init --
+ * Initialize the global transaction state.
+ */
+int
+__wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *s;
+ u_int i;
+
+ WT_UNUSED(cfg);
+ conn = S2C(session);
+
+ txn_global = &conn->txn_global;
+ txn_global->current = 1;
+ txn_global->oldest_id = 1;
+ txn_global->last_running = 1;
+
+ WT_RET(__wt_calloc_def(
+ session, conn->session_size, &txn_global->states));
+ for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++)
+ s->id = s->snap_min = WT_TXN_NONE;
+
+ return (0);
+}
+
+/*
+ * __wt_txn_global_destroy --
+ * Destroy the global transaction state.
+ */
+void
+__wt_txn_global_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN_GLOBAL *txn_global;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ if (txn_global != NULL)
+ __wt_free(session, txn_global->states);
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
new file mode 100644
index 00000000000..555eec649c6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -0,0 +1,944 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_checkpoint_name_ok --
+ * Complain if the checkpoint name isn't acceptable.
+ */
+int
+__wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len)
+{
+ /* Check for characters we don't want to see in a metadata file. */
+ WT_RET(__wt_name_check(session, name, len));
+
+ /*
+ * The internal checkpoint name is special, applications aren't allowed
+ * to use it. Be aggressive and disallow any matching prefix, it makes
+ * things easier when checking in other places.
+ */
+ if (len < strlen(WT_CHECKPOINT))
+ return (0);
+ if (!WT_PREFIX_MATCH(name, WT_CHECKPOINT))
+ return (0);
+
+ WT_RET_MSG(session, EINVAL,
+ "the checkpoint name \"%s\" is reserved", WT_CHECKPOINT);
+}
+
+/*
+ * __checkpoint_name_check --
+ * Check for an attempt to name a checkpoint that includes anything
+ * other than a file object.
+ */
+static int
+__checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ const char *fail;
+
+ cursor = NULL;
+ fail = NULL;
+
+ /*
+ * This function exists as a place for this comment: named checkpoints
+ * are only supported on file objects, and not on LSM trees or Helium
+ * devices. If a target list is configured for the checkpoint, this
+ * function is called with each target list entry; check the entry to
+ * make sure it's backed by a file. If no target list is configured,
+ * confirm the metadata file contains no non-file objects.
+ */
+ if (uri == NULL) {
+ WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ while ((ret = cursor->next(cursor)) == 0) {
+ WT_ERR(cursor->get_key(cursor, &uri));
+ if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+ !WT_PREFIX_MATCH(uri, "file:") &&
+ !WT_PREFIX_MATCH(uri, "index:") &&
+ !WT_PREFIX_MATCH(uri, "table:")) {
+ fail = uri;
+ break;
+ }
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ } else
+ if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+ !WT_PREFIX_MATCH(uri, "file:") &&
+ !WT_PREFIX_MATCH(uri, "index:") &&
+ !WT_PREFIX_MATCH(uri, "table:"))
+ fail = uri;
+
+ if (fail != NULL)
+ WT_ERR_MSG(session, EINVAL,
+ "%s object does not support named checkpoints", fail);
+
+err: if (cursor != NULL)
+ WT_TRET(cursor->close(cursor));
+ return (ret);
+}
+
+/*
+ * __checkpoint_apply --
+ * Apply an operation to all files involved in a checkpoint.
+ */
+static int
+__checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[],
+ int (*op)(WT_SESSION_IMPL *, const char *[]), int *fullp)
+{
+ WT_CONFIG targetconf;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ int ckpt_closed, named, target_list;
+
+ target_list = 0;
+
+ /* Flag if this is a named checkpoint, and check if the name is OK. */
+ WT_RET(__wt_config_gets(session, cfg, "name", &cval));
+ named = cval.len != 0;
+ if (named)
+ WT_RET(__wt_checkpoint_name_ok(session, cval.str, cval.len));
+
+ /* Step through the targets and optionally operate on each one. */
+ WT_ERR(__wt_config_gets(session, cfg, "target", &cval));
+ WT_ERR(__wt_config_subinit(session, &targetconf, &cval));
+ while ((ret = __wt_config_next(&targetconf, &k, &v)) == 0) {
+ if (!target_list) {
+ WT_ERR(__wt_scr_alloc(session, 512, &tmp));
+ target_list = 1;
+ }
+
+ if (v.len != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "invalid checkpoint target %.*s: URIs may require "
+ "quoting",
+ (int)cval.len, (char *)cval.str);
+
+ /* Some objects don't support named checkpoints. */
+ if (named)
+ WT_ERR(__checkpoint_name_check(session, k.str));
+
+ if (op == NULL)
+ continue;
+ WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
+ if ((ret = __wt_schema_worker(
+ session, tmp->data, op, NULL, cfg, 0)) != 0)
+ WT_ERR_MSG(session, ret, "%s", (const char *)tmp->data);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ if (!target_list && named)
+ /* Some objects don't support named checkpoints. */
+ WT_ERR(__checkpoint_name_check(session, NULL));
+
+ if (!target_list && op != NULL) {
+ /*
+ * If the checkpoint is named or we're dropping checkpoints, we
+ * checkpoint both open and closed files; else, only checkpoint
+ * open files.
+ *
+ * XXX
+ * We don't optimize unnamed checkpoints of a list of targets,
+ * we open the targets and checkpoint them even if they are
+ * quiescent and don't need a checkpoint, believing applications
+ * unlikely to checkpoint a list of closed targets.
+ */
+ ckpt_closed = named;
+ if (!ckpt_closed) {
+ WT_ERR(__wt_config_gets(session, cfg, "drop", &cval));
+ ckpt_closed = cval.len != 0;
+ }
+ WT_ERR(ckpt_closed ?
+ __wt_meta_btree_apply(session, op, cfg) :
+ __wt_conn_btree_apply(session, 0, op, cfg));
+ }
+
+ if (fullp != NULL)
+ *fullp = !target_list;
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __checkpoint_data_source --
+ * Checkpoint all data sources.
+ */
+static int
+__checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_NAMED_DATA_SOURCE *ndsrc;
+ WT_DATA_SOURCE *dsrc;
+
+ /*
+ * A place-holder, to support Helium devices: we assume calling the
+ * underlying data-source session checkpoint function is sufficient to
+ * checkpoint all objects in the data source, open or closed, and we
+ * don't attempt to optimize the checkpoint of individual targets.
+ * Those assumptions is correct for the Helium device, but it's not
+ * necessarily going to be true for other data sources.
+ *
+ * It's not difficult to support data-source checkpoints of individual
+ * targets (__wt_schema_worker is the underlying function that will do
+ * the work, and it's already written to support data-sources, although
+ * we'd probably need to pass the URI of the object to the data source
+ * checkpoint function which we don't currently do). However, doing a
+ * full data checkpoint is trickier: currently, the connection code is
+ * written to ignore all objects other than "file:", and that code will
+ * require significant changes to work with data sources.
+ */
+ TAILQ_FOREACH(ndsrc, &S2C(session)->dsrcqh, q) {
+ dsrc = ndsrc->dsrc;
+ if (dsrc->checkpoint != NULL)
+ WT_RET(dsrc->checkpoint(dsrc,
+ (WT_SESSION *)session, (WT_CONFIG_ARG *)cfg));
+ }
+ return (0);
+}
+
+/*
+ * __wt_checkpoint_list --
+ * Get a list of handles to flush.
+ */
+int
+__wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_DATA_HANDLE *saved_dhandle;
+ WT_DECL_RET;
+ const char *name;
+
+ WT_UNUSED(cfg);
+
+ /* Should not be called with anything other than a file object. */
+ WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+ WT_ASSERT(session,
+ memcmp(session->dhandle->name, "file:", strlen("file:")) == 0);
+
+ /* Make sure there is space for the next entry. */
+ WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated,
+ session->ckpt_handle_next + 1, &session->ckpt_handle));
+
+ /* Not strictly necessary, but cleaner to clear the current handle. */
+ name = session->dhandle->name;
+ saved_dhandle = session->dhandle;
+ session->dhandle = NULL;
+
+ /* Ignore busy files, we'll deal with them in the checkpoint. */
+ switch (ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) {
+ case 0:
+ session->ckpt_handle[
+ session->ckpt_handle_next++] = session->dhandle;
+ break;
+ case EBUSY:
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+
+ session->dhandle = saved_dhandle;
+ return (ret);
+}
+
+/*
+ * __checkpoint_write_leaves --
+ * Write any dirty leaf pages for all checkpoint handles.
+ */
+static int
+__checkpoint_write_leaves(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ u_int i;
+
+ i = 0;
+
+ /* Should not be called with any handle reference. */
+ WT_ASSERT(session, session->dhandle == NULL);
+
+ /*
+ * Get a list of handles we want to flush; this may pull closed objects
+ * into the session cache, but we're going to do that eventually anyway.
+ */
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __checkpoint_apply(session, cfg, __wt_checkpoint_list, NULL));
+ WT_ERR(ret);
+
+ /*
+ * Walk the list, flushing the leaf pages from each file, then releasing
+ * the file. Note that we increment inside the loop to simplify error
+ * handling.
+ */
+ while (i < session->ckpt_handle_next) {
+ dhandle = session->ckpt_handle[i++];
+ WT_WITH_DHANDLE(session, dhandle,
+ ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES));
+ WT_WITH_DHANDLE(session, dhandle,
+ WT_TRET(__wt_session_release_btree(session)));
+ WT_ERR(ret);
+ }
+
+err: while (i < session->ckpt_handle_next) {
+ dhandle = session->ckpt_handle[i++];
+ WT_WITH_DHANDLE(session, dhandle,
+ WT_TRET(__wt_session_release_btree(session)));
+ }
+ __wt_free(session, session->ckpt_handle);
+ session->ckpt_handle_allocated = session->ckpt_handle_next = 0;
+ return (ret);
+}
+
+/*
+ * __wt_txn_checkpoint --
+ * Checkpoint a database or a list of objects in the database.
+ */
+int
+__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ WT_TXN *txn;
+ WT_TXN_ISOLATION saved_isolation;
+ int full, logging, tracking;
+ const char *txn_cfg[] =
+ { WT_CONFIG_BASE(session, session_begin_transaction),
+ "isolation=snapshot", NULL };
+ void *saved_meta_next;
+
+ conn = S2C(session);
+ saved_isolation = session->isolation;
+ txn = &session->txn;
+ full = logging = tracking = 0;
+
+ /*
+ * Do a pass over the configuration arguments and figure out what kind
+ * kind of checkpoint this is.
+ */
+ WT_RET(__checkpoint_apply(session, cfg, NULL, &full));
+
+ /*
+ * Update the global oldest ID so we do all possible cleanup.
+ *
+ * This is particularly important for compact, so that all dirty pages
+ * can be fully written.
+ */
+ __wt_txn_update_oldest(session);
+
+ /* Flush data-sources before we start the checkpoint. */
+ WT_ERR(__checkpoint_data_source(session, cfg));
+
+ /* Flush dirty leaf pages before we start the checkpoint. */
+ session->isolation = txn->isolation = TXN_ISO_READ_COMMITTED;
+ WT_ERR(__checkpoint_write_leaves(session, cfg));
+
+ /* Acquire the schema lock. */
+ F_SET(session, WT_SESSION_SCHEMA_LOCKED);
+ __wt_spin_lock(session, &conn->schema_lock);
+
+ WT_ERR(__wt_meta_track_on(session));
+ tracking = 1;
+
+ /* Tell logging that we are about to start a database checkpoint. */
+ if (conn->logging && full)
+ WT_ERR(__wt_txn_checkpoint_log(
+ session, full, WT_TXN_LOG_CKPT_PREPARE, NULL));
+
+ /*
+ * Start a snapshot transaction for the checkpoint.
+ *
+ * Note: we don't go through the public API calls because they have
+ * side effects on cursors, which applications can hold open across
+ * calls to checkpoint.
+ */
+ WT_ERR(__wt_txn_begin(session, txn_cfg));
+
+ /* Tell logging that we have started a database checkpoint. */
+ if (conn->logging && full) {
+ WT_ERR(__wt_txn_checkpoint_log(
+ session, full, WT_TXN_LOG_CKPT_START, NULL));
+ logging = 1;
+ }
+
+ WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint, NULL));
+
+ /* Commit the transaction before syncing the file(s). */
+ WT_ERR(__wt_txn_commit(session, NULL));
+
+ /*
+ * Checkpoints have to hit disk (it would be reasonable to configure for
+ * lazy checkpoints, but we don't support them yet).
+ */
+ if (F_ISSET(conn, WT_CONN_CKPT_SYNC))
+ WT_ERR(__checkpoint_apply(
+ session, cfg, __wt_checkpoint_sync, NULL));
+
+ /* Checkpoint the metadata file. */
+ SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ if (WT_IS_METADATA(dhandle) ||
+ !WT_PREFIX_MATCH(dhandle->name, "file:"))
+ break;
+ }
+ if (dhandle == NULL)
+ WT_ERR_MSG(session, EINVAL,
+ "checkpoint unable to find open meta-data handle");
+
+ /*
+ * Disable metadata tracking during the metadata checkpoint.
+ *
+ * We don't lock old checkpoints in the metadata file: there is no way
+ * to open one. We are holding other handle locks, it is not safe to
+ * lock conn->spinlock.
+ */
+ session->isolation = txn->isolation = TXN_ISO_READ_UNCOMMITTED;
+ saved_meta_next = session->meta_track_next;
+ session->meta_track_next = NULL;
+ WT_WITH_DHANDLE(session, dhandle, ret = __wt_checkpoint(session, cfg));
+ session->meta_track_next = saved_meta_next;
+
+err: /*
+ * XXX
+ * Rolling back the changes here is problematic.
+ *
+ * If we unroll here, we need a way to roll back changes to the avail
+ * list for each tree that was successfully synced before the error
+ * occurred. Otherwise, the next time we try this operation, we will
+ * try to free an old checkpoint again.
+ *
+ * OTOH, if we commit the changes after a failure, we have partially
+ * overwritten the checkpoint, so what ends up on disk is not
+ * consistent.
+ */
+ session->isolation = txn->isolation = TXN_ISO_READ_UNCOMMITTED;
+ if (tracking)
+ WT_TRET(__wt_meta_track_off(session, ret != 0));
+
+ if (F_ISSET(txn, TXN_RUNNING))
+ WT_TRET(__wt_txn_rollback(session, NULL));
+
+ /* Tell logging that we have finished a database checkpoint. */
+ if (logging)
+ WT_TRET(__wt_txn_checkpoint_log(session, full,
+ (ret == 0) ? WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_FAIL,
+ NULL));
+
+ if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) {
+ F_CLR(session, WT_SESSION_SCHEMA_LOCKED);
+ __wt_spin_unlock(session, &conn->schema_lock);
+ }
+
+ session->isolation = txn->isolation = saved_isolation;
+
+ return (ret);
+}
+
+/*
+ * __drop --
+ * Drop all checkpoints with a specific name.
+ */
+static void
+__drop(WT_CKPT *ckptbase, const char *name, size_t len)
+{
+ WT_CKPT *ckpt;
+
+ /*
+ * If we're dropping internal checkpoints, match to the '.' separating
+ * the checkpoint name from the generational number, and take all that
+ * we can find. Applications aren't allowed to use any variant of this
+ * name, so the test is still pretty simple, if the leading bytes match,
+ * it's one we want to drop.
+ */
+ if (strncmp(WT_CHECKPOINT, name, len) == 0) {
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT))
+ F_SET(ckpt, WT_CKPT_DELETE);
+ } else
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (WT_STRING_MATCH(ckpt->name, name, len))
+ F_SET(ckpt, WT_CKPT_DELETE);
+}
+
+/*
+ * __drop_from --
+ * Drop all checkpoints after, and including, the named checkpoint.
+ */
+static void
+__drop_from(WT_CKPT *ckptbase, const char *name, size_t len)
+{
+ WT_CKPT *ckpt;
+ int matched;
+
+ /*
+ * There's a special case -- if the name is "all", then we delete all
+ * of the checkpoints.
+ */
+ if (WT_STRING_MATCH("all", name, len)) {
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ F_SET(ckpt, WT_CKPT_DELETE);
+ return;
+ }
+
+ /*
+ * We use the first checkpoint we can find, that is, if there are two
+ * checkpoints with the same name in the list, we'll delete from the
+ * first match to the end.
+ */
+ matched = 0;
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ if (!matched && !WT_STRING_MATCH(ckpt->name, name, len))
+ continue;
+
+ matched = 1;
+ F_SET(ckpt, WT_CKPT_DELETE);
+ }
+}
+
+/*
+ * __drop_to --
+ * Drop all checkpoints before, and including, the named checkpoint.
+ */
+static void
+__drop_to(WT_CKPT *ckptbase, const char *name, size_t len)
+{
+ WT_CKPT *ckpt, *mark;
+
+ /*
+ * We use the last checkpoint we can find, that is, if there are two
+ * checkpoints with the same name in the list, we'll delete from the
+ * beginning to the second match, not the first.
+ */
+ mark = NULL;
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (WT_STRING_MATCH(ckpt->name, name, len))
+ mark = ckpt;
+
+ if (mark == NULL)
+ return;
+
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ F_SET(ckpt, WT_CKPT_DELETE);
+
+ if (ckpt == mark)
+ break;
+ }
+}
+
+/*
+ * __checkpoint_worker --
+ * Checkpoint a tree.
+ */
+static int
+__checkpoint_worker(
+ WT_SESSION_IMPL *session, const char *cfg[], int is_checkpoint)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CKPT *ckpt, *ckptbase;
+ WT_CONFIG dropconf;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ WT_LSN ckptlsn;
+ const char *name;
+ int deleted, force, hot_backup_locked, track_ckpt, was_modified;
+ char *name_alloc;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ conn = S2C(session);
+ ckpt = ckptbase = NULL;
+ INIT_LSN(&ckptlsn);
+ dhandle = session->dhandle;
+ name_alloc = NULL;
+ hot_backup_locked = 0;
+ name_alloc = NULL;
+ track_ckpt = 1;
+ was_modified = btree->modified;
+
+ /* Get the list of checkpoints for this file. */
+ WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, &ckptbase));
+
+ /* This may be a named checkpoint, check the configuration. */
+ cval.len = 0;
+ if (cfg != NULL)
+ WT_ERR(__wt_config_gets(session, cfg, "name", &cval));
+ if (cval.len == 0)
+ name = WT_CHECKPOINT;
+ else {
+ WT_ERR(__wt_checkpoint_name_ok(session, cval.str, cval.len));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &name_alloc));
+ name = name_alloc;
+ }
+
+ /* We may be dropping specific checkpoints, check the configuration. */
+ if (cfg != NULL) {
+ cval.len = 0;
+ WT_ERR(__wt_config_gets(session, cfg, "drop", &cval));
+ if (cval.len != 0) {
+ WT_ERR(__wt_config_subinit(session, &dropconf, &cval));
+ while ((ret =
+ __wt_config_next(&dropconf, &k, &v)) == 0) {
+ /* Disallow unsafe checkpoint names. */
+ if (v.len == 0)
+ WT_ERR(__wt_checkpoint_name_ok(
+ session, k.str, k.len));
+ else
+ WT_ERR(__wt_checkpoint_name_ok(
+ session, v.str, v.len));
+
+ if (v.len == 0)
+ __drop(ckptbase, k.str, k.len);
+ else if (WT_STRING_MATCH("from", k.str, k.len))
+ __drop_from(ckptbase, v.str, v.len);
+ else if (WT_STRING_MATCH("to", k.str, k.len))
+ __drop_to(ckptbase, v.str, v.len);
+ else
+ WT_ERR_MSG(session, EINVAL,
+ "unexpected value for checkpoint "
+ "key: %.*s",
+ (int)k.len, k.str);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ }
+
+ /* Drop checkpoints with the same name as the one we're taking. */
+ __drop(ckptbase, name, strlen(name));
+
+ /*
+ * Check for clean objects not requiring a checkpoint.
+ *
+ * If we're closing a handle, and the object is clean, we can skip the
+ * checkpoint, whatever checkpoints we have are sufficient. (We might
+ * not have any checkpoints if the object was never modified, and that's
+ * OK: the object creation code doesn't mark the tree modified so we can
+ * skip newly created trees here.)
+ *
+ * If the application repeatedly checkpoints an object (imagine hourly
+ * checkpoints using the same explicit or internal name), there's no
+ * reason to repeat the checkpoint for clean objects. The test is if
+ * the only checkpoint we're deleting is the last one in the list and
+ * it has the same name as the checkpoint we're about to take, skip the
+ * work. (We can't skip checkpoints that delete more than the last
+ * checkpoint because deleting those checkpoints might free up space in
+ * the file.) This means an application toggling between two (or more)
+ * checkpoint names will repeatedly take empty checkpoints, but that's
+ * not likely enough to make detection worthwhile.
+ *
+ * Checkpoint read-only objects otherwise: the application must be able
+ * to open the checkpoint in a cursor after taking any checkpoint, which
+ * means it must exist.
+ */
+ force = 0;
+ if (!btree->modified && cfg != NULL) {
+ ret = __wt_config_gets(session, cfg, "force", &cval);
+ if (ret != 0 && ret != WT_NOTFOUND)
+ WT_ERR(ret);
+ if (ret == 0 && cval.val != 0)
+ force = 1;
+ }
+ if (!btree->modified && !force) {
+ if (!is_checkpoint)
+ goto done;
+
+ deleted = 0;
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_DELETE))
+ ++deleted;
+ /*
+ * Complicated test: if we only deleted a single checkpoint, and
+ * it was the last checkpoint in the object, and it has the same
+ * name as the checkpoint we're taking (correcting for internal
+ * checkpoint names with their generational suffix numbers), we
+ * can skip the checkpoint, there's nothing to do.
+ */
+ if (deleted == 1 &&
+ F_ISSET(ckpt - 1, WT_CKPT_DELETE) &&
+ (strcmp(name, (ckpt - 1)->name) == 0 ||
+ (WT_PREFIX_MATCH(name, WT_CHECKPOINT) &&
+ WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))))
+ goto done;
+ }
+
+ /* Add a new checkpoint entry at the end of the list. */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ ;
+ WT_ERR(__wt_strdup(session, name, &ckpt->name));
+ F_SET(ckpt, WT_CKPT_ADD);
+
+ /*
+ * We can't delete checkpoints if a backup cursor is open. WiredTiger
+ * checkpoints are uniquely named and it's OK to have multiple of them
+ * in the system: clear the delete flag for them, and otherwise fail.
+ * Hold the lock until we're done (blocking hot backups from starting),
+ * we don't want to race with a future hot backup.
+ */
+ __wt_spin_lock(session, &conn->hot_backup_lock);
+ hot_backup_locked = 1;
+ if (conn->hot_backup)
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+ if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) {
+ F_CLR(ckpt, WT_CKPT_DELETE);
+ continue;
+ }
+ WT_ERR_MSG(session, EBUSY,
+ "checkpoint %s blocked by hot backup: it would "
+ "delete an existing checkpoint, and checkpoints "
+ "cannot be deleted during a hot backup",
+ ckpt->name);
+ }
+
+ /*
+ * Lock the checkpoints that will be deleted.
+ *
+ * Checkpoints are only locked when tracking is enabled, which covers
+ * checkpoint and drop operations, but not close. The reasoning is
+ * there should be no access to a checkpoint during close, because any
+ * thread accessing a checkpoint will also have the current file handle
+ * open.
+ */
+ if (WT_META_TRACKING(session))
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+
+ /*
+ * We can't delete checkpoints referenced by a cursor.
+ * WiredTiger checkpoints are uniquely named and it's
+ * OK to have multiple in the system: clear the delete
+ * flag for them, and otherwise fail.
+ */
+ ret = __wt_session_lock_checkpoint(session, ckpt->name);
+ if (ret == 0)
+ continue;
+ if (ret == EBUSY &&
+ WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) {
+ F_CLR(ckpt, WT_CKPT_DELETE);
+ continue;
+ }
+ WT_ERR_MSG(session, ret,
+ "checkpoints cannot be dropped when in-use");
+ }
+
+ /*
+ * There are special files: those being bulk-loaded, salvaged, upgraded
+ * or verified during the checkpoint. We have to do something for those
+ * objects because a checkpoint is an external name the application can
+ * reference and the name must exist no matter what's happening during
+ * the checkpoint. For bulk-loaded files, we could block until the load
+ * completes, checkpoint the partial load, or magic up an empty-file
+ * checkpoint. The first is too slow, the second is insane, so do the
+ * third.
+ * Salvage, upgrade and verify don't currently require any work, all
+ * three hold the schema lock, blocking checkpoints. If we ever want to
+ * fix that (and I bet we eventually will, at least for verify), we can
+ * copy the last checkpoint the file has. That works if we guarantee
+ * salvage, upgrade and verify act on objects with previous checkpoints
+ * (true if handles are closed/re-opened between object creation and a
+ * subsequent salvage, upgrade or verify operation). Presumably,
+ * salvage and upgrade will discard all previous checkpoints when they
+ * complete, which is fine with us. This change will require reference
+ * counting checkpoints, and once that's done, we should use checkpoint
+ * copy instead of forcing checkpoints on clean objects to associate
+ * names with checkpoints.
+ */
+ if (is_checkpoint)
+ switch (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) {
+ case 0:
+ break;
+ case WT_BTREE_BULK:
+ /*
+ * The only checkpoints a bulk-loaded file should have
+ * are fake ones we created without the underlying block
+ * manager. I'm leaving this code here because it's a
+ * cheap test and a nasty race.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (!F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_FAKE))
+ WT_ERR_MSG(session, ret,
+ "block-manager checkpoint found "
+ "for a bulk-loaded file");
+ track_ckpt = 0;
+ goto fake;
+ case WT_BTREE_SALVAGE:
+ case WT_BTREE_UPGRADE:
+ case WT_BTREE_VERIFY:
+ WT_ERR_MSG(session, EINVAL,
+ "checkpoints are blocked during salvage, upgrade "
+ "or verify operations");
+ }
+
+ /*
+ * If an object has never been used (in other words, if it could become
+ * a bulk-loaded file), then we must fake the checkpoint. This is good
+ * because we don't write physical checkpoint blocks for just-created
+ * files, but it's not just a good idea. The reason is because deleting
+ * a physical checkpoint requires writing the file, and fake checkpoints
+ * can't write the file. If you (1) create a physical checkpoint for an
+ * empty file which writes blocks, (2) start bulk-loading records into
+ * the file, (3) during the bulk-load perform another checkpoint with
+ * the same name; in order to keep from having two checkpoints with the
+ * same name you would have to use the bulk-load's fake checkpoint to
+ * delete a physical checkpoint, and that will end in tears.
+ */
+ if (is_checkpoint)
+ if (btree->bulk_load_ok) {
+ track_ckpt = 0;
+ goto fake;
+ }
+
+ /*
+ * Mark the root page dirty to ensure something gets written. (If the
+ * tree is modified, we must write the root page anyway, this doesn't
+ * add additional writes to the process. If the tree is not modified,
+ * we have to dirty the root page to ensure something gets written.)
+ * This is really about paranoia: if the tree modification value gets
+ * out of sync with the set of dirty pages (modify is set, but there
+ * are no dirty pages), we perform a checkpoint without any writes, no
+ * checkpoint is created, and then things get bad.
+ */
+ WT_ERR(__wt_page_modify_init(session, btree->root.page));
+ __wt_page_modify_set(session, btree->root.page);
+
+ /*
+ * Clear the tree's modified flag; any changes before we clear the flag
+ * are guaranteed to be part of this checkpoint (unless reconciliation
+ * skips updates for transactional reasons), and changes subsequent to
+ * the checkpoint start, which might not be included, will re-set the
+ * modified flag. The "unless reconciliation skips updates" problem is
+ * handled in the reconciliation code: if reconciliation skips updates,
+ * it sets the modified flag itself. Use a full barrier so we get the
+ * store done quickly, this isn't a performance path.
+ */
+ btree->modified = 0;
+ WT_FULL_BARRIER();
+
+ /* Tell logging that a file checkpoint is starting. */
+ if (conn->logging)
+ WT_ERR(__wt_txn_checkpoint_log(
+ session, 0, WT_TXN_LOG_CKPT_START, &ckptlsn));
+
+ /* Flush the file from the cache, creating the checkpoint. */
+ if (is_checkpoint)
+ WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CHECKPOINT));
+ else
+ WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CLOSE));
+
+ /*
+ * All blocks being written have been written; set the object's write
+ * generation.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_ADD))
+ ckpt->write_gen = btree->write_gen;
+
+fake: /* Update the object's metadata. */
+ WT_ERR(__wt_meta_ckptlist_set(
+ session, dhandle->name, ckptbase, &ckptlsn));
+
+ /*
+ * If we wrote a checkpoint (rather than faking one), pages may be
+ * available for re-use. If tracking enabled, defer making pages
+ * available until transaction end. The exception is if the handle
+ * is being discarded, in which case the handle will be gone by the
+ * time we try to apply or unroll the meta tracking event.
+ */
+ if (track_ckpt) {
+ if (WT_META_TRACKING(session) && is_checkpoint)
+ WT_ERR(__wt_meta_track_checkpoint(session));
+ else
+ WT_ERR(bm->checkpoint_resolve(bm, session));
+ }
+
+ /* Tell logging that the checkpoint is complete. */
+ if (conn->logging)
+ WT_ERR(__wt_txn_checkpoint_log(
+ session, 0, WT_TXN_LOG_CKPT_STOP, NULL));
+
+done: err:
+ /*
+ * If the checkpoint didn't complete successfully, make sure the
+ * tree is marked dirty.
+ */
+ if (ret != 0 && !btree->modified && was_modified)
+ btree->modified = 1;
+
+ if (hot_backup_locked)
+ __wt_spin_unlock(session, &conn->hot_backup_lock);
+
+ __wt_meta_ckptlist_free(session, ckptbase);
+ __wt_free(session, name_alloc);
+
+ return (ret);
+}
+
+/*
+ * __wt_checkpoint --
+ * Checkpoint a file.
+ */
+int
+__wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ /* Should not be called with a checkpoint handle. */
+ WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+
+ /* Should be holding the schema lock. */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+ return (__checkpoint_worker(session, cfg, 1));
+}
+
+/*
+ * __wt_checkpoint_sync --
+ * Sync a file that has been checkpointed, and wait for the result.
+ */
+int
+__wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_BM *bm;
+
+ WT_UNUSED(cfg);
+
+ bm = S2BT(session)->bm;
+
+ /* Should not be called with a checkpoint handle. */
+ WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+
+ /* Should have an underlying block manager reference. */
+ WT_ASSERT(session, bm != NULL);
+
+ return (bm->sync(bm, session, 0));
+}
+
+/*
+ * __wt_checkpoint_close --
+ * Checkpoint a single file as part of closing the handle.
+ */
+int
+__wt_checkpoint_close(WT_SESSION_IMPL *session, int force)
+{
+ /* If closing an unmodified file, simply discard its blocks. */
+ if (!S2BT(session)->modified || force)
+ return (__wt_cache_op(session, NULL,
+ force ? WT_SYNC_DISCARD_FORCE : WT_SYNC_DISCARD));
+
+ /*
+ * Else, checkpoint the file and optionally flush the writes (the
+ * checkpoint call will discard the blocks, there's no additional
+ * step needed).
+ */
+ WT_RET(__checkpoint_worker(session, NULL, 0));
+ if (F_ISSET(S2C(session), WT_CONN_CKPT_SYNC))
+ WT_RET(__wt_checkpoint_sync(session, NULL));
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn_ext.c b/src/third_party/wiredtiger/src/txn/txn_ext.c
new file mode 100644
index 00000000000..31d5506be5b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn_ext.c
@@ -0,0 +1,104 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ext_transaction_id --
+ * Return the session's transaction ID.
+ */
+uint64_t
+__wt_ext_transaction_id(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session)
+{
+ WT_SESSION_IMPL *session;
+
+ (void)wt_api; /* Unused parameters */
+ session = (WT_SESSION_IMPL *)wt_session;
+ /* Ignore failures: the only case is running out of transaction IDs. */
+ (void)__wt_txn_id_check(session);
+ return (session->txn.id);
+}
+
+/*
+ * __wt_ext_transaction_isolation_level --
+ * Return if the current transaction's isolation level.
+ */
+int
+__wt_ext_transaction_isolation_level(
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session)
+{
+ WT_SESSION_IMPL *session;
+ WT_TXN *txn;
+
+ (void)wt_api; /* Unused parameters */
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ txn = &session->txn;
+
+ if (txn->isolation == TXN_ISO_READ_COMMITTED)
+ return (WT_TXN_ISO_READ_COMMITTED);
+ if (txn->isolation == TXN_ISO_READ_UNCOMMITTED)
+ return (WT_TXN_ISO_READ_UNCOMMITTED);
+ return (WT_TXN_ISO_SNAPSHOT);
+}
+
+/*
+ * __wt_ext_transaction_notify --
+ * Request notification of transaction resolution.
+ */
+int
+__wt_ext_transaction_notify(
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_TXN_NOTIFY *notify)
+{
+ WT_SESSION_IMPL *session;
+ WT_TXN *txn;
+
+ (void)wt_api; /* Unused parameters */
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ txn = &session->txn;
+
+ /*
+ * XXX
+ * For now, a single slot for notifications: I'm not bothering with
+ * more than one because more than one data-source in a transaction
+ * doesn't work anyway.
+ */
+ if (txn->notify == notify)
+ return (0);
+ if (txn->notify != NULL)
+ return (ENOMEM);
+
+ txn->notify = notify;
+
+ return (0);
+}
+
+/*
+ * __wt_ext_transaction_oldest --
+ * Return the oldest transaction ID not yet visible to a running
+ * transaction.
+ */
+uint64_t
+__wt_ext_transaction_oldest(WT_EXTENSION_API *wt_api)
+{
+ return (((WT_CONNECTION_IMPL *)wt_api->conn)->txn_global.oldest_id);
+}
+
+/*
+ * __wt_ext_transaction_visible --
+ * Return if the current transaction can see the given transaction ID.
+ */
+int
+__wt_ext_transaction_visible(
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint64_t transaction_id)
+{
+ (void)wt_api; /* Unused parameters */
+
+ return (__wt_txn_visible(
+ (WT_SESSION_IMPL *)wt_session, transaction_id));
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c
new file mode 100644
index 00000000000..03a71056a9a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn_log.c
@@ -0,0 +1,500 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __txn_op_log --
+ * Log an operation for the current transaction.
+ */
+static int
+__txn_op_log(WT_SESSION_IMPL *session,
+ WT_ITEM *logrec, WT_TXN_OP *op, WT_CURSOR_BTREE *cbt)
+{
+ WT_DECL_RET;
+ WT_ITEM key, value;
+ WT_UPDATE *upd;
+ uint64_t recno;
+
+ WT_CLEAR(key);
+ upd = op->u.upd;
+ value.data = WT_UPDATE_DATA(upd);
+ value.size = upd->size;
+
+ /*
+ * Log the operation. It must be one of the following:
+ * 1) column store remove;
+ * 2) column store insert/update;
+ * 3) row store remove; or
+ * 4) row store insert/update.
+ */
+ if (cbt->btree->type != BTREE_ROW) {
+ WT_ASSERT(session, cbt->ins != NULL);
+ recno = WT_INSERT_RECNO(cbt->ins);
+ WT_ASSERT(session, recno != 0);
+
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ WT_ERR(__wt_logop_col_remove_pack(session, logrec,
+ op->fileid, recno));
+ else
+ WT_ERR(__wt_logop_col_put_pack(session, logrec,
+ op->fileid, recno, &value));
+ } else {
+ WT_ERR(__wt_cursor_row_leaf_key(cbt, &key));
+
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ WT_ERR(__wt_logop_row_remove_pack(session, logrec,
+ op->fileid, &key));
+ else
+ WT_ERR(__wt_logop_row_put_pack(session, logrec,
+ op->fileid, &key, &value));
+ }
+
+err: __wt_buf_free(session, &key);
+ return (ret);
+}
+
+/*
+ * __txn_commit_printlog --
+ * Print a commit log record.
+ */
+static int
+__txn_commit_printlog(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ /* The logging subsystem zero-pads records. */
+ while (*pp < end && **pp)
+ WT_RET(__wt_txn_op_printlog(session, pp, end, out));
+ return (0);
+}
+
+/*
+ * __wt_txn_op_free --
+ * Free memory associated with a transactional operation.
+ */
+void
+__wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op)
+{
+ switch (op->type) {
+ case TXN_OP_BASIC:
+ case TXN_OP_INMEM:
+ case TXN_OP_REF:
+ case TXN_OP_TRUNCATE_COL:
+ break;
+
+ case TXN_OP_TRUNCATE_ROW:
+ __wt_buf_free(session, &op->u.truncate_row.start);
+ __wt_buf_free(session, &op->u.truncate_row.stop);
+ break;
+ }
+}
+
+/*
+ * __txn_logrec_init --
+ * Allocate and initialize a buffer for a transaction's log records.
+ */
+static int
+__txn_logrec_init(WT_SESSION_IMPL *session)
+{
+ WT_DECL_ITEM(logrec);
+ WT_DECL_RET;
+ WT_TXN *txn;
+ const char *fmt = WT_UNCHECKED_STRING(Iq);
+ uint32_t rectype = WT_LOGREC_COMMIT;
+ size_t header_size;
+
+ txn = &session->txn;
+ if (txn->logrec != NULL)
+ return (0);
+
+ WT_ASSERT(session, txn->id != WT_TXN_NONE);
+ WT_RET(__wt_struct_size(session, &header_size, fmt, rectype, txn->id));
+ WT_RET(__wt_logrec_alloc(session, header_size, &logrec));
+
+ WT_ERR(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, header_size,
+ fmt, rectype, txn->id));
+ logrec->size += (uint32_t)header_size;
+ txn->logrec = logrec;
+
+ if (0) {
+err: __wt_logrec_free(session, &logrec);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_txn_log_op --
+ * Write the last logged operation into the in-memory buffer.
+ */
+int
+__wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ WT_ITEM *logrec;
+ WT_TXN *txn;
+ WT_TXN_OP *op;
+
+ if (!S2C(session)->logging || F_ISSET(session, WT_SESSION_NO_LOGGING))
+ return (0);
+
+ txn = &session->txn;
+
+ /* We'd better have a transaction. */
+ WT_ASSERT(session,
+ F_ISSET(txn, TXN_RUNNING) && F_ISSET(txn, TXN_HAS_ID));
+
+ WT_ASSERT(session, txn->mod_count > 0);
+ op = txn->mod + txn->mod_count - 1;
+
+ WT_RET(__txn_logrec_init(session));
+ logrec = txn->logrec;
+
+ switch (op->type) {
+ case TXN_OP_BASIC:
+ return (__txn_op_log(session, logrec, op, cbt));
+ case TXN_OP_INMEM:
+ case TXN_OP_REF:
+ /* Nothing to log, we're done. */
+ return (0);
+ case TXN_OP_TRUNCATE_COL:
+ return (__wt_logop_col_truncate_pack(session, logrec,
+ op->fileid,
+ op->u.truncate_col.start, op->u.truncate_col.stop));
+ case TXN_OP_TRUNCATE_ROW:
+ return (__wt_logop_row_truncate_pack(session, txn->logrec,
+ op->fileid,
+ &op->u.truncate_row.start, &op->u.truncate_row.stop,
+ (uint32_t)op->u.truncate_row.mode));
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_txn_log_commit --
+ * Write the operations of a transaction to the log at commit time.
+ */
+int
+__wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_TXN *txn;
+
+ WT_UNUSED(cfg);
+ txn = &session->txn;
+
+ /* Write updates to the log. */
+ return (__wt_log_write(session, txn->logrec, NULL, txn->txn_logsync));
+}
+
+/*
+ * __txn_log_file_sync --
+ * Write a log record for a file sync.
+ */
+static int
+__txn_log_file_sync(WT_SESSION_IMPL *session, uint32_t flags, WT_LSN *lsnp)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(logrec);
+ WT_DECL_RET;
+ size_t header_size;
+ uint32_t rectype = WT_LOGREC_FILE_SYNC;
+ int start;
+ const char *fmt = WT_UNCHECKED_STRING(III);
+
+ btree = S2BT(session);
+ start = LF_ISSET(WT_TXN_LOG_CKPT_START);
+
+ WT_RET(__wt_struct_size(
+ session, &header_size, fmt, rectype, btree->id, start));
+ WT_RET(__wt_logrec_alloc(session, header_size, &logrec));
+
+ WT_ERR(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, header_size,
+ fmt, rectype, btree->id, start));
+ logrec->size += (uint32_t)header_size;
+
+ WT_ERR(__wt_log_write(session, logrec, lsnp, 0));
+err: __wt_logrec_free(session, &logrec);
+ return (ret);
+}
+
+/*
+ * __wt_txn_checkpoint_logread --
+ * Read a log record for a checkpoint operation.
+ */
+int
+__wt_txn_checkpoint_logread(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ WT_LSN *ckpt_lsn)
+{
+ WT_ITEM ckpt_snapshot;
+ u_int ckpt_nsnapshot;
+ const char *fmt = WT_UNCHECKED_STRING(IQIU);
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &ckpt_lsn->file, &ckpt_lsn->offset,
+ &ckpt_nsnapshot, &ckpt_snapshot));
+ WT_UNUSED(ckpt_nsnapshot);
+ WT_UNUSED(ckpt_snapshot);
+ *pp = end;
+ return (0);
+}
+
+/*
+ * __wt_txn_checkpoint_log --
+ * Write a log record for a checkpoint operation.
+ */
+int
+__wt_txn_checkpoint_log(
+ WT_SESSION_IMPL *session, int full, uint32_t flags, WT_LSN *lsnp)
+{
+ WT_DECL_ITEM(logrec);
+ WT_DECL_RET;
+ WT_LSN *ckpt_lsn;
+ WT_TXN *txn;
+ uint8_t *end, *p;
+ size_t recsize;
+ uint32_t i, rectype = WT_LOGREC_CHECKPOINT;
+ const char *fmt = WT_UNCHECKED_STRING(IIQIU);
+
+ txn = &session->txn;
+ ckpt_lsn = &txn->ckpt_lsn;
+
+ /*
+ * If this is a file sync, log it unless there is a full checkpoint in
+ * progress.
+ */
+ if (!full) {
+ if (txn->full_ckpt) {
+ if (lsnp != NULL)
+ *lsnp = *ckpt_lsn;
+ return (0);
+ } else
+ return (__txn_log_file_sync(session, flags, lsnp));
+ }
+
+ switch (flags) {
+ case WT_TXN_LOG_CKPT_PREPARE:
+ txn->full_ckpt = 1;
+ *ckpt_lsn = S2C(session)->log->alloc_lsn;
+ break;
+
+ case WT_TXN_LOG_CKPT_START:
+ /* Take a copy of the transaction snapshot. */
+ txn->ckpt_nsnapshot = txn->snapshot_count;
+ recsize = txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE;
+ WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot));
+ p = txn->ckpt_snapshot->mem;
+ end = p + recsize;
+ for (i = 0; i < txn->snapshot_count; i++)
+ WT_ERR(__wt_vpack_uint(
+ &p, WT_PTRDIFF(end, p), txn->snapshot[i]));
+ break;
+
+ case WT_TXN_LOG_CKPT_STOP:
+ /*
+ * During a clean connection close, we get here without the
+ * prepare or start steps. In that case, log the current LSN
+ * as the checkpoint LSN.
+ */
+ if (!txn->full_ckpt) {
+ txn->ckpt_nsnapshot = 0;
+ *ckpt_lsn = S2C(session)->log->alloc_lsn;
+ }
+
+ /* Write the checkpoint log record. */
+ WT_ERR(__wt_struct_size(session, &recsize, fmt,
+ rectype, ckpt_lsn->file, ckpt_lsn->offset,
+ txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
+ WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));
+
+ WT_ERR(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, recsize, fmt,
+ rectype, ckpt_lsn->file, ckpt_lsn->offset,
+ txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
+ logrec->size += (uint32_t)recsize;
+ WT_ERR(__wt_log_write(session, logrec, lsnp, 0));
+
+ /*
+ * If this full checkpoint completed successfully and there is
+ * no hot backup in progress, tell the logging subsystem the
+ * checkpoint LSN so that it can archive.
+ */
+ if (!S2C(session)->hot_backup)
+ WT_ERR(__wt_log_ckpt(session, ckpt_lsn));
+
+ /* FALLTHROUGH */
+ case WT_TXN_LOG_CKPT_FAIL:
+ /* Cleanup any allocated resources */
+ INIT_LSN(ckpt_lsn);
+ txn->ckpt_nsnapshot = 0;
+ __wt_scr_free(&txn->ckpt_snapshot);
+ txn->full_ckpt = 0;
+ break;
+ }
+
+err: __wt_logrec_free(session, &logrec);
+ return (ret);
+}
+
+/*
+ * __wt_txn_truncate_log --
+ * Begin truncating a range of a file.
+ */
+int
+__wt_txn_truncate_log(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
+{
+ WT_BTREE *btree;
+ WT_ITEM *item;
+ WT_TXN_OP *op;
+
+ btree = S2BT(session);
+
+ WT_RET(__txn_next_op(session, &op));
+
+ if (btree->type == BTREE_ROW) {
+ op->type = TXN_OP_TRUNCATE_ROW;
+ op->u.truncate_row.mode = TXN_TRUNC_ALL;
+ WT_CLEAR(op->u.truncate_row.start);
+ WT_CLEAR(op->u.truncate_row.stop);
+ if (start != NULL) {
+ op->u.truncate_row.mode = TXN_TRUNC_START;
+ item = &op->u.truncate_row.start;
+ WT_RET(__wt_cursor_get_raw_key(&start->iface, item));
+ WT_RET(__wt_buf_set(
+ session, item, item->data, item->size));
+ }
+ if (stop != NULL) {
+ op->u.truncate_row.mode =
+ (op->u.truncate_row.mode == TXN_TRUNC_ALL) ?
+ TXN_TRUNC_STOP : TXN_TRUNC_BOTH;
+ item = &op->u.truncate_row.stop;
+ WT_RET(__wt_cursor_get_raw_key(&stop->iface, item));
+ WT_RET(__wt_buf_set(
+ session, item, item->data, item->size));
+ }
+ } else {
+ op->type = TXN_OP_TRUNCATE_COL;
+ op->u.truncate_col.start =
+ (start == NULL) ? 0 : start->recno;
+ op->u.truncate_col.stop =
+ (stop == NULL) ? 0 : stop->recno;
+ }
+
+ /* Write that operation into the in-memory log. */
+ WT_RET(__wt_txn_log_op(session, NULL));
+
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOGGING_INMEM));
+ F_SET(session, WT_SESSION_LOGGING_INMEM);
+ return (0);
+}
+
+/*
+ * __wt_txn_truncate_end --
+ * Finish truncating a range of a file.
+ */
+int
+__wt_txn_truncate_end(WT_SESSION_IMPL *session)
+{
+ F_CLR(session, WT_SESSION_LOGGING_INMEM);
+ return (0);
+}
+
+/*
+ * __txn_printlog --
+ * Print a log record in a human-readable format.
+ */
+static int
+__txn_printlog(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie)
+{
+ FILE *out;
+ WT_LSN ckpt_lsn;
+ uint64_t txnid;
+ uint32_t fileid, rectype;
+ int32_t start;
+ const uint8_t *end, *p;
+ const char *msg;
+
+ out = cookie;
+
+ p = LOG_SKIP_HEADER(logrec->data);
+ end = (const uint8_t *)logrec->data + logrec->size;
+
+ /* First, peek at the log record type. */
+ WT_RET(__wt_logrec_read(session, &p, end, &rectype));
+
+ if (fprintf(out, " { \"lsn\" : [%" PRIu32 ",%" PRId64 "],\n",
+ lsnp->file, lsnp->offset) < 0)
+ return (errno);
+
+ switch (rectype) {
+ case WT_LOGREC_CHECKPOINT:
+ WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
+ WT_UNCHECKED_STRING(IQ), &ckpt_lsn.file, &ckpt_lsn.offset));
+ if (fprintf(out, " \"type\" : \"checkpoint\"\n") < 0 ||
+ fprintf(
+ out, " \"ckpt_lsn\" : [%" PRIu32 ",%" PRId64 "],\n",
+ ckpt_lsn.file, ckpt_lsn.offset) < 0)
+ return (errno);
+ break;
+
+ case WT_LOGREC_COMMIT:
+ WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid));
+ if (fprintf(out, " \"type\" : \"commit\"\n") < 0 ||
+ fprintf(out, " \"txnid\" : %" PRIu64 ",\n", txnid) < 0)
+ return (errno);
+ WT_RET(__txn_commit_printlog(session, &p, end, out));
+ break;
+
+ case WT_LOGREC_FILE_SYNC:
+ WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
+ WT_UNCHECKED_STRING(Ii), &fileid, &start));
+ if (fprintf(out, " \"type\" : \"file_sync\"\n") < 0 ||
+ fprintf(out, " \"fileid\" : %" PRIu32 "\n",
+ fileid) < 0 ||
+ fprintf(out, " \"start\" : %" PRId32 "\n", start) < 0)
+ return (errno);
+ break;
+
+ case WT_LOGREC_MESSAGE:
+ WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
+ WT_UNCHECKED_STRING(S), &msg));
+ if (fprintf(out, " \"type\" : \"message\"\n") < 0 ||
+ fprintf(out, " \"message\" : \"%s\"\n", msg) < 0)
+ return (errno);
+ break;
+ }
+
+ if (fprintf(out, " },\n") < 0)
+ return (errno);
+
+ return (0);
+}
+
+/*
+ * __wt_txn_printlog --
+ * Print the log in a human-readable format.
+ */
+int
+__wt_txn_printlog(WT_SESSION *wt_session, FILE *out)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ if (fprintf(out, "[\n") < 0)
+ return (errno);
+ WT_RET(__wt_log_scan(
+ session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, out));
+ if (fprintf(out, "]\n") < 0)
+ return (errno);
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
new file mode 100644
index 00000000000..38c606320ef
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -0,0 +1,491 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/* State maintained during recovery. */
+typedef struct {
+ WT_SESSION_IMPL *session;
+
+ /* Files from the metadata, indexed by file ID. */
+ struct WT_RECOVERY_FILE {
+ const char *uri; /* File URI. */
+ WT_CURSOR *c; /* Cursor used for recovery. */
+ WT_LSN ckpt_lsn; /* File's checkpoint LSN. */
+ } *files;
+ size_t file_alloc; /* Allocated size of files array. */
+ u_int max_fileid; /* Maximum file ID seen. */
+ u_int nfiles; /* Number of files in the metadata. */
+
+ WT_LSN ckpt_lsn; /* Start LSN for main recovery loop. */
+
+ int missing; /* Were there missing files? */
+ int modified; /* Did recovery make any changes? */
+ int metadata_only; /*
+ * Set during the first recovery pass,
+ * when only the metadata is recovered.
+ */
+} WT_RECOVERY;
+
+/*
+ * __recovery_cursor --
+ * Get a cursor for a recovery operation.
+ */
+static int
+__recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r,
+ WT_LSN *lsnp, u_int id, int duplicate, WT_CURSOR **cp)
+{
+ WT_CURSOR *c;
+ const char *cfg[] = { WT_CONFIG_BASE(session, session_open_cursor),
+ "overwrite", NULL };
+ int metadata_op;
+
+ c = NULL;
+
+ /* Track the largest file ID we have seen. */
+ if (id > r->max_fileid)
+ r->max_fileid = id;
+
+ /*
+ * Metadata operations have an id of 0. Match operations based
+ * on the id and the current pass of recovery for metadata.
+ *
+ * Only apply operations in the correct metadata phase, and if the LSN
+ * is more recent than the last checkpoint. If there is no entry for a
+ * file, assume it was dropped or missing after a hot backup.
+ */
+ metadata_op = (id == WT_METAFILE_ID);
+ if (r->metadata_only != metadata_op)
+ ;
+ else if (id >= r->nfiles || r->files[id].uri == NULL) {
+ /* If a file is missing, output a verbose message once. */
+ if (!r->missing)
+ WT_RET(__wt_verbose(session, WT_VERB_RECOVERY,
+ "No file found with ID %u (max %u)",
+ id, r->nfiles));
+ r->missing = 1;
+ } else if (LOG_CMP(lsnp, &r->files[id].ckpt_lsn) >= 0) {
+ /*
+ * We're going to apply the operation. Get the cursor, opening
+ * one if none is cached.
+ */
+ if ((c = r->files[id].c) == NULL) {
+ WT_RET(__wt_open_cursor(
+ session, r->files[id].uri, NULL, cfg, &c));
+ r->files[id].c = c;
+ }
+ }
+
+ if (duplicate && c != NULL)
+ WT_RET(__wt_open_cursor(
+ session, r->files[id].uri, NULL, cfg, &c));
+
+ *cp = c;
+ return (0);
+}
+
+/*
+ * Helper to a cursor if this operation is to be applied during recovery.
+ */
+#define GET_RECOVERY_CURSOR(session, r, lsnp, fileid, cp) \
+ WT_ERR(__recovery_cursor( \
+ (session), (r), (lsnp), (fileid), 0, (cp))); \
+ WT_ERR(__wt_verbose((session), WT_VERB_RECOVERY, \
+ "%s op %d to file %d at LSN %u/%" PRIuMAX, \
+ (cursor == NULL) ? "Skipping" : "Applying", \
+ optype, fileid, lsnp->file, (uintmax_t)lsnp->offset)); \
+ if (cursor == NULL) \
+ break
+
+/*
+ * __txn_op_apply --
+ * Apply a transactional operation during recovery.
+ */
+static int
+__txn_op_apply(
+ WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
+{
+ WT_CURSOR *cursor, *start, *stop;
+ WT_DECL_RET;
+ WT_ITEM key, start_key, stop_key, value;
+ WT_SESSION_IMPL *session;
+ uint64_t recno, start_recno, stop_recno;
+ uint32_t fileid, mode, optype, opsize;
+
+ session = r->session;
+ cursor = NULL;
+
+ /* Peek at the size and the type. */
+ WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize));
+ end = *pp + opsize;
+
+ switch (optype) {
+ case WT_LOGOP_COL_PUT:
+ WT_ERR(__wt_logop_col_put_unpack(session, pp, end,
+ &fileid, &recno, &value));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ cursor->set_key(cursor, recno);
+ __wt_cursor_set_raw_value(cursor, &value);
+ WT_ERR(cursor->insert(cursor));
+ break;
+
+ case WT_LOGOP_COL_REMOVE:
+ WT_ERR(__wt_logop_col_remove_unpack(session, pp, end,
+ &fileid, &recno));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ cursor->set_key(cursor, recno);
+ WT_ERR(cursor->remove(cursor));
+ break;
+
+ case WT_LOGOP_COL_TRUNCATE:
+ WT_ERR(__wt_logop_col_truncate_unpack(session, pp, end,
+ &fileid, &start_recno, &stop_recno));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+
+ /* Set up the cursors. */
+ if (start_recno == 0) {
+ start = NULL;
+ stop = cursor;
+ } else if (stop_recno == 0) {
+ start = cursor;
+ stop = NULL;
+ } else {
+ start = cursor;
+ WT_ERR(__recovery_cursor(
+ session, r, lsnp, fileid, 1, &stop));
+ }
+
+ /* Set the keys. */
+ if (start != NULL)
+ start->set_key(start, start_recno);
+ if (stop != NULL)
+ stop->set_key(stop, stop_recno);
+
+ WT_TRET(session->iface.truncate(&session->iface, NULL,
+ start, stop, NULL));
+ /* If we opened a duplicate cursor, close it now. */
+ if (stop != NULL && stop != cursor)
+ WT_TRET(stop->close(stop));
+ WT_ERR(ret);
+ break;
+
+ case WT_LOGOP_ROW_PUT:
+ WT_ERR(__wt_logop_row_put_unpack(session, pp, end,
+ &fileid, &key, &value));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ __wt_cursor_set_raw_key(cursor, &key);
+ __wt_cursor_set_raw_value(cursor, &value);
+ WT_ERR(cursor->insert(cursor));
+ break;
+
+ case WT_LOGOP_ROW_REMOVE:
+ WT_ERR(__wt_logop_row_remove_unpack(session, pp, end,
+ &fileid, &key));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ __wt_cursor_set_raw_key(cursor, &key);
+ WT_ERR(cursor->remove(cursor));
+ break;
+
+ case WT_LOGOP_ROW_TRUNCATE:
+ WT_ERR(__wt_logop_row_truncate_unpack(session, pp, end,
+ &fileid, &start_key, &stop_key, &mode));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ /* Set up the cursors. */
+ start = stop = NULL;
+ switch (mode) {
+ case TXN_TRUNC_ALL:
+ /* Both cursors stay NULL. */
+ break;
+ case TXN_TRUNC_BOTH:
+ start = cursor;
+ WT_ERR(__recovery_cursor(
+ session, r, lsnp, fileid, 1, &stop));
+ break;
+ case TXN_TRUNC_START:
+ start = cursor;
+ break;
+ case TXN_TRUNC_STOP:
+ stop = cursor;
+ break;
+
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Set the keys. */
+ if (start != NULL)
+ __wt_cursor_set_raw_key(start, &start_key);
+ if (stop != NULL)
+ __wt_cursor_set_raw_key(stop, &stop_key);
+
+ WT_TRET(session->iface.truncate(&session->iface, NULL,
+ start, stop, NULL));
+ /* If we opened a duplicate cursor, close it now. */
+ if (stop != NULL && stop != cursor)
+ WT_TRET(stop->close(stop));
+ WT_ERR(ret);
+ break;
+
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Reset the cursor so it doesn't block eviction. */
+ if (cursor != NULL)
+ WT_ERR(cursor->reset(cursor));
+
+ r->modified = 1;
+
+err: if (ret != 0)
+ __wt_err(session, ret, "Operation failed during recovery");
+ return (ret);
+}
+
+/*
+ * __txn_commit_apply --
+ * Apply a commit record during recovery.
+ */
+static int
+__txn_commit_apply(
+ WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
+{
+ WT_UNUSED(lsnp);
+
+ /* The logging subsystem zero-pads records. */
+ while (*pp < end && **pp)
+ WT_RET(__txn_op_apply(r, lsnp, pp, end));
+
+ return (0);
+}
+
+/*
+ * __txn_log_recover --
+ * Roll the log forward to recover committed changes.
+ */
+static int
+__txn_log_recover(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie)
+{
+ WT_RECOVERY *r;
+ const uint8_t *end, *p;
+ uint64_t txnid;
+ uint32_t rectype;
+
+ r = cookie;
+ p = LOG_SKIP_HEADER(logrec->data);
+ end = (const uint8_t *)logrec->data + logrec->size;
+
+ /* First, peek at the log record type. */
+ WT_RET(__wt_logrec_read(session, &p, end, &rectype));
+
+ switch (rectype) {
+ case WT_LOGREC_CHECKPOINT:
+ if (r->metadata_only)
+ WT_RET(__wt_txn_checkpoint_logread(
+ session, &p, end, &r->ckpt_lsn));
+ break;
+
+ case WT_LOGREC_COMMIT:
+ WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid));
+ WT_UNUSED(txnid);
+ WT_RET(__txn_commit_apply(r, lsnp, &p, end));
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * __recovery_setup_file --
+ * Set up the recovery slot for a file.
+ */
+static int
+__recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_LSN lsn;
+ uint32_t fileid;
+
+ WT_RET(__wt_config_getones(r->session, config, "id", &cval));
+ fileid = (uint32_t)cval.val;
+
+ if (r->nfiles <= fileid) {
+ WT_RET(__wt_realloc_def(
+ r->session, &r->file_alloc, fileid + 1, &r->files));
+ r->nfiles = fileid + 1;
+ }
+
+ WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri));
+ WT_RET(
+ __wt_config_getones(r->session, config, "checkpoint_lsn", &cval));
+ /* If there is checkpoint logged for the file, apply everything. */
+ if (cval.type != WT_CONFIG_ITEM_STRUCT)
+ INIT_LSN(&lsn);
+ else if (sscanf(cval.str, "(%" PRIu32 ",%" PRIdMAX ")",
+ &lsn.file, (intmax_t*)&lsn.offset) != 2)
+ WT_RET_MSG(r->session, EINVAL,
+ "Failed to parse checkpoint LSN '%.*s'",
+ (int)cval.len, cval.str);
+ r->files[fileid].ckpt_lsn = lsn;
+
+ WT_RET(__wt_verbose(r->session, WT_VERB_RECOVERY,
+ "Recovering %s with id %u @ (%" PRIu32 ", %" PRIu64 ")",
+ uri, fileid, lsn.file, lsn.offset));
+
+ return (0);
+
+}
+
+/*
+ * __recovery_free --
+ * Free the recovery state.
+ */
+static int
+__recovery_free(WT_RECOVERY *r)
+{
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ session = r->session;
+ for (i = 0; i < r->nfiles; i++) {
+ __wt_free(session, r->files[i].uri);
+ if ((c = r->files[i].c) != NULL)
+ WT_TRET(c->close(c));
+ }
+
+ __wt_free(session, r->files);
+ return (ret);
+}
+
+/*
+ * __recovery_file_scan --
+ * Scan the files referenced from the metadata and gather information
+ * about them for recovery.
+ */
+static int
+__recovery_file_scan(WT_RECOVERY *r)
+{
+ WT_DECL_RET;
+ WT_CURSOR *c;
+ const char *uri, *config;
+ int cmp;
+
+ /* Scan through all files in the metadata. */
+ c = r->files[0].c;
+ c->set_key(c, "file:");
+ if ((ret = c->search_near(c, &cmp)) != 0) {
+ /* Is the metadata empty? */
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ goto err;
+ }
+ if (cmp < 0)
+ WT_ERR_NOTFOUND_OK(c->next(c));
+ for (; ret == 0; ret = c->next(c)) {
+ WT_ERR(c->get_key(c, &uri));
+ if (!WT_PREFIX_MATCH(uri, "file:"))
+ break;
+ WT_ERR(c->get_value(c, &config));
+ WT_ERR(__recovery_setup_file(r, uri, config));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: if (r->nfiles > r->max_fileid)
+ r->max_fileid = r->nfiles;
+ return (ret);
+}
+
+/*
+ * __wt_txn_recover --
+ * Run recovery.
+ */
+int
+__wt_txn_recover(WT_CONNECTION_IMPL *conn)
+{
+ WT_CURSOR *metac;
+ WT_DECL_RET;
+ WT_RECOVERY r;
+ WT_SESSION_IMPL *session;
+ struct WT_RECOVERY_FILE *metafile;
+ const char *config;
+ int was_backup;
+
+ WT_CLEAR(r);
+ INIT_LSN(&r.ckpt_lsn);
+ was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0;
+
+ /* We need a real session for recovery. */
+ WT_RET(__wt_open_session(conn, NULL, NULL, &session));
+ F_SET(session, WT_SESSION_NO_LOGGING);
+ r.session = session;
+
+ WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
+ WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
+ WT_ERR(__wt_metadata_cursor(session, NULL, &metac));
+ metafile = &r.files[WT_METAFILE_ID];
+ metafile->c = metac;
+
+ /*
+ * First, do a pass through the log to recover the metadata, and
+ * establish the last checkpoint LSN. Skip this when opening a hot
+ * backup: we already have the correct metadata in that case.
+ */
+ if (!was_backup) {
+ r.metadata_only = 1;
+ if (IS_INIT_LSN(&metafile->ckpt_lsn))
+ WT_ERR(__wt_log_scan(session,
+ NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r));
+ else
+ WT_ERR(__wt_log_scan(session,
+ &metafile->ckpt_lsn, 0, __txn_log_recover, &r));
+
+ WT_ASSERT(session,
+ LOG_CMP(&r.ckpt_lsn, &conn->log->first_lsn) >= 0);
+ }
+
+ /* Scan the metadata to find the live files and their IDs. */
+ WT_ERR(__recovery_file_scan(&r));
+
+ /*
+ * We no longer need the metadata cursor: close it to avoid pinning any
+ * resources that could block eviction during recovery.
+ */
+ r.files[0].c = NULL;
+ WT_ERR(metac->close(metac));
+
+ /*
+ * Now, recover all the files apart from the metadata.
+ * Pass WT_LOGSCAN_RECOVER so that old logs get truncated.
+ */
+ r.metadata_only = 0;
+ WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY,
+ "Main recovery loop: starting at %u/%" PRIuMAX,
+ r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset));
+ if (IS_INIT_LSN(&r.ckpt_lsn))
+ WT_ERR(__wt_log_scan(session, NULL,
+ WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER,
+ __txn_log_recover, &r));
+ else
+ WT_ERR(__wt_log_scan(session, &r.ckpt_lsn,
+ WT_LOGSCAN_RECOVER,
+ __txn_log_recover, &r));
+
+ conn->next_file_id = r.max_fileid;
+
+ /*
+ * If recovery ran successfully forcibly log a checkpoint so the next
+ * open is fast and keep the metadata up to date with the checkpoint
+ * LSN and archiving.
+ */
+ WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
+
+err: WT_TRET(__recovery_free(&r));
+ __wt_free(session, config);
+ WT_TRET(session->iface.close(&session->iface, NULL));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util.h b/src/third_party/wiredtiger/src/utilities/util.h
new file mode 100644
index 00000000000..1f2f0b7211a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util.h
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include <wt_internal.h>
+
+typedef struct {
+ void *mem; /* Managed memory chunk */
+ size_t memsize; /* Managed memory size */
+} ULINE;
+
+extern const char *home; /* Home directory */
+extern const char *progname; /* Program name */
+extern const char *usage_prefix; /* Global arguments */
+extern int verbose; /* Verbose flag */
+
+extern WT_EVENT_HANDLER *verbose_handler;
+
+extern int __wt_opterr; /* if error message should be printed */
+extern int __wt_optind; /* index into parent argv vector */
+extern int __wt_optopt; /* character checked for validity */
+extern int __wt_optreset; /* reset getopt */
+extern char *__wt_optarg; /* argument associated with option */
+
+int util_backup(WT_SESSION *, int, char *[]);
+int util_cerr(const char *, const char *, int);
+int util_compact(WT_SESSION *, int, char *[]);
+void util_copyright(void);
+int util_create(WT_SESSION *, int, char *[]);
+int util_drop(WT_SESSION *, int, char *[]);
+int util_dump(WT_SESSION *, int, char *[]);
+int util_err(int, const char *, ...);
+int util_flush(WT_SESSION *, const char *);
+int util_list(WT_SESSION *, int, char *[]);
+int util_load(WT_SESSION *, int, char *[]);
+int util_loadtext(WT_SESSION *, int, char *[]);
+char *util_name(const char *, const char *);
+int util_printlog(WT_SESSION *, int, char *[]);
+int util_read(WT_SESSION *, int, char *[]);
+int util_read_line(ULINE *, int, int *);
+int util_rename(WT_SESSION *, int, char *[]);
+int util_salvage(WT_SESSION *, int, char *[]);
+int util_stat(WT_SESSION *, int, char *[]);
+int util_str2recno(const char *p, uint64_t *recnop);
+int util_upgrade(WT_SESSION *, int, char *[]);
+int util_verify(WT_SESSION *, int, char *[]);
+int util_write(WT_SESSION *, int, char *[]);
diff --git a/src/third_party/wiredtiger/src/utilities/util_backup.c b/src/third_party/wiredtiger/src/utilities/util_backup.c
new file mode 100644
index 00000000000..aa61cc338f0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_backup.c
@@ -0,0 +1,205 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int copy(const char *, const char *);
+static int usage(void);
+
+#define CBUF_LEN (128 * 1024) /* Copy buffer and size. */
+static char *cbuf;
+
+/*
+ * append_target --
+ * Build a list of comma-separated targets.
+ */
+static int
+append_target(const char *target, char **bufp)
+{
+ static int first = 1;
+ static size_t len = 0, remain = 0;
+ static char *buf = NULL;
+
+ /* 20 bytes of slop */
+ if (remain < strlen(target) + 20) {
+ len += strlen(target) + 512;
+ remain += strlen(target) + 512;
+ if ((buf = realloc(buf, len)) == NULL)
+ return (util_err(errno, NULL));
+ *bufp = buf;
+ }
+ if (first) {
+ first = 0;
+ strcpy(buf, "target=(");
+ } else
+ buf[strlen(buf) - 1] = ','; /* overwrite previous ")" */
+ strcat(buf, "\"");
+ strcat(buf, target);
+ strcat(buf, "\")");
+ remain -= strlen(target) + 1;
+
+ return (0);
+}
+
+int
+util_backup(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ int ch;
+ char *config;
+ const char *directory, *name;
+
+ config = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "t:")) != EOF)
+ switch (ch) {
+ case 't':
+ if (append_target(__wt_optarg, &config))
+ return (1);
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ if (argc != 1) {
+ (void)usage();
+ goto err;
+ }
+ directory = *argv;
+
+ if ((ret = session->open_cursor(
+ session, "backup:", NULL, config, &cursor)) != 0) {
+ fprintf(stderr, "%s: cursor open(backup:) failed: %s\n",
+ progname, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ /* Copy the files. */
+ while (
+ (ret = cursor->next(cursor)) == 0 &&
+ (ret = cursor->get_key(cursor, &name)) == 0)
+ if ((ret = copy(name, directory)) != 0)
+ goto err;
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+
+ if (ret != 0) {
+ fprintf(stderr, "%s: cursor next(backup:) failed: %s\n",
+ progname, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+err: if (config != NULL)
+ free(config);
+ if (cbuf != NULL)
+ free(cbuf);
+
+ return (ret);
+}
+
+static int
+copy(const char *name, const char *directory)
+{
+ WT_DECL_RET;
+ ssize_t n;
+ int ifd, ofd;
+
+ ret = 1;
+ ifd = ofd = -1;
+
+ if (verbose &&
+ printf("Backing up %s/%s to %s\n", home, name, directory) < 0) {
+ fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+ return (1);
+ }
+
+ /* Allocate a large copy buffer (use it to build pathnames as well. */
+ if (cbuf == NULL && (cbuf = malloc(CBUF_LEN)) == NULL)
+ goto memerr;
+
+ /* Open the read file. */
+ if (snprintf(cbuf, CBUF_LEN, "%s/%s", home, name) >= CBUF_LEN)
+ goto memerr;
+ if ((ifd = open(cbuf, O_BINARY | O_RDONLY, 0)) < 0)
+ goto readerr;
+
+ /* Open the write file. */
+ if (snprintf(cbuf, CBUF_LEN, "%s/%s", directory, name) >= CBUF_LEN)
+ goto memerr;
+ if ((ofd = open(
+ cbuf, O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, 0666)) < 0)
+ goto writerr;
+
+ /* Copy the file. */
+ while ((n = read(ifd, cbuf, CBUF_LEN)) > 0)
+ if (write(ofd, cbuf, (size_t)n) != n)
+ goto writerr;
+ if (n != 0)
+ goto readerr;
+
+ /*
+ * Close file descriptors (forcing a flush on the write side), and
+ * check for any errors.
+ */
+ ret = close(ifd);
+ ifd = -1;
+ if (ret != 0)
+ goto readerr;
+
+ /*
+ * We need to know this file was successfully written, it's a backup.
+ */
+#ifdef _WIN32
+ if (FlushFileBuffers((HANDLE)_get_osfhandle(ofd)) == 0) {
+ DWORD err = GetLastError();
+ ret = err;
+ goto writerr;
+ }
+#else
+ if (fsync(ofd))
+ goto writerr;
+#endif
+ ret = close(ofd);
+ ofd = -1;
+ if (ret != 0)
+ goto writerr;
+
+ /* Success. */
+ ret = 0;
+
+ if (0) {
+readerr: fprintf(stderr,
+ "%s: %s/%s: %s\n", progname, home, name, strerror(errno));
+ }
+ if (0) {
+writerr: fprintf(stderr, "%s: %s/%s: %s\n",
+ progname, directory, name, strerror(errno));
+ }
+ if (0) {
+memerr: fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+ }
+
+ if (ifd >= 0)
+ (void)close(ifd);
+ if (ofd >= 0)
+ (void)close(ofd);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "backup [-t uri] directory\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_compact.c b/src/third_party/wiredtiger/src/utilities/util_compact.c
new file mode 100644
index 00000000000..51d5461e43c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_compact.c
@@ -0,0 +1,59 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_compact(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ char *uri;
+
+ uri = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+ switch (ch) {
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the table name. */
+ if (argc != 1)
+ return (usage());
+ if ((uri = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ if ((ret = session->compact(session, uri, NULL)) != 0) {
+ fprintf(stderr, "%s: compact(%s): %s\n",
+ progname, uri, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ if (0) {
+err: ret = 1;
+ }
+
+ if (uri != NULL)
+ free(uri);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "compact uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_cpyright.c b/src/third_party/wiredtiger/src/utilities/util_cpyright.c
new file mode 100644
index 00000000000..21d82828863
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_cpyright.c
@@ -0,0 +1,35 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+void
+util_copyright(void)
+{
+ printf("%s\n", "Copyright (c) 2008-2014 WiredTiger, Inc.");
+ printf("%s\n\n", "All rights reserved.");
+
+ printf("%s\n\n",
+ "This program is free software: you can redistribute it and/or\n"
+ "modify it under the terms of version 3 of the GNU General\n"
+ "Public License as published by the Free Software Foundation.");
+
+ printf("%s\n\n",
+ "This program is distributed in the hope that it will be useful,\n"
+ "but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
+ "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"
+ "GNU General Public License for more details:");
+
+ printf("\t%s\n\n",
+ "http://www.gnu.org/licenses/gpl-3.0-standalone.html");
+
+ printf("%s\n",
+ "For a license to use the WiredTiger software under conditions\n"
+ "other than those described by the GNU General Public License,\n"
+ "or for technical support for this software, contact WiredTiger,\n"
+ "Inc. at info@wiredtiger.com.");
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_create.c b/src/third_party/wiredtiger/src/utilities/util_create.c
new file mode 100644
index 00000000000..ebff3a8ad05
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_create.c
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_create(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ const char *config, *uri;
+
+ config = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "c:")) != EOF)
+ switch (ch) {
+ case 'c': /* command-line configuration */
+ config = __wt_optarg;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the uri. */
+ if (argc != 1)
+ return (usage());
+
+ if ((uri = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ if ((ret = session->create(session, uri, config)) != 0)
+ return (util_err(ret, "%s: session.create", uri));
+ return (0);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "create [-c configuration] uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_drop.c b/src/third_party/wiredtiger/src/utilities/util_drop.c
new file mode 100644
index 00000000000..6fe416882a3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_drop.c
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_drop(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ char *name;
+
+ while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+ switch (ch) {
+ case '?':
+ default:
+ return (usage());
+ }
+
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the uri. */
+ if (argc != 1)
+ return (usage());
+ if ((name = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ ret = session->drop(session, name, "force");
+
+ if (name != NULL)
+ free(name);
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "drop uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_dump.c b/src/third_party/wiredtiger/src/utilities/util_dump.c
new file mode 100644
index 00000000000..bd0590948b4
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_dump.c
@@ -0,0 +1,701 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int dump_config(WT_SESSION *, const char *, int);
+static int dump_json_begin(void);
+static int dump_json_end(void);
+static int dump_json_separator(void);
+static int dump_json_table_begin(WT_CURSOR *, const char *, const char *);
+static int dump_json_table_cg(WT_CURSOR *, const char *, const char *,
+ const char *, const char *);
+static int dump_json_table_config(WT_SESSION *, const char *);
+static int dump_json_table_end(void);
+static int dump_prefix(int);
+static int dump_record(WT_CURSOR *, const char *, int, int);
+static int dump_suffix(void);
+static int dump_table_config(WT_SESSION *, WT_CURSOR *, const char *);
+static int dump_table_config_type(WT_SESSION *,
+ WT_CURSOR *, WT_CURSOR *, const char *, const char *, const char *);
+static int dup_json_string(const char *, char **);
+static int print_config(WT_SESSION *, const char *, const char *, const char *);
+static int usage(void);
+
+int
+util_dump(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ size_t len;
+ int ch, hex, i, json, reverse;
+ char *checkpoint, *config, *name;
+
+ hex = json = reverse = 0;
+ checkpoint = config = name = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "c:f:jrx")) != EOF)
+ switch (ch) {
+ case 'c':
+ checkpoint = __wt_optarg;
+ break;
+ case 'f': /* output file */
+ if (freopen(__wt_optarg, "w", stdout) == NULL)
+ return (
+ util_err(errno, "%s: reopen", __wt_optarg));
+ break;
+ case 'j':
+ json = 1;
+ break;
+ case 'r':
+ reverse = 1;
+ break;
+ case 'x':
+ hex = 1;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* -j and -x are incompatible. */
+ if (hex && json) {
+ fprintf(stderr,
+ "%s: the -j and -x dump options are incompatible\n",
+ progname);
+ goto err;
+ }
+
+ /* The remaining argument is the uri. */
+ if (argc < 1 || (argc != 1 && !json))
+ return (usage());
+
+ if (json && (ret = dump_json_begin()) != 0)
+ goto err;
+
+ for (i = 0; i < argc; i++) {
+ if (json && i > 0)
+ if ((ret = dump_json_separator()) != 0)
+ goto err;
+ if (name != NULL) {
+ free(name);
+ name = NULL;
+ }
+ if ((name = util_name(argv[i], "table")) == NULL)
+ goto err;
+
+ if (json && dump_json_table_config(session, name) != 0)
+ goto err;
+ if (!json && dump_config(session, name, hex) != 0)
+ goto err;
+
+ len =
+ checkpoint == NULL ? 0 : strlen("checkpoint=") +
+ strlen(checkpoint) + 1;
+ len += strlen(json ? "dump=json" :
+ (hex ? "dump=hex" : "dump=print"));
+ if ((config = malloc(len + 10)) == NULL)
+ goto err;
+ if (checkpoint == NULL)
+ config[0] = '\0';
+ else {
+ (void)strcpy(config, "checkpoint=");
+ (void)strcat(config, checkpoint);
+ (void)strcat(config, ",");
+ }
+ (void)strcat(config, json ? "dump=json" :
+ (hex ? "dump=hex" : "dump=print"));
+ if ((ret = session->open_cursor(
+ session, name, NULL, config, &cursor)) != 0) {
+ fprintf(stderr, "%s: cursor open(%s) failed: %s\n",
+ progname, name, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ if ((ret = dump_record(cursor, name, reverse, json)) != 0)
+ goto err;
+ if (json && (ret = dump_json_table_end()) != 0)
+ goto err;
+ }
+ if (json && ((ret = dump_json_end()) != 0))
+ goto err;
+
+ if (0) {
+err: ret = 1;
+ }
+
+ if (config != NULL)
+ free(config);
+ if (name != NULL)
+ free(name);
+
+ return (ret);
+}
+
+/*
+ * dump_config --
+ * Dump the config for the uri.
+ */
+static int
+dump_config(WT_SESSION *session, const char *uri, int hex)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ int tret;
+
+ /* Open a metadata cursor. */
+ if ((ret = session->open_cursor(
+ session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) {
+ fprintf(stderr, "%s: %s: session.open_cursor: %s\n",
+ progname, WT_METADATA_URI, wiredtiger_strerror(ret));
+ return (1);
+ }
+ /*
+ * Search for the object itself, just to make sure it exists, we don't
+ * want to output a header if the user entered the wrong name. This is
+ * where we find out a table doesn't exist, use a simple error message.
+ */
+ cursor->set_key(cursor, uri);
+ if ((ret = cursor->search(cursor)) == 0) {
+ if (dump_prefix(hex) != 0 ||
+ dump_table_config(session, cursor, uri) != 0 ||
+ dump_suffix() != 0)
+ ret = 1;
+ } else if (ret == WT_NOTFOUND)
+ ret = util_err(0, "%s: No such object exists", uri);
+ else
+ ret = util_err(ret, "%s", uri);
+
+ if ((tret = cursor->close(cursor)) != 0) {
+ tret = util_cerr(uri, "close", tret);
+ if (ret == 0)
+ ret = tret;
+ }
+
+ return (ret);
+}
+
+/*
+ * dump_json_begin --
+ * Output the dump file header prefix.
+ */
+static int
+dump_json_begin(void)
+{
+ if (printf("{\n") < 0)
+ return (util_err(EIO, NULL));
+ return (0);
+}
+
+/*
+ * dump_json_end --
+ * Output the dump file header suffix.
+ */
+static int
+dump_json_end(void)
+{
+ if (printf("\n}\n") < 0)
+ return (util_err(EIO, NULL));
+ return (0);
+}
+
+/*
+ * dump_json_begin --
+ * Output the dump file header prefix.
+ */
+static int
+dump_json_separator(void)
+{
+ if (printf(",\n") < 0)
+ return (util_err(EIO, NULL));
+ return (0);
+}
+
+/*
+ * dump_json_table_begin --
+ * Output the JSON syntax that starts a table, along with its config.
+ */
+static int
+dump_json_table_begin(WT_CURSOR *cursor, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+ const char *name;
+ char *jsonconfig;
+
+ jsonconfig = NULL;
+
+ /* Get the table name. */
+ if ((name = strchr(uri, ':')) == NULL) {
+ fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri);
+ return (1);
+ }
+ ++name;
+
+ if ((ret = dup_json_string(config, &jsonconfig)) != 0)
+ return (util_cerr(uri, "config dup", ret));
+ if (printf(" \"%s\" : [\n {\n", uri) < 0)
+ goto eio;
+ if (printf(" \"config\" : \"%s\",\n", jsonconfig) < 0)
+ goto eio;
+
+ if ((ret = dump_json_table_cg(
+ cursor, uri, name, "colgroup:", "colgroups")) == 0) {
+ if (printf(",\n") < 0)
+ goto eio;
+ ret =
+ dump_json_table_cg(cursor, uri, name, "index:", "indices");
+ }
+
+ if (printf("\n },\n {\n \"data\" : [") < 0)
+ goto eio;
+
+ if (0) {
+eio: ret = util_err(EIO, NULL);
+ }
+
+ free(jsonconfig);
+ return (ret);
+}
+
+/*
+ * dump_json_table_cg --
+ * Dump the column groups or indices for a table.
+ */
+static int
+dump_json_table_cg(WT_CURSOR *cursor,
+ const char *uri, const char *name, const char *entry, const char *header)
+{
+ WT_DECL_RET;
+ const char *key, *skip, *value;
+ int exact, once;
+ char *jsonconfig;
+ static const char * const indent = " ";
+
+ once = 0;
+ if (printf(" \"%s\" : [", header) < 0)
+ return (util_err(EIO, NULL));
+
+ /*
+ * For table dumps, we're done.
+ */
+ if (cursor == NULL) {
+ if (printf("]") < 0)
+ return (util_err(EIO, NULL));
+ else
+ return (0);
+ }
+
+ /*
+ * Search the file looking for column group and index key/value pairs:
+ * for each one, look up the related source information and append it
+ * to the base record.
+ */
+ cursor->set_key(cursor, entry);
+ if ((ret = cursor->search_near(cursor, &exact)) != 0) {
+ if (ret == WT_NOTFOUND)
+ return (0);
+ return (util_cerr(uri, "search_near", ret));
+ }
+ if (exact >= 0)
+ goto match;
+ while ((ret = cursor->next(cursor)) == 0) {
+match: if ((ret = cursor->get_key(cursor, &key)) != 0)
+ return (util_cerr(uri, "get_key", ret));
+
+ /* Check if we've finished the list of entries. */
+ if (!WT_PREFIX_MATCH(key, entry))
+ break;
+
+ /* Check for a table name match. */
+ skip = key + strlen(entry);
+ if (strncmp(
+ skip, name, strlen(name)) != 0 || skip[strlen(name)] != ':')
+ continue;
+
+ /* Get the value. */
+ if ((ret = cursor->get_value(cursor, &value)) != 0)
+ return (util_cerr(uri, "get_value", ret));
+
+ if ((ret = dup_json_string(value, &jsonconfig)) != 0)
+ return (util_cerr(uri, "config dup", ret));
+ ret = printf("%s\n"
+ "%s{\n"
+ "%s \"uri\" : \"%s\",\n"
+ "%s \"config\" : \"%s\"\n"
+ "%s}",
+ (once == 0 ? "" : ","),
+ indent, indent, key, indent, jsonconfig, indent);
+ free(jsonconfig);
+ if (ret < 0)
+ return (util_err(EIO, NULL));
+
+ once = 1;
+ }
+ if (printf("%s]", (once == 0 ? "" : "\n ")) < 0)
+ return (util_err(EIO, NULL));
+ if (ret == 0 || ret == WT_NOTFOUND)
+ return (0);
+ return (util_cerr(uri, "next", ret));
+}
+
+/*
+ * dump_json_table_config --
+ * Dump the config for the uri.
+ */
+static int
+dump_json_table_config(WT_SESSION *session, const char *uri)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_EXTENSION_API *wtext;
+ int tret;
+ const char *value;
+
+ /* Dump the config. */
+ if (WT_PREFIX_MATCH(uri, "table:")) {
+ /* Open a metadata cursor. */
+ if ((ret = session->open_cursor(
+ session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) {
+ fprintf(stderr, "%s: %s: session.open_cursor: %s\n",
+ progname, WT_METADATA_URI,
+ wiredtiger_strerror(ret));
+ return (1);
+ }
+
+ /*
+ * Search for the object itself, to make sure it
+ * exists, and get its config string. This where we
+ * find out a table object doesn't exist, use a simple
+ * error message.
+ */
+ cursor->set_key(cursor, uri);
+ if ((ret = cursor->search(cursor)) == 0) {
+ if ((ret = cursor->get_value(cursor, &value)) != 0)
+ ret = util_cerr(uri, "get_value", ret);
+ else if (dump_json_table_begin(cursor, uri,
+ value) != 0)
+ ret = 1;
+ } else if (ret == WT_NOTFOUND)
+ ret = util_err(0, "%s: No such object exists", uri);
+ else
+ ret = util_err(ret, "%s", uri);
+
+ if ((tret = cursor->close(cursor)) != 0) {
+ tret = util_cerr(uri, "close", tret);
+ if (ret == 0)
+ ret = tret;
+ }
+ } else {
+ /*
+ * We want to be able to dump the metadata file itself, but the
+ * configuration for that file lives in the turtle file. Reach
+ * down into the library and ask for the file's configuration,
+ * that will work in all cases.
+ *
+ * This where we find out a file object doesn't exist, use a
+ * simple error message.
+ */
+ wtext = session->
+ connection->get_extension_api(session->connection);
+ if ((ret =
+ wtext->metadata_search(wtext, session, uri, &value)) == 0) {
+ if (dump_json_table_begin(NULL, uri, value) != 0)
+ ret = 1;
+ } else if (ret == WT_NOTFOUND)
+ ret = util_err(0, "%s: No such object exists", uri);
+ else
+ ret = util_err(ret, "%s", uri);
+ }
+
+ return (ret);
+}
+
+/*
+ * dump_json_table_end --
+ * Output the JSON syntax that ends a table.
+ */
+static int
+dump_json_table_end(void)
+{
+ if (printf(" ]\n }\n ]") < 0)
+ return (util_err(EIO, NULL));
+ return (0);
+}
+
+/*
+ * dump_table_config --
+ * Dump the config for a table.
+ */
+static int
+dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
+{
+ WT_CURSOR *srch;
+ WT_DECL_RET;
+ int tret;
+ const char *key, *name, *value;
+
+ /* Get the table name. */
+ if ((name = strchr(uri, ':')) == NULL) {
+ fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri);
+ return (1);
+ }
+ ++name;
+
+ /*
+ * Dump out the config information: first, dump the uri entry itself
+ * (requires a lookup).
+ */
+ cursor->set_key(cursor, uri);
+ if ((ret = cursor->search(cursor)) != 0)
+ return (util_cerr(uri, "search", ret));
+ if ((ret = cursor->get_key(cursor, &key)) != 0)
+ return (util_cerr(uri, "get_key", ret));
+ if ((ret = cursor->get_value(cursor, &value)) != 0)
+ return (util_cerr(uri, "get_value", ret));
+ if (print_config(session, key, value, NULL) != 0)
+ return (1);
+
+ /*
+ * The underlying table configuration function needs a second cursor:
+ * open one before calling it, it makes error handling hugely simpler.
+ */
+ if ((ret =
+ session->open_cursor(session, NULL, cursor, NULL, &srch)) != 0)
+ return (util_cerr(uri, "open_cursor", ret));
+
+ if ((ret = dump_table_config_type(
+ session, cursor, srch, uri, name, "colgroup:")) == 0)
+ ret = dump_table_config_type(
+ session, cursor, srch, uri, name, "index:");
+
+ if ((tret = srch->close(srch)) != 0) {
+ tret = util_cerr(uri, "close", tret);
+ if (ret == 0)
+ ret = tret;
+ }
+
+ return (ret);
+}
+
+/*
+ * dump_table_config_type --
+ * Dump the column groups or indices for a table.
+ */
+static int
+dump_table_config_type(WT_SESSION *session,
+ WT_CURSOR *cursor, WT_CURSOR *srch,
+ const char *uri, const char *name, const char *entry)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ const char *key, *skip, *value, *value_source;
+ int exact;
+ char *p;
+
+ /*
+ * Search the file looking for column group and index key/value pairs:
+ * for each one, look up the related source information and append it
+ * to the base record.
+ */
+ cursor->set_key(cursor, entry);
+ if ((ret = cursor->search_near(cursor, &exact)) != 0) {
+ if (ret == WT_NOTFOUND)
+ return (0);
+ return (util_cerr(uri, "search_near", ret));
+ }
+ if (exact >= 0)
+ goto match;
+ while ((ret = cursor->next(cursor)) == 0) {
+match: if ((ret = cursor->get_key(cursor, &key)) != 0)
+ return (util_cerr(uri, "get_key", ret));
+
+ /* Check if we've finished the list of entries. */
+ if (!WT_PREFIX_MATCH(key, entry))
+ return (0);
+
+ /* Check for a table name match. */
+ skip = key + strlen(entry);
+ if (strncmp(
+ skip, name, strlen(name)) != 0 || skip[strlen(name)] != ':')
+ continue;
+
+ /* Get the value. */
+ if ((ret = cursor->get_value(cursor, &value)) != 0)
+ return (util_cerr(uri, "get_value", ret));
+
+ /* Crack it and get the underlying source. */
+ if ((ret = __wt_config_getones(
+ (WT_SESSION_IMPL *)session, value, "source", &cval)) != 0)
+ return (util_err(ret, "%s: source entry", key));
+
+ /* Nul-terminate the source entry. */
+ if ((p = malloc(cval.len + 10)) == NULL)
+ return (util_err(errno, NULL));
+ (void)strncpy(p, cval.str, cval.len);
+ p[cval.len] = '\0';
+ srch->set_key(srch, p);
+ if ((ret = srch->search(srch)) != 0)
+ ret = util_err(ret, "%s: %s", key, p);
+ free(p);
+ if (ret != 0)
+ return (1);
+
+ /* Get the source's value. */
+ if ((ret = srch->get_value(srch, &value_source)) != 0)
+ return (util_cerr(uri, "get_value", ret));
+
+ /*
+ * The dumped configuration string is the original key plus the
+ * source's configuration.
+ */
+ if (print_config(session, key, value, value_source) != 0)
+ return (util_err(EIO, NULL));
+ }
+ if (ret == 0 || ret == WT_NOTFOUND)
+ return (0);
+ return (util_cerr(uri, "next", ret));
+}
+
+/*
+ * dump_prefix --
+ * Output the dump file header prefix.
+ */
+static int
+dump_prefix(int hex)
+{
+ int vmajor, vminor, vpatch;
+
+ (void)wiredtiger_version(&vmajor, &vminor, &vpatch);
+
+ if (printf(
+ "WiredTiger Dump (WiredTiger Version %d.%d.%d)\n",
+ vmajor, vminor, vpatch) < 0 ||
+ printf("Format=%s\n", hex ? "hex" : "print") < 0 ||
+ printf("Header\n") < 0)
+ return (util_err(EIO, NULL));
+ return (0);
+}
+
+/*
+ * dump_record --
+ * Dump a single record, advance cursor to next/prev, along
+ * with JSON formatting if needed.
+ */
+static int
+dump_record(WT_CURSOR *cursor, const char *name, int reverse, int json)
+{
+ WT_DECL_RET;
+ const char *infix, *key, *prefix, *suffix, *value;
+ int once;
+
+ once = 0;
+ if (json) {
+ prefix = "\n{\n";
+ infix = ",\n";
+ suffix = "\n}";
+ } else {
+ prefix = "";
+ infix = "\n";
+ suffix = "\n";
+ }
+ while ((ret =
+ (reverse ? cursor->prev(cursor) : cursor->next(cursor))) == 0) {
+ if ((ret = cursor->get_key(cursor, &key)) != 0)
+ return (util_cerr(name, "get_key", ret));
+ if ((ret = cursor->get_value(cursor, &value)) != 0)
+ return (util_cerr(name, "get_value", ret));
+ if (printf("%s%s%s%s%s%s", (json && once) ? "," : "",
+ prefix, key, infix, value, suffix) < 0)
+ return (util_err(EIO, NULL));
+ once = 1;
+ }
+ if (json && once && printf("\n") < 0)
+ return (util_err(EIO, NULL));
+ return (ret == WT_NOTFOUND ? 0 :
+ util_cerr(name, (reverse ? "prev" : "next"), ret));
+}
+
+/*
+ * dump_suffix --
+ * Output the dump file header suffix.
+ */
+static int
+dump_suffix(void)
+{
+ if (printf("Data\n") < 0)
+ return (util_err(EIO, NULL));
+ return (0);
+}
+
+/*
+ * dup_json_string --
+ * Like strdup, but escape any characters that are special for JSON.
+ * The result will be embedded in a JSON string.
+ */
+static int
+dup_json_string(const char *str, char **result)
+{
+ size_t left, nchars;
+ const char *p;
+ char *q;
+
+ nchars = 0;
+ for (p = str; *p; p++, nchars++)
+ nchars += __wt_json_unpack_char(*p, NULL, 0, 0);
+ q = malloc(nchars + 1);
+ if (q == NULL)
+ return (1);
+ *result = q;
+ left = nchars;
+ for (p = str; *p; p++, nchars++) {
+ nchars = __wt_json_unpack_char(*p, (u_char *)q, left, 0);
+ left -= nchars;
+ q += nchars;
+ }
+ *q = '\0';
+ return (0);
+}
+
+/*
+ * print_config --
+ * Output a key/value URI pair by combining v1 and v2.
+ */
+static int
+print_config(WT_SESSION *session,
+ const char *key, const char *v1, const char *v2)
+{
+ WT_DECL_RET;
+ const char *value_ret;
+
+ /*
+ * The underlying call will ignore v2 if v1 is NULL -- check here and
+ * swap in that case.
+ */
+ if (v1 == NULL) {
+ v1 = v2;
+ v2 = NULL;
+ }
+
+ if ((ret = __wt_session_create_strip(session, v1, v2, &value_ret)) != 0)
+ return (util_err(ret, NULL));
+ ret = printf("%s\n%s\n", key, value_ret);
+ free((char *)value_ret);
+ if (ret < 0)
+ return (util_err(EIO, NULL));
+ return (0);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "dump [-jrx] [-c checkpoint] [-f output-file] uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_list.c b/src/third_party/wiredtiger/src/utilities/util_list.c
new file mode 100644
index 00000000000..4a1489628d1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_list.c
@@ -0,0 +1,193 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int list_print(WT_SESSION *, const char *, int, int);
+static int list_print_checkpoint(WT_SESSION *, const char *);
+static int usage(void);
+
+int
+util_list(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int cflag, ch, vflag;
+ char *name;
+
+ cflag = vflag = 0;
+ name = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "cv")) != EOF)
+ switch (ch) {
+ case 'c':
+ cflag = 1;
+ break;
+ case 'v':
+ vflag = 1;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ switch (argc) {
+ case 0:
+ break;
+ case 1:
+ if ((name = util_name(*argv, "table")) == NULL)
+ return (1);
+ break;
+ default:
+ return (usage());
+ }
+
+ ret = list_print(session, name, cflag, vflag);
+
+ if (name != NULL)
+ free(name);
+
+ return (ret);
+}
+
+/*
+ * list_print --
+ * List the high-level objects in the database.
+ */
+static int
+list_print(WT_SESSION *session, const char *name, int cflag, int vflag)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ int found;
+ const char *key, *value;
+
+ /* Open the metadata file. */
+ if ((ret = session->open_cursor(
+ session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) {
+ /*
+ * If there is no metadata (yet), this will return ENOENT.
+ * Treat that the same as an empty metadata.
+ */
+ if (ret == ENOENT)
+ return (0);
+
+ fprintf(stderr, "%s: %s: session.open_cursor: %s\n",
+ progname, WT_METADATA_URI, wiredtiger_strerror(ret));
+ return (1);
+ }
+
+ found = name == NULL;
+ while ((ret = cursor->next(cursor)) == 0) {
+ /* Get the key. */
+ if ((ret = cursor->get_key(cursor, &key)) != 0)
+ return (util_cerr("metadata", "get_key", ret));
+
+ /*
+ * If a name is specified, only show objects that match.
+ */
+ if (name != NULL) {
+ if (!WT_PREFIX_MATCH(key, name))
+ continue;
+ found = 1;
+ }
+
+ /*
+ * XXX
+ * We don't normally say anything about the WiredTiger
+ * metadata, it's not a normal "object" in the database. I'm
+ * making an exception for the checkpoint and verbose options.
+ */
+ if (strcmp(key, WT_METADATA_URI) != 0 || cflag || vflag)
+ printf("%s\n", key);
+
+ if (!cflag && !vflag)
+ continue;
+
+ if (cflag && (ret = list_print_checkpoint(session, key)) != 0)
+ return (ret);
+ if (vflag) {
+ if ((ret = cursor->get_value(cursor, &value)) != 0)
+ return (
+ util_cerr("metadata", "get_value", ret));
+ printf("%s\n", value);
+ }
+ }
+ if (ret != WT_NOTFOUND)
+ return (util_cerr("metadata", "next", ret));
+ if (!found) {
+ fprintf(stderr, "%s: %s: not found\n", progname, name);
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * list_print_checkpoint --
+ * List the checkpoint information.
+ */
+static int
+list_print_checkpoint(WT_SESSION *session, const char *key)
+{
+ WT_DECL_RET;
+ WT_CKPT *ckpt, *ckptbase;
+ size_t len;
+ time_t t;
+ uint64_t v;
+
+ /*
+ * We may not find any checkpoints for this file, in which case we don't
+ * report an error, and continue our caller's loop. Otherwise, read the
+ * list of checkpoints and print each checkpoint's name and time.
+ */
+ if ((ret = __wt_metadata_get_ckptlist(session, key, &ckptbase)) != 0)
+ return (ret == WT_NOTFOUND ? 0 : ret);
+
+ /* Find the longest name, so we can pretty-print. */
+ len = 0;
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (strlen(ckpt->name) > len)
+ len = strlen(ckpt->name);
+ ++len;
+
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ /*
+ * Call ctime, not ctime_r; ctime_r has portability problems,
+ * the Solaris version is different from the POSIX standard.
+ */
+ t = (time_t)ckpt->sec;
+ printf("\t%*s: %.24s", (int)len, ckpt->name, ctime(&t));
+
+ v = ckpt->ckpt_size;
+ if (v >= WT_PETABYTE)
+ printf(" (%" PRIu64 " PB)\n", v / WT_PETABYTE);
+ else if (v >= WT_TERABYTE)
+ printf(" (%" PRIu64 " TB)\n", v / WT_TERABYTE);
+ else if (v >= WT_GIGABYTE)
+ printf(" (%" PRIu64 " GB)\n", v / WT_GIGABYTE);
+ else if (v >= WT_MEGABYTE)
+ printf(" (%" PRIu64 " MB)\n", v / WT_MEGABYTE);
+ else if (v >= WT_KILOBYTE)
+ printf(" (%" PRIu64 " KB)\n", v / WT_KILOBYTE);
+ else
+ printf(" (%" PRIu64 " B)\n", v);
+ }
+
+ __wt_metadata_free_ckptlist(session, ckptbase);
+ return (0);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "list [-cv] [uri]\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_load.c b/src/third_party/wiredtiger/src/utilities/util_load.c
new file mode 100644
index 00000000000..7d9dfa445dc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_load.c
@@ -0,0 +1,595 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+#include "util_load.h"
+
+static int config_read(char ***, int *);
+static int config_rename(char **, const char *);
+static void config_remove(char *, const char *);
+static int format(void);
+static int insert(WT_CURSOR *, const char *);
+static int load_dump(WT_SESSION *);
+static int usage(void);
+
+static int append; /* -a append (ignore record number keys) */
+static char *cmdname; /* -r rename */
+static char **cmdconfig; /* configuration pairs */
+static int json; /* -j input is JSON format */
+static int no_overwrite; /* -n don't overwrite existing data */
+
+int
+util_load(WT_SESSION *session, int argc, char *argv[])
+{
+ int ch;
+ const char *filename;
+ uint32_t flags;
+
+ flags = 0;
+
+ filename = "<stdin>";
+ while ((ch = __wt_getopt(progname, argc, argv, "af:jnr:")) != EOF)
+ switch (ch) {
+ case 'a': /* append (ignore record number keys) */
+ append = 1;
+ break;
+ case 'f': /* input file */
+ if (freopen(__wt_optarg, "r", stdin) == NULL)
+ return (
+ util_err(errno, "%s: reopen", __wt_optarg));
+ else
+ filename = __wt_optarg;
+ break;
+ case 'j': /* input is JSON */
+ json = 1;
+ break;
+ case 'n': /* don't overwrite existing data */
+ no_overwrite = 1;
+ break;
+ case 'r': /* rename */
+ cmdname = __wt_optarg;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* -a and -o are mutually exclusive. */
+ if (append == 1 && no_overwrite == 1)
+ return (util_err(EINVAL,
+ "the -a (append) and -n (no-overwrite) flags are mutually "
+ "exclusive"));
+
+ /* The remaining arguments are configuration uri/string pairs. */
+ if (argc != 0) {
+ if (argc % 2 != 0)
+ return (usage());
+ cmdconfig = argv;
+ }
+
+ if (json) {
+ if (append)
+ flags |= LOAD_JSON_APPEND;
+ if (no_overwrite)
+ flags |= LOAD_JSON_NO_OVERWRITE;
+ return (util_load_json(session, filename, flags));
+ } else
+ return (load_dump(session));
+}
+
+/*
+ * load_dump --
+ * Load from the WiredTiger dump format.
+ */
+static int
+load_dump(WT_SESSION *session)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ int hex, tret;
+ char **list, **tlist, *uri, config[64];
+
+ cursor = NULL;
+ list = NULL; /* -Wuninitialized */
+ hex = 0; /* -Wuninitialized */
+ uri = NULL;
+
+ /* Read the metadata file. */
+ if ((ret = config_read(&list, &hex)) != 0)
+ return (ret);
+
+ /* Reorder and check the list. */
+ if ((ret = config_reorder(list)) != 0)
+ goto err;
+
+ /* Update the config based on any command-line configuration. */
+ if ((ret = config_update(session, list)) != 0)
+ goto err;
+
+ uri = list[0];
+ /* Create the items in the list. */
+ if ((ret = config_exec(session, list)) != 0)
+ goto err;
+
+ /* Open the insert cursor. */
+ (void)snprintf(config, sizeof(config),
+ "dump=%s%s%s",
+ hex ? "hex" : "print",
+ append ? ",append" : "", no_overwrite ? ",overwrite=false" : "");
+ if ((ret = session->open_cursor(
+ session, uri, NULL, config, &cursor)) != 0) {
+ ret = util_err(ret, "%s: session.open", uri);
+ goto err;
+ }
+
+ /*
+ * Check the append flag (it only applies to objects where the primary
+ * key is a record number).
+ */
+ if (append && strcmp(cursor->key_format, "r") != 0) {
+ fprintf(stderr,
+ "%s: %s: -a option illegal unless the primary key is a "
+ "record number\n",
+ progname, uri);
+ ret = 1;
+ } else
+ ret = insert(cursor, uri);
+
+err: /*
+ * Technically, we don't have to close the cursor because the session
+ * handle will do it for us, but I'd like to see the flush to disk and
+ * the close succeed, it's better to fail early when loading files.
+ */
+ if (cursor != NULL && (tret = cursor->close(cursor)) != 0) {
+ tret = util_err(tret, "%s: cursor.close", uri);
+ if (ret == 0)
+ ret = tret;
+ }
+ if (ret == 0)
+ ret = util_flush(session, uri);
+
+ for (tlist = list; *tlist != NULL; ++tlist)
+ free(*tlist);
+ free(list);
+
+ return (ret == 0 ? 0 : 1);
+}
+
+/*
+ * config_exec --
+ * Create the tables/indices/colgroups implied by the list.
+ */
+int
+config_exec(WT_SESSION *session, char **list)
+{
+ WT_DECL_RET;
+
+ for (; *list != NULL; list += 2)
+ if ((ret = session->create(session, list[0], list[1])) != 0)
+ return (util_err(ret, "%s: session.create", list[0]));
+ return (0);
+}
+
+/*
+ * config_list_free --
+ * Add a value to the config list.
+ */
+int
+config_list_add(CONFIG_LIST *clp, char *val)
+{
+ if (clp->entry + 1 >= clp->max_entry)
+ if ((clp->list = realloc(clp->list, (size_t)
+ (clp->max_entry += 100) * sizeof(char *))) == NULL)
+ /* List already freed by realloc. */
+ return (util_err(errno, NULL));
+
+ clp->list[clp->entry++] = val;
+ clp->list[clp->entry] = NULL;
+ return (0);
+}
+
+/*
+ * config_list_free --
+ * Free the list and any of its entries.
+ */
+void
+config_list_free(CONFIG_LIST *clp)
+{
+ char **entry;
+
+ if (clp->list != NULL)
+ for (entry = &clp->list[0]; *entry != NULL; entry++)
+ free(*entry);
+ free(clp->list);
+ clp->list = NULL;
+}
+
+/*
+ * config_read --
+ * Read the config lines and do some basic validation.
+ */
+static int
+config_read(char ***listp, int *hexp)
+{
+ ULINE l;
+ WT_DECL_RET;
+ int entry, eof, max_entry;
+ const char *s;
+ char **list, **tlist;
+
+ list = NULL;
+ memset(&l, 0, sizeof(l));
+
+ /* Header line #1: "WiredTiger Dump" and a WiredTiger version. */
+ if (util_read_line(&l, 0, &eof))
+ return (1);
+ s = "WiredTiger Dump ";
+ if (strncmp(l.mem, s, strlen(s)) != 0)
+ return (format());
+
+ /* Header line #2: "Format={hex,print}". */
+ if (util_read_line(&l, 0, &eof))
+ return (1);
+ if (strcmp(l.mem, "Format=print") == 0)
+ *hexp = 0;
+ else if (strcmp(l.mem, "Format=hex") == 0)
+ *hexp = 1;
+ else
+ return (format());
+
+ /* Header line #3: "Header". */
+ if (util_read_line(&l, 0, &eof))
+ return (1);
+ if (strcmp(l.mem, "Header") != 0)
+ return (format());
+
+ /* Now, read in lines until we get to the end of the headers. */
+ for (entry = max_entry = 0, list = NULL;; ++entry) {
+ if ((ret = util_read_line(&l, 0, &eof)) != 0)
+ goto err;
+ if (strcmp(l.mem, "Data") == 0)
+ break;
+
+ /*
+ * Grow the array of header lines as necessary -- we need an
+ * extra slot for NULL termination.
+ */
+ if (entry + 1 >= max_entry) {
+ if ((tlist = realloc(list, (size_t)
+ (max_entry += 100) * sizeof(char *))) == NULL) {
+ ret = util_err(errno, NULL);
+
+ /*
+ * List already freed by realloc, still use err
+ * label for consistency.
+ */
+ list = NULL;
+ goto err;
+ }
+ list = tlist;
+ }
+ if ((list[entry] = strdup(l.mem)) == NULL) {
+ ret = util_err(errno, NULL);
+ goto err;
+ }
+ list[entry + 1] = NULL;
+ }
+
+ /* Headers are required, and they're supposed to be in pairs. */
+ if (list == NULL || entry % 2 != 0) {
+ ret = format();
+ goto err;
+ }
+ *listp = list;
+ return (0);
+
+err: if (list != NULL) {
+ for (tlist = list; *tlist != NULL; ++tlist)
+ free(*tlist);
+ free(list);
+ }
+ return (ret);
+}
+
+/*
+ * config_reorder --
+ * For table dumps, reorder the list so tables are first.
+ * For other dumps, make any needed checks.
+ */
+int
+config_reorder(char **list)
+{
+ char **entry, *p;
+
+ /*
+ * Search for a table name -- if we find one, then it's table dump,
+ * otherwise, it's a single file dump.
+ */
+ for (entry = list; *entry != NULL; ++entry)
+ if (WT_PREFIX_MATCH(*entry, "table:"))
+ break;
+ if (*entry == NULL) {
+ /*
+ * Single file dumps can only have two lines, the file name and
+ * the configuration information.
+ */
+ if ((list[0] == NULL || list[1] == NULL || list[2] != NULL) ||
+ (WT_PREFIX_MATCH(list[0], "file:") &&
+ WT_PREFIX_MATCH(list[0], "lsm:")))
+ return (format());
+
+ entry = list;
+ }
+
+ /*
+ * Make sure the table key/value pair comes first, then we can just
+ * run through the array in order. (We already checked that we had
+ * a multiple of 2 entries, so this is safe.)
+ */
+ if (entry != list) {
+ p = list[0]; list[0] = entry[0]; entry[0] = p;
+ p = list[1]; list[1] = entry[1]; entry[1] = p;
+ }
+ return (0);
+}
+
+/*
+ * config_update --
+ * Reconcile and update the command line configuration against the
+ * config we found.
+ */
+int
+config_update(WT_SESSION *session, char **list)
+{
+ int found;
+ const char *cfg[] = { NULL, NULL, NULL };
+ char **configp, **listp;
+ const char **rm;
+ static const char *rmnames[] = {
+ "filename", "id", "checkpoint", "checkpoint_lsn",
+ "version", "source", NULL };
+
+ /*
+ * If the object has been renamed, replace all of the column group,
+ * index, file and table names with the new name.
+ */
+ if (cmdname != NULL) {
+ for (listp = list; *listp != NULL; listp += 2)
+ if (WT_PREFIX_MATCH(*listp, "colgroup:") ||
+ WT_PREFIX_MATCH(*listp, "file:") ||
+ WT_PREFIX_MATCH(*listp, "index:") ||
+ WT_PREFIX_MATCH(*listp, "table:"))
+ if (config_rename(listp, cmdname))
+ return (1);
+
+ /*
+ * If the object was renamed, and there are configuration pairs,
+ * rename the configuration pairs as well, because we don't know
+ * if the user used the old or new names for the pair's URI.
+ */
+ for (configp = cmdconfig;
+ cmdconfig != NULL && *configp != NULL; configp += 2)
+ if (config_rename(configp, cmdname))
+ return (1);
+ }
+
+ /*
+ * Remove all "filename=", "source=" and other configurations
+ * that foil loading from the values. New filenames are chosen
+ * as part of table load.
+ */
+ for (listp = list; *listp != NULL; listp += 2)
+ for (rm = rmnames; *rm != NULL; rm++)
+ if (strstr(listp[1], *rm) != NULL)
+ config_remove(listp[1], *rm);
+
+ /*
+ * It's possible to update everything except the key/value formats.
+ * If there were command-line configuration pairs, walk the list of
+ * command-line configuration strings, and check.
+ */
+ for (configp = cmdconfig;
+ cmdconfig != NULL && *configp != NULL; configp += 2)
+ if (strstr(configp[1], "key_format=") ||
+ strstr(configp[1], "value_format="))
+ return (util_err(0,
+ "the command line configuration string may not "
+ "modify the object's key or value format"));
+
+ /*
+ * If there were command-line configuration pairs, walk the list of
+ * command-line URIs and find a matching dump URI. For each match,
+ * rewrite the dump configuration as described by the command-line
+ * configuration. It is an error if a command-line URI doesn't find
+ * a single, exact match, that's likely a mistake.
+ */
+ for (configp = cmdconfig;
+ cmdconfig != NULL && *configp != NULL; configp += 2) {
+ found = 0;
+ for (listp = list; *listp != NULL; listp += 2) {
+ if (strncmp(*configp, listp[0], strlen(*configp)) != 0)
+ continue;
+ /*
+ * !!!
+ * We support JSON configuration strings, which leads to
+ * configuration strings with brackets. Unfortunately,
+ * that implies we can't simply append new configuration
+ * strings to existing ones. We call an unpublished
+ * WiredTiger API to do the concatenation: if anyone
+ * else ever needs it we can make it public, but I think
+ * that's unlikely. We're also playing fast and loose
+ * with types, but it should work.
+ */
+ cfg[0] = listp[1];
+ cfg[1] = configp[1];
+ if (__wt_config_concat(
+ (WT_SESSION_IMPL *)session, cfg,
+ (const char **)&listp[1]) != 0)
+ return (1);
+ ++found;
+ }
+ switch (found) {
+ case 0:
+ return (util_err(0,
+ "the command line object name %s was not matched "
+ "by any loaded object name", *configp));
+ case 1:
+ break;
+ default:
+ return (util_err(0,
+ "the command line object name %s was not unique, "
+ "matching more than a single loaded object name",
+ *configp));
+ }
+ }
+
+ /* Leak the memory, I don't care. */
+ return (0);
+}
+
+/*
+ * config_rename --
+ * Update the URI name.
+ */
+static int
+config_rename(char **urip, const char *name)
+{
+ size_t len;
+ char *buf, *p;
+
+ /* Allocate room. */
+ len = strlen(*urip) + strlen(name) + 10;
+ if ((buf = malloc(len)) == NULL)
+ return (util_err(errno, NULL));
+
+ /*
+ * Find the separating colon characters, but not the trailing one may
+ * not be there.
+ */
+ if ((p = strchr(*urip, ':')) == NULL) {
+ free(buf);
+ return (format());
+ }
+ *p = '\0';
+ p = strchr(p + 1, ':');
+ snprintf(buf, len, "%s:%s%s", *urip, name, p == NULL ? "" : p);
+ *urip = buf;
+
+ return (0);
+}
+
+/*
+ * config_remove --
+ * Remove a single config key and its value.
+ */
+static void
+config_remove(char *config, const char *ckey)
+{
+ int parens, quoted;
+ char *begin, match[100], *next, *p;
+
+ snprintf(match, sizeof(match), "%s=", ckey);
+ if ((begin = strstr(config, match)) != NULL) {
+ parens = 0;
+ quoted = 0;
+ next = NULL;
+ for (p = begin + strlen(match); !next && *p; p++)
+ switch (*p) {
+ case '(':
+ if (!quoted)
+ parens++;
+ break;
+ case ')':
+ if (!quoted)
+ parens--;
+ break;
+ case '"':
+ quoted = !quoted;
+ break;
+ case ',':
+ if (!quoted && parens == 0)
+ next = p + 1;
+ break;
+ }
+ if (next)
+ memmove(begin, next, strlen(next) + 1);
+ else
+ *begin = '\0';
+ }
+}
+
+/*
+ * format --
+ * The input doesn't match the dump format.
+ */
+static int
+format(void)
+{
+ return (util_err(0, "input does not match WiredTiger dump format"));
+}
+
+/*
+ * insert --
+ * Read and insert data.
+ */
+static int
+insert(WT_CURSOR *cursor, const char *name)
+{
+ ULINE key, value;
+ WT_DECL_RET;
+ uint64_t insert_count;
+ int eof;
+
+ memset(&key, 0, sizeof(key));
+ memset(&value, 0, sizeof(value));
+
+ /* Read key/value pairs and insert them into the file. */
+ for (insert_count = 0;;) {
+ /*
+ * Three modes: in row-store, we always read a key and use it,
+ * in column-store, we might read it (a dump), we might read
+ * and ignore it (a dump with "append" set), or not read it at
+ * all (flat-text load).
+ */
+ if (util_read_line(&key, 1, &eof))
+ return (1);
+ if (eof == 1)
+ break;
+ if (!append)
+ cursor->set_key(cursor, key.mem);
+
+ if (util_read_line(&value, 0, &eof))
+ return (1);
+ cursor->set_value(cursor, value.mem);
+
+ if ((ret = cursor->insert(cursor)) != 0)
+ return (util_err(ret, "%s: cursor.insert", name));
+
+ /* Report on progress every 100 inserts. */
+ if (verbose && ++insert_count % 100 == 0) {
+ printf("\r\t%s: %" PRIu64, name, insert_count);
+ fflush(stdout);
+ }
+ }
+
+ if (verbose)
+ printf("\r\t%s: %" PRIu64 "\n", name, insert_count);
+
+ return (0);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "load [-as] [-f input-file] [-r name] [object configuration ...]\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_load.h b/src/third_party/wiredtiger/src/utilities/util_load.h
new file mode 100644
index 00000000000..7bca677e178
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_load.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * A list of configuration strings.
+ */
+typedef struct {
+ char **list; /* array of alternating (uri, config) values */
+ int entry; /* next entry available in list */
+ int max_entry; /* how many allocated in list */
+} CONFIG_LIST;
+
+int config_exec(WT_SESSION *, char **);
+int config_list_add(CONFIG_LIST *, char *);
+void config_list_free(CONFIG_LIST *);
+int config_reorder(char **);
+int config_update(WT_SESSION *, char **);
+
+/* Flags for util_load_json */
+#define LOAD_JSON_APPEND 0x0001 /* append (ignore record number keys) */
+#define LOAD_JSON_NO_OVERWRITE 0x0002 /* don't overwrite existing data */
+
+int util_load_json(WT_SESSION *, const char *, uint32_t);
diff --git a/src/third_party/wiredtiger/src/utilities/util_load_json.c b/src/third_party/wiredtiger/src/utilities/util_load_json.c
new file mode 100644
index 00000000000..fb61df9ab16
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_load_json.c
@@ -0,0 +1,573 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+#include "util_load.h"
+
+/*
+ * Encapsulates the input state for parsing JSON.
+ *
+ * At any time, we may be peeking at an unconsumed token; this is
+ * indicated by 'peeking' as true. toktype, tokstart, toklen will be
+ * set in this case.
+ *
+ * Generally we are collecting and processing tokens one by one.
+ * In JSON, tokens never span lines so this makes processing easy.
+ * The exception is that a JSON dump cursor takes the complete
+ * set of keys or values during cursor->set_key/set_value calls,
+ * which may contain many tokens and span lines. E.g.
+ * cursor->set_value("\"name\" : \"John\", \"phone\" : 2348765");
+ * The raw key/value string is collected in the kvraw field.
+ */
+typedef struct {
+ WT_SESSION *session; /* associated session */
+ ULINE line; /* current line */
+ const char *p; /* points to cur position in line.mem */
+ int ateof; /* current token is EOF */
+ int peeking; /* peeking at next token */
+ int toktype; /* next token, defined by __wt_json_token() */
+ const char *tokstart; /* next token start (points into line.mem) */
+ size_t toklen; /* next token length */
+ char *kvraw; /* multiple line raw content collected so far */
+ size_t kvrawstart; /* pos on cur line that JSON key/value starts */
+ const char *filename; /* filename for error reporting */
+ int linenum; /* line number for error reporting */
+} JSON_INPUT_STATE;
+
+static int json_column_group_index(WT_SESSION *, JSON_INPUT_STATE *,
+ CONFIG_LIST *, int);
+static int json_data(WT_SESSION *, JSON_INPUT_STATE *, CONFIG_LIST *, uint32_t);
+static int json_expect(WT_SESSION *, JSON_INPUT_STATE *, int);
+static int json_peek(WT_SESSION *, JSON_INPUT_STATE *);
+static int json_skip(WT_SESSION *, JSON_INPUT_STATE *, const char **);
+static int json_kvraw_append(JSON_INPUT_STATE *, const char *, size_t);
+static int json_strdup(JSON_INPUT_STATE *, char **);
+static int json_top_level(WT_SESSION *, JSON_INPUT_STATE *, uint32_t);
+
+#define JSON_STRING_MATCH(ins, match) \
+ ((ins)->toklen - 2 == strlen(match) && \
+ strncmp((ins)->tokstart + 1, (match), (ins)->toklen - 2) == 0)
+
+#define JSON_INPUT_POS(ins) \
+ ((size_t)((ins)->p - (const char *)(ins)->line.mem))
+
+#define JSON_EXPECT(session, ins, tok) do { \
+ if (json_expect(session, ins, tok)) \
+ goto err; \
+} while (0)
+
+/*
+ * json_column_group_index --
+ * Parse a column group or index entry from JSON input.
+ */
+static int
+json_column_group_index(WT_SESSION *session, JSON_INPUT_STATE *ins,
+ CONFIG_LIST *clp, int idx)
+{
+ WT_DECL_RET;
+ char *config, *p, *uri;
+ int isconfig;
+
+ uri = NULL;
+ config = NULL;
+
+ while (json_peek(session, ins) == '{') {
+ JSON_EXPECT(session, ins, '{');
+ JSON_EXPECT(session, ins, 's');
+ isconfig = JSON_STRING_MATCH(ins, "config");
+ if (!isconfig && !JSON_STRING_MATCH(ins, "uri"))
+ goto err;
+ JSON_EXPECT(session, ins, ':');
+ JSON_EXPECT(session, ins, 's');
+
+ if ((ret = json_strdup(ins, &p)) != 0) {
+ ret = util_err(ret, NULL);
+ goto err;
+ }
+ if (isconfig)
+ config = p;
+ else
+ uri = p;
+
+ isconfig = !isconfig;
+ JSON_EXPECT(session, ins, ',');
+ JSON_EXPECT(session, ins, 's');
+ if (!JSON_STRING_MATCH(ins, isconfig ? "config" : "uri"))
+ goto err;
+ JSON_EXPECT(session, ins, ':');
+ JSON_EXPECT(session, ins, 's');
+
+ if ((ret = json_strdup(ins, &p)) != 0) {
+ ret = util_err(ret, NULL);
+ goto err;
+ }
+ if (isconfig)
+ config = p;
+ else
+ uri = p;
+ JSON_EXPECT(session, ins, '}');
+ if ((idx && strncmp(uri, "index:", 6) != 0) ||
+ (!idx && strncmp(uri, "colgroup:", 9) != 0)) {
+ ret = util_err(EINVAL,
+ "%s: misplaced colgroup or index", uri);
+ goto err;
+ }
+ if ((ret = config_list_add(clp, uri)) != 0 ||
+ (ret = config_list_add(clp, config)) != 0)
+ goto err;
+
+ if (json_peek(session, ins) != ',')
+ break;
+ JSON_EXPECT(session, ins, ',');
+ if (json_peek(session, ins) != '{')
+ goto err;
+ }
+ if (0) {
+err: if (ret == 0)
+ ret = EINVAL;
+ }
+ return (ret);
+}
+
+/*
+ * json_kvraw_append --
+ * Append to the kvraw buffer, which is used to collect all the
+ * raw key/value pairs from JSON input.
+ */
+static int json_kvraw_append(JSON_INPUT_STATE *ins, const char *str, size_t len)
+{
+ char *tmp;
+ size_t needsize;
+
+ if (len > 0) {
+ needsize = strlen(ins->kvraw) + len + 2;
+ if ((tmp = malloc(needsize)) == NULL)
+ return (util_err(errno, NULL));
+ snprintf(tmp, needsize, "%s %.*s", ins->kvraw, (int)len, str);
+ free(ins->kvraw);
+ ins->kvraw = tmp;
+ }
+ return (0);
+}
+
+/*
+ * json_strdup --
+ * Return a string, with no escapes or other JSON-isms, from the
+ * JSON string at the current input position.
+ */
+static int
+json_strdup(JSON_INPUT_STATE *ins, char **resultp)
+{
+ WT_DECL_RET;
+ char *result, *resultcpy;
+ const char *src;
+ ssize_t resultlen;
+ size_t srclen;
+
+ result = NULL;
+ src = ins->tokstart + 1; /*strip "" from token */
+ srclen = ins->toklen - 2;
+ if ((resultlen = __wt_json_strlen(src, srclen)) < 0) {
+ ret = util_err(EINVAL, "Invalid config string");
+ goto err;
+ }
+ resultlen += 1;
+ if ((result = (char *)malloc((size_t)resultlen)) == NULL) {
+ ret = util_err(errno, NULL);
+ goto err;
+ }
+ *resultp = result;
+ resultcpy = result;
+ if ((ret = __wt_json_strncpy(&resultcpy, (size_t)resultlen, src,
+ srclen))
+ != 0) {
+ ret = util_err(ret, NULL);
+ goto err;
+ }
+
+ if (0) {
+err: if (ret == 0)
+ ret = EINVAL;
+ if (result != NULL)
+ free(result);
+ *resultp = NULL;
+ }
+ return (ret);
+}
+
+/*
+ * json_data --
+ * Parse the data portion of the JSON input, and insert all
+ * values.
+ */
+static int
+json_data(WT_SESSION *session, JSON_INPUT_STATE *ins, CONFIG_LIST *clp,
+ uint32_t flags)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ char config[64], *endp, *uri;
+ const char *keyformat;
+ int isrec, nfield, nkeys, toktype, tret;
+ size_t keystrlen;
+ ssize_t gotnolen;
+ uint64_t gotno, recno;
+
+ cursor = NULL;
+ uri = NULL;
+
+ /* Reorder and check the list. */
+ if ((ret = config_reorder(clp->list)) != 0)
+ goto err;
+
+ /* Update config based on command-line configuration. */
+ if ((ret = config_update(session, clp->list)) != 0)
+ goto err;
+
+ /* Create the items collected. */
+ if ((ret = config_exec(session, clp->list)) != 0)
+ goto err;
+
+ uri = clp->list[0];
+ (void)snprintf(config, sizeof(config),
+ "dump=json%s%s",
+ LF_ISSET(LOAD_JSON_APPEND) ? ",append" : "",
+ LF_ISSET(LOAD_JSON_NO_OVERWRITE) ? ",overwrite=false" : "");
+ if ((ret = session->open_cursor(
+ session, uri, NULL, config, &cursor)) != 0) {
+ ret = util_err(ret, "%s: session.open", uri);
+ goto err;
+ }
+ keyformat = cursor->key_format;
+ isrec = (strcmp(keyformat, "r") == 0);
+ for (nkeys = 0; *keyformat; keyformat++)
+ if (!isdigit(*keyformat))
+ nkeys++;
+
+ recno = 0;
+ while (json_peek(session, ins) == '{') {
+ nfield = 0;
+ JSON_EXPECT(session, ins, '{');
+ if (ins->kvraw == NULL) {
+ if ((ins->kvraw = (char *)malloc(1)) == NULL) {
+ ret = util_err(errno, NULL);
+ goto err;
+ }
+ }
+ ins->kvraw[0] = '\0';
+ ins->kvrawstart = JSON_INPUT_POS(ins);
+ keystrlen = 0;
+ while (json_peek(session, ins) == 's') {
+ JSON_EXPECT(session, ins, 's');
+ JSON_EXPECT(session, ins, ':');
+ toktype = json_peek(session, ins);
+ JSON_EXPECT(session, ins, toktype);
+ if (isrec && nfield == 0) {
+ /* Verify the dump has recnos in order. */
+ recno++;
+ gotno = __wt_strtouq(ins->tokstart, &endp, 0);
+ gotnolen = (endp - ins->tokstart);
+ if (recno != gotno ||
+ ins->toklen != (size_t)gotnolen) {
+ ret = util_err(0,
+ "%s: recno out of order", uri);
+ goto err;
+ }
+ }
+ if (++nfield == nkeys) {
+ size_t curpos = JSON_INPUT_POS(ins);
+ if ((ret = json_kvraw_append(ins,
+ (char *)ins->line.mem + ins->kvrawstart,
+ curpos - ins->kvrawstart)) != 0)
+ goto err;
+ ins->kvrawstart = curpos;
+ keystrlen = strlen(ins->kvraw);
+ }
+ if (json_peek(session, ins) != ',')
+ break;
+ JSON_EXPECT(session, ins, ',');
+ if (json_peek(session, ins) != 's')
+ goto err;
+ }
+ if (json_kvraw_append(ins, ins->line.mem, JSON_INPUT_POS(ins)))
+ goto err;
+
+ ins->kvraw[keystrlen] = '\0';
+ if (!LF_ISSET(LOAD_JSON_APPEND))
+ cursor->set_key(cursor, ins->kvraw);
+ /* skip over inserted space and comma */
+ cursor->set_value(cursor, &ins->kvraw[keystrlen+2]);
+ if ((ret = cursor->insert(cursor)) != 0) {
+ ret = util_err(ret, "%s: cursor.insert", uri);
+ goto err;
+ }
+
+ JSON_EXPECT(session, ins, '}');
+ if (json_peek(session, ins) != ',')
+ break;
+ JSON_EXPECT(session, ins, ',');
+ if (json_peek(session, ins) != '{')
+ goto err;
+ }
+ if (0) {
+err: if (ret == 0)
+ ret = EINVAL;
+ }
+ /*
+ * Technically, we don't have to close the cursor because the session
+ * handle will do it for us, but I'd like to see the flush to disk and
+ * the close succeed, it's better to fail early when loading files.
+ */
+ if (cursor != NULL && (tret = cursor->close(cursor)) != 0) {
+ tret = util_err(tret, "%s: cursor.close", uri);
+ if (ret == 0)
+ ret = tret;
+ }
+ if (ret == 0)
+ ret = util_flush(session, uri);
+ return (ret);
+}
+
+/*
+ * json_top_level --
+ * Parse the top level JSON input.
+ */
+static int
+json_top_level(WT_SESSION *session, JSON_INPUT_STATE *ins, uint32_t flags)
+{
+ CONFIG_LIST cl;
+ WT_DECL_RET;
+ char *config, *tableuri;
+ int toktype;
+ static const char *json_markers[] = {
+ "\"config\"", "\"colgroups\"", "\"indices\"", "\"data\"", NULL };
+
+ memset(&cl, 0, sizeof(cl));
+ tableuri = NULL;
+ JSON_EXPECT(session, ins, '{');
+ while (json_peek(session, ins) == 's') {
+ JSON_EXPECT(session, ins, 's');
+ tableuri = realloc(tableuri, ins->toklen);
+ snprintf(tableuri, ins->toklen, "%.*s",
+ (int)(ins->toklen - 2), ins->tokstart + 1);
+ JSON_EXPECT(session, ins, ':');
+
+ /*
+ * Allow any ordering of 'config', 'colgroups',
+ * 'indices' before 'data', which must appear last.
+ * The non-'data' items build up a list of entries
+ * that created in our session before the data is
+ * inserted.
+ */
+ for (;;) {
+ if (json_skip(session, ins, json_markers) != 0)
+ goto err;
+ JSON_EXPECT(session, ins, 's');
+ if (JSON_STRING_MATCH(ins, "config")) {
+ JSON_EXPECT(session, ins, ':');
+ JSON_EXPECT(session, ins, 's');
+ if ((ret = json_strdup(ins, &config)) != 0) {
+ ret = util_err(ret, NULL);
+ goto err;
+ }
+ if ((ret = config_list_add(&cl, tableuri)) != 0)
+ goto err;
+ if ((ret = config_list_add(&cl, config)) != 0)
+ goto err;
+ tableuri = NULL;
+ } else if (JSON_STRING_MATCH(ins, "colgroups")) {
+ JSON_EXPECT(session, ins, ':');
+ JSON_EXPECT(session, ins, '[');
+ if ((ret = json_column_group_index(
+ session, ins, &cl, 0)) != 0)
+ goto err;
+ JSON_EXPECT(session, ins, ']');
+ } else if (JSON_STRING_MATCH(ins, "indices")) {
+ JSON_EXPECT(session, ins, ':');
+ JSON_EXPECT(session, ins, '[');
+ if ((ret = json_column_group_index(
+ session, ins, &cl, 1)) != 0)
+ goto err;
+ JSON_EXPECT(session, ins, ']');
+ } else if (JSON_STRING_MATCH(ins, "data")) {
+ JSON_EXPECT(session, ins, ':');
+ JSON_EXPECT(session, ins, '[');
+ if ((ret = json_data(session, ins, &cl,
+ flags)) != 0)
+ goto err;
+ config_list_free(&cl);
+ break;
+ }
+ else
+ goto err;
+ }
+
+ while ((toktype = json_peek(session, ins)) == '}' ||
+ toktype == ']')
+ JSON_EXPECT(session, ins, toktype);
+ if (toktype == 0) /* Check EOF. */
+ break;
+ if (toktype == ',') {
+ JSON_EXPECT(session, ins, ',');
+ if (json_peek(session, ins) != 's')
+ goto err;
+ continue;
+ }
+ }
+ JSON_EXPECT(session, ins, 0);
+
+ if (0) {
+err: if (ret == 0)
+ ret = EINVAL;
+ }
+ config_list_free(&cl);
+ if (tableuri != NULL)
+ free(tableuri);
+ return (ret);
+}
+
+/*
+ * json_peek --
+ * Set the input state to the next available token in the input
+ * and return its token type, a code defined by __wt_json_token().
+ */
+static int
+json_peek(WT_SESSION *session, JSON_INPUT_STATE *ins)
+{
+ WT_DECL_RET;
+
+ if (!ins->peeking) {
+ while (!ins->ateof) {
+ while (isspace(*ins->p))
+ ins->p++;
+ if (*ins->p)
+ break;
+ if (ins->kvraw != NULL) {
+ if (json_kvraw_append(ins,
+ (char *)ins->line.mem + ins->kvrawstart,
+ strlen(ins->line.mem) - ins->kvrawstart)) {
+ ret = -1;
+ goto err;
+ }
+ ins->kvrawstart = 0;
+ }
+ if (util_read_line(&ins->line, 1,
+ &ins->ateof)) {
+ ins->toktype = -1;
+ ret = -1;
+ goto err;
+ }
+ ins->linenum++;
+ ins->p = (const char *)ins->line.mem;
+ }
+ if (ins->ateof)
+ ins->toktype = 0;
+ else if (__wt_json_token(session, ins->p,
+ &ins->toktype, &ins->tokstart,
+ &ins->toklen) != 0)
+ ins->toktype = -1;
+ ins->peeking = 1;
+ }
+ if (0) {
+ err: if (ret == 0)
+ ret = -1;
+ }
+ return (ret == 0 ? ins->toktype : -1);
+}
+
+/*
+ * json_expect --
+ * Ensure that the type of the next token in the input matches
+ * the wanted value, and advance past it. The values of the
+ * input state will be set so specific string or integer values
+ * can be pulled out after this call.
+ */
+static int
+json_expect(WT_SESSION *session, JSON_INPUT_STATE *ins, int wanttok)
+{
+ if (json_peek(session, ins) < 0)
+ return (1);
+ ins->p += ins->toklen;
+ ins->peeking = 0;
+ if (ins->toktype != wanttok) {
+ fprintf(stderr,
+ "%s: %d: %" WT_SIZET_FMT ": expected %s, got %s\n",
+ ins->filename,
+ ins->linenum,
+ JSON_INPUT_POS(ins) + 1,
+ __wt_json_tokname(wanttok),
+ __wt_json_tokname(ins->toktype));
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * json_skip --
+ * Skip over JSON input until one of the specified strings appears.
+ * The tokenizer will be set to point to the beginning of
+ * that string.
+ */
+static int
+json_skip(WT_SESSION *session, JSON_INPUT_STATE *ins, const char **matches)
+{
+ const char *hit;
+ const char **match;
+
+ if (ins->kvraw != NULL)
+ return (1);
+
+ hit = NULL;
+ while (!ins->ateof) {
+ for (match = matches; *match != NULL; match++)
+ if ((hit = strstr(ins->p, *match)) != NULL)
+ goto out;
+ if (util_read_line(&ins->line, 1, &ins->ateof)) {
+ ins->toktype = -1;
+ return (1);
+ }
+ ins->linenum++;
+ ins->p = (const char *)ins->line.mem;
+ }
+out:
+ if (hit == NULL)
+ return (1);
+
+ /* Set to this token. */
+ ins->p = hit;
+ ins->peeking = 0;
+ ins->toktype = 0;
+ (void)json_peek(session, ins);
+ return (0);
+}
+
+/*
+ * load_json --
+ * Load from the JSON format produced by 'wt dump -j'.
+ */
+int
+util_load_json(WT_SESSION *session, const char *filename, uint32_t flags)
+{
+ JSON_INPUT_STATE instate;
+ WT_DECL_RET;
+
+ memset(&instate, 0, sizeof(instate));
+ instate.session = session;
+ if (util_read_line(&instate.line, 0, &instate.ateof))
+ return (1);
+ instate.p = (const char *)instate.line.mem;
+ instate.linenum = 1;
+ instate.filename = filename;
+
+ if ((ret = json_top_level(session, &instate, flags)) != 0)
+ goto err;
+
+err: if (instate.line.mem != NULL)
+ free(instate.line.mem);
+ free(instate.kvraw);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_loadtext.c b/src/third_party/wiredtiger/src/utilities/util_loadtext.c
new file mode 100644
index 00000000000..27c4c23b50c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_loadtext.c
@@ -0,0 +1,157 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int insert(WT_CURSOR *, const char *, int);
+static int text(WT_SESSION *, const char *);
+static int usage(void);
+
+int
+util_loadtext(WT_SESSION *session, int argc, char *argv[])
+{
+ int ch;
+ const char *uri;
+
+ while ((ch = __wt_getopt(progname, argc, argv, "f:")) != EOF)
+ switch (ch) {
+ case 'f': /* input file */
+ if (freopen(__wt_optarg, "r", stdin) == NULL)
+ return (
+ util_err(errno, "%s: reopen", __wt_optarg));
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the uri. */
+ if (argc != 1)
+ return (usage());
+ if ((uri = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ return (text(session, uri));
+}
+
+/*
+ * text --
+ * Load flat-text into a file/table.
+ */
+static int
+text(WT_SESSION *session, const char *uri)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ int readkey, tret;
+
+ /*
+ * Open the cursor, configured to append new records (in the case of
+ * column-store objects), or to overwrite existing strings (in the
+ * case of row-store objects). The two flags are mutually exclusive,
+ * but the library doesn't currently care that we set both of them.
+ */
+ if ((ret = session->open_cursor(
+ session, uri, NULL, "append,overwrite", &cursor)) != 0)
+ return (util_err(ret, "%s: session.open", uri));
+
+ /*
+ * We're about to load strings, make sure the formats match.
+ *
+ * Row-store tables have key/value pairs, column-store tables only have
+ * values.
+ */
+ if (strcmp(cursor->value_format, "S") != 0 ||
+ (strcmp(cursor->key_format, "S") != 0 &&
+ strcmp(cursor->key_format, "r") != 0))
+ return (util_err(EINVAL,
+ "the loadtext command can only load objects configured "
+ "for record number or string keys, and string values"));
+ readkey = strcmp(cursor->key_format, "r") == 0 ? 0 : 1;
+
+ /* Insert the records */
+ ret = insert(cursor, uri, readkey);
+
+ /*
+ * Technically, we don't have to close the cursor because the session
+ * handle will do it for us, but I'd like to see the flush to disk and
+ * the close succeed, it's better to fail early when loading files.
+ */
+ if ((tret = cursor->close(cursor)) != 0) {
+ tret = util_err(tret, "%s: cursor.close", uri);
+ if (ret == 0)
+ ret = tret;
+ }
+ if (ret == 0)
+ ret = util_flush(session, uri);
+
+ return (ret == 0 ? 0 : 1);
+}
+
+/*
+ * insert --
+ * Read and insert data.
+ */
+static int
+insert(WT_CURSOR *cursor, const char *name, int readkey)
+{
+ ULINE key, value;
+ WT_DECL_RET;
+ uint64_t insert_count;
+ int eof;
+
+ memset(&key, 0, sizeof(key));
+ memset(&value, 0, sizeof(value));
+
+ /* Read key/value pairs and insert them into the file. */
+ for (insert_count = 0;;) {
+ /*
+ * Three modes: in row-store, we always read a key and use it,
+ * in column-store, we might read it (a dump), we might read
+ * and ignore it (a dump with "append" set), or not read it at
+ * all (flat-text load).
+ */
+ if (readkey) {
+ if (util_read_line(&key, 1, &eof))
+ return (1);
+ if (eof == 1)
+ break;
+ cursor->set_key(cursor, key.mem);
+ }
+ if (util_read_line(&value, readkey ? 0 : 1, &eof))
+ return (1);
+ if (eof == 1)
+ break;
+ cursor->set_value(cursor, value.mem);
+
+ if ((ret = cursor->insert(cursor)) != 0)
+ return (util_err(ret, "%s: cursor.insert", name));
+
+ /* Report on progress every 100 inserts. */
+ if (verbose && ++insert_count % 100 == 0) {
+ printf("\r\t%s: %" PRIu64, name, insert_count);
+ fflush(stdout);
+ }
+ }
+
+ if (verbose)
+ printf("\r\t%s: %" PRIu64 "\n", name, insert_count);
+
+ return (0);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "loadtext [-f input-file] uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_main.c b/src/third_party/wiredtiger/src/utilities/util_main.c
new file mode 100644
index 00000000000..04ab59f1ca9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_main.c
@@ -0,0 +1,262 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+const char *home = "."; /* Home directory */
+const char *progname; /* Program name */
+ /* Global arguments */
+const char *usage_prefix = "[-Vv] [-C config] [-h home]";
+int verbose; /* Verbose flag */
+
+static const char *command; /* Command name */
+
+static int usage(void);
+
+int
+main(int argc, char *argv[])
+{
+ WT_CONNECTION *conn;
+ WT_DECL_RET;
+ WT_SESSION *session;
+ size_t len;
+ int ch, major_v, minor_v, tret, (*func)(WT_SESSION *, int, char *[]);
+ char *p;
+ const char *cmd_config, *config;
+
+ conn = NULL;
+ p = NULL;
+
+ /* Get the program name. */
+ if ((progname = strrchr(argv[0], '/')) == NULL)
+ progname = argv[0];
+ else
+ ++progname;
+ command = "";
+
+ /* Check the version against the library build. */
+ (void)wiredtiger_version(&major_v, & minor_v, NULL);
+ if (major_v != WIREDTIGER_VERSION_MAJOR ||
+ minor_v != WIREDTIGER_VERSION_MINOR) {
+ fprintf(stderr,
+ "%s: program build version %d.%d does not match "
+ "library build version %d.%d\n",
+ progname,
+ WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR,
+ major_v, minor_v);
+ return (EXIT_FAILURE);
+ }
+
+ /* Check for standard options. */
+ cmd_config = config = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "C:h:Vv")) != EOF)
+ switch (ch) {
+ case 'C': /* wiredtiger_open config */
+ cmd_config = __wt_optarg;
+ break;
+ case 'h': /* home directory */
+ home = __wt_optarg;
+ break;
+ case 'V': /* version */
+ printf("%s\n", wiredtiger_version(NULL, NULL, NULL));
+ return (EXIT_SUCCESS);
+ case 'v': /* verbose */
+ verbose = 1;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The next argument is the command name. */
+ if (argc < 1)
+ return (usage());
+ command = argv[0];
+
+ /* Reset getopt. */
+ __wt_optreset = __wt_optind = 1;
+
+ func = NULL;
+ switch (command[0]) {
+ case 'b':
+ if (strcmp(command, "backup") == 0)
+ func = util_backup;
+ break;
+ case 'c':
+ if (strcmp(command, "compact") == 0)
+ func = util_compact;
+ else if (strcmp(command, "copyright") == 0) {
+ util_copyright();
+ return (EXIT_SUCCESS);
+ } else if (strcmp(command, "create") == 0) {
+ func = util_create;
+ config = "create";
+ }
+ break;
+ case 'd':
+ if (strcmp(command, "drop") == 0)
+ func = util_drop;
+ else if (strcmp(command, "dump") == 0)
+ func = util_dump;
+ break;
+ case 'l':
+ if (strcmp(command, "list") == 0)
+ func = util_list;
+ else if (strcmp(command, "load") == 0) {
+ func = util_load;
+ config = "create";
+ } else if (strcmp(command, "loadtext") == 0) {
+ func = util_loadtext;
+ config = "create";
+ }
+ break;
+ case 'p':
+ if (strcmp(command, "printlog") == 0)
+ func = util_printlog;
+ break;
+ case 'r':
+ if (strcmp(command, "read") == 0)
+ func = util_read;
+ else if (strcmp(command, "rename") == 0)
+ func = util_rename;
+ break;
+ case 's':
+ if (strcmp(command, "salvage") == 0)
+ func = util_salvage;
+ else if (strcmp(command, "stat") == 0) {
+ func = util_stat;
+ config = "statistics=(all)";
+ }
+ break;
+ case 'u':
+ if (strcmp(command, "upgrade") == 0)
+ func = util_upgrade;
+ break;
+ case 'v':
+ if (strcmp(command, "verify") == 0)
+ func = util_verify;
+ break;
+ case 'w':
+ if (strcmp(command, "write") == 0)
+ func = util_write;
+ break;
+ default:
+ break;
+ }
+ if (func == NULL)
+ return (usage());
+
+ /* Build the configuration string, as necessary. */
+ if (config == NULL)
+ config = cmd_config;
+ else if (cmd_config != NULL) {
+ len = strlen(cmd_config) + strlen(config) + 10;
+ if ((p = malloc(len)) == NULL) {
+ ret = util_err(errno, NULL);
+ goto err;
+ }
+ (void)snprintf(p, len, "%s,%s", config, cmd_config);
+ config = p;
+ }
+
+ /* Open the database and a session. */
+ if ((ret = wiredtiger_open(home,
+ verbose ? verbose_handler : NULL, config, &conn)) != 0) {
+ ret = util_err(ret, NULL);
+ goto err;
+ }
+ if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) {
+ ret = util_err(ret, NULL);
+ goto err;
+ }
+
+ /* Call the function. */
+ ret = func(session, argc, argv);
+
+ /* Close the database. */
+
+err: if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0)
+ ret = tret;
+
+ if (p != NULL)
+ free(p);
+
+ return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+static int
+usage(void)
+{
+ fprintf(stderr,
+ "WiredTiger Data Engine (version %d.%d)\n",
+ WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR);
+ fprintf(stderr,
+ "global options:\n"
+ "\t" "-C\twiredtiger_open configuration\n"
+ "\t" "-h\tdatabase directory\n"
+ "\t" "-V\tdisplay library version and exit\n"
+ "\t" "-v\tverbose\n");
+ fprintf(stderr,
+ "commands:\n"
+ "\t" "backup\t database backup\n"
+ "\t" "compact\t compact an object\n"
+ "\t" "copyright copyright information\n"
+ "\t" "create\t create an object\n"
+ "\t" "drop\t drop an object\n"
+ "\t" "dump\t dump an object\n"
+ "\t" "list\t list database objects\n"
+ "\t" "load\t load an object\n"
+ "\t" "loadtext\t load an object from a text file\n"
+ "\t" "printlog display the database log\n"
+ "\t" "read\t read values from an object\n"
+ "\t" "rename\t rename an object\n"
+ "\t" "salvage\t salvage a file\n"
+ "\t" "stat\t display statistics for an object\n"
+ "\t" "upgrade\t upgrade an object\n"
+ "\t" "verify\t verify an object\n"
+ "\t" "write\t write values to an object\n");
+
+ return (EXIT_FAILURE);
+}
+
+/*
+ * util_name --
+ * Build a name.
+ */
+char *
+util_name(const char *s, const char *type)
+{
+ size_t len;
+ char *name;
+
+ if (WT_PREFIX_MATCH(s, "backup:") ||
+ WT_PREFIX_MATCH(s, "config:") ||
+ WT_PREFIX_MATCH(s, "statistics:")) {
+ fprintf(stderr,
+ "%s: %s: unsupported object type: %s\n",
+ progname, command, s);
+ return (NULL);
+ }
+
+ len = strlen(type) + strlen(s) + 2;
+ if ((name = calloc(len, 1)) == NULL) {
+ (void)util_err(errno, NULL);
+ return (NULL);
+ }
+
+ /*
+ * If the string has a URI prefix, use it verbatim, otherwise prepend
+ * the default type for the operation.
+ */
+ if (strchr(s, ':') != NULL)
+ strcpy(name, s);
+ else
+ snprintf(name, len, "%s:%s", type, s);
+ return (name);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_misc.c b/src/third_party/wiredtiger/src/utilities/util_misc.c
new file mode 100644
index 00000000000..71e307a2e0e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_misc.c
@@ -0,0 +1,146 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+int
+util_cerr(const char *uri, const char *op, int ret)
+{
+ return (util_err(ret, "%s: cursor.%s", uri, op));
+}
+
+/*
+ * util_err --
+ * Report an error.
+ */
+int
+util_err(int e, const char *fmt, ...)
+{
+ va_list ap;
+
+ (void)fprintf(stderr, "%s: ", progname);
+ if (fmt != NULL) {
+ va_start(ap, fmt);
+ (void)vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ if (e != 0)
+ (void)fprintf(stderr, ": ");
+ }
+ if (e != 0)
+ (void)fprintf(stderr, "%s", wiredtiger_strerror(e));
+ (void)fprintf(stderr, "\n");
+ return (1);
+}
+
+/*
+ * util_read_line --
+ * Read a line from stdin into a ULINE.
+ */
+int
+util_read_line(ULINE *l, int eof_expected, int *eofp)
+{
+ static uint64_t line = 0;
+ size_t len;
+ int ch;
+
+ ++line;
+ *eofp = 0;
+
+ if (l->memsize == 0) {
+ if ((l->mem = realloc(l->mem, l->memsize + 1024)) == NULL)
+ return (util_err(errno, NULL));
+ l->memsize = 1024;
+ }
+ for (len = 0;; ++len) {
+ if ((ch = getchar()) == EOF) {
+ if (len == 0) {
+ if (eof_expected) {
+ *eofp = 1;
+ return (0);
+ }
+ return (util_err(0,
+ "line %" PRIu64 ": unexpected end-of-file",
+ line));
+ }
+ return (util_err(0,
+ "line %" PRIu64 ": no newline terminator", line));
+ }
+ if (ch == '\n')
+ break;
+ /*
+ * We nul-terminate the string so it's easier to convert the
+ * line into a record number, that means we always need one
+ * extra byte at the end.
+ */
+ if (len >= l->memsize - 1) {
+ if ((l->mem =
+ realloc(l->mem, l->memsize + 1024)) == NULL)
+ return (util_err(errno, NULL));
+ l->memsize += 1024;
+ }
+ ((uint8_t *)l->mem)[len] = (uint8_t)ch;
+ }
+
+ ((uint8_t *)l->mem)[len] = '\0'; /* nul-terminate */
+
+ return (0);
+}
+
+/*
+ * util_str2recno --
+ * Convert a string to a record number.
+ */
+int
+util_str2recno(const char *p, uint64_t *recnop)
+{
+ uint64_t recno;
+ char *endptr;
+
+ /*
+ * strtouq takes lots of things like hex values, signs and so on and so
+ * forth -- none of them are OK with us. Check the string starts with
+ * digit, that turns off the special processing.
+ */
+ if (!isdigit(p[0]))
+ goto format;
+
+ errno = 0;
+ recno = __wt_strtouq(p, &endptr, 0);
+ if (recno == ULLONG_MAX && errno == ERANGE)
+ return (util_err(ERANGE, "%s: invalid record number", p));
+
+ if (endptr[0] != '\0')
+format: return (util_err(EINVAL, "%s: invalid record number", p));
+
+ *recnop = recno;
+ return (0);
+}
+
+/*
+ * util_flush --
+ * Flush the file successfully, or drop it.
+ */
+int
+util_flush(WT_SESSION *session, const char *uri)
+{
+ WT_DECL_RET;
+ size_t len;
+ char *buf;
+
+ len = strlen(uri) + 100;
+ if ((buf = malloc(len)) == NULL)
+ return (util_err(errno, NULL));
+
+ (void)snprintf(buf, len, "target=(\"%s\")", uri);
+ if ((ret = session->checkpoint(session, buf)) != 0) {
+ ret = util_err(ret, "%s: session.checkpoint", uri);
+ (void)session->drop(session, uri, NULL);
+ }
+
+ free(buf);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_printlog.c b/src/third_party/wiredtiger/src/utilities/util_printlog.c
new file mode 100644
index 00000000000..7fc9bfa39b0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_printlog.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_printlog(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch, printable;
+
+ printable = 0;
+ while ((ch = __wt_getopt(progname, argc, argv, "f:p")) != EOF)
+ switch (ch) {
+ case 'f': /* output file */
+ if (freopen(__wt_optarg, "w", stdout) == NULL) {
+ fprintf(stderr, "%s: %s: reopen: %s\n",
+ progname, __wt_optarg, strerror(errno));
+ return (1);
+ }
+ break;
+ case 'p':
+ printable = 1;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* There should not be any more arguments. */
+ if (argc != 0)
+ return (usage());
+
+ WT_UNUSED(printable);
+ ret = __wt_txn_printlog(session, stdout);
+
+ if (ret != 0) {
+ fprintf(stderr, "%s: printlog failed: %s\n",
+ progname, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ if (0) {
+err: ret = 1;
+ }
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "printlog [-p] [-f output-file]\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_read.c b/src/third_party/wiredtiger/src/utilities/util_read.c
new file mode 100644
index 00000000000..d9a629e40e2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_read.c
@@ -0,0 +1,101 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_read(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ uint64_t recno;
+ int ch, rkey, rval;
+ const char *uri, *value;
+
+ while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+ switch (ch) {
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining arguments are a uri followed by a list of keys. */
+ if (argc < 2)
+ return (usage());
+ if ((uri = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ /* Open the object. */
+ if ((ret = session->open_cursor(
+ session, uri, NULL, NULL, &cursor)) != 0)
+ return (util_err(ret, "%s: session.open", uri));
+
+ /*
+ * A simple search only makes sense if the key format is a string or a
+ * record number, and the value format is a single string.
+ */
+ if (strcmp(cursor->key_format, "r") != 0 &&
+ strcmp(cursor->key_format, "S") != 0) {
+ fprintf(stderr,
+ "%s: read command only possible when the key format is "
+ "a record number or string\n",
+ progname);
+ return (1);
+ }
+ rkey = strcmp(cursor->key_format, "r") == 0 ? 1 : 0;
+ if (strcmp(cursor->value_format, "S") != 0) {
+ fprintf(stderr,
+ "%s: read command only possible when the value format is "
+ "a string\n",
+ progname);
+ return (1);
+ }
+
+ /*
+ * Run through the keys, returning non-zero on error or if any requested
+ * key isn't found.
+ */
+ for (rval = 0; *++argv != NULL;) {
+ if (rkey) {
+ if (util_str2recno(*argv, &recno))
+ return (1);
+ cursor->set_key(cursor, recno);
+ } else
+ cursor->set_key(cursor, *argv);
+
+ switch (ret = cursor->search(cursor)) {
+ case 0:
+ if ((ret = cursor->get_value(cursor, &value)) != 0)
+ return (util_cerr(uri, "get_value", ret));
+ if (printf("%s\n", value) < 0)
+ return (util_err(EIO, NULL));
+ break;
+ case WT_NOTFOUND:
+ (void)util_err(0, "%s: not found", *argv);
+ rval = 1;
+ break;
+ default:
+ return (util_cerr(uri, "search", ret));
+ }
+ }
+
+ return (rval);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "read uri key ...\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_rename.c b/src/third_party/wiredtiger/src/utilities/util_rename.c
new file mode 100644
index 00000000000..8c2aeb30c59
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_rename.c
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_rename(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ char *uri, *newuri;
+
+ uri = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+ switch (ch) {
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining arguments are the object uri and new name. */
+ if (argc != 2)
+ return (usage());
+ if ((uri = util_name(*argv, "table")) == NULL)
+ return (1);
+ newuri = argv[1];
+
+ if ((ret = session->rename(session, uri, newuri, NULL)) != 0) {
+ fprintf(stderr, "%s: rename %s to %s: %s\n",
+ progname, uri, newuri, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ if (0) {
+err: ret = 1;
+ }
+
+ if (uri != NULL)
+ free(uri);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "rename uri newuri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_salvage.c b/src/third_party/wiredtiger/src/utilities/util_salvage.c
new file mode 100644
index 00000000000..386365d8875
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_salvage.c
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_salvage(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ const char *force;
+ char *name;
+
+ force = NULL;
+ name = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "F")) != EOF)
+ switch (ch) {
+ case 'F':
+ force = "force";
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the file name. */
+ if (argc != 1)
+ return (usage());
+ if ((name = util_name(*argv, "file")) == NULL)
+ return (1);
+
+ if ((ret = session->salvage(session, name, force)) != 0) {
+ fprintf(stderr, "%s: salvage(%s): %s\n",
+ progname, name, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ /* Verbose configures a progress counter, move to the next line. */
+ if (verbose)
+ printf("\n");
+
+ if (0) {
+err: ret = 1;
+ }
+
+ if (name != NULL)
+ free(name);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "salvage [-F] uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_stat.c b/src/third_party/wiredtiger/src/utilities/util_stat.c
new file mode 100644
index 00000000000..caac560e839
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_stat.c
@@ -0,0 +1,103 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_stat(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ size_t urilen;
+ int all, ch, objname_free;
+ const char *pval, *desc;
+ char *objname, *uri;
+
+ all = objname_free = 0;
+ objname = uri = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "a")) != EOF)
+ switch (ch) {
+ case 'a':
+ all = 1;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /*
+ * If there are no arguments, the statistics cursor operates on the
+ * connection, otherwise, the optional remaining argument is a file
+ * or LSM name.
+ */
+ switch (argc) {
+ case 0:
+ objname = (char *)"";
+ break;
+ case 1:
+ if ((objname = util_name(*argv, "table")) == NULL)
+ return (1);
+ objname_free = 1;
+ break;
+ default:
+ return (usage());
+ }
+
+ urilen = strlen("statistics:") + strlen(objname) + 1;
+ if ((uri = calloc(urilen, 1)) == NULL) {
+ fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+ goto err;
+ }
+ snprintf(uri, urilen, "statistics:%s", objname);
+
+ if ((ret = session->open_cursor(session, uri, NULL,
+ all ? "statistics=(all)" : NULL, &cursor)) != 0) {
+ fprintf(stderr, "%s: cursor open(%s) failed: %s\n",
+ progname, uri, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ /* List the statistics. */
+ while (
+ (ret = cursor->next(cursor)) == 0 &&
+ (ret = cursor->get_value(cursor, &desc, &pval, NULL)) == 0)
+ if (printf("%s=%s\n", desc, pval) < 0) {
+ ret = errno;
+ break;
+ }
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+
+ if (ret != 0) {
+ fprintf(stderr, "%s: cursor get(%s) failed: %s\n",
+ progname, objname, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ if (0) {
+err: ret = 1;
+ }
+ if (objname_free)
+ free(objname);
+ free(uri);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "stat -a [uri]\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_upgrade.c b/src/third_party/wiredtiger/src/utilities/util_upgrade.c
new file mode 100644
index 00000000000..b56caca2ccd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_upgrade.c
@@ -0,0 +1,63 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_upgrade(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ char *name;
+
+ name = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+ switch (ch) {
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the table name. */
+ if (argc != 1)
+ return (usage());
+ if ((name = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ if ((ret = session->upgrade(session, name, NULL)) != 0) {
+ fprintf(stderr, "%s: upgrade(%s): %s\n",
+ progname, name, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ /* Verbose configures a progress counter, move to the next line. */
+ if (verbose)
+ printf("\n");
+
+ if (0) {
+err: ret = 1;
+ }
+
+ if (name != NULL)
+ free(name);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "upgrade uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_verbose.c b/src/third_party/wiredtiger/src/utilities/util_verbose.c
new file mode 100644
index 00000000000..12ff1c5463c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_verbose.c
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+/*
+ * __handle_error_verbose --
+ * Verbose WT_EVENT_HANDLER->handle_error implementation: send to stderr.
+ */
+static int
+__handle_error_verbose(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, int error, const char *errmsg)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+ WT_UNUSED(error);
+
+ return (fprintf(stderr, "%s\n", errmsg) < 0 ? EIO : 0);
+}
+
+/*
+ * __handle_message_verbose --
+ * Verbose WT_EVENT_HANDLER->handle_message implementation: send to stdout.
+ */
+static int
+__handle_message_verbose(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, const char *message)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+
+ return (printf("%s\n", message) < 0 ? EIO : 0);
+}
+
+/*
+ * __handle_progress_verbose --
+ * Default WT_EVENT_HANDLER->handle_progress implementation: ignore.
+ */
+static int
+__handle_progress_verbose(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, const char *operation, uint64_t progress)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+
+ return (
+ printf("\r\t%s %-20" PRIu64, operation, progress) < 0 ? EIO : 0);
+}
+
+static WT_EVENT_HANDLER __event_handler_verbose = {
+ __handle_error_verbose,
+ __handle_message_verbose,
+ __handle_progress_verbose,
+ NULL /* Close handler. */
+
+};
+
+WT_EVENT_HANDLER *verbose_handler = &__event_handler_verbose;
diff --git a/src/third_party/wiredtiger/src/utilities/util_verify.c b/src/third_party/wiredtiger/src/utilities/util_verify.c
new file mode 100644
index 00000000000..6ae5fdeec26
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_verify.c
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+#undef OPT_ARGS
+#undef USAGE_ARGS
+#ifdef HAVE_DIAGNOSTIC
+#define OPT_ARGS "d:"
+#define USAGE_ARGS \
+ "[-d dump_address | dump_blocks | dump_offsets=#,# | dump_pages] uri"
+#else
+#define OPT_ARGS ""
+#define USAGE_ARGS "uri"
+#endif
+
+int
+util_verify(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ size_t size;
+ int ch, dump_address, dump_blocks, dump_pages;
+ char *config, *dump_offsets, *name;
+
+ dump_address = dump_blocks = dump_pages = 0;
+ config = dump_offsets = name = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, OPT_ARGS)) != EOF)
+ switch (ch) {
+ case 'd':
+ if (strcmp(__wt_optarg, "dump_address") == 0)
+ dump_address = 1;
+ else if (strcmp(__wt_optarg, "dump_blocks") == 0)
+ dump_blocks = 1;
+ else if (
+ WT_PREFIX_MATCH(__wt_optarg, "dump_offsets=")) {
+ if (dump_offsets != NULL) {
+ fprintf(stderr,
+ "%s: only a single 'dump_offsets' "
+ "argument supported\n", progname);
+ return (usage());
+ }
+ dump_offsets =
+ __wt_optarg + strlen("dump_offsets=");
+ } else if (strcmp(__wt_optarg, "dump_pages") == 0)
+ dump_pages = 1;
+ else
+ return (usage());
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the table name. */
+ if (argc != 1)
+ return (usage());
+ if ((name = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ /* Build the configuration string as necessary. */
+ if (dump_address || dump_blocks || dump_offsets != NULL || dump_pages) {
+ size =
+ strlen("dump_address,") +
+ strlen("dump_blocks,") +
+ strlen("dump_pages,") +
+ strlen("dump_offsets[],") +
+ (dump_offsets == NULL ? 0 : strlen(dump_offsets)) + 20;
+ if ((config = malloc(size)) == NULL) {
+ (void)util_err(errno, NULL);
+ goto err;
+ }
+ snprintf(config, size,
+ "%s%s%s%s%s%s",
+ dump_address ? "dump_address," : "",
+ dump_blocks ? "dump_blocks," : "",
+ dump_offsets != NULL ? "dump_offsets=[" : "",
+ dump_offsets != NULL ? dump_offsets : "",
+ dump_offsets != NULL ? "]," : "",
+ dump_pages ? "dump_pages" : "");
+ }
+ if ((ret = session->verify(session, name, config)) != 0) {
+ fprintf(stderr, "%s: verify(%s): %s\n",
+ progname, name, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ /* Verbose configures a progress counter, move to the next line. */
+ if (verbose)
+ printf("\n");
+
+ if (0) {
+err: ret = 1;
+ }
+
+ if (config != NULL)
+ free(config);
+ if (name != NULL)
+ free(name);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "verify %s\n",
+ progname, usage_prefix, USAGE_ARGS);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_write.c b/src/third_party/wiredtiger/src/utilities/util_write.c
new file mode 100644
index 00000000000..067b951c0cc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_write.c
@@ -0,0 +1,107 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_write(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ uint64_t recno;
+ int append, ch, overwrite, rkey;
+ const char *uri;
+ char config[100];
+
+ append = overwrite = 0;
+ while ((ch = __wt_getopt(progname, argc, argv, "ao")) != EOF)
+ switch (ch) {
+ case 'a':
+ append = 1;
+ break;
+ case 'o':
+ overwrite = 1;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /*
+ * The remaining arguments are a uri followed by a list of values (if
+ * append is set), or key/value pairs (if append is not set).
+ */
+ if (append) {
+ if (argc < 2)
+ return (usage());
+ } else
+ if (argc < 3 || ((argc - 1) % 2 != 0))
+ return (usage());
+ if ((uri = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ /* Open the object. */
+ (void)snprintf(config, sizeof(config), "%s,%s",
+ append ? "append=true" : "", overwrite ? "overwrite=true" : "");
+ if ((ret = session->open_cursor(
+ session, uri, NULL, config, &cursor)) != 0)
+ return (util_err(ret, "%s: session.open", uri));
+
+ /*
+ * A simple search only makes sense if the key format is a string or a
+ * record number, and the value format is a single string.
+ */
+ if (strcmp(cursor->key_format, "r") != 0 &&
+ strcmp(cursor->key_format, "S") != 0) {
+ fprintf(stderr,
+ "%s: write command only possible when the key format is "
+ "a record number or string\n",
+ progname);
+ return (1);
+ }
+ rkey = strcmp(cursor->key_format, "r") == 0 ? 1 : 0;
+ if (strcmp(cursor->value_format, "S") != 0) {
+ fprintf(stderr,
+ "%s: write command only possible when the value format is "
+ "a string\n",
+ progname);
+ return (1);
+ }
+
+ /* Run through the values or key/value pairs. */
+ while (*++argv != NULL) {
+ if (!append) {
+ if (rkey) {
+ if (util_str2recno(*argv, &recno))
+ return (1);
+ cursor->set_key(cursor, recno);
+ } else
+ cursor->set_key(cursor, *argv);
+ ++argv;
+ }
+ cursor->set_value(cursor, *argv);
+
+ if ((ret = cursor->insert(cursor)) != 0)
+ return (util_cerr(uri, "search", ret));
+ }
+
+ return (0);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "write [-ao] uri key ...\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/tools/stat_data.py b/src/third_party/wiredtiger/tools/stat_data.py
new file mode 100644
index 00000000000..75a3b577472
--- /dev/null
+++ b/src/third_party/wiredtiger/tools/stat_data.py
@@ -0,0 +1,66 @@
+# DO NOT EDIT: automatically built by dist/stat.py. */
+
+no_scale_per_second_list = [
+ 'async: maximum work queue length',
+ 'cache: tracked dirty bytes in the cache',
+ 'cache: bytes currently in the cache',
+ 'cache: maximum bytes configured',
+ 'cache: tracked dirty pages in the cache',
+ 'cache: pages currently held in the cache',
+ 'conn: files currently open',
+ 'log: total log buffer size',
+ 'LSM: App work units currently queued',
+ 'LSM: Merge work units currently queued',
+ 'LSM: Switch work units currently queued',
+ 'reconciliation: split bytes currently awaiting free',
+ 'reconciliation: split objects currently awaiting free',
+ 'session: open cursor count',
+ 'session: open session count',
+ 'txn: transaction checkpoint currently running',
+ 'txn: transaction range of IDs currently pinned',
+ 'block manager: file allocation unit size',
+ 'block manager: checkpoint size',
+ 'block manager: file magic number',
+ 'block manager: file major version number',
+ 'block manager: minor version number',
+ 'block manager: file size in bytes',
+ 'LSM: bloom filters in the LSM tree',
+ 'LSM: total size of bloom filters',
+ 'btree: column-store variable-size deleted values',
+ 'btree: column-store fixed-size leaf pages',
+ 'btree: column-store internal pages',
+ 'btree: column-store variable-size leaf pages',
+ 'btree: number of key/value pairs',
+ 'btree: fixed-record size',
+ 'btree: maximum tree depth',
+ 'btree: maximum internal page item size',
+ 'btree: maximum internal page size',
+ 'btree: maximum leaf page item size',
+ 'btree: maximum leaf page size',
+ 'btree: overflow pages',
+ 'btree: row-store internal pages',
+ 'btree: row-store leaf pages',
+ 'cache: overflow values cached in memory',
+ 'LSM: chunks in the LSM tree',
+ 'LSM: highest merge generation in the LSM tree',
+ 'reconciliation: maximum blocks required for a page',
+ 'session: open cursor count',
+]
+no_clear_list = [
+ 'cache: bytes currently in the cache',
+ 'cache: maximum bytes configured',
+ 'cache: pages currently held in the cache',
+ 'conn: files currently open',
+ 'log: total log buffer size',
+ 'log: maximum log file size',
+ 'LSM: App work units currently queued',
+ 'LSM: Merge work units currently queued',
+ 'LSM: Switch work units currently queued',
+ 'reconciliation: split bytes currently awaiting free',
+ 'reconciliation: split objects currently awaiting free',
+ 'session: open cursor count',
+ 'session: open session count',
+ 'txn: transaction checkpoint currently running',
+ 'txn: transaction range of IDs currently pinned',
+ 'session: open cursor count',
+]
diff --git a/src/third_party/wiredtiger/tools/statlog.py b/src/third_party/wiredtiger/tools/statlog.py
new file mode 100644
index 00000000000..f32b46a9ec7
--- /dev/null
+++ b/src/third_party/wiredtiger/tools/statlog.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+import fileinput, os, shutil, sys, textwrap
+from collections import defaultdict
+from datetime import datetime
+from subprocess import call
+
+# Import the data describing which statistics should not be scaled
+from stat_data import no_scale_per_second_list
+
+TIMEFMT = "%b %d %H:%M:%S"
+reportno = 0
+
+# Plot a set of entries for a title.
+def plot(title, values):
+ global reportno
+
+ # Ignore entries where the value never changes.
+ skip = True
+ t0, v0 = values[0]
+ for t, v in values:
+ if v != v0:
+ skip = False
+ break
+ if skip:
+ print 'skipping: ' + title
+ return
+
+ print 'building: ' + title
+ reportno = reportno + 1
+ num = "%03d" % reportno
+
+ ylabel = 'Value'
+ if title.split(' ')[1] != 'spinlock' and \
+ title.split(' ', 1)[1] in no_scale_per_second_list:
+ seconds = 1
+ else:
+ t1, v1 = values[1]
+ seconds = (datetime.strptime(t1, TIMEFMT) -
+ datetime.strptime(t0, TIMEFMT)).seconds
+ if seconds == 0:
+ seconds = 1
+ ylabel += ' per second'
+
+ # Write the raw data into a file for processing.
+ of = open("reports/raw/report.%s.raw" % num, "w")
+ for t, v in sorted(values):
+ print >>of, "%s %g" % (t, float(v) / seconds)
+ of.close()
+
+ # Write a command file for gnuplot.
+ of = open("gnuplot.cmd", "w")
+ of.write('''
+set terminal png nocrop size 800,600
+set autoscale
+set grid
+set style data linespoints
+set title "%(title)s"
+set xlabel "Time"
+set xtics rotate by -45
+set xdata time
+set timefmt "%(timefmt)s"
+set format x "%(timefmt)s"
+set ylabel "%(ylabel)s"
+set yrange [0:]
+set output 'reports/report.%(num)s.png'
+plot "reports/raw/report.%(num)s.raw" using 1:4 notitle''' % {
+ 'num' : num,
+ 'timefmt' : TIMEFMT,
+ 'title' : title,
+ 'ylabel' : ylabel,
+ })
+ of.close()
+
+ # Run gnuplot.
+ call(["gnuplot", "gnuplot.cmd"])
+
+ # Remove the command file.
+ os.remove("gnuplot.cmd")
+
+# Read the input into a dictionary of lists.
+if sys.argv[1:] == []:
+ print "usage: " + sys.argv[0] + " file ..."
+ sys.exit(1)
+
+# Remove and re-create the reports folder.
+shutil.rmtree("reports", True)
+os.makedirs("reports/raw")
+
+d = defaultdict(list)
+for line in fileinput.input(sys.argv[1:]):
+ month, day, time, v, desc = line.strip('\n').split(" ", 4)
+ d[desc].append((month + " " + day + " " + time, v))
+
+# Plot each entry in the dictionary.
+for items in sorted(d.iteritems()):
+ plot('\\n'.join(l for l in textwrap.wrap(items[0], 60)), items[1])
+
diff --git a/src/third_party/wiredtiger/tools/wt_nvd3_util.py b/src/third_party/wiredtiger/tools/wt_nvd3_util.py
new file mode 100644
index 00000000000..6bf1396b0ff
--- /dev/null
+++ b/src/third_party/wiredtiger/tools/wt_nvd3_util.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+from datetime import datetime
+from nvd3 import lineChart
+
+# Add a multiChart type so we can overlay line graphs
+class multiChart(lineChart):
+ def __init__(self, **kwargs):
+ lineChart.__init__(self, **kwargs)
+
+ # Fix the axes
+ del self.axislist['yAxis']
+ self.create_y_axis('yAxis1', format=kwargs.get('y_axis_format', '.02f'))
+ self.create_y_axis('yAxis2', format=kwargs.get('y_axis_format', '.02f'))
+
+TIMEFMT = "%b %d %H:%M:%S"
+
+thisyear = datetime.today().year
+def parsetime(s):
+ return datetime.strptime(s, TIMEFMT).replace(year=thisyear)
+
diff --git a/src/third_party/wiredtiger/tools/wtperf_graph.py b/src/third_party/wiredtiger/tools/wtperf_graph.py
new file mode 100644
index 00000000000..f45145cf801
--- /dev/null
+++ b/src/third_party/wiredtiger/tools/wtperf_graph.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+import csv, os, sys
+from subprocess import call
+# Python script to read wtperf monitor output and create a performance
+# graph.
+
+TIMEFMT = "%b %d %H:%M:%S"
+
+def process_monitor(fname, sfx, ckptlist, opdict):
+ # Read the monitor file and figure out when a checkpoint was running.
+ in_ckpt = 'N'
+
+ ckptlist=[]
+
+ ofname = 'monitor%s.png' % (sfx)
+ # Monitor output format currently is:
+ # time,totalsec,read,insert,update,ckpt,...latencies...
+ ops = ('read', 'insert', 'update')
+ csvcol = (2, 3, 4)
+ with open(fname, 'r') as csvfile:
+ reader = csv.reader(csvfile)
+ for row in reader:
+ if row[0].lstrip().startswith('#'):
+ continue
+ # Look for checkpoints and operations.
+ if row[5] != in_ckpt:
+ ckptlist.append(row[0])
+ in_ckpt = row[5]
+ for op, col in zip(ops, csvcol):
+ if row[col] != '0' and opdict[op] == 0:
+ opdict[op] = 1
+
+ if in_ckpt == 'Y':
+ ckptlist.append(row[0])
+
+ # Graph time vs. read, insert and update operations per second.
+ gcmd = "gnuplot.mon.cmd"
+ of = open(gcmd, "w")
+ of.write('''
+set autoscale
+set datafile sep ','
+set grid
+set style data lines
+set terminal png nocrop size 800,600
+set timefmt "%(TIMEFMT)s"
+set title "read, insert and update operations per second"
+set format x "%(TIMEFMT)s"
+set xlabel "Time"
+set xtics rotate by -45
+set xdata time
+set ylabel "Operations per second (thousands)"
+set yrange [0:]\n''' % {
+ 'TIMEFMT' : TIMEFMT
+ })
+ it = iter(ckptlist)
+ for start, stop in zip(it, it):
+ of.write("set object rectangle from first '%s',\
+ graph 0 to first '%s',\
+ graph 1 fc rgb \"gray\" back\n" % (start, stop))
+ of.write('set output "%s"\n' % (ofname))
+ of.write("""plot "{name}" using 1:($3/1000) title "Reads", \\
+ "{name}" using 1:($4/1000) title "Inserts",\\
+ "{name}" using 1:($5/1000) title "Updates"
+ """.format(name=fname))
+ of.close()
+ call(["gnuplot", gcmd])
+ os.remove(gcmd)
+
+# Graph time vs. average, minimium, maximum latency for an operation.
+def plot_latency_operation(name, fname, sfx, ckptlist, col_avg, col_min, col_max):
+ gcmd = "gnuplot." + name + ".l1.cmd"
+ of = open(gcmd, "w")
+ of.write('''
+set autoscale
+set datafile sep ','
+set grid
+set style data lines
+set terminal png nocrop size 800,600
+set timefmt "%(TIMEFMT)s"
+set title "%(NAME)s: average, minimum and maximum latency"
+set format x "%(TIMEFMT)s"
+set xlabel "Time"
+set xtics rotate by -45
+set xdata time
+set ylabel "Latency (us)"
+set logscale y
+set yrange [1:]\n''' % {
+ 'NAME' : name,
+ 'TIMEFMT' : TIMEFMT
+ })
+ it = iter(ckptlist)
+ for start, stop in zip(it, it):
+ of.write('set object rectangle from first \'' + start +\
+ '\', graph 0 ' + ' to first \'' + stop +\
+ '\', graph 1 fc rgb "gray" back\n')
+ ofname = name + sfx + '.latency1.png'
+ of.write('set output "' + ofname + '"\n')
+ of.write('plot "' +\
+ fname + '" using 1:($' + repr(col_avg) +\
+ ') title "Average Latency", "' + fname +'" using 1:($' +\
+ repr(col_min) + ') title "Minimum Latency", "' +\
+ fname + '" using 1:($' + repr(col_max) +\
+ ') title "Maximum Latency"\n')
+ of.close()
+ call(["gnuplot", gcmd])
+ os.remove(gcmd)
+
+
+# Graph latency vs. % operations
+def plot_latency_percent(name, dirname, sfx, ckptlist):
+ lfile = os.path.join(dirname, 'latency.' + name)
+ if not os.path.exists(lfile):
+ return
+ gcmd = "gnuplot." + name + ".l2.cmd"
+ of = open(gcmd, "w")
+ of.write('''
+set autoscale
+set datafile sep ','
+set grid
+set style data points
+set terminal png nocrop size 800,600\n''')
+ of.write('set title "' + name + ': latency distribution"\n')
+ of.write('''
+set xlabel "Latency (us)"
+set xrange [1:]
+set xtics rotate by -45
+set logscale x
+set ylabel "%% operations"
+set yrange [0:]\n''')
+ ofname = name + sfx + '.latency2.png'
+ of.write('set output "' + ofname + '"\n')
+ of.write('plot "' + lfile + sfx +\
+ '" using (($2 * 100)/$4) title "' + name + '"\n')
+ of.close()
+ call(["gnuplot", gcmd])
+ os.remove(gcmd)
+
+
+# Graph latency vs. % operations (cumulative)
+def plot_latency_cumulative_percent(name, dirname, sfx, ckptlist):
+ lfile = os.path.join(dirname, 'latency.' + name)
+ if not os.path.exists(lfile):
+ return
+ # Latency plot: cumulative operations vs. latency
+ gcmd = "gnuplot." + name + ".l3.cmd"
+ of = open(gcmd, "w")
+ of.write('''
+set autoscale
+set datafile sep ','
+set grid
+set style data lines
+set terminal png nocrop size 800,600
+set title "%(NAME)s: cumulative latency distribution"
+set xlabel "Latency (us)"
+set xrange [1:]
+set xtics rotate by -45
+set logscale x
+set ylabel "%% operations"
+set yrange [0:]\n''' % {
+ 'NAME' : name
+ })
+ ofname = name + sfx + '.latency3.png'
+ of.write('set output "' + ofname + '"\n')
+ of.write('plot "' + lfile + sfx +\
+ '" using 1:(($3 * 100)/$4) title "' + name + '"\n')
+ of.close()
+ call(["gnuplot", gcmd])
+ os.remove(gcmd)
+
+def process_file(fname):
+ ckptlist = []
+ # NOTE: The operations below must be in this exact order to match
+ # the operation latency output in the monitor file.
+ opdict={'read':0, 'insert':0, 'update':0}
+
+ # This assumes the monitor file has the string "monitor"
+ # and any other (optional) characters in the filename are a suffix.
+ sfx = os.path.basename(fname).replace('monitor','')
+ dirname = os.path.dirname(fname)
+
+ process_monitor(fname, sfx, ckptlist, opdict)
+ column = 7 # average, minimum, maximum start in column 7
+ for k, v in opdict.items():
+ if v != 0:
+ plot_latency_operation(
+ k, fname, sfx, ckptlist, column, column + 1, column + 2)
+ plot_latency_percent(k, dirname, sfx, ckptlist)
+ plot_latency_cumulative_percent(k, dirname, sfx, ckptlist)
+ else:
+ print fname + ': no ' + k + ' operations found. Skip.'
+ column = column + 3
+
+def main():
+ # This program takes a list of monitor files generated by
+ # wtperf. If no args are given, it looks for a single file
+ # named 'monitor'.
+ numargs = len(sys.argv)
+ if numargs < 2:
+ process_file('monitor')
+ else:
+ d = 1
+ while d < numargs:
+ process_file(sys.argv[d])
+ d += 1
+
+if __name__ == '__main__':
+ main()
diff --git a/src/third_party/wiredtiger/tools/wtperf_stats.py b/src/third_party/wiredtiger/tools/wtperf_stats.py
new file mode 100644
index 00000000000..6f2f6dda682
--- /dev/null
+++ b/src/third_party/wiredtiger/tools/wtperf_stats.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+import os, csv, operator
+from time import mktime
+
+try:
+ from wt_nvd3_util import multiChart, parsetime
+except ImportError:
+ print >>sys.stderr, "Could not import wt_nvd3_util.py, it should be\
+ in the same directory as %s" % sys.argv[0]
+ sys.exit(-1)
+
+def timesort(s):
+ # Sort the timestr via its parsetime() value so that the year gets
+ # added and it properly sorts. Times are only %b %d %H:%M:%S and
+ # may improperly sort if the data crosses a month boundary.
+ t = operator.itemgetter('#time')
+ timestr = t(s)
+ return parsetime(timestr)
+
+# Fixup the names and values in a dictionary read in from a csv file. One
+# field must be "#time" - which is used to calculate the interval.
+# Input is a dictionary, output is a list of dictionaries with a single entry.
+def munge_dict(values_dict, abstime):
+ sorted_values = sorted(values_dict, key=timesort)
+ start_time = parsetime(sorted_values[0]['#time'])
+
+ ret = []
+ for v in sorted_values:
+ if abstime:
+ # Build the time series, milliseconds since the epoch
+ v['#time'] = int(mktime(parsetime(v['#time']).timetuple())) * 1000
+ else:
+ # Build the time series as seconds since the start of the data
+ v['#time'] = (parsetime(v['#time']) - start_time).seconds
+ next_val = {}
+ for title, value in v.items():
+ if title.find('uS') != -1:
+ title = title.replace('uS', 'ms')
+ value = float(value) / 1000
+ if title == 'totalsec':
+ value = 0
+ if title == 'checkpoints' and value == 'N':
+ value = 0
+ elif title.find('time') != -1:
+ title = 'time'
+ elif title.find('latency') == -1 and \
+ title.find('checkpoints') == -1:
+ title = title + ' (thousands)'
+ value = float(value) / 1000
+ next_val[title] = value
+ ret.append(next_val)
+
+ # After building the series, eliminate constants
+ d0 = ret[0]
+ for t0, v0 in d0.items():
+ skip = True
+ for d in ret:
+ v = d[t0]
+ if v != v0:
+ skip = False
+ break
+ if skip:
+ for dicts in ret:
+ del dicts[t0]
+
+ return ret
+
+def addPlotsToChart(chart, graph_data, wtstat_chart = False):
+ # Extract the times - they are the same for all lines.
+ times = []
+ for v in graph_data:
+ times.append(v['time'])
+
+ # Add a line to the graph for each field in the CSV file in alphabetical
+ # order, so the key is sorted.
+ for field in sorted(graph_data[0].keys()):
+ if field == 'time':
+ continue
+ # Split the latency and non-latency measurements onto different scales
+ axis = "1"
+ if not wtstat_chart and field.find('latency') == -1:
+ axis="2"
+ ydata = []
+ for v in graph_data:
+ ydata.append(v[field])
+ chart.add_serie(x=times, y=ydata, name=field, type="line", yaxis=axis)
+
+# Input parameters are a chart populated with WiredTiger statistics and
+# the directory where the wtperf monitor file can be found.
+def addPlotsToStatsChart(chart, dirname, abstime):
+ fname = os.path.join(dirname, 'monitor')
+ try:
+ with open(fname, 'rb') as csvfile:
+ reader = csv.DictReader(csvfile)
+ # Transform the data into something NVD3 can digest
+ graph_data = munge_dict(reader, abstime)
+ except IOError:
+ print >>sys.stderr, "Could not open wtperf monitor file."
+ sys.exit(-1)
+ addPlotsToChart(chart, graph_data, 1)
+
+def main():
+ # Parse the command line
+ import argparse
+
+ parser = argparse.ArgumentParser(description='Create graphs from WiredTiger statistics.')
+ parser.add_argument('--abstime', action='store_true',
+ help='use absolute time on the x axis')
+ parser.add_argument('--output', '-o', metavar='file',
+ default='wtperf_stats.html', help='HTML output file')
+ parser.add_argument('files', metavar='file', nargs='+',
+ help='input monitor file generated by WiredTiger wtperf application')
+ args = parser.parse_args()
+
+ output_file = open(args.output, 'w')
+
+ if len(args.files) != 1:
+ print 'Script currently only supports a single monitor file'
+ exit (1)
+
+ chart_extra = {}
+ # Add in the x axis if the user wants time.
+ if args.abstime:
+ chart_extra['x_axis_format'] = '%H:%M:%S'
+
+ for f in args.files:
+ with open(f, 'rb') as csvfile:
+ reader = csv.DictReader(csvfile)
+ # Transform the data into something NVD3 can digest
+ graph_data = munge_dict(reader, args.abstime)
+
+ chart = multiChart(name='wtperf',
+ height=450 + 10*len(graph_data[0].keys()),
+ resize=True,
+ x_is_date=args.abstime,
+ assets_directory='http://source.wiredtiger.com/graphs/',
+ **chart_extra)
+
+ addPlotsToChart(chart, graph_data)
+
+ chart.buildhtml()
+ output_file.write(chart.htmlcontent)
+ output_file.close()
+
+if __name__ == '__main__':
+ main()
+
diff --git a/src/third_party/wiredtiger/tools/wtstats.py b/src/third_party/wiredtiger/tools/wtstats.py
new file mode 100644
index 00000000000..371af6b4f1a
--- /dev/null
+++ b/src/third_party/wiredtiger/tools/wtstats.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+import fileinput, os, re, shutil, sys, textwrap
+from collections import defaultdict
+from time import mktime
+from subprocess import call
+
+try:
+ from stat_data import no_scale_per_second_list, no_clear_list
+except ImportError:
+ print >>sys.stderr, "Could not import stat_data.py, it should be\
+ in the same directory as %s" % sys.argv[0]
+ sys.exit(-1)
+
+try:
+ from wtperf_stats import addPlotsToStatsChart
+except ImportError:
+ print >>sys.stderr, "Could not import wtperf_stats.py, it should be\
+ in the same directory as %s" % sys.argv[0]
+ sys.exit(-1)
+
+try:
+ from wt_nvd3_util import multiChart, parsetime
+except ImportError:
+ print >>sys.stderr, "Could not import wt_nvd3_util.py, it should be\
+ in the same directory as %s" % sys.argv[0]
+ sys.exit(-1)
+
+try:
+ from nvd3 import lineChart, lineWithFocusChart
+except ImportError:
+ print >>sys.stderr, "Could not import nvd3. Please install it *from source* (other versions may be missing features that we rely on). Run these commands: git clone https://github.com/areski/python-nvd3.git ; cd python-nvd3 ; sudo python setup.py install"
+ sys.exit(-1)
+
+# Plot a set of entries for a title.
+def munge(title, values):
+ t0, v0 = values[0]
+ start_time = parsetime(t0)
+
+ ylabel = ' '.join(title.split(' ')).lower()
+ if title.split(' ')[1] != 'spinlock' and \
+ title.split(' ', 1)[1] in no_scale_per_second_list:
+ seconds = 1
+ else:
+ t1, v1 = values[1]
+ seconds = (parsetime(t1) - start_time).seconds
+ ylabel += ' per second'
+ if seconds == 0:
+ seconds = 1
+
+ stats_cleared = False
+ if args.clear or title.split(' ', 1)[1] in no_clear_list:
+ stats_cleared = True
+
+ # Split the values into a dictionary of y-axis values keyed by the x axis
+ ydata = {}
+ last_value = 0.0
+ for t, v in sorted(values):
+ if args.abstime:
+ # Build the time series, milliseconds since the epoch
+ x = int(mktime(parsetime(t).timetuple())) * 1000
+ else:
+ # Build the time series as seconds since the start of the data
+ x = (parsetime(t) - start_time).seconds
+
+ float_v = float(v)
+ if not stats_cleared:
+ float_v = float_v - last_value
+ # Sometimes WiredTiger stats go backwards without clear, assume
+ # that means nothing happened
+ if float_v < 0:
+ float_v = 0.0
+ last_value = float(v)
+ ydata[x] = float_v / seconds
+
+ return ylabel, ydata
+
+# Parse the command line
+import argparse
+
+parser = argparse.ArgumentParser(description='Create graphs from WiredTiger statistics.')
+parser.add_argument('--abstime', action='store_true',
+ help='use absolute time on the x axis')
+parser.add_argument('--clear', action='store_true',
+ help='WiredTiger stats gathered with clear set')
+parser.add_argument('--focus', action='store_true',
+ help='generate a chart with focus slider')
+parser.add_argument('--include', '-I', metavar='regexp',
+ type=re.compile, action='append',
+ help='include series with titles matching the specifed regexp')
+parser.add_argument('--list', action='store_true',
+ help='list the series that would be displayed')
+parser.add_argument('--output', '-o', metavar='file', default='wtstats.html',
+ help='HTML output file')
+parser.add_argument('--right', '-R', metavar='regexp',
+ type=re.compile, action='append',
+ help='use the right axis for series with titles matching the specifed regexp')
+parser.add_argument('--wtperf', '-w', action='store_true',
+ help='Plot wtperf statistics on the same graph')
+parser.add_argument('files', metavar='file', nargs='+',
+ help='input files generated by WiredTiger statistics logging')
+args = parser.parse_args()
+
+# Don't require users to specify regexps twice for right axis
+if args.focus and args.right:
+ print >>sys.stderr, "focus charts cannot have a right-hand y-axis"
+ sys.exit(-1)
+
+# Don't require users to specify regexps twice for right axis
+if args.include and args.right:
+ args.include += args.right
+
+# Read the input file(s) into a dictionary of lists.
+d = defaultdict(list)
+for f in args.files:
+ for line in open(f, 'rU'):
+ month, day, time, v, title = line.strip('\n').split(" ", 4)
+ d[title].append((month + " " + day + " " + time, v))
+
+# Process the series, eliminate constants
+for title, values in sorted(d.iteritems()):
+ skip = True
+ t0, v0 = values[0]
+ for t, v in values:
+ if v != v0:
+ skip = False
+ break
+ if skip:
+ #print "Skipping", title
+ del d[title]
+
+# Common prefix / suffix elimination
+prefix = suffix = None
+
+def common_prefix(a, b):
+ while not b.startswith(a):
+ a = a[:-1]
+ return a
+
+def common_suffix(a, b):
+ while not a.endswith(b):
+ b = b[1:]
+ return b
+
+# Split out the data, convert timestamps
+results = []
+for title, values in sorted(d.iteritems()):
+ title, ydata = munge(title, values)
+ # Ignore entries if a list of regular expressions was given
+ if args.include and not [r for r in args.include if r.search(title)]:
+ continue
+ yaxis = args.right and [r for r in args.right if r.search(title)]
+ prefix = title if prefix is None else common_prefix(prefix, title)
+ suffix = title if suffix is None else common_suffix(title, suffix)
+ results.append((title, yaxis, ydata))
+
+# Process titles, eliminate common prefixes and suffixes
+if prefix or suffix:
+ new_results = []
+ for title, yaxis, ydata in results:
+ title = title[len(prefix):]
+ if suffix:
+ title = title[:-len(suffix)]
+ new_results.append((title, yaxis, ydata))
+ results = new_results
+
+# Dump the results as a CSV file
+#print '"time", ' + ', '.join('"%s"' % title for title, values in ydata)
+#for i in xrange(len(xdata)):
+# print '%d, %s' % (xdata[i], ', '.join('%g' % values[i] for title, values in ydata))
+
+# Are we just listing the results?
+if args.list:
+ for title, yaxis, ydata in results:
+ print title
+ sys.exit(0)
+
+# Figure out the full set of x axis values
+xdata = sorted(set(k for k in ydata.iterkeys() for ydata in results))
+
+# open the output file
+output_file = open(args.output, 'w')
+#---------------------------------------
+if args.right:
+ charttype = multiChart
+elif args.focus:
+ charttype = lineWithFocusChart
+else:
+ charttype = lineChart
+
+chart_extra = {}
+# Add in the x axis if the user wants time.
+if args.abstime:
+ chart_extra['x_axis_format'] = '%H:%M:%S'
+
+# Create the chart, add the series
+chart = charttype(name='statlog', height=450+10*len(results), resize=True, x_is_date=args.abstime, y_axis_format='g', assets_directory='http://source.wiredtiger.com/graphs/', **chart_extra)
+
+for title, yaxis, ydata in results:
+ chart.add_serie(x=xdata, y=(ydata.get(x, 0) for x in xdata), name=title,
+ type="line", yaxis="2" if yaxis else "1")
+
+if args.wtperf:
+ addPlotsToStatsChart(chart, os.path.dirname(args.files[0]), args.abstime)
+
+chart.buildhtml()
+output_file.write(chart.htmlcontent)
+
+#close Html file
+output_file.close()