summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.hgtags1
-rw-r--r--NEWS60
-rw-r--r--README4
-rw-r--r--RELEASE4
-rw-r--r--bench/wtperf/wtperf.c42
-rw-r--r--build_posix/Make.subdirs1
-rw-r--r--build_posix/aclocal/ax_check_class.m4144
-rw-r--r--build_posix/aclocal/ax_check_junit.m472
-rw-r--r--build_posix/aclocal/ax_java_options.m448
-rw-r--r--build_posix/aclocal/ax_jni_include_dir.m4120
-rw-r--r--build_posix/aclocal/ax_prog_jar.m452
-rw-r--r--build_posix/aclocal/ax_prog_java.m4115
-rw-r--r--build_posix/aclocal/ax_prog_java_works.m4134
-rw-r--r--build_posix/aclocal/ax_prog_javac.m479
-rw-r--r--build_posix/aclocal/ax_prog_javac_works.m472
-rw-r--r--build_posix/aclocal/ax_try_compile_java.m455
-rw-r--r--build_posix/aclocal/options.m416
-rw-r--r--build_posix/aclocal/version-set.m48
-rw-r--r--build_posix/aclocal/version.m42
-rw-r--r--build_posix/configure.ac.in31
-rw-r--r--dist/api_config.py2
-rw-r--r--dist/api_data.py106
-rw-r--r--dist/filelist8
-rw-r--r--dist/flags.py2
-rw-r--r--dist/java_doc.py43
-rw-r--r--dist/s_all1
-rw-r--r--dist/s_copyright.list13
-rw-r--r--dist/s_define.list3
-rwxr-xr-xdist/s_docs23
-rw-r--r--dist/s_funcs.list9
-rwxr-xr-xdist/s_release16
-rw-r--r--dist/s_release.list1
-rw-r--r--dist/s_string.ok17
-rw-r--r--dist/s_style49
-rw-r--r--dist/s_symbols.list11
-rw-r--r--dist/s_typedef11
-rw-r--r--dist/stat.py22
-rw-r--r--dist/stat_data.py36
-rw-r--r--examples/c/ex_all.c147
-rw-r--r--examples/java/Makefile.am21
-rw-r--r--examples/java/com/wiredtiger/examples/ex_access.java53
-rw-r--r--ext/compressors/bzip2/bzip2_compress.c13
-rw-r--r--lang/java/Makefile.am75
-rw-r--r--lang/java/java_doc.i41
-rw-r--r--lang/java/src/com/wiredtiger/db/PackFormatInputStream.java165
-rw-r--r--lang/java/src/com/wiredtiger/db/PackInputStream.java320
-rw-r--r--lang/java/src/com/wiredtiger/db/PackOutputStream.java244
-rw-r--r--lang/java/src/com/wiredtiger/db/PackUtil.java49
-rw-r--r--lang/java/src/com/wiredtiger/db/WiredTigerException.java19
-rw-r--r--lang/java/src/com/wiredtiger/db/WiredTigerPackingException.java21
-rw-r--r--lang/java/wiredtiger.i823
-rw-r--r--lang/python/setup.py10
-rw-r--r--lang/python/wiredtiger.i4
-rw-r--r--src/block/block_ckpt.c35
-rw-r--r--src/block/block_ext.c204
-rw-r--r--src/block/block_map.c63
-rw-r--r--src/block/block_mgr.c39
-rw-r--r--src/block/block_open.c58
-rw-r--r--src/block/block_read.c17
-rw-r--r--src/block/block_slvg.c2
-rw-r--r--src/block/block_vrfy.c6
-rw-r--r--src/block/block_write.c30
-rw-r--r--src/bloom/bloom.c4
-rw-r--r--src/btree/bt_curnext.c13
-rw-r--r--src/btree/bt_curprev.c13
-rw-r--r--src/btree/bt_discard.c19
-rw-r--r--src/btree/bt_evict.c375
-rw-r--r--src/btree/bt_handle.c223
-rw-r--r--src/btree/bt_huffman.c9
-rw-r--r--src/btree/bt_ovfl.c2
-rw-r--r--src/btree/bt_page.c278
-rw-r--r--src/btree/bt_read.c5
-rw-r--r--src/btree/bt_slvg.c44
-rw-r--r--src/btree/bt_stat.c68
-rw-r--r--src/btree/bt_sync.c5
-rw-r--r--src/btree/bt_walk.c115
-rw-r--r--src/btree/col_srch.c13
-rw-r--r--src/btree/rec_evict.c70
-rw-r--r--src/btree/rec_merge.c538
-rw-r--r--src/btree/rec_track.c28
-rw-r--r--src/btree/rec_write.c328
-rw-r--r--src/btree/row_key.c22
-rw-r--r--src/btree/row_modify.c16
-rw-r--r--src/btree/row_srch.c23
-rw-r--r--src/config/config.c38
-rw-r--r--src/config/config_check.c5
-rw-r--r--src/config/config_def.c68
-rw-r--r--src/conn/conn_api.c54
-rw-r--r--src/conn/conn_cache.c33
-rw-r--r--src/conn/conn_cache_pool.c50
-rw-r--r--src/conn/conn_ckpt.c160
-rw-r--r--src/conn/conn_dhandle.c115
-rw-r--r--src/conn/conn_handle.c7
-rw-r--r--src/conn/conn_open.c16
-rw-r--r--src/conn/conn_stat.c318
-rw-r--r--src/cursor/cur_file.c3
-rw-r--r--src/cursor/cur_index.c8
-rw-r--r--src/cursor/cur_stat.c9
-rw-r--r--src/cursor/cur_table.c11
-rw-r--r--src/docs/bulk-load.dox23
-rw-r--r--src/docs/cache-configuration.dox6
-rw-r--r--src/docs/community.dox23
-rw-r--r--src/docs/cursors.dox10
-rw-r--r--src/docs/data_sources.dox12
-rw-r--r--src/docs/install.dox2
-rw-r--r--src/docs/license.dox26
-rw-r--r--src/docs/namespace.dox2
-rw-r--r--src/docs/programming.dox16
-rw-r--r--src/docs/spell.ok9
-rw-r--r--src/docs/statistics.dox74
-rw-r--r--src/docs/style/DoxygenLayout.xml1
-rw-r--r--src/docs/top/main.dox5
-rw-r--r--src/docs/tuning.dox175
-rw-r--r--src/docs/upgrading.dox11
-rw-r--r--src/include/api.h2
-rw-r--r--src/include/block.h17
-rw-r--r--src/include/btmem.h67
-rw-r--r--src/include/btree.h18
-rw-r--r--src/include/btree.i139
-rw-r--r--src/include/cache.h8
-rw-r--r--src/include/cache.i3
-rw-r--r--src/include/cell.i19
-rw-r--r--src/include/config.h2
-rw-r--r--src/include/connection.h32
-rw-r--r--src/include/cursor.h2
-rw-r--r--src/include/cursor.i20
-rw-r--r--src/include/error.h26
-rw-r--r--src/include/extern.h95
-rw-r--r--src/include/flags.h12
-rw-r--r--src/include/lsm.h3
-rw-r--r--src/include/misc.h12
-rw-r--r--src/include/mutex.h2
-rw-r--r--src/include/os.h6
-rw-r--r--src/include/packing.i40
-rw-r--r--src/include/schema.h17
-rw-r--r--src/include/session.h3
-rw-r--r--src/include/stat.h62
-rw-r--r--src/include/txn.h11
-rw-r--r--src/include/txn.i2
-rw-r--r--src/include/wiredtiger.in433
-rw-r--r--src/include/wt_internal.h2
-rw-r--r--src/lsm/lsm_cursor.c41
-rw-r--r--src/lsm/lsm_meta.c8
-rw-r--r--src/lsm/lsm_stat.c7
-rw-r--r--src/lsm/lsm_tree.c25
-rw-r--r--src/lsm/lsm_worker.c2
-rw-r--r--src/meta/meta_table.c2
-rw-r--r--src/meta/meta_turtle.c2
-rw-r--r--src/os_posix/os_alloc.c69
-rw-r--r--src/os_posix/os_fsync.c6
-rw-r--r--src/os_posix/os_open.c17
-rw-r--r--src/os_posix/os_remove.c38
-rw-r--r--src/os_posix/os_rw.c24
-rw-r--r--src/packing/pack_api.c (renamed from src/packing/packing_api.c)0
-rw-r--r--src/packing/pack_impl.c (renamed from src/packing/packing.c)9
-rw-r--r--src/packing/pack_stream.c288
-rw-r--r--src/schema/schema_create.c40
-rw-r--r--src/schema/schema_drop.c9
-rw-r--r--src/schema/schema_list.c47
-rw-r--r--src/schema/schema_open.c21
-rw-r--r--src/schema/schema_rename.c15
-rw-r--r--src/schema/schema_stat.c6
-rw-r--r--src/schema/schema_truncate.c5
-rw-r--r--src/schema/schema_worker.c9
-rw-r--r--src/session/session_api.c9
-rw-r--r--src/support/filename.c38
-rw-r--r--src/support/hazard.c4
-rw-r--r--src/support/scratch.c2
-rw-r--r--src/support/stat.c77
-rw-r--r--src/txn/txn.c77
-rw-r--r--src/txn/txn_ckpt.c150
-rw-r--r--src/utilities/util.h2
-rw-r--r--test/format/wts.c13
-rw-r--r--test/java/com/wiredtiger/test/CursorTest.java117
-rw-r--r--test/java/com/wiredtiger/test/PackTest.java246
-rw-r--r--test/java/com/wiredtiger/test/WiredTigerSuite.java41
-rw-r--r--test/suite/test_cursor_random.py2
-rw-r--r--test/suite/test_drop_create.py23
-rw-r--r--test/suite/test_reconfig.py44
-rw-r--r--test/suite/test_shared_cache.py10
-rw-r--r--tools/statlog.py95
181 files changed, 8291 insertions, 1797 deletions
diff --git a/.hgtags b/.hgtags
index dcc6cb9953f..85afb6fc0b5 100644
--- a/.hgtags
+++ b/.hgtags
@@ -18,3 +18,4 @@ df87effe7cd3239e3666a76312bae77b92090d98 1.3.4
8b91f84675fd67259b1f513e3f84786501cbc16c 1.3.6
27cec73582030254a2752cc3213bb89825dc5183 1.3.7
edc4643f811d706cbbb6400d048bf56602aed963 1.4.2
+aff8aabe571be6db68e8bf44bf7670df5d55d1ff 1.5.0
diff --git a/NEWS b/NEWS
index e5be4422af9..8c425ff0873 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,63 @@
+WiredTiger release 1.5.0, 2013-03-14
+------------------------------------
+
+This release contains some major new features along with numerous bug fixes
+and performance improvements. The significant changes are highlighted
+below:
+
+* Add a Java API.
+
+* Create a thread to do automatic checkpoints, configured by passing
+ "checkpoint=(wait=X)" to wiredtiger_open.
+
+* Add support for periodically logging statistics to a file and a tool to
+ generate graphs based on those logs. Configured by passing
+ "statistics_log=(wait=X)" to wiredtiger_open.
+
+* Several changes to minimize the impact of checkpoints on other threads.
+
+* When reading from checkpoints, use mmap by default.
+
+* Enhance eviction so that internal pages take up less space.
+
+* Add maximum filesystem buffer cache settings to wiredtiger_open called
+ "os_cache_max" and "os_cache_dirty_max". After doing the specified
+ amount of reads or writes, WiredTiger will call fadvise and/or
+ sync_file_range to drop pages from the filesystem cache. This is an
+ alternative to direct I/O with less impact on performance.
+
+* Make run-time statistics optional, defaulted to "off".
+
+* Change how we detect if shared cache is used. It used to rely on a name,
+ now it will be used if the shared_cache configuration option is included.
+
+* Add the ability to specify a per-connection reserved size for cache
+ pools. Ensure cache pool reconfiguration is honoured quickly.
+
+* Rework hazard pointer coupling during cursor walks to be more efficient.
+
+* Add a cache_eviction_walk statistic to track the pages we walk and a
+ cache_eviction_force statistic to track the count of pages queued for
+ forced eviction.
+
+* Fixes to reduce the number of operations on shared data that were causing
+ bottlenecks in read only workloads.
+
+* Add streaming pack / unpack to the API.
+
+* Add some basic reconciliation stats to the connection stats.
+
+* In LSM, keep trying to switch if there is an error: it may be transient.
+
+* Minor clean up and enhancement for the reconciliation statistics, add a
+ set of compression statistics, both to the data-source statistics.
+
+* Compaction cannot run at the same time as a checkpoint: the problem is
+ that checkpoints review page reconciliation information and checkpoints
+ update page reconciliation information. Lock out checkpoints while
+ compaction is running.
+
+
WiredTiger release 1.4.2, 2013-01-14
------------------------------------
diff --git a/README b/README
index 1bf68a5e568..c4f10627a7a 100644
--- a/README
+++ b/README
@@ -1,6 +1,6 @@
-WiredTiger 1.4.2: (January 14, 2013)
+WiredTiger 1.5.0: (March 14, 2013)
-This is version 1.4.2 of WiredTiger.
+This is version 1.5.0 of WiredTiger.
WiredTiger documentation can be found at:
diff --git a/RELEASE b/RELEASE
index 2cc103a08db..8dc0f6413cf 100644
--- a/RELEASE
+++ b/RELEASE
@@ -1,6 +1,6 @@
WIREDTIGER_VERSION_MAJOR=1
-WIREDTIGER_VERSION_MINOR=4
-WIREDTIGER_VERSION_PATCH=2
+WIREDTIGER_VERSION_MINOR=5
+WIREDTIGER_VERSION_PATCH=0
WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH"
WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"`
diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c
index 97fb4875c9d..8fff6000a26 100644
--- a/bench/wtperf/wtperf.c
+++ b/bench/wtperf/wtperf.c
@@ -41,6 +41,15 @@
#define ATOMIC_ADD(v, val) \
__sync_add_and_fetch(&(v), val)
+#ifndef F_CLR
+#define F_CLR(p, mask) ((p)->flags &= ~((uint32_t)(mask)))
+#endif
+#ifndef F_ISSET
+#define F_ISSET(p, mask) ((p)->flags & ((uint32_t)(mask)))
+#endif
+#ifndef F_SET
+#define F_SET(p, mask) ((p)->flags |= ((uint32_t)(mask)))
+#endif
typedef struct {
const char *home;
@@ -68,6 +77,8 @@ typedef struct {
#define WT_PERF_POP 0x01
#define WT_PERF_READ 0x02
uint32_t phase;
+#define WT_INSERT_RMW 0x01
+ uint32_t flags;
struct timeval phase_start_time;
} CONFIG;
@@ -99,9 +110,10 @@ void worker(CONFIG *, uint32_t);
"leaf_page_max=4kb,internal_page_max=64kb,allocation_size=4kb,"
/* Worker thread types. */
-#define WORKER_READ 0x01
-#define WORKER_INSERT 0x02
-#define WORKER_UPDATE 0x03
+#define WORKER_READ 0x01
+#define WORKER_INSERT 0x02
+#define WORKER_INSERT_RMW 0x03
+#define WORKER_UPDATE 0x04
/* Default values - these are tiny, we want the basic run to be fast. */
CONFIG default_cfg = {
@@ -127,6 +139,7 @@ CONFIG default_cfg = {
NULL, /* conn */
NULL, /* logf */
WT_PERF_INIT, /* phase */
+ 0, /* flags */
{0, 0} /* phase_start_time */
};
/* Small config values - these are small. */
@@ -154,6 +167,7 @@ CONFIG small_cfg = {
NULL, /* conn */
NULL, /* logf */
WT_PERF_INIT, /* phase */
+ 0, /* flags */
{0, 0} /* phase_start_time */
};
/* Default values - these are small, we want the basic run to be fast. */
@@ -181,6 +195,7 @@ CONFIG med_cfg = {
NULL, /* conn */
NULL, /* logf */
WT_PERF_INIT, /* phase */
+ 0, /* flags */
{0, 0} /* phase_start_time */
};
/* Default values - these are small, we want the basic run to be fast. */
@@ -208,6 +223,7 @@ CONFIG large_cfg = {
NULL, /* conn */
NULL, /* logf */
WT_PERF_INIT, /* phase */
+ 0, /* flags */
{0, 0} /* phase_start_time */
};
@@ -240,7 +256,11 @@ read_thread(void *arg)
void *
insert_thread(void *arg)
{
- worker((CONFIG *)arg, WORKER_INSERT);
+ CONFIG *config;
+
+ config = (CONFIG *)arg;
+ worker(config, F_ISSET(config, WT_INSERT_RMW) ?
+ WORKER_INSERT_RMW : WORKER_INSERT);
return (NULL);
}
@@ -314,6 +334,12 @@ worker(CONFIG *cfg, uint32_t worker_type)
if (op_ret == 0)
++g_nread_ops;
break;
+ case WORKER_INSERT_RMW:
+ op_name="insert_rmw";
+ op_ret = cursor->search(cursor);
+ if (op_ret != WT_NOTFOUND)
+ break;
+ /* Fall through */
case WORKER_INSERT:
op_name = "insert";
cursor->set_value(cursor, data_buf);
@@ -758,7 +784,7 @@ int main(int argc, char **argv)
CONFIG cfg;
WT_CONNECTION *conn;
const char *user_cconfig, *user_tconfig;
- const char *opts = "C:I:P:R:U:T:c:d:eh:i:k:l:r:s:t:u:v:SML";
+ const char *opts = "C:I:P:R:U:T:c:d:eh:i:jk:l:r:s:t:u:v:SML";
char *cc_buf, *tc_buf;
int ch, checkpoint_created, ret, stat_created;
pthread_t checkpoint, stat;
@@ -810,6 +836,9 @@ int main(int argc, char **argv)
case 'i':
cfg.icount = (uint32_t)atoi(optarg);
break;
+ case 'j':
+ F_SET(&cfg, WT_INSERT_RMW);
+ break;
case 'k':
cfg.key_sz = (uint32_t)atoi(optarg);
break;
@@ -1122,6 +1151,8 @@ void print_config(CONFIG *cfg)
printf("\t Workload period: %d\n", cfg->run_time);
printf("\t Number read threads: %d\n", cfg->read_threads);
printf("\t Number insert threads: %d\n", cfg->insert_threads);
+ if (F_ISSET(cfg, WT_INSERT_RMW))
+ printf("\t Insert operations are RMW.\n");
printf("\t Number update threads: %d\n", cfg->update_threads);
printf("\t Verbosity: %d\n", cfg->verbose);
}
@@ -1144,6 +1175,7 @@ void usage(void)
printf("\t-e use existing database (skip population phase)\n");
printf("\t-h <string> Wired Tiger home must exist, default WT_TEST \n");
printf("\t-i <int> number of records to insert\n");
+ printf("\t-j Execute a read prior to each insert in populate\n");
printf("\t-k <int> key item size\n");
printf("\t-l <int> log statistics every <int> report intervals."
"Default disabled.\n");
diff --git a/build_posix/Make.subdirs b/build_posix/Make.subdirs
index 07bb8269a72..aeb2316d634 100644
--- a/build_posix/Make.subdirs
+++ b/build_posix/Make.subdirs
@@ -12,6 +12,7 @@ ext/collators/reverse
ext/compressors/bzip2 BZIP2
ext/compressors/nop
ext/compressors/snappy SNAPPY
+lang/java JAVA
lang/python PYTHON
test/bloom
test/fops
diff --git a/build_posix/aclocal/ax_check_class.m4 b/build_posix/aclocal/ax_check_class.m4
new file mode 100644
index 00000000000..098aa77290b
--- /dev/null
+++ b/build_posix/aclocal/ax_check_class.m4
@@ -0,0 +1,144 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_check_class.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_CHECK_CLASS
+#
+# DESCRIPTION
+#
+# AX_CHECK_CLASS tests the existence of a given Java class, either in a
+# jar or in a '.class' file.
+#
+# *Warning*: its success or failure can depend on a proper setting of the
+# CLASSPATH env. variable.
+#
+# Note: This is part of the set of autoconf M4 macros for Java programs.
+# It is VERY IMPORTANT that you download the whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission. The
+# general documentation, as well as the sample configure.in, is included
+# in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception, the respective Autoconf Macro's copyright owner
+# gives unlimited permission to copy, distribute and modify the configure
+# scripts that are the output of Autoconf when processing the Macro. You
+# need not follow the terms of the GNU General Public License when using
+# or distributing such scripts, even though portions of the text of the
+# Macro appear in them. The GNU General Public License (GPL) does govern
+# all other use of the material that constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the Autoconf
+# Macro released by the Autoconf Archive. When you make and distribute a
+# modified version of the Autoconf Macro, you may extend this special
+# exception to the GPL to apply to your modified version as well.
+
+#serial 7
+
+AU_ALIAS([AC_CHECK_CLASS], [AX_CHECK_CLASS])
+AC_DEFUN([AX_CHECK_CLASS],[
+AC_REQUIRE([AX_PROG_JAVA])
+ac_var_name=`echo $1 | sed 's/\./_/g'`
+dnl Normaly I'd use a AC_CACHE_CHECK here but since the variable name is
+dnl dynamic I need an extra level of extraction
+AC_MSG_CHECKING([for $1 class])
+AC_CACHE_VAL(ax_cv_class_$ac_var_name, [
+if test x$ac_cv_prog_uudecode_base64 = xyes; then
+dnl /**
+dnl * Test.java: used to test dynamicaly if a class exists.
+dnl */
+dnl public class Test
+dnl {
+dnl
+dnl public static void
+dnl main( String[] argv )
+dnl {
+dnl Class lib;
+dnl if (argv.length < 1)
+dnl {
+dnl System.err.println ("Missing argument");
+dnl System.exit (77);
+dnl }
+dnl try
+dnl {
+dnl lib = Class.forName (argv[0]);
+dnl }
+dnl catch (ClassNotFoundException e)
+dnl {
+dnl System.exit (1);
+dnl }
+dnl lib = null;
+dnl System.exit (0);
+dnl }
+dnl
+dnl }
+cat << \EOF > Test.uue
+begin-base64 644 Test.class
+yv66vgADAC0AKQcAAgEABFRlc3QHAAQBABBqYXZhL2xhbmcvT2JqZWN0AQAE
+bWFpbgEAFihbTGphdmEvbGFuZy9TdHJpbmc7KVYBAARDb2RlAQAPTGluZU51
+bWJlclRhYmxlDAAKAAsBAANlcnIBABVMamF2YS9pby9QcmludFN0cmVhbTsJ
+AA0ACQcADgEAEGphdmEvbGFuZy9TeXN0ZW0IABABABBNaXNzaW5nIGFyZ3Vt
+ZW50DAASABMBAAdwcmludGxuAQAVKExqYXZhL2xhbmcvU3RyaW5nOylWCgAV
+ABEHABYBABNqYXZhL2lvL1ByaW50U3RyZWFtDAAYABkBAARleGl0AQAEKEkp
+VgoADQAXDAAcAB0BAAdmb3JOYW1lAQAlKExqYXZhL2xhbmcvU3RyaW5nOylM
+amF2YS9sYW5nL0NsYXNzOwoAHwAbBwAgAQAPamF2YS9sYW5nL0NsYXNzBwAi
+AQAgamF2YS9sYW5nL0NsYXNzTm90Rm91bmRFeGNlcHRpb24BAAY8aW5pdD4B
+AAMoKVYMACMAJAoAAwAlAQAKU291cmNlRmlsZQEACVRlc3QuamF2YQAhAAEA
+AwAAAAAAAgAJAAUABgABAAcAAABtAAMAAwAAACkqvgSiABCyAAwSD7YAFBBN
+uAAaKgMyuAAeTKcACE0EuAAaAUwDuAAasQABABMAGgAdACEAAQAIAAAAKgAK
+AAAACgAAAAsABgANAA4ADgATABAAEwASAB4AFgAiABgAJAAZACgAGgABACMA
+JAABAAcAAAAhAAEAAQAAAAUqtwAmsQAAAAEACAAAAAoAAgAAAAQABAAEAAEA
+JwAAAAIAKA==
+====
+EOF
+ if $UUDECODE Test.uue; then
+ :
+ else
+ echo "configure: __oline__: uudecode had trouble decoding base 64 file 'Test.uue'" >&AS_MESSAGE_LOG_FD
+ echo "configure: failed file was:" >&AS_MESSAGE_LOG_FD
+ cat Test.uue >&AS_MESSAGE_LOG_FD
+ ac_cv_prog_uudecode_base64=no
+ fi
+ rm -f Test.uue
+ if AC_TRY_COMMAND($JAVA $JAVAFLAGS Test $1) >/dev/null 2>&1; then
+ eval "ac_cv_class_$ac_var_name=yes"
+ else
+ eval "ac_cv_class_$ac_var_name=no"
+ fi
+ rm -f Test.class
+else
+ AX_TRY_COMPILE_JAVA([$1], , [eval "ac_cv_class_$ac_var_name=yes"],
+ [eval "ac_cv_class_$ac_var_name=no"])
+fi
+eval "ac_var_val=$`eval echo ac_cv_class_$ac_var_name`"
+eval "HAVE_$ac_var_name=$`echo ac_cv_class_$ac_var_val`"
+HAVE_LAST_CLASS=$ac_var_val
+if test x$ac_var_val = xyes; then
+ ifelse([$2], , :, [$2])
+else
+ ifelse([$3], , :, [$3])
+fi
+])
+dnl for some reason the above statment didn't fall though here?
+dnl do scripts have variable scoping?
+eval "ac_var_val=$`eval echo ac_cv_class_$ac_var_name`"
+AC_MSG_RESULT($ac_var_val)
+])
diff --git a/build_posix/aclocal/ax_check_junit.m4 b/build_posix/aclocal/ax_check_junit.m4
new file mode 100644
index 00000000000..724e0e0814f
--- /dev/null
+++ b/build_posix/aclocal/ax_check_junit.m4
@@ -0,0 +1,72 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_check_junit.html
+# ===========================================================================
+#
+# WiredTiger: Updated to use JUnit 4 call semantics.
+#
+# SYNOPSIS
+#
+# AX_CHECK_JUNIT
+#
+# DESCRIPTION
+#
+# AX_CHECK_JUNIT tests the availability of the Junit testing framework,
+# and set some variables for conditional compilation of the test suite by
+# automake.
+#
+# If available, JUNIT is set to a command launching the text based user
+# interface of Junit, @JAVA_JUNIT@ is set to $JAVA_JUNIT and @TESTS_JUNIT@
+# is set to $TESTS_JUNIT, otherwise they are set to empty values.
+#
+# You can use these variables in your Makefile.am file like this :
+#
+# # Some of the following classes are built only if junit is available
+# JAVA_JUNIT = Class1Test.java Class2Test.java AllJunitTests.java
+#
+# noinst_JAVA = Example1.java Example2.java @JAVA_JUNIT@
+#
+# EXTRA_JAVA = $(JAVA_JUNIT)
+#
+# TESTS_JUNIT = AllJunitTests
+#
+# TESTS = StandaloneTest1 StandaloneTest2 @TESTS_JUNIT@
+#
+# EXTRA_TESTS = $(TESTS_JUNIT)
+#
+# AllJunitTests :
+# echo "#! /bin/sh" > $@
+# echo "exec @JUNIT@ my.package.name.AllJunitTests" >> $@
+# chmod +x $@
+#
+# LICENSE
+#
+# Copyright (c) 2008 Luc Maisonobe <luc@spaceroots.org>
+#
+# Copying and distribution of this file, with or without modification, are
+# permitted in any medium without royalty provided the copyright notice
+# and this notice are preserved. This file is offered as-is, without any
+# warranty.
+
+#serial 5
+
+AU_ALIAS([AC_CHECK_JUNIT], [AX_CHECK_JUNIT])
+AC_DEFUN([AX_CHECK_JUNIT],[
+AC_CACHE_VAL(ac_cv_prog_JUNIT,[
+AX_CHECK_CLASS(org.junit.runner.JUnitCore)
+if test x"`eval 'echo $ac_cv_class_org_junit_runner_JUnitCore'`" != xno ; then
+ ac_cv_prog_JUNIT='$(CLASSPATH_ENV) $(JAVA) $(JAVAFLAGS) org.junit.runner.JUnitCore'
+fi])
+AC_MSG_CHECKING([for junit])
+if test x"`eval 'echo $ac_cv_prog_JUNIT'`" != x ; then
+ JUNIT="$ac_cv_prog_JUNIT"
+ JAVA_JUNIT='$(JAVA_JUNIT)'
+ TESTS_JUNIT='$(TESTS_JUNIT)'
+else
+ JUNIT=
+ JAVA_JUNIT=
+ TESTS_JUNIT=
+fi
+AC_MSG_RESULT($JAVA_JUNIT)
+AC_SUBST(JUNIT)
+AC_SUBST(JAVA_JUNIT)
+AC_SUBST(TESTS_JUNIT)])
diff --git a/build_posix/aclocal/ax_java_options.m4 b/build_posix/aclocal/ax_java_options.m4
new file mode 100644
index 00000000000..36c10d922bd
--- /dev/null
+++ b/build_posix/aclocal/ax_java_options.m4
@@ -0,0 +1,48 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_java_options.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_JAVA_OPTIONS
+#
+# DESCRIPTION
+#
+# AX_JAVA_OPTIONS adds configure command line options used for Java m4
+# macros. This Macro is optional.
+#
+# Note: This is part of the set of autoconf M4 macros for Java programs.
+# It is VERY IMPORTANT that you download the whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission. The
+# general documentation, as well as the sample configure.in, is included
+# in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Devin Weaver <ktohg@tritarget.com>
+#
+# Copying and distribution of this file, with or without modification, are
+# permitted in any medium without royalty provided the copyright notice
+# and this notice are preserved. This file is offered as-is, without any
+# warranty.
+
+#serial 6
+
+AU_ALIAS([AC_JAVA_OPTIONS], [AX_JAVA_OPTIONS])
+AC_DEFUN([AX_JAVA_OPTIONS],[
+AC_ARG_WITH(java-prefix,
+ [ --with-java-prefix=PFX prefix where Java runtime is installed (optional)])
+AC_ARG_WITH(javac-flags,
+ [ --with-javac-flags=FLAGS flags to pass to the Java compiler (optional)])
+AC_ARG_WITH(java-flags,
+ [ --with-java-flags=FLAGS flags to pass to the Java VM (optional)])
+JAVAPREFIX=$with_java_prefix
+JAVACFLAGS=$with_javac_flags
+JAVAFLAGS=$with_java_flags
+AC_SUBST(JAVAPREFIX)dnl
+AC_SUBST(JAVACFLAGS)dnl
+AC_SUBST(JAVAFLAGS)dnl
+AC_SUBST(JAVA)dnl
+AC_SUBST(JAVAC)dnl
+])
diff --git a/build_posix/aclocal/ax_jni_include_dir.m4 b/build_posix/aclocal/ax_jni_include_dir.m4
new file mode 100644
index 00000000000..7ce12e10c82
--- /dev/null
+++ b/build_posix/aclocal/ax_jni_include_dir.m4
@@ -0,0 +1,120 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_jni_include_dir.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_JNI_INCLUDE_DIR
+#
+# DESCRIPTION
+#
+# AX_JNI_INCLUDE_DIR finds include directories needed for compiling
+# programs using the JNI interface.
+#
+# JNI include directories are usually in the java distribution This is
+# deduced from the value of JAVAC. When this macro completes, a list of
+# directories is left in the variable JNI_INCLUDE_DIRS.
+#
+# Example usage follows:
+#
+# AX_JNI_INCLUDE_DIR
+#
+# for JNI_INCLUDE_DIR in $JNI_INCLUDE_DIRS
+# do
+# CPPFLAGS="$CPPFLAGS -I$JNI_INCLUDE_DIR"
+# done
+#
+# If you want to force a specific compiler:
+#
+# - at the configure.in level, set JAVAC=yourcompiler before calling
+# AX_JNI_INCLUDE_DIR
+#
+# - at the configure level, setenv JAVAC
+#
+# Note: This macro can work with the autoconf M4 macros for Java programs.
+# This particular macro is not part of the original set of macros.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Don Anderson <dda@sleepycat.com>
+#
+# Copying and distribution of this file, with or without modification, are
+# permitted in any medium without royalty provided the copyright notice
+# and this notice are preserved. This file is offered as-is, without any
+# warranty.
+
+#serial 7
+
+AU_ALIAS([AC_JNI_INCLUDE_DIR], [AX_JNI_INCLUDE_DIR])
+AC_DEFUN([AX_JNI_INCLUDE_DIR],[
+
+JNI_INCLUDE_DIRS=""
+
+test "x$JAVAC" = x && AC_MSG_ERROR(['\$JAVAC' undefined])
+AC_PATH_PROG([_ACJNI_JAVAC], [$JAVAC], [no])
+test "x$_ACJNI_JAVAC" = xno && AC_MSG_ERROR([$JAVAC could not be found in path])
+
+_ACJNI_FOLLOW_SYMLINKS("$_ACJNI_JAVAC")
+_JTOPDIR=`echo "$_ACJNI_FOLLOWED" | sed -e 's://*:/:g' -e 's:/[[^/]]*$::'`
+case "$host_os" in
+ darwin*) _JTOPDIR=`echo "$_JTOPDIR" | sed -e 's:/[[^/]]*$::'`
+ _JINC="$_JTOPDIR/Headers";;
+ *) _JINC="$_JTOPDIR/include";;
+esac
+_AS_ECHO_LOG([_JTOPDIR=$_JTOPDIR])
+_AS_ECHO_LOG([_JINC=$_JINC])
+
+# On Mac OS X 10.6.4, jni.h is a symlink:
+# /System/Library/Frameworks/JavaVM.framework/Versions/Current/Headers/jni.h
+# -> ../../CurrentJDK/Headers/jni.h.
+if test -f "$_JINC/jni.h" || test -L "$_JINC/jni.h"; then
+ JNI_INCLUDE_DIRS="$JNI_INCLUDE_DIRS $_JINC"
+else
+ _JTOPDIR=`echo "$_JTOPDIR" | sed -e 's:/[[^/]]*$::'`
+ if test -f "$_JTOPDIR/include/jni.h"; then
+ JNI_INCLUDE_DIRS="$JNI_INCLUDE_DIRS $_JTOPDIR/include"
+ else
+ AC_MSG_ERROR([cannot find java include files])
+ fi
+fi
+
+# get the likely subdirectories for system specific java includes
+case "$host_os" in
+bsdi*) _JNI_INC_SUBDIRS="bsdos";;
+freebsd*) _JNI_INC_SUBDIRS="freebsd";;
+linux*) _JNI_INC_SUBDIRS="linux genunix";;
+osf*) _JNI_INC_SUBDIRS="alpha";;
+solaris*) _JNI_INC_SUBDIRS="solaris";;
+mingw*) _JNI_INC_SUBDIRS="win32";;
+cygwin*) _JNI_INC_SUBDIRS="win32";;
+*) _JNI_INC_SUBDIRS="genunix";;
+esac
+
+# add any subdirectories that are present
+for JINCSUBDIR in $_JNI_INC_SUBDIRS
+do
+ if test -d "$_JTOPDIR/include/$JINCSUBDIR"; then
+ JNI_INCLUDE_DIRS="$JNI_INCLUDE_DIRS $_JTOPDIR/include/$JINCSUBDIR"
+ fi
+done
+])
+
+# _ACJNI_FOLLOW_SYMLINKS <path>
+# Follows symbolic links on <path>,
+# finally setting variable _ACJNI_FOLLOWED
+# ----------------------------------------
+AC_DEFUN([_ACJNI_FOLLOW_SYMLINKS],[
+# find the include directory relative to the javac executable
+_cur="$1"
+while ls -ld "$_cur" 2>/dev/null | grep " -> " >/dev/null; do
+ AC_MSG_CHECKING([symlink for $_cur])
+ _slink=`ls -ld "$_cur" | sed 's/.* -> //'`
+ case "$_slink" in
+ /*) _cur="$_slink";;
+ # 'X' avoids triggering unwanted echo options.
+ *) _cur=`echo "X$_cur" | sed -e 's/^X//' -e 's:[[^/]]*$::'`"$_slink";;
+ esac
+ AC_MSG_RESULT([$_cur])
+done
+_ACJNI_FOLLOWED="$_cur"
+])# _ACJNI
diff --git a/build_posix/aclocal/ax_prog_jar.m4 b/build_posix/aclocal/ax_prog_jar.m4
new file mode 100644
index 00000000000..776e804ad9f
--- /dev/null
+++ b/build_posix/aclocal/ax_prog_jar.m4
@@ -0,0 +1,52 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_prog_jar.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_PROG_JAR
+#
+# DESCRIPTION
+#
+# AX_PROG_JAR tests for an existing jar program. It uses the environment
+# variable JAR then tests in sequence various common jar programs.
+#
+# If you want to force a specific compiler:
+#
+# - at the configure.in level, set JAR=yourcompiler before calling
+# AX_PROG_JAR
+#
+# - at the configure level, setenv JAR
+#
+# You can use the JAR variable in your Makefile.in, with @JAR@.
+#
+# Note: This macro depends on the autoconf M4 macros for Java programs. It
+# is VERY IMPORTANT that you download that whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission.
+#
+# The general documentation of those macros, as well as the sample
+# configure.in, is included in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Egon Willighagen <e.willighagen@science.ru.nl>
+#
+# Copying and distribution of this file, with or without modification, are
+# permitted in any medium without royalty provided the copyright notice
+# and this notice are preserved. This file is offered as-is, without any
+# warranty.
+
+#serial 6
+
+AU_ALIAS([AC_PROG_JAR], [AX_PROG_JAR])
+AC_DEFUN([AX_PROG_JAR],[
+AC_REQUIRE([AC_EXEEXT])dnl
+if test "x$JAVAPREFIX" = x; then
+ test "x$JAR" = x && AC_CHECK_PROGS(JAR, jar$EXEEXT)
+else
+ test "x$JAR" = x && AC_CHECK_PROGS(JAR, jar, $JAVAPREFIX)
+fi
+test "x$JAR" = x && AC_MSG_ERROR([no acceptable jar program found in \$PATH])
+AC_PROVIDE([$0])dnl
+])
diff --git a/build_posix/aclocal/ax_prog_java.m4 b/build_posix/aclocal/ax_prog_java.m4
new file mode 100644
index 00000000000..5471f322d25
--- /dev/null
+++ b/build_posix/aclocal/ax_prog_java.m4
@@ -0,0 +1,115 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_prog_java.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_PROG_JAVA
+#
+# DESCRIPTION
+#
+# Here is a summary of the main macros:
+#
+# AX_PROG_JAVAC: finds a Java compiler.
+#
+# AX_PROG_JAVA: finds a Java virtual machine.
+#
+# AX_CHECK_CLASS: finds if we have the given class (beware of CLASSPATH!).
+#
+# AX_CHECK_RQRD_CLASS: finds if we have the given class and stops
+# otherwise.
+#
+# AX_TRY_COMPILE_JAVA: attempt to compile user given source.
+#
+# AX_TRY_RUN_JAVA: attempt to compile and run user given source.
+#
+# AX_JAVA_OPTIONS: adds Java configure options.
+#
+# AX_PROG_JAVA tests an existing Java virtual machine. It uses the
+# environment variable JAVA then tests in sequence various common Java
+# virtual machines. For political reasons, it starts with the free ones.
+# You *must* call [AX_PROG_JAVAC] before.
+#
+# If you want to force a specific VM:
+#
+# - at the configure.in level, set JAVA=yourvm before calling AX_PROG_JAVA
+#
+# (but after AC_INIT)
+#
+# - at the configure level, setenv JAVA
+#
+# You can use the JAVA variable in your Makefile.in, with @JAVA@.
+#
+# *Warning*: its success or failure can depend on a proper setting of the
+# CLASSPATH env. variable.
+#
+# TODO: allow to exclude virtual machines (rationale: most Java programs
+# cannot run with some VM like kaffe).
+#
+# Note: This is part of the set of autoconf M4 macros for Java programs.
+# It is VERY IMPORTANT that you download the whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission.
+#
+# A Web page, with a link to the latest CVS snapshot is at
+# <http://www.internatif.org/bortzmeyer/autoconf-Java/>.
+#
+# This is a sample configure.in Process this file with autoconf to produce
+# a configure script.
+#
+# AC_INIT(UnTag.java)
+#
+# dnl Checks for programs.
+# AC_CHECK_CLASSPATH
+# AX_PROG_JAVAC
+# AX_PROG_JAVA
+#
+# dnl Checks for classes
+# AX_CHECK_RQRD_CLASS(org.xml.sax.Parser)
+# AX_CHECK_RQRD_CLASS(com.jclark.xml.sax.Driver)
+#
+# AC_OUTPUT(Makefile)
+#
+# LICENSE
+#
+# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception, the respective Autoconf Macro's copyright owner
+# gives unlimited permission to copy, distribute and modify the configure
+# scripts that are the output of Autoconf when processing the Macro. You
+# need not follow the terms of the GNU General Public License when using
+# or distributing such scripts, even though portions of the text of the
+# Macro appear in them. The GNU General Public License (GPL) does govern
+# all other use of the material that constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the Autoconf
+# Macro released by the Autoconf Archive. When you make and distribute a
+# modified version of the Autoconf Macro, you may extend this special
+# exception to the GPL to apply to your modified version as well.
+
+#serial 8
+
+AU_ALIAS([AC_PROG_JAVA], [AX_PROG_JAVA])
+AC_DEFUN([AX_PROG_JAVA],[
+if test x$JAVAPREFIX = x; then
+ test x$JAVA = x && AC_CHECK_PROGS(JAVA, kaffe java)
+else
+ test x$JAVA = x && AC_CHECK_PROGS(JAVA, kaffe java, $JAVAPREFIX)
+fi
+test x$JAVA = x && AC_MSG_ERROR([no acceptable Java virtual machine found in \$PATH])
+AX_PROG_JAVA_WORKS
+AC_PROVIDE([$0])dnl
+])
diff --git a/build_posix/aclocal/ax_prog_java_works.m4 b/build_posix/aclocal/ax_prog_java_works.m4
new file mode 100644
index 00000000000..741bd561b62
--- /dev/null
+++ b/build_posix/aclocal/ax_prog_java_works.m4
@@ -0,0 +1,134 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_prog_java_works.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_PROG_JAVA_WORKS
+#
+# DESCRIPTION
+#
+# Internal use ONLY.
+#
+# Note: This is part of the set of autoconf M4 macros for Java programs.
+# It is VERY IMPORTANT that you download the whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission. The
+# general documentation, as well as the sample configure.in, is included
+# in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception, the respective Autoconf Macro's copyright owner
+# gives unlimited permission to copy, distribute and modify the configure
+# scripts that are the output of Autoconf when processing the Macro. You
+# need not follow the terms of the GNU General Public License when using
+# or distributing such scripts, even though portions of the text of the
+# Macro appear in them. The GNU General Public License (GPL) does govern
+# all other use of the material that constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the Autoconf
+# Macro released by the Autoconf Archive. When you make and distribute a
+# modified version of the Autoconf Macro, you may extend this special
+# exception to the GPL to apply to your modified version as well.
+
+#serial 8
+
+AU_ALIAS([AC_PROG_JAVA_WORKS], [AX_PROG_JAVA_WORKS])
+AC_DEFUN([AX_PROG_JAVA_WORKS], [
+AC_PATH_PROG(UUDECODE, uudecode, [no])
+if test x$UUDECODE != xno; then
+AC_CACHE_CHECK([if uudecode can decode base 64 file], ac_cv_prog_uudecode_base64, [
+dnl /**
+dnl * Test.java: used to test if java compiler works.
+dnl */
+dnl public class Test
+dnl {
+dnl
+dnl public static void
+dnl main( String[] argv )
+dnl {
+dnl System.exit (0);
+dnl }
+dnl
+dnl }
+cat << \EOF > Test.uue
+begin-base64 644 Test.class
+yv66vgADAC0AFQcAAgEABFRlc3QHAAQBABBqYXZhL2xhbmcvT2JqZWN0AQAE
+bWFpbgEAFihbTGphdmEvbGFuZy9TdHJpbmc7KVYBAARDb2RlAQAPTGluZU51
+bWJlclRhYmxlDAAKAAsBAARleGl0AQAEKEkpVgoADQAJBwAOAQAQamF2YS9s
+YW5nL1N5c3RlbQEABjxpbml0PgEAAygpVgwADwAQCgADABEBAApTb3VyY2VG
+aWxlAQAJVGVzdC5qYXZhACEAAQADAAAAAAACAAkABQAGAAEABwAAACEAAQAB
+AAAABQO4AAyxAAAAAQAIAAAACgACAAAACgAEAAsAAQAPABAAAQAHAAAAIQAB
+AAEAAAAFKrcAErEAAAABAAgAAAAKAAIAAAAEAAQABAABABMAAAACABQ=
+====
+EOF
+if $UUDECODE Test.uue; then
+ ac_cv_prog_uudecode_base64=yes
+else
+ echo "configure: __oline__: uudecode had trouble decoding base 64 file 'Test.uue'" >&AS_MESSAGE_LOG_FD
+ echo "configure: failed file was:" >&AS_MESSAGE_LOG_FD
+ cat Test.uue >&AS_MESSAGE_LOG_FD
+ ac_cv_prog_uudecode_base64=no
+fi
+rm -f Test.uue])
+fi
+if test x$ac_cv_prog_uudecode_base64 != xyes; then
+ rm -f Test.class
+ AC_MSG_WARN([I have to compile Test.class from scratch])
+ if test x$ac_cv_prog_javac_works = xno; then
+ AC_MSG_ERROR([Cannot compile java source. $JAVAC does not work properly])
+ fi
+ if test x$ac_cv_prog_javac_works = x; then
+ AX_PROG_JAVAC
+ fi
+fi
+AC_CACHE_CHECK(if $JAVA works, ac_cv_prog_java_works, [
+JAVA_TEST=Test.java
+CLASS_TEST=Test.class
+TEST=Test
+changequote(, )dnl
+cat << \EOF > $JAVA_TEST
+/* [#]line __oline__ "configure" */
+public class Test {
+public static void main (String args[]) {
+ System.exit (0);
+} }
+EOF
+changequote([, ])dnl
+if test x$ac_cv_prog_uudecode_base64 != xyes; then
+ if AC_TRY_COMMAND($JAVAC $JAVACFLAGS $JAVA_TEST) && test -s $CLASS_TEST; then
+ :
+ else
+ echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD
+ cat $JAVA_TEST >&AS_MESSAGE_LOG_FD
+ AC_MSG_ERROR(The Java compiler $JAVAC failed (see config.log, check the CLASSPATH?))
+ fi
+fi
+if AC_TRY_COMMAND($JAVA $JAVAFLAGS $TEST) >/dev/null 2>&1; then
+ ac_cv_prog_java_works=yes
+else
+ echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD
+ cat $JAVA_TEST >&AS_MESSAGE_LOG_FD
+ AC_MSG_ERROR(The Java VM $JAVA failed (see config.log, check the CLASSPATH?))
+fi
+rm -fr $JAVA_TEST $CLASS_TEST Test.uue
+])
+AC_PROVIDE([$0])dnl
+]
+)
diff --git a/build_posix/aclocal/ax_prog_javac.m4 b/build_posix/aclocal/ax_prog_javac.m4
new file mode 100644
index 00000000000..d9bcc2d7c34
--- /dev/null
+++ b/build_posix/aclocal/ax_prog_javac.m4
@@ -0,0 +1,79 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_prog_javac.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_PROG_JAVAC
+#
+# DESCRIPTION
+#
+# AX_PROG_JAVAC tests an existing Java compiler. It uses the environment
+# variable JAVAC then tests in sequence various common Java compilers. For
+# political reasons, it starts with the free ones.
+#
+# If you want to force a specific compiler:
+#
+# - at the configure.in level, set JAVAC=yourcompiler before calling
+# AX_PROG_JAVAC
+#
+# - at the configure level, setenv JAVAC
+#
+# You can use the JAVAC variable in your Makefile.in, with @JAVAC@.
+#
+# *Warning*: its success or failure can depend on a proper setting of the
+# CLASSPATH env. variable.
+#
+# TODO: allow to exclude compilers (rationale: most Java programs cannot
+# compile with some compilers like guavac).
+#
+# Note: This is part of the set of autoconf M4 macros for Java programs.
+# It is VERY IMPORTANT that you download the whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission. The
+# general documentation, as well as the sample configure.in, is included
+# in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception, the respective Autoconf Macro's copyright owner
+# gives unlimited permission to copy, distribute and modify the configure
+# scripts that are the output of Autoconf when processing the Macro. You
+# need not follow the terms of the GNU General Public License when using
+# or distributing such scripts, even though portions of the text of the
+# Macro appear in them. The GNU General Public License (GPL) does govern
+# all other use of the material that constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the Autoconf
+# Macro released by the Autoconf Archive. When you make and distribute a
+# modified version of the Autoconf Macro, you may extend this special
+# exception to the GPL to apply to your modified version as well.
+
+#serial 6
+
+AU_ALIAS([AC_PROG_JAVAC], [AX_PROG_JAVAC])
+AC_DEFUN([AX_PROG_JAVAC],[
+if test "x$JAVAPREFIX" = x; then
+ test "x$JAVAC" = x && AC_CHECK_PROGS(JAVAC, "gcj -C" guavac jikes javac)
+else
+ test "x$JAVAC" = x && AC_CHECK_PROGS(JAVAC, "gcj -C" guavac jikes javac, $JAVAPREFIX)
+fi
+test "x$JAVAC" = x && AC_MSG_ERROR([no acceptable Java compiler found in \$PATH])
+AX_PROG_JAVAC_WORKS
+AC_PROVIDE([$0])dnl
+])
diff --git a/build_posix/aclocal/ax_prog_javac_works.m4 b/build_posix/aclocal/ax_prog_javac_works.m4
new file mode 100644
index 00000000000..7dfa1e37d89
--- /dev/null
+++ b/build_posix/aclocal/ax_prog_javac_works.m4
@@ -0,0 +1,72 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_prog_javac_works.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_PROG_JAVAC_WORKS
+#
+# DESCRIPTION
+#
+# Internal use ONLY.
+#
+# Note: This is part of the set of autoconf M4 macros for Java programs.
+# It is VERY IMPORTANT that you download the whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission. The
+# general documentation, as well as the sample configure.in, is included
+# in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception, the respective Autoconf Macro's copyright owner
+# gives unlimited permission to copy, distribute and modify the configure
+# scripts that are the output of Autoconf when processing the Macro. You
+# need not follow the terms of the GNU General Public License when using
+# or distributing such scripts, even though portions of the text of the
+# Macro appear in them. The GNU General Public License (GPL) does govern
+# all other use of the material that constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the Autoconf
+# Macro released by the Autoconf Archive. When you make and distribute a
+# modified version of the Autoconf Macro, you may extend this special
+# exception to the GPL to apply to your modified version as well.
+
+#serial 6
+
+AU_ALIAS([AC_PROG_JAVAC_WORKS], [AX_PROG_JAVAC_WORKS])
+AC_DEFUN([AX_PROG_JAVAC_WORKS],[
+AC_CACHE_CHECK([if $JAVAC works], ac_cv_prog_javac_works, [
+JAVA_TEST=Test.java
+CLASS_TEST=Test.class
+cat << \EOF > $JAVA_TEST
+/* [#]line __oline__ "configure" */
+public class Test {
+}
+EOF
+if AC_TRY_COMMAND($JAVAC $JAVACFLAGS $JAVA_TEST) >/dev/null 2>&1; then
+ ac_cv_prog_javac_works=yes
+else
+ AC_MSG_ERROR([The Java compiler $JAVAC failed (see config.log, check the CLASSPATH?)])
+ echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD
+ cat $JAVA_TEST >&AS_MESSAGE_LOG_FD
+fi
+rm -f $JAVA_TEST $CLASS_TEST
+])
+AC_PROVIDE([$0])dnl
+])
diff --git a/build_posix/aclocal/ax_try_compile_java.m4 b/build_posix/aclocal/ax_try_compile_java.m4
new file mode 100644
index 00000000000..8efd091c43b
--- /dev/null
+++ b/build_posix/aclocal/ax_try_compile_java.m4
@@ -0,0 +1,55 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_try_compile_java.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_TRY_COMPILE_JAVA
+#
+# DESCRIPTION
+#
+# AX_TRY_COMPILE_JAVA attempt to compile user given source.
+#
+# *Warning*: its success or failure can depend on a proper setting of the
+# CLASSPATH env. variable.
+#
+# Note: This is part of the set of autoconf M4 macros for Java programs.
+# It is VERY IMPORTANT that you download the whole set, some macros depend
+# on other. Unfortunately, the autoconf archive does not support the
+# concept of set of macros, so I had to break it for submission. The
+# general documentation, as well as the sample configure.in, is included
+# in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Devin Weaver <ktohg@tritarget.com>
+#
+# Copying and distribution of this file, with or without modification, are
+# permitted in any medium without royalty provided the copyright notice
+# and this notice are preserved. This file is offered as-is, without any
+# warranty.
+
+#serial 7
+
+AU_ALIAS([AC_TRY_COMPILE_JAVA], [AX_TRY_COMPILE_JAVA])
+AC_DEFUN([AX_TRY_COMPILE_JAVA],[
+AC_REQUIRE([AX_PROG_JAVAC])dnl
+cat << \EOF > Test.java
+/* [#]line __oline__ "configure" */
+ifelse([$1], , , [import $1;])
+public class Test {
+[$2]
+}
+EOF
+if AC_TRY_COMMAND($JAVAC $JAVACFLAGS Test.java) && test -s Test.class
+then
+dnl Don't remove the temporary files here, so they can be examined.
+ ifelse([$3], , :, [$3])
+else
+ echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD
+ cat Test.java >&AS_MESSAGE_LOG_FD
+ifelse([$4], , , [ rm -fr Test*
+ $4
+])dnl
+fi
+rm -fr Test*])
diff --git a/build_posix/aclocal/options.m4 b/build_posix/aclocal/options.m4
index a65724c5e66..6856cdd0769 100644
--- a/build_posix/aclocal/options.m4
+++ b/build_posix/aclocal/options.m4
@@ -46,10 +46,24 @@ no) wt_cv_enable_diagnostic=no;;
esac
AC_MSG_RESULT($wt_cv_enable_diagnostic)
+AC_MSG_CHECKING(if --enable-java option specified)
+AC_ARG_ENABLE(java,
+ [AS_HELP_STRING([--enable-java],
+ [Configure the Java API.])], r=$enableval, r=no)
+case "$r" in
+no) wt_cv_enable_java=no;;
+*) if test "$enable_shared" = "no"; then
+ AC_MSG_ERROR([--enable-java requires shared libraries])
+ fi
+ wt_cv_enable_java=yes;;
+esac
+AC_MSG_RESULT($wt_cv_enable_java)
+AM_CONDITIONAL([JAVA], [test x$wt_cv_enable_java = xyes])
+
AC_MSG_CHECKING(if --enable-python option specified)
AC_ARG_ENABLE(python,
[AS_HELP_STRING([--enable-python],
- [Configure for python symbols.])], r=$enableval, r=no)
+ [Configure the python API.])], r=$enableval, r=no)
case "$r" in
no) wt_cv_enable_python=no;;
*) if test "$enable_shared" = "no"; then
diff --git a/build_posix/aclocal/version-set.m4 b/build_posix/aclocal/version-set.m4
index 138d5a7cc94..762c30c464d 100644
--- a/build_posix/aclocal/version-set.m4
+++ b/build_posix/aclocal/version-set.m4
@@ -1,14 +1,14 @@
dnl build by dist/s_version
VERSION_MAJOR=1
-VERSION_MINOR=4
-VERSION_PATCH=2
-VERSION_STRING='"WiredTiger 1.4.2: (January 14, 2013)"'
+VERSION_MINOR=5
+VERSION_PATCH=0
+VERSION_STRING='"WiredTiger 1.5.0: (March 14, 2013)"'
AC_SUBST(VERSION_MAJOR)
AC_SUBST(VERSION_MINOR)
AC_SUBST(VERSION_PATCH)
AC_SUBST(VERSION_STRING)
-VERSION_NOPATCH=1.4
+VERSION_NOPATCH=1.5
AC_SUBST(VERSION_NOPATCH)
diff --git a/build_posix/aclocal/version.m4 b/build_posix/aclocal/version.m4
index 75cb73db9bd..55081c7412c 100644
--- a/build_posix/aclocal/version.m4
+++ b/build_posix/aclocal/version.m4
@@ -1,2 +1,2 @@
dnl WiredTiger product version for AC_INIT. Maintained by dist/s_version
-1.4.2
+1.5.0
diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in
index 8fd08fd0877..eb689726494 100644
--- a/build_posix/configure.ac.in
+++ b/build_posix/configure.ac.in
@@ -50,17 +50,28 @@ fi
AM_CONDITIONAL([DEBUG], [test "$wt_cv_enable_debug" = "yes"])
-# Python API
+# Java and Python APIs
+if test "$wt_cv_enable_java" = "yes" -o "$wt_cv_enable_python" = "yes"; then
+ AX_PKG_SWIG(2.0.4, [],
+ [AC_MSG_WARN([SWIG is required to rebuild Java or Python APIs.])])
+fi
+
+if test "$wt_cv_enable_java" = "yes"; then
+ JAVAC=${JAVAC-javac}
+ AX_PROG_JAVAC
+ AX_PROG_JAR
+ AX_JNI_INCLUDE_DIR
+ if test "$wt_cv_enable_debug" = "yes"; then
+ AX_CHECK_JUNIT
+ fi
+ for JNI_INCLUDE_DIR in $JNI_INCLUDE_DIRS ; do
+ JNI_CPPFLAGS="$JNI_CPPFLAGS -I$JNI_INCLUDE_DIR"
+ done
+ AC_SUBST(JNI_CPPFLAGS)
+fi
+
if test "$wt_cv_enable_python" = "yes"; then
AM_PATH_PYTHON([2.6])
- AX_PKG_SWIG(2.0.4, [],
- [ AC_MSG_ERROR([SWIG is required to build Python support.]) ])
-
- # Check that SWIG supports Python.
- touch swigtest.i
- $SWIG -python -module swigtest swigtest.i > /dev/null 2>&1 || \
- AC_MSG_ERROR([$SWIG does not include Python support.])
- rm -f swigtest*
fi
AM_TYPES
@@ -73,7 +84,7 @@ AC_CHECK_LIB(dl, dlopen)
AC_CHECK_LIB(rt, sched_yield)
AC_CHECK_FUNCS([\
clock_gettime fcntl gettimeofday pthread_timedjoin_np posix_fadvise\
- posix_memalign strtouq])
+ posix_memalign strtouq sync_file_range])
AC_SYS_LARGEFILE
AC_C_BIGENDIAN
diff --git a/dist/api_config.py b/dist/api_config.py
index 37c2f022a77..ce36b082964 100644
--- a/dist/api_config.py
+++ b/dist/api_config.py
@@ -191,7 +191,7 @@ def get_default(c):
return '(%s)' % (','.join('%s=%s' % (subc.name, get_default(subc))
for subc in sorted(c.subconfig)))
elif (c.default or t == 'int') and c.default != 'true':
- return str(c.default)
+ return str(c.default).replace('"', '\\"')
else:
return ''
diff --git a/dist/api_data.py b/dist/api_data.py
index baea9560db4..163e2629c93 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -60,12 +60,17 @@ column_meta = [
source_meta = [
Config('source', '', r'''
- override the default data source URI derived from the object
- name'''),
+ set a custom data source URI for a column group, index or simple
+ table. By default, the data source URI is derived from the \c
+ type and the column group or index name. Applications can
+ create tables from existing data sources by supplying a \c
+ source configuration'''),
Config('type', 'file', r'''
- set the data source type. This setting overrides the URI
- prefix for the data source, if no \c source configuration
- setting is provided''',
+ set the type of data source used to store a column group, index
+ or simple table. By default, a \c "file:" URI is derived from
+ the object name. The \c type configuration can be used to
+ switch to a different storage format, such as LSM. Ignored if
+ an explicit URI is supplied with a \c source configuration''',
choices=['file', 'lsm']),
]
@@ -122,7 +127,7 @@ lsm_config = [
]
# Per-file configuration
-file_config = format_meta + lsm_config + [
+file_config = format_meta + [
Config('allocation_size', '512B', r'''
the file unit allocation size, in bytes, must a power-of-two;
smaller values decrease the file space required by overflow
@@ -214,6 +219,17 @@ file_config = format_meta + lsm_config + [
soft - it is possible for pages to be temporarily larger than
this value''',
min='512B', max='10TB'),
+ Config('os_cache_max', '0', r'''
+ maximum system buffer cache usage, in bytes. If non-zero, evict
+ object blocks from the system buffer cache after that many bytes
+ from this object are read or written into the buffer cache''',
+ min=0),
+ Config('os_cache_dirty_max', '0', r'''
+ maximum dirty system buffer cache usage, in bytes. If non-zero,
+ schedule writes for dirty blocks belonging to this object in the
+ system buffer cache after that many bytes from this object are
+ written into the buffer cache''',
+ min=0),
Config('prefix_compression', 'true', r'''
configure row-store format key prefix compression''',
type='boolean'),
@@ -262,8 +278,8 @@ connection_runtime_config = [
Config('reserve', '0', r'''
amount of cache this database is guaranteed to have available
from the shared cache. This setting is per database. Defaults
- to the chunk size'''),
- Config('name', '', r'''
+ to the chunk size''', type='int'),
+ Config('name', 'pool', r'''
name of a cache that is shared between databases'''),
Config('size', '500MB', r'''
maximum memory to allocate for the shared cache. Setting this
@@ -289,6 +305,9 @@ connection_runtime_config = [
trigger eviction when the cache becomes this full (as a
percentage)''',
min=10, max=99),
+ Config('statistics', 'false', r'''
+ Maintain database statistics that may impact performance''',
+ type='boolean'),
Config('verbose', '', r'''
enable messages for various events. Options are given as a
list, such as <code>"verbose=[evictserver,read]"</code>''',
@@ -337,7 +356,7 @@ methods = {
min='10', max='50'),
]),
-'session.create' : Method(table_only_meta + file_config + source_meta + [
+'session.create' : Method(table_only_meta + file_config + lsm_config + source_meta + [
Config('exclusive', 'false', r'''
fail if the object exists. When false (the default), if the
object exists, check that its settings match the specified
@@ -359,18 +378,19 @@ methods = {
number key; valid only for cursors with record number keys''',
type='boolean'),
Config('bulk', 'false', r'''
- configure the cursor for bulk loads, a fast, initial load
- path. Bulk load may only be used for newly created objects,
- and in the case of row-store objects, key/value items must
- be loaded in sorted order. Cursors configured for bulk load
- only support the WT_CURSOR::insert and WT_CURSOR::close
- methods. The value is usually a true/false flag, but the the
- special value \c "bitmap" is for use with fixed-length column
- stores, and allows chunks of a memory resident bitmap to be
- loaded directly into a file by passing a \c WT_ITEM to
- WT_CURSOR::set_value where the \c size field indicates the
- number of records in the bitmap (as specified by the file's
- \c value_format). Bulk load bitmap values must end on a byte
+ configure the cursor for bulk-loading, a fast, initial load
+ path (see @ref bulk_load for more information). Bulk-load
+ may only be used for newly created objects and cursors
+ configured for bulk-load only support the WT_CURSOR::insert
+ and WT_CURSOR::close methods. When bulk-loading row-store
+ objects, keys must be loaded in sorted order. The value is
+ usually a true/false flag; when bulk-loading fixed-length
+ column store objects, the special value \c bitmap allows
+ chunks of a memory resident bitmap to be loaded directly into
+ a file by passing a \c WT_ITEM to WT_CURSOR::set_value where
+ the \c size field indicates the number of records in the
+ bitmap (as specified by the object's \c value_format
+ configuration). Bulk-loaded bitmap values must end on a byte
boundary relative to the bit count (except for the last set
of values loaded)'''),
Config('checkpoint', '', r'''
@@ -499,10 +519,21 @@ methods = {
'wiredtiger_open' : Method(connection_runtime_config + [
Config('buffer_alignment', '-1', r'''
- in-memory alignment (in bytes) for buffers used for I/O. The default
- value of -1 indicates that a platform-specific alignment value should
- be used (512 bytes on Linux systems, zero elsewhere)''',
+ in-memory alignment (in bytes) for buffers used for I/O. The
+ default value of -1 indicates that a platform-specific
+ alignment value should be used (512 bytes on Linux systems,
+ zero elsewhere)''',
min='-1', max='1MB'),
+ Config('checkpoint', '', r'''
+ periodically checkpoint the database''',
+ type='category', subconfig=[
+ Config('name', '"WiredTigerCheckpoint"', r'''
+ the checkpoint name'''),
+ Config('wait', '0', r'''
+ seconds to wait between each checkpoint; setting this value
+ configures periodic checkpoints''',
+ min='1', max='100000'),
+ ]),
Config('create', 'false', r'''
create the database if it does not exist''',
type='boolean'),
@@ -539,6 +570,33 @@ methods = {
maximum expected number of sessions (including server
threads)''',
min='1'),
+ Config('statistics_log', '', r'''
+ log database connection statistics into a file when the
+ \c statistics configuration value is set to true. See
+ @ref statistics_log for more information''',
+ type='category', subconfig=[
+ Config('clear', 'true', r'''
+ reset statistics counters after each set of log records are
+ written''', type='boolean'),
+ Config('path', '"WiredTigerStat.%H"', r'''
+ the pathname to a file into which the log records are written,
+ may contain strftime conversion specifications. If the value
+ is not an absolute path name, the file is created relative to
+ the database home'''),
+ Config('sources', '', r'''
+ if non-empty, include statistics for the list of data source
+ URIs. No statistics that require traversing a tree are
+ reported, as if the \c statistics_fast configuration string
+ were set''',
+ type='list'),
+ Config('timestamp', '"%b %d %H:%M:%S"', r'''
+ a timestamp prepended to each log record, may contain strftime
+ conversion specifications'''),
+ Config('wait', '0', r'''
+ seconds to wait between each write of the log records; setting
+ this value configures \c statistics and statistics logging''',
+ min='5', max='100000'),
+ ]),
Config('sync', 'true', r'''
flush files to stable storage when closing or writing
checkpoints''',
diff --git a/dist/filelist b/dist/filelist
index 2c3b707e51c..12534da6d36 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -7,6 +7,7 @@ src/block/block_addr.c
src/block/block_ckpt.c
src/block/block_compact.c
src/block/block_ext.c
+src/block/block_map.c
src/block/block_mgr.c
src/block/block_open.c
src/block/block_read.c
@@ -40,6 +41,7 @@ src/btree/bt_walk.c
src/btree/col_modify.c
src/btree/col_srch.c
src/btree/rec_evict.c
+src/btree/rec_merge.c
src/btree/rec_track.c
src/btree/rec_write.c
src/btree/row_key.c
@@ -54,6 +56,7 @@ src/conn/conn_api.c
src/conn/conn_dhandle.c
src/conn/conn_cache.c
src/conn/conn_cache_pool.c
+src/conn/conn_ckpt.c
src/conn/conn_handle.c
src/conn/conn_open.c
src/conn/conn_stat.c
@@ -103,8 +106,9 @@ src/os_posix/os_strtouq.c
src/os_posix/os_thread.c
src/os_posix/os_time.c
src/os_posix/os_yield.c
-src/packing/packing.c
-src/packing/packing_api.c
+src/packing/pack_api.c
+src/packing/pack_impl.c
+src/packing/pack_stream.c
src/schema/schema_create.c
src/schema/schema_drop.c
src/schema/schema_list.c
diff --git a/dist/flags.py b/dist/flags.py
index 9f87fd1be17..32c1ecac1fe 100644
--- a/dist/flags.py
+++ b/dist/flags.py
@@ -13,6 +13,7 @@ flags = {
'SYNC_COMPACT',
'SYNC_DISCARD',
'SYNC_DISCARD_NOWRITE',
+ 'SYNC_WRITE_LEAVES',
],
'direct_io' : [
'DIRECTIO_DATA',
@@ -58,6 +59,7 @@ flags = {
###################################################
'conn' : [
'CONN_CACHE_POOL',
+ 'CONN_EVICTION_RUN',
'CONN_LSM_MERGE',
'CONN_PANIC',
'CONN_SERVER_RUN',
diff --git a/dist/java_doc.py b/dist/java_doc.py
new file mode 100644
index 00000000000..ce42a53c118
--- /dev/null
+++ b/dist/java_doc.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+# This program pulls the function names from wiredtiger.in and generates
+# an input file for Java SWIG that adds doxygen copydoc comments to functions.
+
+import os, re, sys
+import api_data
+from dist import compare_srcfile
+
+# Temporary file.
+tmp_file = '__tmp'
+
+#####################################################################
+# Update wiredtiger.in with doxygen comments
+#####################################################################
+f='../src/include/wiredtiger.in'
+o='../lang/java/java_doc.i'
+tfile = open(tmp_file, 'w')
+
+tfile.write('''/* DO NOT EDIT: automatically built by dist/java_doc.py. */
+
+''')
+
+cclass_re = re.compile('^struct __([a-z_]*) {')
+cfunc_re = re.compile('\t.*? __F\(([a-z_]*)\)')
+
+curr_class = ""
+for line in open(f, 'r'):
+
+ m = cclass_re.match(line)
+ if m:
+ curr_class = m.group(1)
+
+ if curr_class == "":
+ continue
+
+ m = cfunc_re.match(line)
+ if m:
+ tfile.write('COPYDOC(__' + curr_class.lower() + ', ' + curr_class.upper() + ', ' + m.group(1) + ')\n')
+
+tfile.close()
+compare_srcfile(tmp_file, o)
+
diff --git a/dist/s_all b/dist/s_all
index abb39c87903..d84711e1e62 100644
--- a/dist/s_all
+++ b/dist/s_all
@@ -54,6 +54,7 @@ run "python flags.py" "building flags"
run "python log.py" "building logging layer"
run "python serial.py" "building serial function support"
run "python stat.py" "building statistics support"
+run "python java_doc.py" "building Java documentation index"
run "sh ./s_typedef -b" "building standard typedefs"
run "sh ./s_prototypes" "building function prototypes"
diff --git a/dist/s_copyright.list b/dist/s_copyright.list
index 359e97f1aa8..d6ac12c588b 100644
--- a/dist/s_copyright.list
+++ b/dist/s_copyright.list
@@ -1,4 +1,5 @@
skip bench/tcbench/wttest.c
+skip build_posix/Test.java
skip build_posix/wiredtiger_config.h
skip dist/api_config.py
skip dist/api_data.py
@@ -6,12 +7,21 @@ skip dist/api_err.py
skip dist/db.py
skip dist/dist.py
skip dist/flags.py
+skip dist/java_doc.py
skip dist/log.py
skip dist/log_data.py
skip dist/serial.py
skip dist/stat.py
skip dist/stat_data.py
-skip docs/tools/doxypy.py
+skip lang/java/java_doc.i
+skip lang/java/src/com/wiredtiger/db/Connection.java
+skip lang/java/src/com/wiredtiger/db/Cursor.java
+skip lang/java/src/com/wiredtiger/db/SearchStatus.java
+skip lang/java/src/com/wiredtiger/db/Session.java
+skip lang/java/src/com/wiredtiger/db/wiredtiger.java
+skip lang/java/src/com/wiredtiger/db/wiredtigerConstants.java
+skip lang/java/src/com/wiredtiger/db/wiredtigerJNI.java
+skip lang/java/wiredtiger_wrap.c
skip lang/python/setup.py
skip lang/python/src/wiredtiger/service/WiredTiger.py
skip lang/python/src/wiredtiger/service/__init__.py
@@ -29,7 +39,6 @@ skip src/include/queue.h
skip src/include/serial_funcs.i
skip src/log/log_desc.c
skip src/support/stat.c
-skip test/config.i
skip test/packing/intpack-test.c
skip test/packing/intpack-test2.c
skip test/packing/packing-test.c
diff --git a/dist/s_define.list b/dist/s_define.list
index 13506d003f0..36308036fef 100644
--- a/dist/s_define.list
+++ b/dist/s_define.list
@@ -16,9 +16,10 @@ TXN_API_CALL_NOCONF
TXN_API_END
WT_BARRIER
WT_BLOCK_DESC_SIZE
+WT_CSTAT_SET
WT_DEBUG_BYTE
+WT_DSTAT_DECR
WT_READ_BARRIER
-WT_STAT_CHECK_SESSION
WT_STAT_DECR
__F
__WIREDTIGER_EXT_H_
diff --git a/dist/s_docs b/dist/s_docs
index d213f254a17..f2e673823d0 100755
--- a/dist/s_docs
+++ b/dist/s_docs
@@ -82,12 +82,19 @@ EOF
e=1
}
- # Run again to generate the full documentation set (with Python).
- [ "$python" -eq 1 ] && [ -f ../lang/python/wiredtiger.py ] && (
+ # Add optional extras
+ EXTRAS="../lang/java/src/com/wiredtiger/db ../lang/python/wiredtiger.py"
+ EXTRA_INPUT=""
+ for f in $EXTRAS ; do
+ [ -e "$f" ] && EXTRA_INPUT="$EXTRA_INPUT ../$f"
+ done
+
+ # Run again to generate the full doc set with Python and Java.
+ [ "$additional_languages" -eq 1 ] && [ "x$EXTRA_INPUT" != "x" ] && (
cd ../src/docs &&
(eval cat Doxyfile $filter ; cat <<EOF
QUIET=YES
-INPUT+=../../lang/python/wiredtiger.py
+INPUT+=$EXTRA_INPUT
EOF
) | doxygen -)
@@ -98,21 +105,21 @@ EOF
}
clean=0
-python=1
+additional_languages=1
filter="|sed '/PROJECT_NUMBER/s,=.*,=\"Version $WIREDTIGER_VERSION\",'"
while :
do case "$1" in
-a) # Build from scratch
clean=1
shift;;
- -l) # Generate the top-level landing page in ../docs/top
+ -l) # Generate the top-level landing page in ../docs/top
filter="$filter; cat top/Doxyfile"
- python=0
+ additional_languages=0
shift;;
- -p) # Generate PDFs
+ -p) # Generate PDFs
filter="$filter| sed '/GENERATE_LATEX/s,=.*,=YES,'"
shift;;
- -t) # Include the TODO list
+ -t) # Include the TODO list
filter="$filter| sed '/GENERATE_TODOLIST/s,=.*,=YES,'"
shift;;
*)
diff --git a/dist/s_funcs.list b/dist/s_funcs.list
index 5390a976764..2b71a693226 100644
--- a/dist/s_funcs.list
+++ b/dist/s_funcs.list
@@ -15,6 +15,15 @@ __wt_log_printf
__wt_nlpo2
__wt_nlpo2_round
__wt_print_huffman_code
+wiredtiger_pack_int
+wiredtiger_pack_item
+wiredtiger_pack_str
+wiredtiger_pack_uint
+wiredtiger_unpack_int
+wiredtiger_unpack_item
+wiredtiger_unpack_start
+wiredtiger_unpack_str
+wiredtiger_unpack_uint
wiredtiger_struct_pack
wiredtiger_struct_size
wiredtiger_struct_unpack
diff --git a/dist/s_release b/dist/s_release
index e7fd4633b4e..996bb3f151c 100755
--- a/dist/s_release
+++ b/dist/s_release
@@ -32,11 +32,15 @@ fi
echo "Running 'dist/s_all' in the release tree"
(cd "$DEST/dist" && env WT_RELEASE_BUILD=yes sh s_all -A > /dev/null)
-echo "Running swig to generate the Python API"
-(cd "$DEST/build_posix" && \
- ../configure --enable-python && \
- (cd lang/python && make ../../../lang/python/wiredtiger_wrap.c) && \
- make distclean) > /dev/null
+echo "Running swig to generate the Java and Python API"
+(cd "$DEST/build_posix" &&
+ ../configure --enable-java --enable-python &&
+ (cd lang/java && make ../../../lang/java/wiredtiger_wrap.c) &&
+ (cd lang/python && make ../../../lang/python/wiredtiger_wrap.c) &&
+ make distclean &&
+ find . -type d -a -empty | xargs rmdir &&
+ find . -type d -a -empty | xargs rmdir &&
+ find . -type d -a -empty | xargs rmdir) > /dev/null
echo "Building documentation"
(cd "$DEST/dist" && sh s_docs > /dev/null)
@@ -45,7 +49,7 @@ echo "Packing release into $RELEASE_DIR/$PKG.tar.bz2"
(cd "$RELEASE_DIR" && tar cf - $PKG | bzip2 -9 > $PKG.tar.bz2)
echo "Packing documentation into $RELEASE_DIR/$PKG-docs.tar.bz2"
-(cd "$RELEASE_DIR" && tar cf - $PKG/[A-Z][A-Z]* $PKG/docs | \
+(cd "$RELEASE_DIR" && tar cf - $PKG/LICENSE $PKG/NEWS $PKG/README $PKG/docs | \
bzip2 -9 > $PKG-docs.tar.bz2)
rm -r $DEST
diff --git a/dist/s_release.list b/dist/s_release.list
index 749aee885e9..4f67e4cdb5b 100644
--- a/dist/s_release.list
+++ b/dist/s_release.list
@@ -1,6 +1,5 @@
# Exclusions from release packages.
# Each non-comment line is passed as an "--exclude" argument to "hg archive".
-lang/java
lang/python/src
src/server
test/format
diff --git a/dist/s_string.ok b/dist/s_string.ok
index 91c9024f801..0dc672c9907 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -174,6 +174,7 @@ SLIST
SLVG
SML
SQL
+SSD
SSq
STAILQ
SYS
@@ -210,11 +211,13 @@ Vv
VxWorks
WIREDTIGER
WeakHashLen
+Wformat
WinNT
WiredTiger
WiredTiger's
WiredTigerCheckpoint
WiredTigerHome
+WiredTigerStat
WithSeeds
Wuninitialized
XP
@@ -239,6 +242,7 @@ argc
args
argv
async
+autockpt
autocommit
bdb
bigram
@@ -376,6 +380,7 @@ exactp
extern
extlist
extlists
+fadvise
fblocks
fclose
fcntl
@@ -458,6 +463,7 @@ intpack
ints
inuse
io
+ip
ispo
kb
kcell
@@ -504,6 +510,8 @@ memcpy
memfree
memmove
memsize
+mergeable
+metaconf
metadata
metafile
mfence
@@ -530,10 +538,12 @@ negint
newbar
newname
nextprev
+nfilename
nhex
nl
nlpo
nocase
+nonliteral
noop
nop
notfound
@@ -557,6 +567,7 @@ os
ovfl
packv
patchp
+pathname
pathnames
perf
pfx
@@ -569,6 +580,8 @@ presize
printf
printlog
priv
+ps
+psp
pthread
putK
putV
@@ -625,6 +638,7 @@ sizep
sizev
skiplist
skiplists
+sl
slotsp
slvg
snaplen
@@ -637,6 +651,7 @@ srch
srvr
sset
startup
+statlog
stbar
stdarg
stderr
@@ -646,6 +661,7 @@ str
strcmp
strdup
strerror
+strftime
stringin
strncpy
strndup
@@ -738,6 +754,7 @@ untyped
upd
upg
uri
+usedp
usr
utf
util
diff --git a/dist/s_style b/dist/s_style
index c91518678ab..1964a8bf79d 100644
--- a/dist/s_style
+++ b/dist/s_style
@@ -30,30 +30,59 @@ for f in `find examples ext src test -name '*.[chisy]' -o -name '*.in' |
echo "$f: while (0) has trailing semi-colon"
cat $t
fi
+
if egrep '%l[diouxXn]|%[diouxXn]l' $f > $t; then
echo "$f: incorrect or dangerous printf format: %l[diouxXn]"
cat $t
fi
+
if grep "(unsigned)" $f > $t; then
echo "$f: (unsigned) cast is wrong"
cat $t
fi
+
egrep 'u_quad' $f | sed '/@u_quad_decl@/d' > $t
test -s $t && {
echo "$f: old-style type declaration: u_XXX_t or u_quad"
cat $t
}
- # Direct calls to strtouq, not __wt_strtouq
- if ! expr "$f" : '.*/os_strtouq.c' > /dev/null; then
- egrep strtouq $f | egrep -v __wt_strtouq | \
- egrep -v '^[[:space:]].*\*' > $t
- test -s $t && {
- echo "$f: explicit call to strtouq"
- cat $t
- }
+ # Common typos (Wikipedia's list).
+ egrep -w 'a a|an an|and and|are are|be be|by by|for for|from from|if if|in in|is is|it it|of of|the the|this this|to to|was was|were were|when when|with with|a an|an a|a the|the a' $f > $t
+ test -s $t && {
+ echo "$f: paired typo"
+ cat $t
+ }
+
+ # Direct calls to functions we're not supposed to use in the library.
+ # We don't check for all of them, just a few of the common ones.
+ if ! expr "$f" : 'examples/.*' > /dev/null &&
+ ! expr "$f" : 'ext/.*' > /dev/null &&
+ ! expr "$f" : 'test/.*' > /dev/null &&
+ ! expr "$f" : '.*/utilities/.*' > /dev/null; then
+ if ! expr "$f" : '.*/os_alloc.c' > /dev/null &&
+ egrep '[[:space:]]free[(]|[[:space:]]strdup[(]|[[:space:]]strndup[(]|[[:space:]]malloc[(]|[[:space:]]calloc[(]|[[:space:]]realloc[(]' $f > $t; then
+ test -s $t && {
+ echo "$f: call to illegal function"
+ cat $t
+ }
+ fi
+ if ! expr "$f" : '.*/os_strtouq.c' > /dev/null &&
+ egrep '[[:space:]]strtouq[(]' $f > $t; then
+ test -s $t && {
+ echo "$f: call to illegal function"
+ cat $t
+ }
+ fi
+ if egrep '[[:space:]]exit[(]' $f > $t; then
+ test -s $t && {
+ echo "$f: call to illegal function"
+ cat $t
+ }
+ fi
fi
+ # Declaration of an integer return variable.
if ! expr "$f" : 'examples/.*' > /dev/null &&
! expr "$f" : 'test/.*' > /dev/null &&
! expr "$f" : 'ext/.*' > /dev/null; then
@@ -71,13 +100,9 @@ for f in `find examples ext src test -name '*.[chisy]' -o -name '*.in' |
egrep 'return|WT_RET' | \
sed -e "s,^,$f:," -e 's/$/ [return skips API_END call]/'
- # Bad code we can't easily fix
- grep -Hn 'bzero|exit[ ]*\(1\)|^[ ]+[|&=+-]' $f
-
tr -cd '[:alnum:][:space:][:punct:]' < $f |
unexpand |
sed -e 's/){/) {/' \
- -e 's/\([ ]\)exit (/\1exit(/g' \
-e 's/\([ ]\)for(/\1for (/' \
-e 's/\([ ]\)if(/\1if (/' \
-e 's/\([ ]\)index(/\1strchr(/' \
diff --git a/dist/s_symbols.list b/dist/s_symbols.list
index 0bc487e25b3..565a82c6413 100644
--- a/dist/s_symbols.list
+++ b/dist/s_symbols.list
@@ -1,5 +1,16 @@
# List of OK external symbols.
wiredtiger_open
+wiredtiger_pack_start
+wiredtiger_pack_close
+wiredtiger_pack_int
+wiredtiger_pack_item
+wiredtiger_pack_str
+wiredtiger_pack_uint
+wiredtiger_unpack_start
+wiredtiger_unpack_int
+wiredtiger_unpack_item
+wiredtiger_unpack_str
+wiredtiger_unpack_uint
wiredtiger_strerror
wiredtiger_struct_pack
wiredtiger_struct_size
diff --git a/dist/s_typedef b/dist/s_typedef
index 0969a3b1b89..78081be7160 100644
--- a/dist/s_typedef
+++ b/dist/s_typedef
@@ -54,6 +54,12 @@ check() {
test -s $t && cat $t
}
+usage()
+{
+ echo 'usage: s_typedef [-bc]' >&2
+ exit 1
+}
+test "$#" -eq 1 || usage
while :
do case "$1" in
-b) # -b builds the typedefs
@@ -63,12 +69,9 @@ while :
check
shift;;
*)
+ test "$#" -eq 0 || usage
break;;
esac
done
-test "$#" -eq 0 || {
- echo 'usage: s_typedef [-bc]' >&2
- exit 1
-}
exit 0
diff --git a/dist/stat.py b/dist/stat.py
index 5d294f51fce..dfeaef82935 100644
--- a/dist/stat.py
+++ b/dist/stat.py
@@ -1,4 +1,5 @@
-# Read the source files and output the statistics #defines and allocation code.
+# Read the source files and output the statistics #defines plus the
+# initialize and clear code.
import re, string, sys, textwrap
from dist import compare_srcfile
@@ -90,30 +91,21 @@ compare_srcfile(tmp_file, '../src/include/wiredtiger.in')
def print_func(name, list):
'''Print the functions for the stat.c file.'''
f.write('''
-int
-__wt_stat_alloc_''' + name + '''_stats(WT_SESSION_IMPL *session, WT_''' +
- name.upper() + '''_STATS **statsp)
+void
+__wt_stat_init_''' + name + '''_stats(WT_''' + name.upper() + '''_STATS *stats)
{
-\tWT_''' + name.upper() + '''_STATS *stats;
-
-\tWT_RET(__wt_calloc_def(session, 1, &stats));
-
''')
-
for l in sorted(list):
o = '\tstats->' + l.name + '.desc = "' + l.desc + '";\n'
if len(o) + 7 > 80:
o = o.replace('= ', '=\n\t ')
f.write(o)
- f.write('''
-\t*statsp = stats;
-\treturn (0);
-}
+ f.write('''}
''')
f.write('''
void
-__wt_stat_clear_''' + name + '''_stats(WT_STATS *stats_arg)
+__wt_stat_clear_''' + name + '''_stats(void *stats_arg)
{
\tWT_''' + name.upper() + '''_STATS *stats;
@@ -126,7 +118,7 @@ __wt_stat_clear_''' + name + '''_stats(WT_STATS *stats_arg)
f.write('\tstats->' + l.name + '.v = 0;\n');
f.write('}\n')
-# Write the stat allocation and clear routines to the stat.c file.
+# Write the stat initialization and clear routines to the stat.c file.
f = open(tmp_file, 'w')
f.write('/* DO NOT EDIT: automatically built by dist/stat.py. */\n\n')
f.write('#include "wt_internal.h"\n')
diff --git a/dist/stat_data.py b/dist/stat_data.py
index e5eafa0bc29..682941b4132 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -27,6 +27,7 @@ connection_stats = [
Stat('file_open', 'files currently open'),
Stat('memory_allocation', 'total heap memory allocations'),
Stat('memory_free', 'total heap memory frees'),
+ Stat('memory_grow', 'total heap memory re-allocations'),
Stat('read_io', 'total read I/Os'),
Stat('rwlock_read', 'pthread mutex shared lock read-lock calls'),
Stat('rwlock_write', 'pthread mutex shared lock write-lock calls'),
@@ -53,13 +54,23 @@ connection_stats = [
Stat('cache_bytes_write', 'cache: bytes written from cache'),
Stat('cache_eviction_clean', 'cache: unmodified pages evicted'),
Stat('cache_eviction_dirty', 'cache: modified pages evicted'),
+ Stat('cache_eviction_checkpoint',
+ 'cache: checkpoint blocked page eviction'),
Stat('cache_eviction_fail',
'cache: pages selected for eviction unable to be evicted'),
+ Stat('cache_eviction_force', 'cache: pages queued for forced eviction'),
Stat('cache_eviction_hazard',
- 'cache: eviction unable to acquire hazard pointer'),
+ 'cache: hazard pointer blocked page eviction'),
Stat('cache_eviction_internal', 'cache: internal pages evicted'),
+ Stat('cache_eviction_merge',
+ 'cache: internal page merge operations completed'),
+ Stat('cache_eviction_merge_fail',
+ 'cache: internal page merge attempts that could not complete'),
+ Stat('cache_eviction_merge_levels',
+ 'cache: internal levels merged'),
Stat('cache_eviction_slow',
'cache: eviction server unable to reach eviction goal'),
+ Stat('cache_eviction_walk', 'cache: pages walked for eviction'),
Stat('cache_pages_dirty', 'cache: tracked dirty pages in the cache'),
Stat('cache_pages_inuse',
'cache: pages currently held in the cache', perm=1),
@@ -67,6 +78,14 @@ connection_stats = [
Stat('cache_write', 'cache: pages written from cache'),
##########################################
+ # Reconciliation statistics
+ ##########################################
+ Stat('rec_pages', 'page reconciliation calls'),
+ Stat('rec_pages_eviction', 'page reconciliation calls for eviction'),
+ Stat('rec_skipped_update',
+ 'reconciliation failed because an update could not be included'),
+
+ ##########################################
# Transaction statistics
##########################################
Stat('txn_ancient', 'ancient transactions'),
@@ -157,12 +176,21 @@ dsrc_stats = [
Stat('cache_bytes_read', 'bytes read into cache'),
Stat('cache_bytes_write', 'bytes written from cache'),
Stat('cache_eviction_clean', 'unmodified pages evicted'),
+ Stat('cache_eviction_checkpoint',
+ 'cache: checkpoint blocked page eviction'),
Stat('cache_eviction_dirty', 'modified pages evicted'),
Stat('cache_eviction_fail',
'data source pages selected for eviction unable to be evicted'),
+ Stat('cache_eviction_force', 'cache: pages queued for forced eviction'),
Stat('cache_eviction_hazard',
- 'eviction unable to acquire hazard pointer'),
+ 'cache: hazard pointer blocked page eviction'),
Stat('cache_eviction_internal', 'internal pages evicted'),
+ Stat('cache_eviction_merge',
+ 'cache: internal page merge operations completed'),
+ Stat('cache_eviction_merge_fail',
+ 'cache: internal page merge attempts that could not complete'),
+ Stat('cache_eviction_merge_levels',
+ 'cache: internal levels merged'),
Stat('cache_overflow_value', 'overflow values cached in memory'),
Stat('cache_read', 'pages read into cache'),
Stat('cache_read_overflow', 'overflow pages read into cache'),
@@ -193,9 +221,11 @@ dsrc_stats = [
Stat('rec_pages', 'page reconciliation calls'),
Stat('rec_pages_eviction', 'page reconciliation calls for eviction'),
Stat('rec_skipped_update',
- 'page reconciliation failed when an update could not be included'),
+ 'reconciliation failed because an update could not be included'),
Stat('rec_split_intl', 'reconciliation internal pages split'),
Stat('rec_split_leaf', 'reconciliation leaf pages split'),
+ Stat('rec_split_max',
+ 'reconciliation maximum number of splits created by for a page'),
##########################################
# Transaction statistics
diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c
index 1386e7ea037..540fc973c04 100644
--- a/examples/c/ex_all.c
+++ b/examples/c/ex_all.c
@@ -447,11 +447,13 @@ session_ops(WT_SESSION *session)
ret = session->create(session,
"table:mytable", "key_format=S,value_format=S");
/*! [Create a table] */
+ ret = session->drop(session, "table:mytable", NULL);
/*! [Create a column-store table] */
ret = session->create(session,
"table:mytable", "key_format=r,value_format=S");
/*! [Create a column-store table] */
+ ret = session->drop(session, "table:mytable", NULL);
/*! [Create a table with columns] */
/*
@@ -459,9 +461,10 @@ session_ops(WT_SESSION *session)
* (string, signed 32-bit integer, unsigned 16-bit integer).
*/
ret = session->create(session, "table:mytable",
- "key_format=r,value_format=SiH"
+ "key_format=r,value_format=SiH,"
"columns=(id,department,salary,year-started)");
/*! [Create a table with columns] */
+ ret = session->drop(session, "table:mytable", NULL);
/*
* This example code gets run, and the compression libraries might not
@@ -474,42 +477,64 @@ session_ops(WT_SESSION *session)
"table:mytable",
"block_compressor=bzip2,key_format=S,value_format=S");
/*! [Create a bzip2 compressed table] */
+ ret = session->drop(session, "table:mytable", NULL);
/*! [Create a snappy compressed table] */
ret = session->create(session,
"table:mytable",
"block_compressor=snappy,key_format=S,value_format=S");
/*! [Create a snappy compressed table] */
+ ret = session->drop(session, "table:mytable", NULL);
#endif
/*! [Configure checksums to uncompressed] */
ret = session->create(session, "table:mytable",
"key_format=S,value_format=S,checksum=uncompressed");
/*! [Configure checksums to uncompressed] */
+ ret = session->drop(session, "table:mytable", NULL);
- /*! [Configure dictionary compression off] */
+ /*! [Configure dictionary compression on] */
ret = session->create(session, "table:mytable",
- "key_format=S,value_format=S,dictionary=false");
- /*! [Configure dictionary compression off] */
+ "key_format=S,value_format=S,dictionary=1000");
+ /*! [Configure dictionary compression on] */
+ ret = session->drop(session, "table:mytable", NULL);
/*! [Configure key prefix compression off] */
ret = session->create(session, "table:mytable",
"key_format=S,value_format=S,prefix_compression=false");
/*! [Configure key prefix compression off] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+#ifdef MIGHT_NOT_RUN
+ /* Requires sync_file_range */
+ /*! [os_cache_dirty_max configuration] */
+ ret = session->create(
+ session, "table:mytable", "os_cache_dirty_max=500MB");
+ /*! [os_cache_dirty_max configuration] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+ /* Requires posix_fadvise */
+ /*! [os_cache_max configuration] */
+ ret = session->create(session, "table:mytable", "os_cache_max=1GB");
+ /*! [os_cache_max configuration] */
+ ret = session->drop(session, "table:mytable", NULL);
+#endif
/*! [Create a cache-resident object] */
ret = session->create(session,
"table:mytable", "key_format=r,value_format=S,cache_resident=true");
/*! [Create a cache-resident object] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+ {
+ /* Create a table for the session operations. */
+ ret = session->create(
+ session, "table:mytable", "key_format=S,value_format=S");
/*! [Compact a table] */
ret = session->compact(session, "table:mytable", NULL);
/*! [Compact a table] */
- /*! [Drop a table] */
- ret = session->drop(session, "table:mytable", NULL);
- /*! [Drop a table] */
-
/*! [Print to the message stream] */
ret = session->msg_printf(
session, "process ID %" PRIuMAX, (uintmax_t)getpid());
@@ -553,6 +578,11 @@ session_ops(WT_SESSION *session)
ret = session->verify(session, "table:mytable", NULL);
/*! [Verify a table] */
+ /*! [Drop a table] */
+ ret = session->drop(session, "table:mytable", NULL);
+ /*! [Drop a table] */
+ }
+
/*! [Close a session] */
ret = session->close(session, NULL);
/*! [Close a session] */
@@ -826,6 +856,31 @@ my_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session,
}
/*! [WT_COMPRESSOR presize] */
+static int
+my_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ size_t page_max, u_int split_pct, size_t extra,
+ uint8_t *src, uint32_t *offsets, uint32_t slots,
+ uint8_t *dst, size_t dst_len, int final,
+ size_t *result_lenp, uint32_t *result_slotsp)
+{
+ /* Unused parameters */
+ (void)compressor;
+ (void)session;
+ (void)page_max;
+ (void)split_pct;
+ (void)extra;
+ (void)src;
+ (void)offsets;
+ (void)slots;
+ (void)dst;
+ (void)dst_len;
+ (void)final;
+ (void)result_lenp;
+ (void)result_slotsp;
+
+ return (0);
+}
+
int
add_compressor(WT_CONNECTION *conn)
{
@@ -833,7 +888,11 @@ add_compressor(WT_CONNECTION *conn)
/*! [WT_COMPRESSOR register] */
static WT_COMPRESSOR my_compressor = {
- my_compress, NULL, my_decompress, my_pre_size };
+ my_compress,
+ my_compress_raw, /* NULL, if no raw compression */
+ my_decompress,
+ my_pre_size /* NULL, if pre-sizing not needed */
+ };
ret = conn->add_compressor(conn, "my_compress", &my_compressor, NULL);
/*! [WT_COMPRESSOR register] */
@@ -875,9 +934,11 @@ connection_ops(WT_CONNECTION *conn)
{
int ret;
+#ifdef MIGHT_NOT_RUN
/*! [Load an extension] */
ret = conn->load_extension(conn, "my_extension.dll", NULL);
/*! [Load an extension] */
+#endif
add_collator(conn);
add_data_source(conn);
@@ -991,14 +1052,18 @@ hot_backup(WT_SESSION *session)
int
main(void)
{
+ WT_CONNECTION *conn;
int ret;
- {
- WT_CONNECTION *conn;
/*! [Open a connection] */
ret = wiredtiger_open(home, NULL, "create,cache_size=500M", &conn);
/*! [Open a connection] */
- }
+
+ if (ret == 0)
+ connection_ops(conn);
+ /*
+ * The connection has been closed.
+ */
#ifdef MIGHT_NOT_RUN
/*
@@ -1006,34 +1071,70 @@ main(void)
* be installed, causing the open to fail. The documentation requires
* the code snippets, use #ifdef's to avoid running it.
*/
- {
/*! [Configure bzip2 extension] */
- WT_CONNECTION *conn;
-
ret = wiredtiger_open(home, NULL,
"create,"
"extensions=[\"/usr/local/lib/wiredtiger_bzip2.so\"]", &conn);
/*! [Configure bzip2 extension] */
- }
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
- {
/*! [Configure snappy extension] */
- WT_CONNECTION *conn;
-
ret = wiredtiger_open(home, NULL,
"create,"
"extensions=[\"/usr/local/lib/wiredtiger_snappy.so\"]", &conn);
/*! [Configure snappy extension] */
- }
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
/*
- * We're not allowed to open multiple connections, don't run more than
- * one wiredtiger_open call.
+ * This example code gets run, and direct I/O might not be available,
+ * causing the open to fail. The documentation requires code snippets,
+ * use #ifdef's to avoid running it.
*/
- {
+ /* Might Not Run: direct I/O may not be available. */
/*! [Configure direct_io for data files] */
ret = wiredtiger_open(home, NULL, "create,direct_io=[data]", &conn);
/*! [Configure direct_io for data files] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+#endif
+
+ /*! [Statistics configuration] */
+ ret = wiredtiger_open(home, NULL, "create,statistics=true", &conn);
+ /*! [Statistics configuration] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
+ /*! [Statistics logging] */
+ ret = wiredtiger_open(
+ home, NULL, "create,statistics_log=(wait=30)", &conn);
+ /*! [Statistics logging] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
+ /*! [Statistics logging with objects] */
+ ret = wiredtiger_open(home, NULL,
+ "create,"
+ "statistics_log=(sources=(\"table:table1\",\"table:table2\"))",
+ &conn);
+ /*! [Statistics logging with objects] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
+#ifdef MIGHT_NOT_RUN
+ /*
+ * This example code gets run, and a non-existent log file path might
+ * cause the open to fail. The documentation requires code snippets,
+ * use #ifdef's to avoid running it.
+ */
+ /*! [Statistics logging with path] */
+ ret = wiredtiger_open(home, NULL,
+ "create,"
+ "statistics_log=(wait=120,path=\"/log/log.%m.%d.%y\")", &conn);
+ /*! [Statistics logging with path] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
#endif
/*! [Get the WiredTiger library version #1] */
diff --git a/examples/java/Makefile.am b/examples/java/Makefile.am
new file mode 100644
index 00000000000..c7fbfffa48c
--- /dev/null
+++ b/examples/java/Makefile.am
@@ -0,0 +1,21 @@
+AM_CPPFLAGS = -I$(abs_top_builddir)
+
+JAVAEXAMPLES = $(top_srcdir)/examples/java/com/wiredtiger/examples
+
+# TODO: How to add to existing Javadoc from main API?
+# JDOCDIR = $(top_srcdir)/docs/java
+# java_DATA = $(JDOCDIR)/index.html
+
+javadir = $(datadir)/java
+dist_java_JAVA = \
+ $(JAVAEXAMPLES)/ex_access.java
+
+all-local: wiredtiger.jar
+
+$(JDOCDIR)/index.html: $(dist_java_JAVA)
+ mkdir -p $(JDOCDIR)
+ javadoc -public -d $(JDOCDIR) -link http://docs.oracle.com/javase/6/docs/api $(JAVAEXAMPLES)/[A-Z]*.java
+
+wiredtiger.jar: $(dist_java_JAVA)
+ (cd $(top_builddir) && \
+ $(JAR) -cf wiredtiger.jar com/)
diff --git a/examples/java/com/wiredtiger/examples/ex_access.java b/examples/java/com/wiredtiger/examples/ex_access.java
new file mode 100644
index 00000000000..9a681546c59
--- /dev/null
+++ b/examples/java/com/wiredtiger/examples/ex_access.java
@@ -0,0 +1,53 @@
+/*-
+ * Public Domain 2008-2013 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_access.java
+ * demonstrates how to create and access a simple table.
+ */
+package com.wiredtiger.examples;
+import com.wiredtiger.db.*;
+
+public class ex_access {
+ public static void main(String[] args) {
+ Connection conn = wiredtiger.open("WT_HOME", "create");
+ Session s = conn.open_session(null);
+ s.create("table:t", "key_format=S,value_format=u");
+ Cursor c = s.open_cursor("table:t", null, null);
+ System.out.println("Key format: " + c.getKeyFormat());
+ System.out.println("Value format: " + c.getValueFormat());
+ try {
+ c.putKeyString("foo");
+ c.putValueByteArray("bar".getBytes());
+ c.insert();
+ c.reset();
+ while (c.next() == 0) {
+ System.out.println("Got: " + c.getKeyString());
+ }
+ } catch (WiredTigerPackingException wtpe) {
+ }
+ conn.close(null);
+ }
+}
diff --git a/ext/compressors/bzip2/bzip2_compress.c b/ext/compressors/bzip2/bzip2_compress.c
index 16efaa7aa3e..95d0490262c 100644
--- a/ext/compressors/bzip2/bzip2_compress.c
+++ b/ext/compressors/bzip2/bzip2_compress.c
@@ -43,8 +43,8 @@ bzip2_decompress(WT_COMPRESSOR *, WT_SESSION *,
uint8_t *, size_t, uint8_t *, size_t, size_t *);
#ifdef WIREDTIGER_TEST_COMPRESS_RAW
static int
-bzip2_compress_raw(WT_COMPRESSOR *, WT_SESSION *,
- size_t, size_t, uint8_t *, uint32_t *, uint32_t, uint8_t *, size_t, int,
+bzip2_compress_raw(WT_COMPRESSOR *, WT_SESSION *, size_t, u_int,
+ size_t, uint8_t *, uint32_t *, uint32_t, uint8_t *, size_t, int,
size_t *, uint32_t *);
#endif
@@ -209,7 +209,7 @@ __bzip2_compress_raw_random(void)
*/
static int
bzip2_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session,
- size_t page_max, size_t extra,
+ size_t page_max, u_int split_pct, size_t extra,
uint8_t *src, uint32_t *offsets, uint32_t slots,
uint8_t *dst, size_t dst_len, int final,
size_t *result_lenp, uint32_t *result_slotsp)
@@ -218,6 +218,7 @@ bzip2_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session,
int compression_failed, ret;
__UNUSED(page_max);
+ __UNUSED(split_pct);
__UNUSED(extra);
__UNUSED(final);
@@ -264,14 +265,14 @@ bzip2_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session,
#if 0
fprintf(stderr,
- "bzip2_compress_raw (%s): page_max %" PRIuMAX ", extra %" PRIuMAX
+ "bzip2_compress_raw (%s): page_max %" PRIuMAX
+ ", split_pct %u, extra %" PRIuMAX
", slots %" PRIu32 ", take %" PRIu32 ": %" PRIu32 " -> %"
PRIuMAX "\n",
final ? "final" : "not final",
- (uintmax_t)page_max, (uintmax_t)extra,
+ (uintmax_t)page_max, split_pct, (uintmax_t)extra,
slots, take, offsets[take], (uintmax_t)*result_lenp);
#endif
-
return (0);
}
#endif
diff --git a/lang/java/Makefile.am b/lang/java/Makefile.am
new file mode 100644
index 00000000000..75bd3aaf0bc
--- /dev/null
+++ b/lang/java/Makefile.am
@@ -0,0 +1,75 @@
+AM_CPPFLAGS = -I$(abs_top_builddir)
+
+JAVASRC = $(top_srcdir)/lang/java
+JAVADEST = src/com/wiredtiger/db
+JAVADESTFULL = $(JAVASRC)/$(JAVADEST)
+JAVAEXAMPLES = $(top_srcdir)/examples/java/com/wiredtiger/examples
+JAVATEST = $(top_srcdir)/test/java/com/wiredtiger/test
+BUILT_SOURCES = $(JAVASRC)/wiredtiger_wrap.c
+SWIG_SOURCES = $(JAVASRC)/wiredtiger.i
+
+JDOCDIR = $(top_srcdir)/docs/java
+# The Java documentation is currently generated by Doxygen - disable javadoc
+#java_DATA = $(JDOCDIR)/index.html
+
+javadir = $(datadir)/java/$(PACKAGE)-$(PACKAGE_VERSION)
+JAVA_SRC = \
+ $(JAVADESTFULL)/Connection.java \
+ $(JAVADESTFULL)/Cursor.java \
+ $(JAVADESTFULL)/SearchStatus.java \
+ $(JAVADESTFULL)/PackFormatInputStream.java \
+ $(JAVADESTFULL)/PackInputStream.java \
+ $(JAVADESTFULL)/PackOutputStream.java \
+ $(JAVADESTFULL)/PackUtil.java \
+ $(JAVADESTFULL)/Session.java \
+ $(JAVADESTFULL)/WiredTigerException.java \
+ $(JAVADESTFULL)/WiredTigerPackingException.java \
+ $(JAVADESTFULL)/wiredtiger.java \
+ $(JAVADESTFULL)/wiredtigerConstants.java \
+ $(JAVADESTFULL)/wiredtigerJNI.java \
+ $(JAVAEXAMPLES)/ex_access.java
+
+JAVA_JUNIT = \
+ $(JAVATEST)/CursorTest.java \
+ $(JAVATEST)/PackTest.java \
+ $(JAVATEST)/WiredTigerSuite.java
+
+dist_java_JAVA = $(JAVA_SRC) @JAVA_JUNIT@
+dist_java_DATA = wiredtiger.jar
+
+EXTRA_JAVA = $(JAVA_JUNIT)
+
+java_LTLIBRARIES = libwiredtiger_java.la
+
+TESTS_JUNIT = AllJunitTests
+
+TESTS = @TESTS_JUNIT@
+
+AllJunitTests:
+ echo "#! /bin/sh" > $@
+ echo 'SCRIPT_DIR=`dirname $$0`' >> $@
+ echo 'env LD_LIBRARY_PATH=$$SCRIPT_DIR/../../.libs:$$SCRIPT_DIR/.libs DYLD_LIBRARY_PATH=$$SCRIPT_DIR/../../.libs JAVA_LIBRARY_PATH=$$SCRIPT_DIR/.libs @JUNIT@ com.wiredtiger.test.WiredTigerSuite' >> $@
+ chmod +x $@
+ mkdir -p WT_HOME
+
+CPPFLAGS += $(JNI_CPPFLAGS)
+# Some warnings when compiling the generated code are unavoidable
+CFLAGS += -w
+libwiredtiger_java_la_SOURCES = $(BUILT_SOURCES) $(SWIG_SOURCES)
+#libwiredtiger_java_la_LDFLAGS = -module
+libwiredtiger_java_la_LIBADD = $(abs_top_builddir)/libwiredtiger.la
+
+all-local: wiredtiger.jar
+
+$(JAVASRC)/wiredtiger_wrap.c: $(top_srcdir)/src/include/wiredtiger.in $(SWIG_SOURCES)
+ (cd $(JAVASRC) && \
+ $(SWIG) -Wall -v -java -nodefaultctor -nodefaultdtor -package com.wiredtiger.db -I$(abs_top_builddir) -outdir $(JAVADEST) -o wiredtiger_wrap.c wiredtiger.i)
+
+$(JDOCDIR)/index.html: $(dist_java_JAVA)
+ mkdir -p $(JDOCDIR)
+ javadoc -public -d $(JDOCDIR) -link http://docs.oracle.com/javase/6/docs/api $(JAVADESTFULL)/wiredtiger.java $(JAVADESTFULL)/wiredtigerConstants.java $(JAVADESTFULL)/[A-Z]*.java
+
+wiredtiger.jar: $(dist_java_JAVA) classjava.stamp
+ (cd $(top_builddir) && \
+ $(JAR) -cf wiredtiger.jar com/)
+ cp $(top_builddir)/wiredtiger.jar .
diff --git a/lang/java/java_doc.i b/lang/java/java_doc.i
new file mode 100644
index 00000000000..da2e9fecb94
--- /dev/null
+++ b/lang/java/java_doc.i
@@ -0,0 +1,41 @@
+/* DO NOT EDIT: automatically built by dist/java_doc.py. */
+
+COPYDOC(__wt_cursor, WT_CURSOR, get_key)
+COPYDOC(__wt_cursor, WT_CURSOR, get_value)
+COPYDOC(__wt_cursor, WT_CURSOR, set_key)
+COPYDOC(__wt_cursor, WT_CURSOR, set_value)
+COPYDOC(__wt_cursor, WT_CURSOR, compare)
+COPYDOC(__wt_cursor, WT_CURSOR, next)
+COPYDOC(__wt_cursor, WT_CURSOR, prev)
+COPYDOC(__wt_cursor, WT_CURSOR, reset)
+COPYDOC(__wt_cursor, WT_CURSOR, search)
+COPYDOC(__wt_cursor, WT_CURSOR, search_near)
+COPYDOC(__wt_cursor, WT_CURSOR, insert)
+COPYDOC(__wt_cursor, WT_CURSOR, update)
+COPYDOC(__wt_cursor, WT_CURSOR, remove)
+COPYDOC(__wt_cursor, WT_CURSOR, close)
+COPYDOC(__wt_session, WT_SESSION, close)
+COPYDOC(__wt_session, WT_SESSION, reconfigure)
+COPYDOC(__wt_session, WT_SESSION, open_cursor)
+COPYDOC(__wt_session, WT_SESSION, create)
+COPYDOC(__wt_session, WT_SESSION, compact)
+COPYDOC(__wt_session, WT_SESSION, drop)
+COPYDOC(__wt_session, WT_SESSION, rename)
+COPYDOC(__wt_session, WT_SESSION, salvage)
+COPYDOC(__wt_session, WT_SESSION, truncate)
+COPYDOC(__wt_session, WT_SESSION, upgrade)
+COPYDOC(__wt_session, WT_SESSION, verify)
+COPYDOC(__wt_session, WT_SESSION, begin_transaction)
+COPYDOC(__wt_session, WT_SESSION, commit_transaction)
+COPYDOC(__wt_session, WT_SESSION, rollback_transaction)
+COPYDOC(__wt_session, WT_SESSION, checkpoint)
+COPYDOC(__wt_session, WT_SESSION, msg_printf)
+COPYDOC(__wt_connection, WT_CONNECTION, close)
+COPYDOC(__wt_connection, WT_CONNECTION, reconfigure)
+COPYDOC(__wt_connection, WT_CONNECTION, is_new)
+COPYDOC(__wt_connection, WT_CONNECTION, open_session)
+COPYDOC(__wt_connection, WT_CONNECTION, load_extension)
+COPYDOC(__wt_connection, WT_CONNECTION, add_data_source)
+COPYDOC(__wt_connection, WT_CONNECTION, add_collator)
+COPYDOC(__wt_connection, WT_CONNECTION, add_compressor)
+COPYDOC(__wt_connection, WT_CONNECTION, add_extractor)
diff --git a/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java b/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java
new file mode 100644
index 00000000000..607717ee95a
--- /dev/null
+++ b/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java
@@ -0,0 +1,165 @@
+/*-
+ * Copyright (c) 2008-2013 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+package com.wiredtiger.db;
+
+import java.io.ByteArrayInputStream;
+import java.lang.StringBuffer;
+import com.wiredtiger.db.PackUtil;
+import com.wiredtiger.db.WiredTigerPackingException;
+
+/**
+ * An internal helper class for consuming pack format strings.
+ *
+ * Applications should not need to use this class.
+ */
+public class PackFormatInputStream {
+
+ protected String format;
+ protected int formatOff;
+ protected int formatRepeatCount;
+
+ /**
+ * Constructor for a format stream.
+ *
+ * \param format the encoded format backing string.
+ */
+ protected PackFormatInputStream(String format) {
+ this.format = format;
+ formatOff = 0;
+ formatRepeatCount = 0;
+ }
+
+ /**
+ * Standard toString - returns the string used during construction.
+ */
+ public String toString() {
+ return format;
+ }
+
+ /**
+ * Returns the approximate count of elements left in the format.
+ * This method does not account for repeat counts or string length
+ * encodings - so should be used as a guide only.
+ */
+ public int available() {
+ return format.length() - formatOff + formatRepeatCount;
+ }
+
+ /**
+ * Reset the current stream position.
+ */
+ public void reset() {
+ formatOff = 0;
+ formatRepeatCount = 0;
+ }
+
+ /**
+ * Return the decoded type for the next entry in the format stream. Does
+ * not adjust the position of the stream.
+ */
+ protected char getType()
+ throws WiredTigerPackingException {
+ if (formatOff >= format.length()) {
+ System.err.println("Raw format is: " + format);
+ throw new WiredTigerPackingException(
+ "No more fields in format.");
+ }
+
+ String fieldName;
+ boolean lenOK = false;
+ int countOff = 0;
+
+ while (PackUtil.PackSpecialCharacters.indexOf(
+ format.charAt(formatOff + countOff)) != -1) {
+ countOff++;
+ }
+ // Skip repeat counts and sizes
+ while (Character.isDigit(format.charAt(formatOff + countOff))) {
+ countOff++;
+ }
+ return format.charAt(formatOff + countOff);
+ }
+
+ /**
+ * Check to see if the next entry is compatible with the requested type.
+ *
+ * \param asking the format type to match.
+ * \param consume indicates whether to update the stream position.
+ */
+ protected void checkType(char asking, boolean consume)
+ throws WiredTigerPackingException {
+
+ char expected = getType();
+ if (Character.toLowerCase(expected) != Character.toLowerCase(asking))
+ throw new WiredTigerPackingException(
+ "Format mismatch. Wanted: " + asking + ", got: " + expected);
+ if (consume) {
+ consume();
+ }
+ }
+
+ /**
+ * Move the format stream position ahead one position.
+ */
+ protected void consume() {
+ if (formatRepeatCount > 1) {
+ --formatRepeatCount;
+ } else if (formatRepeatCount == 1) {
+ formatRepeatCount = 0;
+ ++formatOff;
+ } else {
+ while (PackUtil.PackSpecialCharacters.indexOf(
+ format.charAt(formatOff)) != -1) {
+ ++formatOff;
+ }
+
+ // Don't need to worry about String or byte array size counts
+ // since they have already been consumed.
+ formatRepeatCount = getIntFromFormat(true);
+ if (formatRepeatCount == 0) {
+ ++formatOff;
+ }
+ }
+ }
+
+ /**
+ * Decode an integer from the format string, return zero if not starting
+ * on a digit.
+ *
+ * \param advance whether to move the stream position.
+ */
+ private int getIntFromFormat(boolean advance) {
+ int valueLen = 0;
+ int countOff;
+ for (countOff = 0;
+ Character.isDigit(format.charAt(formatOff + countOff));
+ countOff++) {
+ valueLen *= 10;
+ valueLen += Character.digit(format.charAt(formatOff + countOff), 10);
+ }
+ if (advance) {
+ formatOff += countOff;
+ }
+ return valueLen;
+ }
+
+ /**
+ * Retrieve a length from the format string. Either for a repeat count
+ * or a string length. Return one if no explicit repeat count.
+ *
+ * \param advance whether to move the stream position.
+ */
+ protected int getLengthFromFormat(boolean advance) {
+ int valueLen = getIntFromFormat(advance);
+ if (valueLen == 0) {
+ valueLen = 1;
+ }
+ return valueLen;
+ }
+}
+
diff --git a/lang/java/src/com/wiredtiger/db/PackInputStream.java b/lang/java/src/com/wiredtiger/db/PackInputStream.java
new file mode 100644
index 00000000000..6082684a8bf
--- /dev/null
+++ b/lang/java/src/com/wiredtiger/db/PackInputStream.java
@@ -0,0 +1,320 @@
+/*-
+ * Copyright (c) 2008-2013 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+package com.wiredtiger.db;
+
+import java.io.ByteArrayInputStream;
+import java.lang.StringBuffer;
+import com.wiredtiger.db.PackUtil;
+import com.wiredtiger.db.WiredTigerPackingException;
+
+/**
+ * An internal helper class for decoding WiredTiger packed values.
+ *
+ * Applications should not need to use this class.
+ */
+public class PackInputStream {
+
+ protected PackFormatInputStream format;
+ protected byte[] value;
+ protected int valueOff;
+ protected int valueLen;
+
+ /**
+ * Constructor.
+ *
+ * \param format A String that contains the WiredTiger format that
+ * defines the layout of this packed value.
+ * \param value The raw bytes that back the stream.
+ */
+ public PackInputStream(String format, byte[] value) {
+ this(format, value, 0, value.length);
+ }
+
+ /**
+ * Constructor.
+ *
+ * \param format A String that contains the WiredTiger format that
+ * defines the layout of this packed value.
+ * \param value The raw bytes that back the stream.
+ * \param off Offset into the value array at which the stream begins.
+ * \param len Length of the value array that forms the stream.
+ */
+ public PackInputStream(String format, byte[] value, int off, int len) {
+ this.format = new PackFormatInputStream(format);
+ this.value = value;
+ this.valueOff = off;
+ this.valueLen = len;
+ }
+
+ /**
+ * Returns the raw packing format string.
+ */
+ public String getFormat() {
+ return format.toString();
+ }
+
+ /**
+ * Returns the raw value byte array.
+ */
+ public byte[] getValue() {
+ return value;
+ }
+
+ /**
+ * Retrieves a byte field from the stream.
+ */
+ public byte getByte()
+ throws WiredTigerPackingException {
+ format.checkType('b', false);
+ format.consume();
+ return (byte)(value[valueOff++] - 0x80);
+ }
+
+ /**
+ * Retrieves a byte array field from the stream.
+ *
+ * \param dest The byte array where the returned value will be stored. The
+ * array should be large enough to store the entire data item,
+ * if it is not, a truncated value will be returned.
+ */
+ public void getByteArray(byte[] dest)
+ throws WiredTigerPackingException {
+ this.getByteArray(dest, 0, dest.length);
+ }
+
+ /**
+ * Retrieves a byte array field from the stream.
+ *
+ * \param dest The byte array where the returned value will be stored.
+ * \param off Offset into the destination buffer to start copying into.
+ * \param len The length should be large enough to store the entire data
+ * item, if it is not, a truncated value will be returned.
+ */
+ public void getByteArray(byte[] dest, int off, int len)
+ throws WiredTigerPackingException {
+ format.checkType('U', false);
+ getByteArrayInternal(getByteArrayLength(), dest, off, len);
+
+ }
+
+ /**
+ * Retrieves a byte array field from the stream. Creates a new byte array
+ * that is the size of the object being retrieved.
+ */
+ public byte[] getByteArray()
+ throws WiredTigerPackingException {
+ int itemLen = getByteArrayLength();
+ byte[] unpacked = new byte[itemLen];
+ getByteArrayInternal(itemLen, unpacked, 0, itemLen);
+ return unpacked;
+ }
+
+ /**
+ * Finds the length of a byte array. Either by decoding the length from
+ * the format or using the remaining size of the stream.
+ */
+ private int getByteArrayLength()
+ throws WiredTigerPackingException {
+ int itemLen = 0;
+ /* The rest of the buffer is a byte array. */
+ if (format.available() == 1) {
+ itemLen = valueLen - valueOff;
+ } else {
+ itemLen = unpackInt(false);
+ }
+ return itemLen;
+ }
+
+ /**
+ * Do the work of retrieving a byte array.
+ */
+ private void getByteArrayInternal(
+ int itemLen, byte[] dest, int off, int destLen)
+ throws WiredTigerPackingException {
+ /* TODO: padding. */
+ int copyLen = itemLen;
+ if (itemLen > destLen) {
+ copyLen = destLen;
+ }
+ format.consume();
+ System.arraycopy(value, valueOff, dest, off, copyLen);
+ valueOff += itemLen;
+ }
+
+ /**
+ * Retrieves an integer field from the stream.
+ */
+ public int getInt()
+ throws WiredTigerPackingException {
+ boolean signed = false;
+ format.checkType('i', false);
+ if (format.getType() == 'I' ||
+ format.getType() == 'L') {
+ signed = true;
+ }
+ format.consume();
+ return unpackInt(signed);
+ }
+
+ /**
+ * Retrieves a long field from the stream.
+ */
+ public long getLong()
+ throws WiredTigerPackingException {
+ boolean signed = false;
+ format.checkType('q', false);
+ if (format.getType() == 'Q') {
+ signed = true;
+ }
+ format.consume();
+ return unpackLong(signed);
+ }
+
+ /**
+ * Retrieves a record field from the stream.
+ */
+ public long getRecord()
+ throws WiredTigerPackingException {
+ format.checkType('r', false);
+ format.consume();
+ return unpackLong(false);
+ }
+
+ /**
+ * Retrieves a short field from the stream.
+ */
+ public short getShort()
+ throws WiredTigerPackingException {
+ boolean signed = false;
+ format.checkType('h', false);
+ if (format.getType() == 'H') {
+ signed = true;
+ }
+ format.consume();
+ return unpackShort(signed);
+ }
+
+ /**
+ * Retrieves a string field from the stream.
+ */
+ public String getString()
+ throws WiredTigerPackingException {
+ int stringLength = 0;
+ format.checkType('S', false);
+ // Get the length for a fixed length string
+ if (format.getType() != 'S') {
+ stringLength = format.getLengthFromFormat(true);
+ } else {
+ // The string is null terminated, but we need to know how many
+ // bytes are consumed - which won't necessarily match up to the
+ // string length.
+ for (; valueOff + stringLength < value.length &&
+ value[valueOff + stringLength] != 0; stringLength++) {}
+ }
+ format.consume();
+ String result = new String(value, valueOff, stringLength);
+ valueOff += stringLength + 1;
+ return result;
+ }
+
+ /**
+ * Decodes an encoded short from the stream. This method does bounds
+ * checking, to ensure values fit, since some values may be encoded as
+ * unsigned values, and Java types are all signed.
+ */
+ private short unpackShort(boolean signed)
+ throws WiredTigerPackingException {
+ long ret = unpackLong(true);
+ if ((signed && (ret > Short.MAX_VALUE || ret > Short.MIN_VALUE)) ||
+ (!signed && (short)ret < 0)) {
+ throw new WiredTigerPackingException("Overflow unpacking short.");
+ }
+ return (short)ret;
+ }
+
+ /**
+ * Decodes an encoded integer from the stream. This method does bounds
+ * checking, to ensure values fit, since some values may be encoded as
+ * unsigned values, and Java types are all signed.
+ */
+ private int unpackInt(boolean signed)
+ throws WiredTigerPackingException {
+ long ret = unpackLong(true);
+ if ((signed && (ret > Integer.MAX_VALUE || ret > Integer.MIN_VALUE)) ||
+ (!signed && (int)ret < 0)) {
+ throw new WiredTigerPackingException("Overflow unpacking integer.");
+ }
+ return (int)ret;
+ }
+
+ /**
+ * Decodes an encoded long from the stream. This method does bounds
+ * checking, to ensure values fit, since some values may be encoded as
+ * unsigned values, and Java types are all signed.
+ * The packing format is defined in the WiredTiger C integer packing
+ * implementation, which is at src/include/intpack.i
+ */
+ private long unpackLong(boolean signed)
+ throws WiredTigerPackingException {
+ int len;
+ long unpacked = 0;
+ switch (value[valueOff] & 0xf0) {
+ case PackUtil.NEG_MULTI_MARKER & 0xff:
+ len = (int)PackUtil.SIZEOF_LONG - (value[valueOff++] & 0xf);
+
+ for (unpacked = 0xffffffff; len != 0; --len) {
+ unpacked = (unpacked << 8) | value[valueOff++] & 0xff;
+ }
+ break;
+ case PackUtil.NEG_2BYTE_MARKER & 0xff:
+ case (PackUtil.NEG_2BYTE_MARKER | 0x10) & 0xff:
+ unpacked = PackUtil.GET_BITS(value[valueOff++], 5, 0) << 8;
+ unpacked |= value[valueOff++] & 0xff;
+ unpacked += PackUtil.NEG_2BYTE_MIN;
+ break;
+ case PackUtil.NEG_1BYTE_MARKER & 0xff:
+ case (PackUtil.NEG_1BYTE_MARKER | 0x10) & 0xff:
+ case (PackUtil.NEG_1BYTE_MARKER | 0x20) & 0xff:
+ case (PackUtil.NEG_1BYTE_MARKER | 0x30) & 0xff:
+ unpacked = PackUtil.NEG_1BYTE_MIN +
+ PackUtil.GET_BITS(value[valueOff++], 6, 0);
+ break;
+ case PackUtil.POS_1BYTE_MARKER & 0xff:
+ case (PackUtil.POS_1BYTE_MARKER | 0x10) & 0xff:
+ case (PackUtil.POS_1BYTE_MARKER | 0x20) & 0xff:
+ case (PackUtil.POS_1BYTE_MARKER | 0x30) & 0xff:
+ unpacked = PackUtil.GET_BITS(value[valueOff++], 6, 0);
+ break;
+ case PackUtil.POS_2BYTE_MARKER & 0xff:
+ case (PackUtil.POS_2BYTE_MARKER | 0x10) & 0xff:
+ unpacked = PackUtil.GET_BITS(value[valueOff++], 5, 0) << 8;
+ unpacked |= value[valueOff++] & 0xff;
+ unpacked += PackUtil.POS_1BYTE_MAX + 1;
+ break;
+ case PackUtil.POS_MULTI_MARKER & 0xff:
+ // There are four length bits in the first byte.
+ len = (value[valueOff++] & 0xf);
+
+ for (unpacked = 0; len != 0; --len) {
+ unpacked = (unpacked << 8) | value[valueOff++] & 0xff;
+ }
+ unpacked += PackUtil.POS_2BYTE_MAX + 1;
+ break;
+ default:
+ throw new WiredTigerPackingException(
+ "Error decoding packed value.");
+ }
+ // Check for overflow if decoding an unsigned value - since Java only
+ // supports signed values.
+ if (!signed && unpacked < 0) {
+ throw new WiredTigerPackingException("Overflow unpacking long.");
+ }
+
+ return (unpacked);
+ }
+}
+
diff --git a/lang/java/src/com/wiredtiger/db/PackOutputStream.java b/lang/java/src/com/wiredtiger/db/PackOutputStream.java
new file mode 100644
index 00000000000..693f79c3ff2
--- /dev/null
+++ b/lang/java/src/com/wiredtiger/db/PackOutputStream.java
@@ -0,0 +1,244 @@
+/*-
+ * Copyright (c) 2008-2013 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+package com.wiredtiger.db;
+
+import java.io.ByteArrayOutputStream;
+import java.lang.StringBuffer;
+import com.wiredtiger.db.WiredTigerPackingException;
+
+/**
+ * An internal helper class for encoding WiredTiger packed values.
+ *
+ * Applications should not need to use this class.
+ */
+public class PackOutputStream {
+
+ final static int MAX_INT_BYTES = 21;
+ protected PackFormatInputStream format;
+ protected ByteArrayOutputStream packed;
+ protected byte[] intBuf;
+
+ /**
+ * Constructor.
+ *
+ * \param format A String that contains the WiredTiger format that
+ * defines the layout of this packed value.
+ */
+ public PackOutputStream(String format) {
+ this.format = new PackFormatInputStream(format);
+ intBuf = new byte[MAX_INT_BYTES];
+ packed = new ByteArrayOutputStream(100);
+ }
+
+ /**
+ * Returns the raw packing format string.
+ */
+ public String getFormat() {
+ return format.toString();
+ }
+
+ /**
+ * Returns the current packed value.
+ */
+ public byte[] getValue() {
+ return packed.toByteArray();
+ }
+
+ /**
+ * Reset the stream position.
+ */
+ public void reset() {
+ format.reset();
+ packed.reset();
+ }
+
+ /**
+ * Add a byte field to the stream.
+ *
+ * \param value The byte value to be added.
+ */
+ public void addByte(byte value)
+ throws WiredTigerPackingException {
+ format.checkType('b', true);
+ /* Translate to maintain ordering with the sign bit. */
+ byte input = (byte)(value + 0x80);
+ packed.write(input);
+ }
+
+ /**
+ * Add a byte array field to the stream.
+ *
+ * \param value The byte array value to be added.
+ */
+ public void addByteArray(byte[] value)
+ throws WiredTigerPackingException {
+ this.addByteArray(value, 0, value.length);
+ }
+
+ /**
+ * Add a byte array field to the stream.
+ *
+ * \param value The byte array value to be added.
+ * \param off The offset from the start of value to begin using the array.
+ * \param len The length of the value to encode.
+ */
+ public void addByteArray(byte[] value, int off, int len)
+ throws WiredTigerPackingException {
+ format.checkType('U', true);
+ // If this is not the last item, store the size.
+ if (format.available() > 0) {
+ packLong(len, false);
+ }
+
+ packed.write(value, off, len);
+ /* TODO: padding. */
+ }
+
+ /**
+ * Add an integer field to the stream.
+ *
+ * \param value The integer value to be added.
+ */
+ public void addInt(int value)
+ throws WiredTigerPackingException {
+ format.checkType('i', true);
+ packLong(value, true);
+ }
+
+ /**
+ * Add a long field to the stream.
+ *
+ * \param value The long value to be added.
+ */
+ public void addLong(long value)
+ throws WiredTigerPackingException {
+ format.checkType('q', true);
+ packLong(value, true);
+ }
+
+ /**
+ * Add a record field to the stream.
+ *
+ * \param value The record value to be added.
+ */
+ public void addRecord(long value)
+ throws WiredTigerPackingException {
+ format.checkType('r', true);
+ packLong(value, true);
+ }
+
+ /**
+ * Add a short field to the stream.
+ *
+ * \param value The short value to be added.
+ */
+ public void addShort(short value)
+ throws WiredTigerPackingException {
+ format.checkType('h', true);
+ packLong(value, true);
+ }
+
+ /**
+ * Add a string field to the stream.
+ *
+ * \param value The string value to be added.
+ */
+ public void addString(String value)
+ throws WiredTigerPackingException {
+ format.checkType('s', false);
+ char fieldFormat = format.getType();
+ int stringLen = 0;
+ int padBytes = 0;
+ // Strings have two possible encodings. A lower case 's' is not null
+ // terminated, and has a length define in the format (default 1). An
+ // upper case 'S' is variable length and has a null terminator.
+ if (fieldFormat == 's') {
+ stringLen = format.getLengthFromFormat(true);
+ if (stringLen > value.length()) {
+ padBytes = stringLen - value.length();
+ }
+ } else {
+ stringLen = value.length();
+ padBytes = 1; // Null terminator
+ }
+ // We're done pulling information from the field now.
+ format.consume();
+
+ // Use the default Charset.
+ packed.write(value.getBytes(), 0, stringLen);
+ while(padBytes-- > 0) {
+ packed.write(0);
+ }
+ }
+
+ /**
+ * Add a long field to the stream.
+ * The packing format is defined in the WiredTiger C integer packing
+ * implementation, which is at src/include/intpack.i
+ *
+ * \param x The long value to be added.
+ * \param signed Whether the value is signed or unsigned.
+ */
+ private void packLong(long x, boolean signed)
+ throws WiredTigerPackingException {
+ int offset = 0;
+
+ if (!signed && x < 0) {
+ throw new WiredTigerPackingException("Overflow packing long.");
+ }
+
+ if (x < PackUtil.NEG_2BYTE_MIN) {
+ intBuf[offset] = PackUtil.NEG_MULTI_MARKER;
+ int lz = Long.numberOfLeadingZeros(~x) / 8;
+ int len = PackUtil.SIZEOF_LONG - lz;
+
+ //
+ // There are four size bits we can use in the first
+ // byte. For negative numbers, we store the number of
+ // leading 0xff byes to maintain ordering (if this is
+ // not obvious, it may help to remember that -1 is the
+ // largest negative number).
+ intBuf[offset++] |= (lz & 0xf);
+
+ for (int shift = (len - 1) << 3;
+ len != 0; shift -= 8, --len) {
+ intBuf[offset++] = (byte)(x >> shift);
+ }
+ } else if (x < PackUtil.NEG_1BYTE_MIN) {
+ x -= PackUtil.NEG_2BYTE_MIN;
+ intBuf[offset++] =
+ (byte)(PackUtil.NEG_2BYTE_MARKER | PackUtil.GET_BITS(x, 13, 8));
+ intBuf[offset++] = PackUtil.GET_BITS(x, 8, 0);
+ } else if (x < 0) {
+ x -= PackUtil.NEG_1BYTE_MIN;
+ intBuf[offset++] =
+ (byte)(PackUtil.NEG_1BYTE_MARKER | PackUtil.GET_BITS(x, 6, 0));
+ } else if (x <= PackUtil.POS_1BYTE_MAX) {
+ intBuf[offset++] =
+ (byte)(PackUtil.POS_1BYTE_MARKER | PackUtil.GET_BITS(x, 6, 0));
+ } else if (x <= PackUtil.POS_2BYTE_MAX) {
+ x -= PackUtil.POS_1BYTE_MAX + 1;
+ intBuf[offset++] =
+ (byte)(PackUtil.POS_2BYTE_MARKER | PackUtil.GET_BITS(x, 13, 8));
+ intBuf[offset++] = PackUtil.GET_BITS(x, 8, 0);
+ } else {
+ x -= PackUtil.POS_2BYTE_MAX + 1;
+ intBuf[offset] = PackUtil.POS_MULTI_MARKER;
+ int lz = Long.numberOfLeadingZeros(x) / 8;
+ int len = PackUtil.SIZEOF_LONG - lz;
+
+ // There are four bits we can use in the first byte.
+ intBuf[offset++] |= (len & 0xf);
+
+ for (int shift = (len - 1) << 3;
+ len != 0; --len, shift -= 8) {
+ intBuf[offset++] = (byte)(x >> shift);
+ }
+ }
+ packed.write(intBuf, 0, offset);
+ }
+}
diff --git a/lang/java/src/com/wiredtiger/db/PackUtil.java b/lang/java/src/com/wiredtiger/db/PackUtil.java
new file mode 100644
index 00000000000..b82e0294073
--- /dev/null
+++ b/lang/java/src/com/wiredtiger/db/PackUtil.java
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2008-2013 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+package com.wiredtiger.db;
+
+import java.lang.String;
+
+/**
+ * An internal helper class with utilities for packing and unpacking values.
+ *
+ * Applications should not need to use this class.
+ */
+class PackUtil {
+ /* Contants. */
+ final static byte NEG_MULTI_MARKER = (byte)0x10;
+ final static byte NEG_2BYTE_MARKER = (byte)0x20;
+ final static byte NEG_1BYTE_MARKER = (byte)0x40;
+ final static byte POS_1BYTE_MARKER = (byte)0x80;
+ final static byte POS_2BYTE_MARKER = (byte)0xc0;
+ final static byte POS_MULTI_MARKER = (byte)0xe0;
+
+ final static int NEG_1BYTE_MIN = ((-1) << 6);
+ final static int NEG_2BYTE_MIN = (((-1) << 13) + NEG_1BYTE_MIN);
+ final static int POS_1BYTE_MAX = ((1 << 6) - 1);
+ final static int POS_2BYTE_MAX = ((1 << 13) + POS_1BYTE_MAX);
+
+ // See: http://docs.python.org/2/library/struct.html for an explanation
+ // of what these special characters mean.
+ // TODO: Care about byte ordering and padding in packed formats.
+ final static String PackSpecialCharacters = "@=<>!x";
+
+ final static int SIZEOF_LONG = 8;
+
+ /**
+ * Extract bits from a value, counting from LSB == 0.
+ *
+ * \param x The value to extract bits from.
+ * \param start The first bit to extract.
+ * \param end The last bit to extract.
+ */
+ public static byte GET_BITS(long x, int start, int end) {
+ return (byte)((x & ((1 << start) - 1)) >> end);
+ }
+
+
+}
diff --git a/lang/java/src/com/wiredtiger/db/WiredTigerException.java b/lang/java/src/com/wiredtiger/db/WiredTigerException.java
new file mode 100644
index 00000000000..6424cb3a92e
--- /dev/null
+++ b/lang/java/src/com/wiredtiger/db/WiredTigerException.java
@@ -0,0 +1,19 @@
+/*-
+ * Copyright (c) 2008-2013 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+package com.wiredtiger.db;
+
+/**
+ * An exception that is generated by the WiredTiger application.
+ */
+public class WiredTigerException extends Exception {
+ /**
+ * Constructor.
+ */
+ public WiredTigerException(String msg) {
+ super(msg);
+ }
+}
diff --git a/lang/java/src/com/wiredtiger/db/WiredTigerPackingException.java b/lang/java/src/com/wiredtiger/db/WiredTigerPackingException.java
new file mode 100644
index 00000000000..1c4ab079748
--- /dev/null
+++ b/lang/java/src/com/wiredtiger/db/WiredTigerPackingException.java
@@ -0,0 +1,21 @@
+/*-
+ * Copyright (c) 2008-2013 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+package com.wiredtiger.db;
+
+/**
+ * An exception that is generated by the WiredTiger application during
+ * encoding or decoding of packed values.
+ */
+public class WiredTigerPackingException extends WiredTigerException {
+ /**
+ * Constructor.
+ */
+ public WiredTigerPackingException(String msg) {
+ super(msg);
+ }
+}
+
diff --git a/lang/java/wiredtiger.i b/lang/java/wiredtiger.i
new file mode 100644
index 00000000000..f93a45c2581
--- /dev/null
+++ b/lang/java/wiredtiger.i
@@ -0,0 +1,823 @@
+/*-
+ * Copyright (c) 2008-2013 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ *
+ * wiredtiger.i
+ * The SWIG interface file defining the wiredtiger Java API.
+ */
+
+%module wiredtiger
+
+%include "enums.swg"
+%include "typemaps.i"
+
+%pragma(java) jniclasscode=%{
+ static {
+ try {
+ System.loadLibrary("wiredtiger_java");
+ } catch (UnsatisfiedLinkError e) {
+ System.err.println("Native code library failed to load. \n" + e);
+ System.exit(1);
+ }
+ }
+%}
+
+%{
+typedef int bool;
+
+static void throwWiredTigerException(JNIEnv *jenv, const char *msg) {
+ jclass excep = (*jenv)->FindClass(jenv, "com/wiredtiger/db/WiredTigerException");
+ if (excep)
+ (*jenv)->ThrowNew(jenv, excep, msg);
+}
+%}
+
+/* No finalizers */
+%typemap(javafinalize) SWIGTYPE ""
+
+/* Event handlers are not supported in Java. */
+%typemap(in, numinputs=0) WT_EVENT_HANDLER * %{ $1 = NULL; %}
+
+/* Allow silently passing the Java object and JNIEnv into our code. */
+%typemap(in, numinputs=0) jobject *jthis %{ $1 = jarg1_; %}
+%typemap(in, numinputs=0) JNIEnv * %{ $1 = jenv; %}
+
+/* 64 bit typemaps. */
+%typemap(jni) uint64_t "jlong"
+%typemap(jtype) uint64_t "long"
+%typemap(jstype) uint64_t "long"
+
+%typemap(javain) uint64_t "$javainput"
+%typemap(javaout) uint64_t {
+ return $jnicall;
+}
+
+/* Return byte[] from cursor.get_value */
+%typemap(jni) WT_ITEM * "jbyteArray"
+%typemap(jtype) WT_ITEM * "byte[]"
+%typemap(jstype) WT_ITEM * "byte[]"
+
+%typemap(javain) WT_ITEM * "$javainput"
+%typemap(javaout) WT_ITEM * {
+ return $jnicall;
+}
+
+%typemap(in) WT_ITEM * (WT_ITEM item) %{
+ $1 = &item;
+ $1->data = (*jenv)->GetByteArrayElements(jenv, $input, 0);
+ $1->size = (*jenv)->GetArrayLength(jenv, $input);
+%}
+
+%typemap(argout) WT_ITEM * %{
+ (*jenv)->ReleaseByteArrayElements(jenv, $input, $1->data, 0);
+%}
+
+%typemap(out) WT_ITEM * %{
+ if ($1 == NULL)
+ $result = NULL;
+ else if (($result = (*jenv)->NewByteArray(jenv, $1->size)) != NULL) {
+ (*jenv)->SetByteArrayRegion(jenv,
+ $result, 0, $1->size, $1->data);
+ }
+%}
+
+/* Don't require empty config strings. */
+%typemap(default) const char *config %{ $1 = NULL; %}
+
+%typemap(out) int %{
+ if ($1 != 0 && $1 != WT_NOTFOUND) {
+ throwWiredTigerException(jenv, wiredtiger_strerror($1));
+ return $null;
+ }
+ $result = $1;
+%}
+
+/*
+ * Extra 'self' elimination.
+ * The methods we're wrapping look like this:
+ * struct __wt_xxx {
+ * int method(WT_XXX *, ...otherargs...);
+ * };
+ * To SWIG, that is equivalent to:
+ * int method(struct __wt_xxx *self, WT_XXX *, ...otherargs...);
+ * and we use consecutive argument matching of typemaps to convert two args to
+ * one.
+ */
+%define WT_CLASS(type, class, name)
+%typemap(in, numinputs=0) type *name "$1 = *(type **)&jarg1;"
+%typemap(javaimports) type "
+/**
+ * @copydoc class
+ * @ingroup wt_java
+ */"
+%enddef
+
+%pragma(java) moduleimports=%{
+/**
+ * @defgroup wt_java WiredTiger Java API
+ *
+ * Java wrappers around the WiredTiger C API.
+ */
+
+/**
+ * @ingroup wt_java
+ */
+%}
+
+WT_CLASS(struct __wt_connection, WT_CONNECTION, connection)
+WT_CLASS(struct __wt_session, WT_SESSION, session)
+WT_CLASS(struct __wt_cursor, WT_CURSOR, cursor)
+
+%define COPYDOC(SIGNATURE_CLASS, CLASS, METHOD)
+%javamethodmodifiers SIGNATURE_CLASS::METHOD "
+ /**
+ * @copydoc CLASS::METHOD
+ */
+ public ";
+%enddef
+
+%include "java_doc.i"
+
+/* WT_CURSOR customization. */
+/* First, replace the varargs get / set methods with Java equivalents. */
+%ignore __wt_cursor::get_key;
+%ignore __wt_cursor::get_value;
+%ignore __wt_cursor::set_key;
+%ignore __wt_cursor::set_value;
+%ignore __wt_cursor::insert;
+%ignore __wt_cursor::remove;
+%ignore __wt_cursor::search;
+%ignore __wt_cursor::search_near;
+%ignore __wt_cursor::update;
+%javamethodmodifiers __wt_cursor::next "protected";
+%rename (next_wrap) __wt_cursor::next;
+%javamethodmodifiers __wt_cursor::prev "protected";
+%rename (prev_wrap) __wt_cursor::prev;
+%javamethodmodifiers __wt_cursor::key_format "protected";
+%javamethodmodifiers __wt_cursor::value_format "protected";
+
+%ignore __wt_cursor::compare(WT_CURSOR *, WT_CURSOR *, int *);
+%rename (compare_wrap) __wt_cursor::compare;
+
+/* SWIG magic to turn Java byte strings into data / size. */
+%apply (char *STRING, int LENGTH) { (char *data, int size) };
+
+/* Status from search_near */
+%javaconst(1);
+%inline %{
+enum SearchStatus { FOUND, NOTFOUND, SMALLER, LARGER };
+%}
+
+%extend __wt_cursor {
+
+ %javamethodmodifiers get_key_wrap "protected";
+ WT_ITEM *get_key_wrap(JNIEnv *jenv) {
+ WT_ITEM k;
+ int ret;
+ if ((ret = $self->get_key($self, &k)) != 0) {
+ throwWiredTigerException(jenv, wiredtiger_strerror(ret));
+ return NULL;
+ }
+ return &$self->key;
+ }
+
+ %javamethodmodifiers get_value_wrap "protected";
+ WT_ITEM *get_value_wrap(JNIEnv *jenv) {
+ WT_ITEM v;
+ int ret;
+ if ((ret = $self->get_value($self, &v)) != 0) {
+ throwWiredTigerException(jenv, wiredtiger_strerror(ret));
+ return NULL;
+ }
+ return &$self->value;
+ }
+
+ %javamethodmodifiers insert_wrap "protected";
+ int insert_wrap(WT_ITEM *k, WT_ITEM *v) {
+ $self->set_key($self, k);
+ $self->set_value($self, v);
+ return $self->insert($self);
+ }
+
+ %javamethodmodifiers remove_wrap "protected";
+ int remove_wrap(WT_ITEM *k) {
+ $self->set_key($self, k);
+ return $self->remove($self);
+ }
+
+ %javamethodmodifiers search_wrap "protected";
+ int search_wrap(WT_ITEM *k) {
+ $self->set_key($self, k);
+ return $self->search($self);
+ }
+
+ %javamethodmodifiers search_near_wrap "protected";
+ enum SearchStatus search_near_wrap(JNIEnv *jenv, WT_ITEM *k) {
+ int cmp, ret;
+
+ $self->set_key($self, k);
+ ret = $self->search_near(self, &cmp);
+ if (ret != 0 && ret != WT_NOTFOUND)
+ throwWiredTigerException(jenv, wiredtiger_strerror(ret));
+ if (ret == 0)
+ return (cmp == 0 ? FOUND : cmp < 0 ? SMALLER : LARGER);
+ return (NOTFOUND);
+ }
+
+ %javamethodmodifiers update_wrap "protected";
+ int update_wrap(WT_ITEM *k, WT_ITEM *v) {
+ $self->set_key($self, k);
+ $self->set_value($self, v);
+ return $self->update($self);
+ }
+
+ int compare_wrap(JNIEnv *jenv, WT_CURSOR *other) {
+ int cmp, ret = $self->compare($self, other, &cmp);
+ if (ret != 0)
+ throwWiredTigerException(jenv, wiredtiger_strerror(ret));
+ return cmp;
+ }
+}
+
+/* Cache key/value formats in Cursor */
+%typemap(javabody) struct __wt_cursor %{
+ private long swigCPtr;
+ protected boolean swigCMemOwn;
+ protected String keyFormat;
+ protected String valueFormat;
+ protected PackOutputStream keyPacker;
+ protected PackOutputStream valuePacker;
+ protected PackInputStream keyUnpacker;
+ protected PackInputStream valueUnpacker;
+
+ protected $javaclassname(long cPtr, boolean cMemoryOwn) {
+ swigCMemOwn = cMemoryOwn;
+ swigCPtr = cPtr;
+ keyFormat = getKey_format();
+ valueFormat = getValue_format();
+ keyPacker = new PackOutputStream(keyFormat);
+ valuePacker = new PackOutputStream(valueFormat);
+ }
+
+ protected static long getCPtr($javaclassname obj) {
+ return (obj == null) ? 0 : obj.swigCPtr;
+ }
+%}
+
+%typemap(javacode) struct __wt_cursor %{
+
+ /**
+ * Retrieve the format string for this cursor's key.
+ */
+ public String getKeyFormat() {
+ return keyFormat;
+ }
+
+ /**
+ * Retrieve the format string for this cursor's value.
+ */
+ public String getValueFormat() {
+ return valueFormat;
+ }
+
+ /**
+ * Append a byte to the cursor's key.
+ *
+ * \param value The value to append.
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putKeyByte(byte value)
+ throws WiredTigerPackingException {
+ keyPacker.addByte(value);
+ return this;
+ }
+
+ /**
+ * Append a byte array to the cursor's key.
+ *
+ * \param value The value to append.
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putKeyByteArray(byte[] value)
+ throws WiredTigerPackingException {
+ this.putKeyByteArray(value, 0, value.length);
+ return this;
+ }
+
+ /**
+ * Append a byte array to the cursor's key.
+ *
+ * \param value The value to append.
+ * \param off The offset into value at which to start.
+ * \param len The length of the byte array.
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putKeyByteArray(byte[] value, int off, int len)
+ throws WiredTigerPackingException {
+ keyPacker.addByteArray(value, off, len);
+ return this;
+ }
+
+ /**
+ * Append an integer to the cursor's key.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putKeyInt(int value)
+ throws WiredTigerPackingException {
+ keyPacker.addInt(value);
+ return this;
+ }
+
+ /**
+ * Append a long to the cursor's key.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putKeyLong(long value)
+ throws WiredTigerPackingException {
+ keyPacker.addLong(value);
+ return this;
+ }
+
+ /**
+ * Append a short integer to the cursor's key.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putKeyShort(short value)
+ throws WiredTigerPackingException {
+ keyPacker.addShort(value);
+ return this;
+ }
+
+ /**
+ * Append a string to the cursor's key.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putKeyString(String value)
+ throws WiredTigerPackingException {
+ keyPacker.addString(value);
+ return this;
+ }
+
+ /**
+ * Append a byte to the cursor's value.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putValueByte(byte value)
+ throws WiredTigerPackingException {
+ valuePacker.addByte(value);
+ return this;
+ }
+
+ /**
+ * Append a byte array to the cursor's value.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putValueByteArray(byte[] value)
+ throws WiredTigerPackingException {
+ this.putValueByteArray(value, 0, value.length);
+ return this;
+ }
+
+ /**
+ * Append a byte array to the cursor's value.
+ *
+ * \param value The value to append
+ * \param off The offset into value at which to start.
+ * \param len The length of the byte array.
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putValueByteArray(byte[] value, int off, int len)
+ throws WiredTigerPackingException {
+ valuePacker.addByteArray(value, off, len);
+ return this;
+ }
+
+ /**
+ * Append an integer to the cursor's value.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putValueInt(int value)
+ throws WiredTigerPackingException {
+ valuePacker.addInt(value);
+ return this;
+ }
+
+ /**
+ * Append a long to the cursor's value.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putValueLong(long value)
+ throws WiredTigerPackingException {
+ valuePacker.addLong(value);
+ return this;
+ }
+
+ /**
+ * Append a short integer to the cursor's value.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putValueShort(short value)
+ throws WiredTigerPackingException {
+ valuePacker.addShort(value);
+ return this;
+ }
+
+ /**
+ * Append a string to the cursor's value.
+ *
+ * \param value The value to append
+ * \return This cursor object, so put calls can be chained.
+ */
+ public Cursor putValueString(String value)
+ throws WiredTigerPackingException {
+ valuePacker.addString(value);
+ return this;
+ }
+
+ /**
+ * Retrieve a byte from the cursor's key.
+ *
+ * \return The requested value.
+ */
+ public byte getKeyByte()
+ throws WiredTigerPackingException {
+ return keyUnpacker.getByte();
+ }
+
+ /**
+ * Retrieve a byte array from the cursor's key.
+ *
+ * \param output The byte array where the returned value will be stored.
+ * The array should be large enough to store the entire
+ * data item, if not a truncated value will be returned.
+ */
+ public void getKeyByteArray(byte[] output)
+ throws WiredTigerPackingException {
+ this.getKeyByteArray(output, 0, output.length);
+ }
+
+ /**
+ * Retrieve a byte array from the cursor's key.
+ *
+ * \param output The byte array where the returned value will be stored.
+ * \param off Offset into the destination buffer to start copying into.
+ * \param len The length should be large enough to store the entire
+ * data item, if not a truncated value will be returned.
+ */
+ public void getKeyByteArray(byte[] output, int off, int len)
+ throws WiredTigerPackingException {
+ keyUnpacker.getByteArray(output, off, len);
+ }
+
+ /**
+ * Retrieve a byte array from the cursor's key.
+ *
+ * \return The requested value.
+ */
+ public byte[] getKeyByteArray()
+ throws WiredTigerPackingException {
+ return keyUnpacker.getByteArray();
+ }
+
+ /**
+ * Retrieve an integer from the cursor's key.
+ *
+ * \return The requested value.
+ */
+ public int getKeyInt()
+ throws WiredTigerPackingException {
+ return keyUnpacker.getInt();
+ }
+
+ /**
+ * Retrieve a long from the cursor's key.
+ *
+ * \return The requested value.
+ */
+ public long getKeyLong()
+ throws WiredTigerPackingException {
+ return keyUnpacker.getLong();
+ }
+
+ /**
+ * Retrieve a short integer from the cursor's key.
+ *
+ * \return The requested value.
+ */
+ public short getKeyShort()
+ throws WiredTigerPackingException {
+ return keyUnpacker.getShort();
+ }
+
+ /**
+ * Retrieve a string from the cursor's key.
+ *
+ * \return The requested value.
+ */
+ public String getKeyString()
+ throws WiredTigerPackingException {
+ return keyUnpacker.getString();
+ }
+
+ /**
+ * Retrieve a byte from the cursor's value.
+ *
+ * \return The requested value.
+ */
+ public byte getValueByte()
+ throws WiredTigerPackingException {
+ return valueUnpacker.getByte();
+ }
+
+ /**
+ * Retrieve a byte array from the cursor's value.
+ *
+ * \param output The byte array where the returned value will be stored.
+ * The array should be large enough to store the entire
+ * data item, if not a truncated value will be returned.
+ */
+ public void getValueByteArray(byte[] output)
+ throws WiredTigerPackingException {
+ this.getValueByteArray(output, 0, output.length);
+ }
+
+ /**
+ * Retrieve a byte array from the cursor's value.
+ *
+ * \param output The byte array where the returned value will be stored.
+ * \param off Offset into the destination buffer to start copying into.
+ * \param len The length should be large enough to store the entire
+ * data item, if not a truncated value will be returned.
+ */
+ public void getValueByteArray(byte[] output, int off, int len)
+ throws WiredTigerPackingException {
+ valueUnpacker.getByteArray(output, off, len);
+ }
+
+ /**
+ * Retrieve a byte array from the cursor's value.
+ *
+ * \return The requested value.
+ */
+ public byte[] getValueByteArray()
+ throws WiredTigerPackingException {
+ return valueUnpacker.getByteArray();
+ }
+
+ /**
+ * Retrieve an integer from the cursor's value.
+ *
+ * \return The requested value.
+ */
+ public int getValueInt()
+ throws WiredTigerPackingException {
+ return valueUnpacker.getInt();
+ }
+
+ /**
+ * Retrieve a long from the cursor's value.
+ *
+ * \return The requested value.
+ */
+ public long getValueLong()
+ throws WiredTigerPackingException {
+ return valueUnpacker.getLong();
+ }
+
+ /**
+ * Retrieve a short integer from the cursor's value.
+ *
+ * \return The requested value.
+ */
+ public short getValueShort()
+ throws WiredTigerPackingException {
+ return valueUnpacker.getShort();
+ }
+
+ /**
+ * Retrieve a string from the cursor's value.
+ *
+ * \return The requested value.
+ */
+ public String getValueString()
+ throws WiredTigerPackingException {
+ return valueUnpacker.getString();
+ }
+
+ /**
+ * Insert the cursor's current key/value into the table.
+ *
+ * \return The status of the operation.
+ */
+ public int insert() {
+ byte[] key = keyPacker.getValue();
+ byte[] value = valuePacker.getValue();
+ keyPacker.reset();
+ valuePacker.reset();
+ return insert_wrap(key, value);
+ }
+
+ /**
+ * Update the cursor's current key/value into the table.
+ *
+ * \return The status of the operation.
+ */
+ public int update() {
+ byte[] key = keyPacker.getValue();
+ byte[] value = valuePacker.getValue();
+ keyPacker.reset();
+ valuePacker.reset();
+ return update_wrap(key, value);
+ }
+
+ /**
+ * Remove the cursor's current key/value into the table.
+ *
+ * \return The status of the operation.
+ */
+ public int remove() {
+ byte[] key = keyPacker.getValue();
+ keyPacker.reset();
+ return remove_wrap(key);
+ }
+
+ /**
+ * Compare this cursor's position to another Cursor.
+ *
+ * \return The result of the comparison.
+ */
+ public int compare(Cursor other) {
+ return compare_wrap(other);
+ }
+
+ /**
+ * Retrieve the next item in the table.
+ *
+ * \return The result of the comparison.
+ */
+ public int next() {
+ int ret = next_wrap();
+ keyPacker.reset();
+ valuePacker.reset();
+ keyUnpacker = (ret == 0) ?
+ new PackInputStream(keyFormat, get_key_wrap()) : null;
+ valueUnpacker = (ret == 0) ?
+ new PackInputStream(valueFormat, get_value_wrap()) : null;
+ return ret;
+ }
+
+ /**
+ * Retrieve the previous item in the table.
+ *
+ * \return The result of the comparison.
+ */
+ public int prev() {
+ int ret = prev_wrap();
+ keyPacker.reset();
+ valuePacker.reset();
+ keyUnpacker = (ret == 0) ?
+ new PackInputStream(keyFormat, get_key_wrap()) : null;
+ valueUnpacker = (ret == 0) ?
+ new PackInputStream(valueFormat, get_value_wrap()) : null;
+ return ret;
+ }
+
+ /**
+ * Search for an item in the table.
+ *
+ * \return The result of the comparison.
+ */
+ public int search() {
+ int ret = search_wrap(keyPacker.getValue());
+ keyPacker.reset();
+ valuePacker.reset();
+ keyUnpacker = (ret == 0) ?
+ new PackInputStream(keyFormat, get_key_wrap()) : null;
+ valueUnpacker = (ret == 0) ?
+ new PackInputStream(valueFormat, get_value_wrap()) : null;
+ return ret;
+ }
+
+ /**
+ * Search for an item in the table.
+ *
+ * \return The result of the comparison.
+ */
+ public SearchStatus search_near() {
+ SearchStatus ret = search_near_wrap(keyPacker.getValue());
+ keyPacker.reset();
+ valuePacker.reset();
+ keyUnpacker = (ret != SearchStatus.NOTFOUND) ?
+ new PackInputStream(keyFormat, get_key_wrap()) : null;
+ valueUnpacker = (ret != SearchStatus.NOTFOUND) ?
+ new PackInputStream(valueFormat, get_value_wrap()) : null;
+ return ret;
+ }
+%}
+
+/* Remove / rename parts of the C API that we don't want in Java. */
+%immutable __wt_cursor::session;
+%immutable __wt_cursor::uri;
+%immutable __wt_cursor::key_format;
+%immutable __wt_cursor::value_format;
+%immutable __wt_session::connection;
+
+%ignore __wt_collator;
+%ignore __wt_connection::add_collator;
+%ignore __wt_compressor;
+%ignore __wt_connection::add_compressor;
+%ignore __wt_data_source;
+%ignore __wt_connection::add_data_source;
+%ignore __wt_event_handler;
+%ignore __wt_extractor;
+%ignore __wt_connection::add_extractor;
+%ignore __wt_item;
+%ignore __wt_session::msg_printf;
+
+%ignore wiredtiger_struct_pack;
+%ignore wiredtiger_struct_packv;
+%ignore wiredtiger_struct_size;
+%ignore wiredtiger_struct_sizev;
+%ignore wiredtiger_struct_unpack;
+%ignore wiredtiger_struct_unpackv;
+
+%ignore wiredtiger_version;
+
+%ignore wiredtiger_extension_init;
+
+%ignore wiredtiger_open;
+%javamethodmodifiers wiredtiger_open_wrap "
+ /**
+ * @copydoc ::wiredtiger_open
+ */
+ public ";
+
+%rename(open) wiredtiger_open_wrap;
+%ignore __wt_connection::open_session;
+%rename(open_session) __wt_connection::open_session_wrap;
+%ignore __wt_session::open_cursor;
+%javamethodmodifiers __wt_session::open_cursor_wrap "
+ /**
+ * @copydoc WT_SESSION::open_cursor
+ */
+ public ";
+%rename(open_cursor) __wt_session::open_cursor_wrap;
+
+%rename(Cursor) __wt_cursor;
+%rename(Session) __wt_session;
+%rename(Connection) __wt_connection;
+
+%include "wiredtiger.h"
+
+/* Return new connections, sessions and cursors. */
+%inline {
+WT_CONNECTION *wiredtiger_open_wrap(JNIEnv *jenv, const char *home, const char *config) {
+ WT_CONNECTION *conn = NULL;
+ int ret;
+ if ((ret = wiredtiger_open(home, NULL, config, &conn)) != 0)
+ throwWiredTigerException(jenv, wiredtiger_strerror(ret));
+ return conn;
+}
+}
+
+%extend __wt_connection {
+ WT_SESSION *open_session_wrap(JNIEnv *jenv, const char *config) {
+ WT_SESSION *session = NULL;
+ int ret;
+ if ((ret = $self->open_session($self, NULL, config, &session)) != 0)
+ throwWiredTigerException(jenv, wiredtiger_strerror(ret));
+ return session;
+ }
+}
+
+%extend __wt_session {
+ WT_CURSOR *open_cursor_wrap(JNIEnv *jenv, const char *uri, WT_CURSOR *to_dup, const char *config) {
+ WT_CURSOR *cursor = NULL;
+ int ret;
+ if ((ret = $self->open_cursor($self, uri, to_dup, config, &cursor)) != 0)
+ throwWiredTigerException(jenv, wiredtiger_strerror(ret));
+ else
+ cursor->flags |= WT_CURSTD_RAW;
+ return cursor;
+ }
+}
diff --git a/lang/python/setup.py b/lang/python/setup.py
index 995175d6ad2..5634fe7744f 100644
--- a/lang/python/setup.py
+++ b/lang/python/setup.py
@@ -15,16 +15,8 @@ if not 'ARCHFLAGS' in os.environ:
# Suppress warnings building SWIG generated code
extra_cflags = [
- '-Wno-unused-value',
+ '-Wno-error',
]
-if sys.platform == 'darwin':
- kernel_version = os.uname()[2] # e.g. 12.0.0 is Mountain Lion
- major_version = int(kernel_version.split('.')[0])
- if major_version >= 12:
- extra_cflags += [
- '-Wno-self-assign',
- '-Qunused-arguments',
- ]
dir = os.path.dirname(__file__)
diff --git a/lang/python/wiredtiger.i b/lang/python/wiredtiger.i
index a6a8ed532ae..8c381d5aae5 100644
--- a/lang/python/wiredtiger.i
+++ b/lang/python/wiredtiger.i
@@ -442,8 +442,6 @@ typedef int int_void;
%include "wiredtiger.h"
%pythoncode %{
-## @}
-
class stat:
'''keys for statistics cursors'''
@@ -455,6 +453,8 @@ class stat:
'''keys for cursors on data source statistics'''
pass
+## @}
+
import sys
# All names starting with 'WT_STAT_DSRC_' are renamed to
# the wiredtiger.stat.dsrc class, those starting with 'WT_STAT_CONN' are
diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c
index b2535110a36..61728cfad15 100644
--- a/src/block/block_ckpt.c
+++ b/src/block/block_ckpt.c
@@ -209,17 +209,7 @@ __wt_block_checkpoint(WT_SESSION_IMPL *session,
data_cksum, 0));
/* Process the checkpoint list, deleting and updating as required. */
- WT_RET(__ckpt_process(session, block, ckptbase));
-
- /*
- * Checkpoints have to hit disk (it would be reasonable to configure for
- * lazy checkpoints, but we don't support them yet). Regardless, we're
- * not holding any locks, other writers can proceed while we wait.
- */
- if (F_ISSET(S2C(session), WT_CONN_SYNC))
- WT_RET(__wt_fsync(session, block->fh));
-
- return (0);
+ return (__ckpt_process(session, block, ckptbase));
}
/*
@@ -275,7 +265,7 @@ __ckpt_extlist_fblocks(
* file that contains a previous checkpoint's extents.
*/
return (__wt_block_insert_ext(
- session, &block->live.ckpt_avail, el->offset, el->size));
+ session, block, &block->live.ckpt_avail, el->offset, el->size));
}
/*
@@ -419,7 +409,7 @@ __ckpt_process(
* must be paired in the checkpoint.
*/
if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
- WT_ERR(__wt_block_insert_ext(session,
+ WT_ERR(__wt_block_insert_ext(session, block,
&a->discard, a->root_offset, a->root_size));
/*
@@ -436,10 +426,10 @@ __ckpt_process(
*/
if (a->alloc.entries != 0)
WT_ERR(__wt_block_extlist_merge(
- session, &a->alloc, &b->alloc));
+ session, block, &a->alloc, &b->alloc));
if (a->discard.entries != 0)
WT_ERR(__wt_block_extlist_merge(
- session, &a->discard, &b->discard));
+ session, block, &a->discard, &b->discard));
/*
* If the "to" checkpoint is also being deleted, we're done with
@@ -576,12 +566,12 @@ __ckpt_update(
alloc = &block->live.alloc;
WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
if (ci->alloc.offset != WT_BLOCK_INVALID_OFFSET)
- WT_RET(__wt_block_off_remove_overlap(
- session, alloc, ci->alloc.offset, ci->alloc.size));
+ WT_RET(__wt_block_off_remove_overlap(session,
+ block, alloc, ci->alloc.offset, ci->alloc.size));
WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));
if (ci->discard.offset != WT_BLOCK_INVALID_OFFSET)
- WT_RET(__wt_block_off_remove_overlap(
- session, alloc, ci->discard.offset, ci->discard.size));
+ WT_RET(__wt_block_off_remove_overlap(session,
+ block, alloc, ci->discard.offset, ci->discard.size));
/*
* We only write an avail list for the live system, other checkpoint's
@@ -599,8 +589,8 @@ __ckpt_update(
WT_RET(__wt_block_extlist_write(
session, block, &ci->avail, &ci->ckpt_avail));
if (ci->avail.offset != WT_BLOCK_INVALID_OFFSET)
- WT_RET(__wt_block_off_remove_overlap(
- session, alloc, ci->avail.offset, ci->avail.size));
+ WT_RET(__wt_block_off_remove_overlap(session,
+ block, alloc, ci->avail.offset, ci->avail.size));
}
/*
@@ -679,7 +669,8 @@ __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block)
* available list.
*/
__wt_spin_lock(session, &block->live_lock);
- ret = __wt_block_extlist_merge(session, &ci->ckpt_avail, &ci->avail);
+ ret = __wt_block_extlist_merge(
+ session, block, &ci->ckpt_avail, &ci->avail);
__wt_spin_unlock(session, &block->live_lock);
/* Discard the list. */
diff --git a/src/block/block_ext.c b/src/block/block_ext.c
index 7758730f57a..204fd418c81 100644
--- a/src/block/block_ext.c
+++ b/src/block/block_ext.c
@@ -11,7 +11,71 @@ static int __block_ext_overlap(WT_SESSION_IMPL *,
WT_BLOCK *, WT_EXTLIST *, WT_EXT **, WT_EXTLIST *, WT_EXT **);
static int __block_extlist_dump(
WT_SESSION_IMPL *, const char *, WT_EXTLIST *, int);
-static int __block_merge(WT_SESSION_IMPL *, WT_EXTLIST *, off_t, off_t);
+static int __block_merge(
+ WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, off_t, off_t);
+
+/*
+ * __block_ext_alloc --
+ * Return a cached WT_EXT structure, or allocate one if none cached.
+ */
+static inline int
+__block_ext_alloc(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXT **extp)
+{
+ u_int skipdepth;
+
+ *extp = NULL;
+
+ /*
+ * Select (and set) the WT_EXT structure's depth because we have to
+ * know how deep the skiplist goes at the entry to allocate it.
+ */
+ if (block->free_ext == NULL) {
+ skipdepth = __wt_skip_choose_depth();
+ WT_RET(__wt_calloc(session, 1,
+ sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), extp));
+ (*extp)->depth = (uint8_t)skipdepth;
+ } else {
+ --block->free_ext_cnt;
+
+ (*extp) = block->free_ext;
+ block->free_ext = block->free_ext->next[0];
+ }
+ return (0);
+}
+
+/*
+ * __block_ext_free --
+ * Add an EXT structure to the cached list, or free it if enough cached.
+ */
+static inline void
+__block_ext_free(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXT *ext)
+{
+ if (block->free_ext_cnt >= 100)
+ __wt_free(session, ext);
+ else {
+ ++block->free_ext_cnt;
+
+ ext->next[0] = block->free_ext;
+ block->free_ext = ext;
+ }
+}
+
+/*
+ * __wt_block_ext_cleanup --
+ * Discard any cached structures from the list.
+ */
+void
+__wt_block_ext_cleanup(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_EXT *ext, *next;
+
+ for (ext = block->free_ext; ext != NULL; ext = next) {
+ --block->free_ext_cnt;
+ next = ext->next[0];
+ __wt_free(session, ext);
+ }
+ WT_ASSERT(session, block->free_ext_cnt == 0);
+}
/*
* __block_off_srch --
@@ -183,20 +247,16 @@ __block_ext_insert(WT_SESSION_IMPL *session, WT_EXTLIST *el, WT_EXT *ext)
* Insert a file range into an extent list.
*/
static int
-__block_off_insert(
- WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
+__block_off_insert(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *el, off_t off, off_t size)
{
WT_EXT *ext;
- u_int skipdepth;
- /* Allocate a new WT_EXT structure. */
- skipdepth = __wt_skip_choose_depth();
- WT_RET(__wt_calloc(session, 1,
- sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), &ext));
+ WT_RET(__block_ext_alloc(session, block, &ext));
ext->off = off;
ext->size = size;
- ext->depth = (uint8_t)skipdepth;
+
return (__block_ext_insert(session, el, ext));
}
@@ -247,11 +307,12 @@ __wt_block_misplaced(WT_SESSION_IMPL *session,
else if (__block_off_match(&block->live.discard, offset, size))
name = "discard";
__wt_spin_unlock(session, &block->live_lock);
- if (name != NULL)
- WT_RET_MSG(session, WT_ERROR,
+ if (name != NULL) {
+ __wt_errx(session,
"%s failed: %" PRIuMAX "/%" PRIu32 " is on the %s list",
tag, (uintmax_t)offset, size, name);
-
+ return (__wt_panic(session));
+ }
return (0);
}
#endif
@@ -261,8 +322,8 @@ __wt_block_misplaced(WT_SESSION_IMPL *session,
* Remove a record from an extent list.
*/
static int
-__block_off_remove(
- WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, WT_EXT **extp)
+__block_off_remove(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *el, off_t off, WT_EXT **extp)
{
WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
@@ -301,7 +362,7 @@ __block_off_remove(
/* Return the record if our caller wants it, otherwise free it. */
if (extp == NULL)
- __wt_free(session, ext);
+ __block_ext_free(session, block, ext);
else
*extp = ext;
@@ -318,8 +379,8 @@ corrupt:
* overlapping entry.
*/
int
-__wt_block_off_remove_overlap(
- WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
+__wt_block_off_remove_overlap(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *el, off_t off, off_t size)
{
WT_EXT *before, *after, *ext;
off_t a_off, a_size, b_off, b_size;
@@ -331,7 +392,8 @@ __wt_block_off_remove_overlap(
/* If "before" or "after" overlaps, retrieve the overlapping entry. */
if (before != NULL && before->off + before->size > off) {
- WT_RET(__block_off_remove(session, el, before->off, &ext));
+ WT_RET(
+ __block_off_remove(session, block, el, before->off, &ext));
/* Calculate overlapping extents. */
a_off = ext->off;
@@ -339,7 +401,8 @@ __wt_block_off_remove_overlap(
b_off = off + size;
b_size = ext->size - (a_size + size);
} else if (after != NULL && off + size > after->off) {
- WT_RET(__block_off_remove(session, el, after->off, &ext));
+ WT_RET(
+ __block_off_remove(session, block, el, after->off, &ext));
/*
* Calculate overlapping extents. There's no initial overlap
@@ -364,7 +427,8 @@ __wt_block_off_remove_overlap(
}
if (b_size != 0) {
if (ext == NULL)
- WT_RET(__block_off_insert(session, el, b_off, b_size));
+ WT_RET(__block_off_insert(
+ session, block, el, b_off, b_size));
else {
ext->off = b_off;
ext->size = b_size;
@@ -373,7 +437,7 @@ __wt_block_off_remove_overlap(
}
}
if (ext != NULL)
- __wt_free(session, ext);
+ __block_ext_free(session, block, ext);
return (0);
}
@@ -455,7 +519,8 @@ __wt_block_alloc(
/* Remove the first record, and set the returned offset. */
ext = szp->off[0];
- WT_RET(__block_off_remove(session, &block->live.avail, ext->off, &ext));
+ WT_RET(__block_off_remove(
+ session, block, &block->live.avail, ext->off, &ext));
*offp = ext->off;
/* If doing a partial allocation, adjust the record and put it back. */
@@ -476,11 +541,12 @@ __wt_block_alloc(
"allocate range %" PRIdMAX "-%" PRIdMAX,
(intmax_t)ext->off, (intmax_t)(ext->off + ext->size));
- __wt_free(session, ext);
+ __block_ext_free(session, block, ext);
}
done: /* Add the newly allocated extent to the list of allocations. */
- WT_RET(__block_merge(session, &block->live.alloc, *offp, (off_t)size));
+ WT_RET(__block_merge(
+ session, block, &block->live.alloc, *offp, (off_t)size));
return (0);
}
@@ -536,12 +602,12 @@ __wt_block_off_free(
* list.
*/
if ((ret = __wt_block_off_remove_overlap(
- session, &block->live.alloc, offset, size)) == 0)
+ session, block, &block->live.alloc, offset, size)) == 0)
ret = __block_merge(
- session, &block->live.avail, offset, (off_t)size);
+ session, block, &block->live.avail, offset, (off_t)size);
else if (ret == WT_NOTFOUND)
ret = __block_merge(
- session, &block->live.discard, offset, (off_t)size);
+ session, block, &block->live.discard, offset, (off_t)size);
return (ret);
}
@@ -637,7 +703,7 @@ __block_ext_overlap(WT_SESSION_IMPL *session,
* We can think of the overlap possibilities as 11 different cases:
*
* AAAAAAAAAAAAAAAAAA
- * #1 BBBBBBBBBBBBBBBBBB ranges are are the same
+ * #1 BBBBBBBBBBBBBBBBBB ranges are the same
* #2 BBBBBBBBBBBBB overlaps the beginning
* #3 BBBBBBBBBBBBBBBB overlaps the end
* #4 BBBBB B is a prefix of A
@@ -658,7 +724,7 @@ __block_ext_overlap(WT_SESSION_IMPL *session,
* eliminate cases #2, #8, #10 and #11, and only handle 7 cases:
*
* AAAAAAAAAAAAAAAAAA
- * #1 BBBBBBBBBBBBBBBBBB ranges are are the same
+ * #1 BBBBBBBBBBBBBBBBBB ranges are the same
* #3 BBBBBBBBBBBBBBBB overlaps the end
* #4 BBBBB B is a prefix of A
* #5 BBBBBB B is middle of A
@@ -688,9 +754,12 @@ __block_ext_overlap(WT_SESSION_IMPL *session,
*/
*ap = (*ap)->next[0];
*bp = (*bp)->next[0];
- WT_RET(__block_merge(session, avail, b->off, b->size));
- WT_RET(__block_off_remove(session, ael, a->off, NULL));
- WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ WT_RET(__block_merge(
+ session, block, avail, b->off, b->size));
+ WT_RET(__block_off_remove(
+ session, block, ael, a->off, NULL));
+ WT_RET(__block_off_remove(
+ session, block, bel, b->off, NULL));
}
else if (a->size > b->size) { /* Case #4 */
/*
@@ -698,7 +767,8 @@ __block_ext_overlap(WT_SESSION_IMPL *session,
* Increment/Decrement A's offset/size by the size of B
* Insert A on its list
*/
- WT_RET(__block_off_remove(session, ael, a->off, &a));
+ WT_RET(__block_off_remove(
+ session, block, ael, a->off, &a));
a->off += b->size;
a->size -= b->size;
WT_RET(__block_ext_insert(session, ael, a));
@@ -709,15 +779,18 @@ __block_ext_overlap(WT_SESSION_IMPL *session,
* Delete B
*/
*bp = (*bp)->next[0];
- WT_RET(__block_merge(session, avail, b->off, b->size));
- WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ WT_RET(__block_merge(
+ session, block, avail, b->off, b->size));
+ WT_RET(__block_off_remove(
+ session, block, bel, b->off, NULL));
} else { /* Case #9 */
/*
* Remove B from its list
* Increment/Decrement B's offset/size by the size of A
* Insert B on its list
*/
- WT_RET(__block_off_remove(session, bel, b->off, &b));
+ WT_RET(__block_off_remove(
+ session, block, bel, b->off, &b));
b->off += a->size;
b->size -= a->size;
WT_RET(__block_ext_insert(session, bel, b));
@@ -728,8 +801,10 @@ __block_ext_overlap(WT_SESSION_IMPL *session,
* Delete A
*/
*ap = (*ap)->next[0];
- WT_RET(__block_merge(session, avail, a->off, a->size));
- WT_RET(__block_off_remove(session, ael, a->off, NULL));
+ WT_RET(__block_merge(
+ session, block, avail, a->off, a->size));
+ WT_RET(__block_off_remove(
+ session, block, ael, a->off, NULL));
} /* Case #6 */
} else if (a->off + a->size == b->off + b->size) {
/*
@@ -737,7 +812,7 @@ __block_ext_overlap(WT_SESSION_IMPL *session,
* Decrement A's size by the size of B
* Insert A on its list
*/
- WT_RET(__block_off_remove(session, ael, a->off, &a));
+ WT_RET(__block_off_remove(session, block, ael, a->off, &a));
a->size -= b->size;
WT_RET(__block_ext_insert(session, ael, a));
@@ -747,8 +822,8 @@ __block_ext_overlap(WT_SESSION_IMPL *session,
* Delete B
*/
*bp = (*bp)->next[0];
- WT_RET(__block_merge(session, avail, b->off, b->size));
- WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ WT_RET(__block_merge(session, block, avail, b->off, b->size));
+ WT_RET(__block_off_remove(session, block, bel, b->off, NULL));
} else if /* Case #3, #7 */
(a->off + a->size < b->off + b->size) {
/*
@@ -756,14 +831,14 @@ __block_ext_overlap(WT_SESSION_IMPL *session,
*/
off = b->off;
size = (a->off + a->size) - b->off;
- WT_RET(__block_merge(session, avail, off, size));
+ WT_RET(__block_merge(session, block, avail, off, size));
/*
* Remove A from its list
* Decrement A's size by the overlap
* Insert A on its list
*/
- WT_RET(__block_off_remove(session, ael, a->off, &a));
+ WT_RET(__block_off_remove(session, block, ael, a->off, &a));
a->size -= size;
WT_RET(__block_ext_insert(session, ael, a));
@@ -772,7 +847,7 @@ __block_ext_overlap(WT_SESSION_IMPL *session,
* Increment/Decrement B's offset/size by the overlap
* Insert B on its list
*/
- WT_RET(__block_off_remove(session, bel, b->off, &b));
+ WT_RET(__block_off_remove(session, block, bel, b->off, &b));
b->off += size;
b->size -= size;
WT_RET(__block_ext_insert(session, bel, b));
@@ -786,12 +861,12 @@ __block_ext_overlap(WT_SESSION_IMPL *session,
* Decrement A's size by trailing part of A plus B's size
* Insert A on its list
*/
- WT_RET(__block_off_remove(session, ael, a->off, &a));
+ WT_RET(__block_off_remove(session, block, ael, a->off, &a));
a->size = b->off - a->off;
WT_RET(__block_ext_insert(session, ael, a));
/* Add trailing part of A to A's list as a new element. */
- WT_RET(__block_merge(session, ael, off, size));
+ WT_RET(__block_merge(session, block, ael, off, size));
/*
* Move caller's B to the next element
@@ -799,8 +874,8 @@ __block_ext_overlap(WT_SESSION_IMPL *session,
* Delete B
*/
*bp = (*bp)->next[0];
- WT_RET(__block_merge(session, avail, b->off, b->size));
- WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ WT_RET(__block_merge(session, block, avail, b->off, b->size));
+ WT_RET(__block_off_remove(session, block, bel, b->off, NULL));
}
return (0);
@@ -811,14 +886,15 @@ __block_ext_overlap(WT_SESSION_IMPL *session,
* Merge one extent list into another.
*/
int
-__wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b)
+__wt_block_extlist_merge(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *a, WT_EXTLIST *b)
{
WT_EXT *ext;
WT_VERBOSE_RET(session, block, "merging %s into %s", a->name, b->name);
WT_EXT_FOREACH(ext, a->off)
- WT_RET(__block_merge(session, b, ext->off, ext->size));
+ WT_RET(__block_merge(session, block, b, ext->off, ext->size));
return (0);
}
@@ -828,8 +904,8 @@ __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b)
* Insert an extent into an extent list, merging if possible.
*/
int
-__wt_block_insert_ext(
- WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
+__wt_block_insert_ext(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *el, off_t off, off_t size)
{
/*
* There are currently two copies of this function (this code is a one-
@@ -842,10 +918,11 @@ __wt_block_insert_ext(
* Callers of this function are expected to have already acquired any
* locks required to manipulate the extent list.
*/
- return (__block_merge(session, el, off, size));
+ return (__block_merge(session, block, el, off, size));
}
static int
-__block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
+__block_merge(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *el, off_t off, off_t size)
{
WT_EXT *ext, *after, *before;
@@ -884,7 +961,7 @@ __block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
"%s: insert range %" PRIdMAX "-%" PRIdMAX,
el->name, (intmax_t)off, (intmax_t)(off + size));
- return (__block_off_insert(session, el, off, size));
+ return (__block_off_insert(session, block, el, off, size));
}
/*
@@ -895,7 +972,8 @@ __block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
* the record we're going to use, adjust it and re-insert it.
*/
if (before == NULL) {
- WT_RET(__block_off_remove(session, el, after->off, &ext));
+ WT_RET(
+ __block_off_remove(session, block, el, after->off, &ext));
WT_VERBOSE_RET(session, block,
"%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
@@ -909,10 +987,11 @@ __block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
} else {
if (after != NULL) {
size += after->size;
- WT_RET(
- __block_off_remove(session, el, after->off, NULL));
+ WT_RET(__block_off_remove(
+ session, block, el, after->off, NULL));
}
- WT_RET(__block_off_remove(session, el, before->off, &ext));
+ WT_RET(
+ __block_off_remove(session, block, el, before->off, &ext));
WT_VERBOSE_RET(session, block,
"%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
@@ -946,7 +1025,8 @@ __wt_block_extlist_read_avail(
* avail list, the extent blocks might be included, remove them.
*/
WT_RET_NOTFOUND_OK(
- __wt_block_off_remove_overlap(session, el, el->offset, el->size));
+ __wt_block_off_remove_overlap(
+ session, block, el, el->offset, el->size));
return (0);
}
@@ -1013,7 +1093,7 @@ corrupted: WT_ERR_MSG(session, WT_ERROR,
* list and crashed, and rolled back to a corrupted checkpoint,
* this might save us?)
*/
- WT_ERR(__block_merge(session, el, off, size));
+ WT_ERR(__block_merge(session, block, el, off, size));
}
if (WT_VERBOSE_ISSET(session, block))
@@ -1147,7 +1227,7 @@ __wt_block_extlist_truncate(
*/
file_size = ext->off;
WT_RET(__wt_ftruncate(session, fh, file_size));
- WT_RET(__block_off_remove(session, el, file_size, NULL));
+ WT_RET(__block_off_remove(session, block, el, file_size, NULL));
fh->file_size = file_size;
return (0);
diff --git a/src/block/block_map.c b/src/block/block_map.c
new file mode 100644
index 00000000000..93dcc4bec6c
--- /dev/null
+++ b/src/block/block_map.c
@@ -0,0 +1,63 @@
+/*-
+ * Copyright (c) 2008-2013 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_block_map --
+ * Map a segment of the file in, if possible.
+ */
+int
+__wt_block_map(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp)
+{
+ *(void **)mapp = NULL;
+ *maplenp = 0;
+
+ /*
+ * Turn off mapping when verifying the file, because we can't perform
+ * checksum validation of mapped segments, and verify has to checksum
+ * pages.
+ */
+ if (block->verify)
+ return (0);
+
+ /*
+ * Turn off mapping when direct I/O is configured for the file, the
+ * Linux open(2) documentation says applications should avoid mixing
+ * mmap(2) of files with direct I/O to the same files.
+ */
+ if (block->fh->direct_io)
+ return (0);
+
+ /*
+ * Turn off mapping if the application configured a cache size maximum,
+ * we can't control how much of the cache size we use in that case.
+ */
+ if (block->os_cache_max != 0)
+ return (0);
+
+ /*
+ * Map the file into memory.
+ * Ignore errors, we'll read the file through the cache if map fails.
+ */
+ (void)__wt_mmap(session, block->fh, mapp, maplenp);
+
+ return (0);
+}
+
+/*
+ * __wt_block_unmap --
+ * Unmap any mapped-in segment of the file.
+ */
+int
+__wt_block_unmap(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen)
+{
+ /* Unmap the file from memory. */
+ return (__wt_munmap(session, block->fh, map, maplen));
+}
diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c
index 307263d5779..ddf23a97866 100644
--- a/src/block/block_mgr.c
+++ b/src/block/block_mgr.c
@@ -67,6 +67,16 @@ __bm_checkpoint(WT_BM *bm,
}
/*
+ * __bm_sync --
+ * Flush a file to disk.
+ */
+static int
+__bm_sync(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ return (__wt_fsync(session, bm->block->fh));
+}
+
+/*
* __bm_checkpoint_load --
* Load a checkpoint point.
*/
@@ -86,17 +96,12 @@ __bm_checkpoint_load(WT_BM *bm, WT_SESSION_IMPL *session,
if (checkpoint) {
/*
- * Read-only objects are mapped into memory instead of being
- * read into cache buffers. Ignore errors, with no mapping
- * we'll read into the cache.
- *
- * Turn off mapping when verifying the file, because we can't
- * perform checksum validation of mapped segments, and verify
- * has to checksum pages.
+ * Read-only objects are optionally mapped into memory instead
+ * of being read into cache buffers.
*/
- if (conn->mmap && !bm->block->verify)
- (void)__wt_mmap(
- session, bm->block->fh, &bm->map, &bm->maplen);
+ if (conn->mmap)
+ WT_RET(__wt_block_map(
+ session, bm->block, &bm->map, &bm->maplen));
/*
* If this handle is for a checkpoint, that is, read-only, there
@@ -132,7 +137,7 @@ __bm_checkpoint_unload(WT_BM *bm, WT_SESSION_IMPL *session)
/* Unmap any mapped segment. */
if (bm->map != NULL)
WT_TRET(
- __wt_munmap(session, bm->block->fh, bm->map, bm->maplen));
+ __wt_block_unmap(session, bm->block, bm->map, bm->maplen));
/* Unload the checkpoint. */
WT_TRET(__wt_block_checkpoint_unload(session, bm->block, !bm->is_live));
@@ -197,9 +202,9 @@ __bm_free(WT_BM *bm,
* Block-manager statistics.
*/
static int
-__bm_stat(WT_BM *bm, WT_SESSION_IMPL *session)
+__bm_stat(WT_BM *bm, WT_SESSION_IMPL *session, WT_DSRC_STATS *stats)
{
- __wt_block_stat(session, bm->block);
+ __wt_block_stat(session, bm->block, stats);
return (0);
}
@@ -333,6 +338,7 @@ __bm_method_set(WT_BM *bm, int readonly)
bm->salvage_valid = (int (*)(WT_BM *,
WT_SESSION_IMPL *, uint8_t *, uint32_t))__bm_readonly;
bm->stat = __bm_stat;
+ bm->sync = (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
bm->verify_addr = __bm_verify_addr;
bm->verify_end = __bm_verify_end;
bm->verify_start = __bm_verify_start;
@@ -358,6 +364,7 @@ __bm_method_set(WT_BM *bm, int readonly)
bm->salvage_start = __bm_salvage_start;
bm->salvage_valid = __bm_salvage_valid;
bm->stat = __bm_stat;
+ bm->sync = __bm_sync;
bm->verify_addr = __bm_verify_addr;
bm->verify_end = __bm_verify_end;
bm->verify_start = __bm_verify_start;
@@ -371,8 +378,8 @@ __bm_method_set(WT_BM *bm, int readonly)
* Open a file.
*/
int
-__wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename,
- const char *config, const char *cfg[], int forced_salvage, WT_BM **bmp)
+__wt_block_manager_open(WT_SESSION_IMPL *session,
+ const char *filename, const char *cfg[], int forced_salvage, WT_BM **bmp)
{
WT_BM *bm;
WT_DECL_RET;
@@ -383,7 +390,7 @@ __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename,
__bm_method_set(bm, 0);
WT_ERR(__wt_block_open(
- session, filename, config, cfg, forced_salvage, &bm->block));
+ session, filename, cfg, forced_salvage, &bm->block));
*bmp = bm;
return (0);
diff --git a/src/block/block_open.c b/src/block/block_open.c
index 6a542a29e7e..973df7e2250 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -71,6 +71,7 @@ __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block)
WT_DECL_RET;
conn = S2C(session);
+ TAILQ_REMOVE(&conn->blockqh, block, q);
if (block->name != NULL)
__wt_free(session, block->name);
@@ -80,7 +81,7 @@ __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block)
__wt_spin_destroy(session, &block->live_lock);
- TAILQ_REMOVE(&conn->blockqh, block, q);
+ __wt_block_ext_cleanup(session, block);
__wt_overwrite_and_free(session, block);
@@ -93,8 +94,7 @@ __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block)
*/
int
__wt_block_open(WT_SESSION_IMPL *session, const char *filename,
- const char *config, const char *cfg[], int forced_salvage,
- WT_BLOCK **blockp)
+ const char *cfg[], int forced_salvage, WT_BLOCK **blockp)
{
WT_BLOCK *block;
WT_CONFIG_ITEM cval;
@@ -124,9 +124,38 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename,
WT_ERR(__wt_strdup(session, filename, &block->name));
/* Get the allocation size. */
- WT_ERR(__wt_config_getones(session, config, "allocation_size", &cval));
+ WT_ERR(__wt_config_gets(session, cfg, "allocation_size", &cval));
block->allocsize = (uint32_t)cval.val;
+ /* Configuration: optional OS buffer cache maximum size. */
+ WT_ERR(__wt_config_gets(session, cfg, "os_cache_max", &cval));
+ block->os_cache_max = cval.val;
+#ifdef HAVE_POSIX_FADVISE
+ if (conn->direct_io && block->os_cache_max)
+ WT_ERR_MSG(session, EINVAL,
+ "os_cache_max not supported in combination with direct_io");
+#else
+ if (block->os_cache_max)
+ WT_ERR_MSG(session, EINVAL,
+ "os_cache_max not supported if posix_fadvise not "
+ "available");
+#endif
+
+ /* Configuration: optional immediate write scheduling flag. */
+ WT_ERR(__wt_config_gets(session, cfg, "os_cache_dirty_max", &cval));
+ block->os_cache_dirty_max = cval.val;
+#ifdef HAVE_SYNC_FILE_RANGE
+ if (conn->direct_io && block->os_cache_dirty_max)
+ WT_ERR_MSG(session, EINVAL,
+ "os_cache_dirty_max not supported in combination with "
+ "direct_io");
+#else
+ if (block->os_cache_dirty_max)
+ WT_ERR_MSG(session, EINVAL,
+ "os_cache_dirty_max not supported if sync_file_range not "
+ "available");
+#endif
+
/* Open the underlying file handle. */
WT_ERR(__wt_open(session, filename, 0, 0, 1, &block->fh));
@@ -258,8 +287,11 @@ __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block)
(desc->majorv == WT_BLOCK_MAJOR_VERSION &&
desc->minorv > WT_BLOCK_MINOR_VERSION))
WT_ERR_MSG(session, WT_ERROR,
- "%s is an unsupported version of a WiredTiger file",
- block->name);
+ "unsupported WiredTiger file version: this build only "
+ "supports major/minor versions up to %d/%d, and the file "
+ "is version %d/%d",
+ WT_BLOCK_MAJOR_VERSION, WT_BLOCK_MINOR_VERSION,
+ desc->majorv, desc->minorv);
err: __wt_scr_free(&buf);
return (ret);
@@ -270,7 +302,7 @@ err: __wt_scr_free(&buf);
* Block statistics
*/
void
-__wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block)
+__wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats)
{
/*
* We're looking inside the live system's structure, which normally
@@ -279,11 +311,11 @@ __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block)
* isn't like this is a common function for an application to call.
*/
__wt_spin_lock(session, &block->live_lock);
- WT_DSTAT_SET(session, block_allocsize, block->allocsize);
- WT_DSTAT_SET(session, block_checkpoint_size, block->live.ckpt_size);
- WT_DSTAT_SET(session, block_magic, WT_BLOCK_MAGIC);
- WT_DSTAT_SET(session, block_major, WT_BLOCK_MAJOR_VERSION);
- WT_DSTAT_SET(session, block_minor, WT_BLOCK_MINOR_VERSION);
- WT_DSTAT_SET(session, block_size, block->fh->file_size);
+ WT_STAT_SET(stats, block_allocsize, block->allocsize);
+ WT_STAT_SET(stats, block_checkpoint_size, block->live.ckpt_size);
+ WT_STAT_SET(stats, block_magic, WT_BLOCK_MAGIC);
+ WT_STAT_SET(stats, block_major, WT_BLOCK_MAJOR_VERSION);
+ WT_STAT_SET(stats, block_minor, WT_BLOCK_MINOR_VERSION);
+ WT_STAT_SET(stats, block_size, block->fh->file_size);
__wt_spin_unlock(session, &block->live_lock);
}
diff --git a/src/block/block_read.c b/src/block/block_read.c
index 4a5ba4c4478..20bd7c17b31 100644
--- a/src/block/block_read.c
+++ b/src/block/block_read.c
@@ -55,7 +55,22 @@ __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session,
}
/* Read the block. */
- return (__wt_block_read_off(session, block, buf, offset, size, cksum));
+ WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum));
+
+#ifdef HAVE_POSIX_FADVISE
+ /* Optionally discard blocks from the system's buffer cache. */
+ if (block->os_cache_max != 0 &&
+ (block->os_cache += size) > block->os_cache_max) {
+ WT_DECL_RET;
+
+ block->os_cache = 0;
+ if ((ret = posix_fadvise(block->fh->fd,
+ (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0)
+ WT_RET_MSG(
+ session, ret, "%s: posix_fadvise", block->name);
+ }
+#endif
+ return (0);
}
/*
diff --git a/src/block/block_slvg.c b/src/block/block_slvg.c
index 46df9dd210b..488278fd41a 100644
--- a/src/block/block_slvg.c
+++ b/src/block/block_slvg.c
@@ -52,7 +52,7 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
* Start with the entire file on the allocation list, we'll "free"
* any blocks we don't want as we process the file.
*/
- WT_RET(__wt_block_insert_ext(session, &block->live.alloc,
+ WT_RET(__wt_block_insert_ext(session, block, &block->live.alloc,
WT_BLOCK_DESC_SECTOR, len - WT_BLOCK_DESC_SECTOR));
return (0);
diff --git a/src/block/block_vrfy.c b/src/block/block_vrfy.c
index d88f6d67fdd..7c06bb8b193 100644
--- a/src/block/block_vrfy.c
+++ b/src/block/block_vrfy.c
@@ -226,7 +226,7 @@ __wt_verify_ckpt_load(
WT_RET(__wt_block_extlist_read(
session, block, el, ci->file_size));
WT_RET(__wt_block_extlist_merge(
- session, el, &block->verify_alloc));
+ session, block, el, &block->verify_alloc));
__wt_block_extlist_free(session, el);
}
el = &ci->discard;
@@ -235,7 +235,7 @@ __wt_verify_ckpt_load(
session, block, el, ci->file_size));
WT_EXT_FOREACH(ext, el->off)
WT_RET(__wt_block_off_remove_overlap(session,
- &block->verify_alloc, ext->off, ext->size));
+ block, &block->verify_alloc, ext->off, ext->size));
__wt_block_extlist_free(session, el);
}
@@ -247,7 +247,7 @@ __wt_verify_ckpt_load(
* checkpoints.
*/
if (ci->root_offset != WT_BLOCK_INVALID_OFFSET)
- WT_RET(__wt_block_off_remove_overlap(session,
+ WT_RET(__wt_block_off_remove_overlap(session, block,
&block->verify_alloc, ci->root_offset, ci->root_size));
/*
diff --git a/src/block/block_write.c b/src/block/block_write.c
index ce07bd6ae57..13cb0f25f0e 100644
--- a/src/block/block_write.c
+++ b/src/block/block_write.c
@@ -28,7 +28,8 @@ __wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep)
{
WT_UNUSED(session);
- *sizep = WT_ALIGN(*sizep + WT_BLOCK_HEADER_BYTE_SIZE, block->allocsize);
+ *sizep = (size_t)
+ WT_ALIGN(*sizep + WT_BLOCK_HEADER_BYTE_SIZE, block->allocsize);
return (0);
}
@@ -87,7 +88,7 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
* boundary, this is one of the reasons the btree layer must find out
* from the block-manager layer the maximum size of the eventual write.
*/
- align_size = WT_ALIGN32(buf->size, block->allocsize);
+ align_size = (uint32_t)WT_ALIGN(buf->size, block->allocsize);
if (align_size > buf->memsize) {
WT_ASSERT(session, align_size <= buf->memsize);
WT_RET_MSG(session, EINVAL,
@@ -141,6 +142,31 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
WT_RET(ret);
}
+#ifdef HAVE_SYNC_FILE_RANGE
+ /*
+ * Optionally schedule writes for dirty pages in the system buffer
+ * cache.
+ */
+ if (block->os_cache_dirty_max != 0 &&
+ (block->os_cache_dirty += align_size) > block->os_cache_dirty_max) {
+ block->os_cache_dirty = 0;
+ if ((ret = sync_file_range(block->fh->fd,
+ (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) != 0)
+ WT_RET_MSG(
+ session, ret, "%s: sync_file_range", block->name);
+ }
+#endif
+#ifdef HAVE_POSIX_FADVISE
+ /* Optionally discard blocks from the system buffer cache. */
+ if (block->os_cache_max != 0 &&
+ (block->os_cache += align_size) > block->os_cache_max) {
+ block->os_cache = 0;
+ if ((ret = posix_fadvise(block->fh->fd,
+ (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0)
+ WT_RET_MSG(
+ session, ret, "%s: posix_fadvise", block->name);
+ }
+#endif
WT_CSTAT_INCR(session, block_write);
WT_CSTAT_INCRV(session, block_byte_write, align_size);
diff --git a/src/bloom/bloom.c b/src/bloom/bloom.c
index df16aa39c55..9e0a2817a11 100644
--- a/src/bloom/bloom.c
+++ b/src/bloom/bloom.c
@@ -65,7 +65,8 @@ static int
__bloom_setup(
WT_BLOOM *bloom, uint64_t n, uint64_t m, uint32_t factor, uint32_t k)
{
- WT_ASSERT(bloom->session, k > 1);
+ if (k < 2)
+ return (EINVAL);
bloom->k = k;
bloom->factor = factor;
@@ -240,6 +241,7 @@ __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash)
uint64_t h1, h2;
uint8_t bit;
+ /* Get operations are only supported by finalized bloom filters. */
WT_ASSERT(bloom->session, bloom->bitstring == NULL);
wt_session = (WT_SESSION *)bloom->session;
diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c
index b2123d0c0f0..c390d7347d9 100644
--- a/src/btree/bt_curnext.c
+++ b/src/btree/bt_curnext.c
@@ -402,7 +402,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, int discard)
session = (WT_SESSION_IMPL *)cbt->iface.session;
WT_DSTAT_INCR(session, cursor_next);
- flags = 0; /* Tree walk flags. */
+ flags = WT_TREE_SKIP_INTL; /* Tree walk flags. */
if (discard)
LF_SET(WT_TREE_DISCARD);
@@ -476,12 +476,11 @@ retry: WT_RET(__cursor_func_init(cbt, 0));
}
cbt->page = NULL;
- do {
- WT_ERR(__wt_tree_walk(session, &page, flags));
- WT_ERR_TEST(page == NULL, WT_NOTFOUND);
- } while (
- page->type == WT_PAGE_COL_INT ||
- page->type == WT_PAGE_ROW_INT);
+ WT_ERR(__wt_tree_walk(session, &page, flags));
+ WT_ERR_TEST(page == NULL, WT_NOTFOUND);
+ WT_ASSERT(session,
+ page->type != WT_PAGE_COL_INT &&
+ page->type != WT_PAGE_ROW_INT);
cbt->page = page;
/* Initialize the page's modification information */
diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c
index a0591937fa6..418e89dcecd 100644
--- a/src/btree/bt_curprev.c
+++ b/src/btree/bt_curprev.c
@@ -494,7 +494,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int discard)
session = (WT_SESSION_IMPL *)cbt->iface.session;
WT_DSTAT_INCR(session, cursor_prev);
- flags = WT_TREE_PREV; /* Tree walk flags. */
+ flags = WT_TREE_SKIP_INTL | WT_TREE_PREV; /* Tree walk flags. */
if (discard)
LF_SET(WT_TREE_DISCARD);
@@ -559,12 +559,11 @@ retry: WT_RET(__cursor_func_init(cbt, 0));
}
cbt->page = NULL;
- do {
- WT_ERR(__wt_tree_walk(session, &page, flags));
- WT_ERR_TEST(page == NULL, WT_NOTFOUND);
- } while (
- page->type == WT_PAGE_COL_INT ||
- page->type == WT_PAGE_ROW_INT);
+ WT_ERR(__wt_tree_walk(session, &page, flags));
+ WT_ERR_TEST(page == NULL, WT_NOTFOUND);
+ WT_ASSERT(session,
+ page->type != WT_PAGE_COL_INT &&
+ page->type != WT_PAGE_ROW_INT);
cbt->page = page;
/* Initialize the page's modification information */
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index eeb1a6dd1e1..277f46a76c1 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -48,13 +48,8 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
hp->page, hp->file, hp->line);
}
#endif
- /*
- * Pages without a memory footprint aren't associated with the cache
- * and were never counted as "pages read". If the page has a memory
- * footprint, update the cache information based on the discard.
- */
- if (page->memory_footprint != 0)
- __wt_cache_page_evict(session, page);
+ /* Update the cache's information. */
+ __wt_cache_page_evict(session, page);
/* Free the page modification information. */
if (page->modify != NULL)
@@ -153,9 +148,6 @@ __free_page_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
__wt_free(session, ref->addr);
}
-
- /* Free the subtree-reference array. */
- __wt_free(session, page->u.intl.t);
}
/*
@@ -165,9 +157,6 @@ __free_page_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
static void
__free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page)
{
- /* Free the in-memory index array. */
- __wt_free(session, page->u.col_var.d);
-
/* Free the RLE lookup array. */
__wt_free(session, page->u.col_var.repeats);
}
@@ -198,9 +187,6 @@ __free_page_row_int(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_free(session, ref->addr);
}
}
-
- /* Free the subtree-reference array. */
- __wt_free(session, page->u.intl.t);
}
/*
@@ -226,7 +212,6 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
if (ikey != NULL && __wt_off_page(page, ikey))
__wt_free(session, ikey);
}
- __wt_free(session, page->u.row.d);
/*
* Free the insert array.
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
index df36f609369..38fca0db2d1 100644
--- a/src/btree/bt_evict.c
+++ b/src/btree/bt_evict.c
@@ -10,10 +10,11 @@
static void __evict_dirty_validate(WT_CONNECTION_IMPL *);
static int __evict_file(WT_SESSION_IMPL *, int);
static int __evict_file_request_walk(WT_SESSION_IMPL *);
-static int __evict_init_candidate(
+static void __evict_init_candidate(
WT_SESSION_IMPL *, WT_EVICT_ENTRY *, WT_PAGE *);
static int __evict_lru(WT_SESSION_IMPL *, int);
static int __evict_lru_cmp(const void *, const void *);
+static int __evict_page(WT_SESSION_IMPL *, WT_PAGE *);
static int __evict_walk(WT_SESSION_IMPL *, uint32_t *, int);
static int __evict_walk_file(WT_SESSION_IMPL *, u_int *, int);
static int __evict_worker(WT_SESSION_IMPL *);
@@ -22,10 +23,10 @@ static int __evict_worker(WT_SESSION_IMPL *);
* Tuning constants: I hesitate to call this tuning, but we want to review some
* number of pages from each file's in-memory tree for each page we evict.
*/
-#define WT_EVICT_INT_SKEW (1<<20) /* Prefer leaf pages over internal
+#define WT_EVICT_INT_SKEW (1<<12) /* Prefer leaf pages over internal
pages by this many increments of the
read generation. */
-#define WT_EVICT_WALK_PER_FILE 5 /* Pages to visit per file */
+#define WT_EVICT_WALK_PER_FILE 10 /* Pages to visit per file */
#define WT_EVICT_WALK_BASE 100 /* Pages tracked across file visits */
#define WT_EVICT_WALK_INCR 100 /* Pages added each walk */
@@ -50,8 +51,15 @@ __evict_read_gen(const WT_EVICT_ENTRY *entry)
return (0);
read_gen = page->read_gen + entry->btree->evict_priority;
- if (page->type == WT_PAGE_ROW_INT ||
- page->type == WT_PAGE_COL_INT)
+
+ /*
+ * Skew the read generation for internal pages that aren't split merge
+ * pages. We want to consider leaf pages in preference to real internal
+ * pages, but merges are relatively cheap in-memory operations that make
+ * reads faster, so don't make them too unlikely.
+ */
+ if ((page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) &&
+ !__wt_btree_mergeable(page))
read_gen += WT_EVICT_INT_SKEW;
return (read_gen);
@@ -151,73 +159,72 @@ __wt_evict_list_clr_page(WT_SESSION_IMPL *session, WT_PAGE *page)
/*
* __wt_evict_forced_page --
- * If a page matches the force criteria add it to the eviction queue and
- * trigger the eviction server.
+ * If a page matches the force criteria,try to add it to the eviction
+ * queue and trigger the eviction server. Best effort only, so no error
+ * is returned if the page is busy.
*/
-int
+void
__wt_evict_forced_page(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- u_int count;
+ WT_PAGE *top;
+ u_int levels;
conn = S2C(session);
cache = conn->cache;
+ /* Don't queue a page for forced eviction if we already have one. */
+ if (F_ISSET(cache, WT_EVICT_FORCE_PASS))
+ return;
+
+ /*
+ * Check if the page we have been asked to forcefully evict is at the
+ * bottom of a stack of split-merge pages. If so, lock the top of the
+ * stack instead.
+ */
+ for (top = page, levels = 0;
+ __wt_btree_mergeable(top->parent);
+ top = top->parent, ++levels)
+ ;
+
+ if (levels >= WT_MERGE_STACK_MIN)
+ page = top;
+
/*
* Try to lock the page. If this succeeds, we're going to queue
* it for forced eviction. We don't go right to the EVICT_FORCED
* state, because that is cleared by __wt_evict_list_clr_page.
*/
if (!WT_ATOMIC_CAS(page->ref->state, WT_REF_MEM, WT_REF_LOCKED))
- return (EBUSY);
+ return;
/* If the page is already queued for ordinary eviction, clear it. */
__wt_evict_list_clr_page(session, page);
__wt_spin_lock(session, &cache->evict_lock);
- /*
- * Add the page to the head of the eviction queue. Initialize the
- * eviction array if necessary.
- */
- if (cache->evict_allocated == 0) {
- count = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
- WT_ERR(__wt_realloc(session, &cache->evict_allocated,
- count * sizeof(WT_EVICT_ENTRY), &cache->evict));
- cache->evict_entries = count;
- }
- WT_ERR(__evict_init_candidate(session, cache->evict, page));
+ /* Add the page to the head of the eviction queue. */
+ __evict_init_candidate(session, cache->evict, page);
+
/* Set the location in the eviction queue to the new entry. */
cache->evict_current = cache->evict;
- /*
- * If the candidate list was empty we are adding a candidate, in all
- * other cases we are replacing an existing candidate.
- */
- if (cache->evict_candidates == 0)
- cache->evict_candidates++;
/*
* Lock the page so other threads cannot get new read locks on the
* page - which makes it more likely that the next pass of the eviction
* server will successfully evict the page.
*/
- if (!WT_ATOMIC_CAS(page->ref->state, WT_REF_LOCKED, WT_REF_EVICT_FORCE))
- WT_ERR(EBUSY);
+ WT_PUBLISH(page->ref->state, WT_REF_EVICT_FORCE);
-err: __wt_spin_unlock(session, &cache->evict_lock);
+ F_SET(cache, WT_EVICT_FORCE_PASS);
+ __wt_spin_unlock(session, &cache->evict_lock);
- /*
- * Only wake the server if the page was successfully queued.
- * Otherwise, unlock it.
- */
- if (ret == 0) {
- F_SET(S2C(session)->cache, WT_EVICT_FORCE_PASS);
- ret = __wt_evict_server_wake(session);
- } else
- page->ref->state = WT_REF_MEM;
- return (ret);
+ WT_CSTAT_INCR(session, cache_eviction_force);
+ WT_DSTAT_INCR(session, cache_eviction_force);
+
+ /* Try to wake the server, but don't worry if that fails. */
+ (void)__wt_evict_server_wake(session);
}
/*
@@ -288,11 +295,14 @@ __wt_cache_evict_server(void *arg)
conn = S2C(session);
cache = conn->cache;
- while (F_ISSET(conn, WT_CONN_SERVER_RUN)) {
+ cache->evict_entries = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
+ WT_ERR(__wt_calloc_def(session, cache->evict_entries, &cache->evict));
+
+ while (F_ISSET(conn, WT_CONN_EVICTION_RUN)) {
/* Evict pages from the cache as needed. */
WT_ERR(__evict_worker(session));
- if (!F_ISSET(conn, WT_CONN_SERVER_RUN))
+ if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
break;
WT_VERBOSE_ERR(session, evictserver, "sleeping");
@@ -304,13 +314,16 @@ __wt_cache_evict_server(void *arg)
WT_VERBOSE_ERR(session, evictserver, "exiting");
if (ret == 0) {
- if (__wt_cache_bytes_inuse(cache) != 0) {
+ if (cache->pages_inmem != cache->pages_evict)
__wt_errx(session,
- "cache server: exiting with %" PRIu64 " pages, "
- "%" PRIu64 " bytes in use",
- __wt_cache_pages_inuse(cache),
- __wt_cache_bytes_inuse(cache));
- }
+ "cache server: exiting with %" PRIu64 " pages in "
+ "memory and %" PRIu64 " pages evicted",
+ cache->pages_inmem, cache->pages_evict);
+ if (cache->bytes_inmem != cache->bytes_evict)
+ __wt_errx(session,
+ "cache server: exiting with %" PRIu64 " bytes in "
+ "memory and %" PRIu64 " bytes evicted",
+ cache->bytes_inmem, cache->bytes_evict);
} else
err: WT_PANIC_ERR(session, ret, "eviction server error");
@@ -330,17 +343,21 @@ err: WT_PANIC_ERR(session, ret, "eviction server error");
static int
__evict_worker(WT_SESSION_IMPL *session)
{
+ WT_BTREE *force_btree;
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
+ WT_PAGE *force_page;
uint64_t bytes_inuse, bytes_max, dirty_inuse;
- int clean, force, loop;
+ int clean, loop;
conn = S2C(session);
cache = conn->cache;
/* Evict pages from the cache. */
for (loop = 0;; loop++) {
+ force_page = NULL;
+
/*
* Block out concurrent eviction while we are handling requests.
*/
@@ -350,22 +367,31 @@ __evict_worker(WT_SESSION_IMPL *session)
while (ret == 0 && cache->sync_complete != cache->sync_request)
ret = __evict_file_request_walk(session);
- /* Check for forced eviction while we hold the lock. */
- force = F_ISSET(cache, WT_EVICT_FORCE_PASS) ? 1 : 0;
- F_CLR(cache, WT_EVICT_FORCE_PASS);
-
- __wt_spin_unlock(session, &cache->evict_lock);
- WT_RET(ret);
-
/*
* If we've been awoken for forced eviction, just try to evict
* the first page in the queue: don't do a walk and sort first.
- * Sometimes the page won't be available for eviction because
- * there is a reader still holding a hazard reference. Give up
- * in that case, the application thread can add it again.
*/
- if (force)
- (void)__wt_evict_lru_page(session, 0);
+ force_btree = NULL;
+ force_page = NULL;
+ if (ret == 0 && F_ISSET(cache, WT_EVICT_FORCE_PASS)) {
+ if (cache->evict->page != NULL &&
+ WT_ATOMIC_CAS(cache->evict->page->ref->state,
+ WT_REF_EVICT_FORCE, WT_REF_LOCKED)) {
+ force_btree = cache->evict->btree;
+ force_page = cache->evict->page;
+ __evict_list_clr(session, cache->evict);
+ }
+ F_CLR(cache, WT_EVICT_FORCE_PASS);
+ }
+
+ __wt_spin_unlock(session, &cache->evict_lock);
+ WT_RET(ret);
+
+ if (force_page != NULL) {
+ WT_SET_BTREE_IN_SESSION(session, force_btree);
+ (void)__evict_page(session, force_page);
+ WT_CLEAR_BTREE_IN_SESSION(session);
+ }
/*
* Keep evicting until we hit the target cache usage and the
@@ -393,16 +419,23 @@ __evict_worker(WT_SESSION_IMPL *session)
if (bytes_inuse > (cache->eviction_target * bytes_max) / 100)
clean = 1;
+ /*
+ * Track whether pages are being evicted. This will be cleared
+ * by the next thread to successfully evict a page.
+ */
+ F_SET(cache, WT_EVICT_NO_PROGRESS);
WT_RET(__evict_lru(session, clean));
__evict_dirty_validate(conn);
+
/*
* If we're making progress, keep going; if we're not making
- * any progress at all, go back to sleep, it's not something
- * we can fix.
+ * any progress at all, mark the cache "stuck" and go back to
+ * sleep, it's not something we can fix.
*/
- if (clean && __wt_cache_bytes_inuse(cache) >= bytes_inuse) {
+ if (F_ISSET(cache, WT_EVICT_NO_PROGRESS)) {
if (loop == 10) {
+ F_SET(cache, WT_EVICT_STUCK);
WT_CSTAT_INCR(session, cache_eviction_slow);
WT_VERBOSE_RET(session, evictserver,
"unable to reach eviction goal");
@@ -474,13 +507,9 @@ __evict_page(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_RET(__wt_txn_init(session));
__wt_txn_get_evict_snapshot(session);
- saved_txn.oldest_snap_min = txn->oldest_snap_min;
txn->isolation = TXN_ISO_READ_COMMITTED;
ret = __wt_rec_evict(session, page, 0);
- /* Keep count of any failures. */
- saved_txn.eviction_fails = txn->eviction_fails;
-
if (was_running) {
WT_ASSERT(session, txn->snapshot == NULL ||
txn->snapshot != saved_txn.snapshot);
@@ -488,6 +517,9 @@ __evict_page(WT_SESSION_IMPL *session, WT_PAGE *page)
} else
__wt_txn_release_snapshot(session);
+ /* If the oldest transaction was updated, keep the newer value. */
+ saved_txn.oldest_snap_min = txn->oldest_snap_min;
+
*txn = saved_txn;
return (ret);
}
@@ -668,28 +700,44 @@ __wt_sync_file(WT_SESSION_IMPL *session, int syncop)
WT_CACHE *cache;
WT_DECL_RET;
WT_PAGE *page;
+ WT_TXN *txn;
uint32_t flags;
btree = S2BT(session);
cache = S2C(session)->cache;
page = NULL;
+ txn = &session->txn;
switch (syncop) {
case WT_SYNC_CHECKPOINT:
+ case WT_SYNC_WRITE_LEAVES:
/*
* The first pass walks all cache leaf pages, waiting for
* concurrent activity in a page to be resolved, acquiring
* hazard references to prevent eviction.
*/
- flags = WT_TREE_CACHE | WT_TREE_SKIP_INTL | WT_TREE_WAIT;
+ flags = WT_TREE_CACHE | WT_TREE_SKIP_INTL;
+ if (syncop == WT_SYNC_CHECKPOINT)
+ flags |= WT_TREE_WAIT;
WT_ERR(__wt_tree_walk(session, &page, flags));
while (page != NULL) {
- /* Write dirty pages. */
- if (__wt_page_is_modified(page))
- WT_ERR(__wt_rec_write(session, page, NULL, 0));
+ /* Write dirty pages if nobody beat us to it. */
+ if (__wt_page_is_modified(page)) {
+ if (txn->isolation == TXN_ISO_READ_COMMITTED)
+ __wt_txn_get_snapshot(session,
+ WT_TXN_NONE, WT_TXN_NONE, 0);
+ ret = __wt_rec_write(session, page, NULL, 0);
+ if (txn->isolation == TXN_ISO_READ_COMMITTED)
+ __wt_txn_release_snapshot(session);
+ WT_ERR(ret);
+ }
+
WT_ERR(__wt_tree_walk(session, &page, flags));
}
+ if (syncop == WT_SYNC_WRITE_LEAVES)
+ break;
+
/*
* Pages cannot disappear from underneath internal pages when
* internal pages are being reconciled by checkpoint; also,
@@ -793,20 +841,20 @@ __evict_lru(WT_SESSION_IMPL *session, int clean)
qsort(cache->evict,
candidates, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp);
- /* Find the bottom 25% */
while (candidates > 0 && cache->evict[candidates - 1].page == NULL)
--candidates;
+ /* Find the bottom 25% of read generations. */
cutoff = (3 * __evict_read_gen(&cache->evict[0]) +
__evict_read_gen(&cache->evict[candidates - 1])) / 4;
/*
- * Don't take more than half, regardless. That said, if there is only
- * one candidate page, which is normal when populating an empty file,
- * don't exclude it.
+ * Don't take less than 10% or more than 50% of candidates, regardless.
+ * That said, if there is only one candidate page, which is normal when
+ * populating an empty file, don't exclude it.
*/
- for (i = 0; i < candidates / 2; i++)
- if (cache->evict[i].page->read_gen > cutoff)
+ for (i = candidates / 10; i < candidates / 2; i++)
+ if (__evict_read_gen(&cache->evict[i]) > cutoff)
break;
cache->evict_candidates = i + 1;
@@ -836,29 +884,14 @@ __evict_walk(WT_SESSION_IMPL *session, u_int *entriesp, int clean)
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
- u_int elem, file_count, i, retries;
+ u_int file_count, i, retries;
conn = S2C(session);
cache = S2C(session)->cache;
retries = 0;
- /*
- * Resize the array in which we're tracking pages, as necessary, then
- * get some pages from each underlying file. In practice, a realloc
- * is rarely needed, so it is worth avoiding the LRU lock.
- */
- elem = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
- if (elem > cache->evict_entries) {
- __wt_spin_lock(session, &cache->evict_lock);
- /* Save the offset of the eviction point. */
- i = (u_int)(cache->evict_current - cache->evict);
- WT_ERR(__wt_realloc(session, &cache->evict_allocated,
- elem * sizeof(WT_EVICT_ENTRY), &cache->evict));
- cache->evict_entries = elem;
- if (cache->evict_current != NULL)
- cache->evict_current = cache->evict + i;
- __wt_spin_unlock(session, &cache->evict_lock);
- }
+ /* Update the oldest transaction ID -- we use it to filter pages. */
+ __wt_txn_get_oldest(session);
/*
* NOTE: we don't hold the schema lock: files can't be removed without
@@ -903,15 +936,11 @@ retry: file_count = 0;
}
cache->evict_file_next = (btree == NULL) ? 0 : file_count;
- /* In the extreme case, all of the pages have to come from one file. */
- if (ret == 0 && i < cache->evict_entries &&
- retries++ < WT_EVICT_WALK_INCR / WT_EVICT_WALK_PER_FILE)
+ /* Walk the files a few times if we don't find enough pages. */
+ if (ret == 0 && i < cache->evict_entries && retries++ < 10)
goto retry;
*entriesp = i;
- if (0) {
-err: __wt_spin_unlock(session, &cache->evict_lock);
- }
return (ret);
}
@@ -919,7 +948,7 @@ err: __wt_spin_unlock(session, &cache->evict_lock);
* __evict_init_candidate --
* Initialize a WT_EVICT_ENTRY structure with a given page.
*/
-static int
+static void
__evict_init_candidate(
WT_SESSION_IMPL *session, WT_EVICT_ENTRY *evict, WT_PAGE *page)
{
@@ -930,7 +959,6 @@ __evict_init_candidate(
/* Mark the page on the list */
F_SET_ATOMIC(page, WT_PAGE_EVICT_LRU);
- return (0);
}
/*
@@ -945,7 +973,8 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean)
WT_DECL_RET;
WT_EVICT_ENTRY *end, *evict, *start;
WT_PAGE *page;
- int modified, restarts;
+ wt_txnid_t oldest_txn;
+ int modified, restarts, levels;
btree = S2BT(session);
cache = S2C(session)->cache;
@@ -953,6 +982,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean)
end = start + WT_EVICT_WALK_PER_FILE;
if (end > cache->evict + cache->evict_entries)
end = cache->evict + cache->evict_entries;
+ oldest_txn = session->txn.oldest_snap_min;
/*
* Get some more eviction candidate pages.
@@ -974,47 +1004,93 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean)
continue;
}
+ WT_CSTAT_INCR(session, cache_eviction_walk);
+
+ /* Ignore root pages entirely. */
+ if (WT_PAGE_IS_ROOT(page))
+ continue;
+
+ /* Look for a split-merge (grand)parent page to merge. */
+ levels = 0;
+ if (__wt_btree_mergeable(page))
+ for (levels = 1;
+ levels < WT_MERGE_STACK_MIN &&
+ __wt_btree_mergeable(page->parent);
+ page = page->parent, levels++)
+ ;
+ else if (page->modify != NULL &&
+ F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE))
+ continue;
+
/*
- * Skip root pages and split-merge pages: they can't be evicted.
- * (Split-merge pages are always merged into their parents.)
- * Don't skip empty or split pages: updates after their last
- * reconciliation may have changed their state and only the
- * reconciliation/eviction code can confirm if they should be
- * skipped.
+ * Only look for a parent at exactly the right height above: if
+ * the stack is deep enough, we'll find it eventually, and we
+ * don't want to do too much work on every level.
*
- * Use the EVICT_LRU flag to avoid putting pages onto the list
- * multiple times.
+ * !!!
+ * Don't restrict ourselves to only the top-most page (that is,
+ * don't require that page->parent is not mergeable). If there
+ * is a big, busy enough split-merge tree, the top-level merge
+ * will only happen if we can lock the whole subtree
+ * exclusively. Consider smaller merges in case locking the
+ * whole tree fails.
*/
- if (WT_PAGE_IS_ROOT(page) ||
- (page->modify != NULL &&
- F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE)) ||
- F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
+ if (levels != 0 && levels != WT_MERGE_STACK_MIN)
continue;
/*
- * If the file is being checkpointed, there's a period of time
- * where we can't discard any page with a modification
- * structure because it might race with the checkpointing
- * thread.
- *
- * During this phase, there is little point trying to evict
- * dirty pages: we might be lucky and find an internal page
- * that has not yet been checkpointed, but much more likely is
- * that we will waste effort considering dirty leaf pages that
- * cannot be evicted because they have modifications more
- * recent than the checkpoint.
+ * If this page has never been considered for eviction, set its
+ * read generation to a little bit in the future and move on,
+ * give readers a chance to start updating the read generation.
*/
- modified = __wt_page_is_modified(page);
- if (modified && btree->checkpointing)
+ if (page->read_gen == WT_READ_GEN_NOTSET) {
+ page->read_gen = __wt_cache_read_gen_set(session);
continue;
+ }
- /* Optionally ignore clean pages. */
- if (!modified && !clean)
+ /*
+ * Use the EVICT_LRU flag to avoid putting pages onto the list
+ * multiple times.
+ */
+ if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
continue;
+ /* The following checks apply to eviction but not merges. */
+ if (levels == 0) {
+ /*
+ * If the file is being checkpointed, there's a period
+ * of time where we can't discard any page with a
+ * modification structure because it might race with
+ * the checkpointing thread.
+ *
+ * During this phase, there is little point trying to
+ * evict dirty pages: we might be lucky and find an
+ * internal page that has not yet been checkpointed,
+ * but much more likely is that we will waste effort
+ * considering dirty leaf pages that cannot be evicted
+ * because they have modifications more recent than the
+ * checkpoint.
+ */
+ modified = __wt_page_is_modified(page);
+ if (modified && btree->checkpointing)
+ continue;
+
+ /* Optionally ignore clean pages. */
+ if (!modified && !clean)
+ continue;
+
+ /*
+ * If the oldest transaction hasn't changed since the
+ * last time this page was written, there's no chance
+ * to make progress...
+ */
+ if (modified &&
+ TXNID_LE(oldest_txn, page->modify->disk_txn))
+ continue;
+ }
+
WT_ASSERT(session, evict->page == NULL);
- if (__evict_init_candidate(session, evict, page) != 0)
- continue;
+ __evict_init_candidate(session, evict, page);
++evict;
WT_VERBOSE_RET(session, evictserver,
@@ -1029,7 +1105,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean)
* __evict_get_page --
* Get a page for eviction.
*/
-static void
+static int
__evict_get_page(
WT_SESSION_IMPL *session, int is_app, WT_BTREE **btreep, WT_PAGE **pagep)
{
@@ -1042,6 +1118,18 @@ __evict_get_page(
*btreep = NULL;
*pagep = NULL;
+ /*
+ * A pathological case: if we're the oldest transaction in the system
+ * and the eviction server is stuck trying to find space, abort the
+ * transaction to give up all hazard references before trying again.
+ */
+ if (is_app && F_ISSET(cache, WT_EVICT_STUCK) &&
+ __wt_txn_am_oldest(session)) {
+ F_CLR(cache, WT_EVICT_STUCK);
+ WT_CSTAT_INCR(session, txn_fail_cache);
+ return (WT_DEADLOCK);
+ }
+
candidates = cache->evict_candidates;
/* The eviction server only considers half of the entries. */
if (!is_app && candidates > 1)
@@ -1057,7 +1145,7 @@ __evict_get_page(
for (;;) {
if (cache->evict_current == NULL ||
cache->evict_current >= cache->evict + candidates)
- return;
+ return (WT_NOTFOUND);
if (__wt_spin_trylock(session, &cache->evict_lock) == 0)
break;
__wt_yield();
@@ -1081,7 +1169,7 @@ __evict_get_page(
* unlocked the page and some other thread may have evicted it
* by the time we look at it.
*/
- evict->page->read_gen = __wt_cache_read_gen(session);
+ evict->page->read_gen = __wt_cache_read_gen_set(session);
/*
* Lock the page while holding the eviction mutex to prevent
@@ -1124,6 +1212,8 @@ __evict_get_page(
if (is_app && *pagep == NULL)
cache->evict_current = NULL;
__wt_spin_unlock(session, &cache->evict_lock);
+
+ return ((*pagep == NULL) ? WT_NOTFOUND : 0);
}
/*
@@ -1135,12 +1225,11 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app)
{
WT_BTREE *btree;
WT_DATA_HANDLE *saved_dhandle;
+ WT_CACHE *cache;
WT_DECL_RET;
WT_PAGE *page;
- __evict_get_page(session, is_app, &btree, &page);
- if (page == NULL)
- return (WT_NOTFOUND);
+ WT_RET(__evict_get_page(session, is_app, &btree, &page));
WT_ASSERT(session, page->ref->state == WT_REF_LOCKED);
@@ -1155,6 +1244,10 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app)
WT_CLEAR_BTREE_IN_SESSION(session);
session->dhandle = saved_dhandle;
+ cache = S2C(session)->cache;
+ if (ret == 0 && F_ISSET(cache, WT_EVICT_NO_PROGRESS | WT_EVICT_STUCK))
+ F_CLR(cache, WT_EVICT_NO_PROGRESS | WT_EVICT_STUCK);
+
return (ret);
}
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 6723d177f6c..33788f248e1 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -9,38 +9,18 @@
static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt);
static int __btree_get_last_recno(WT_SESSION_IMPL *);
-static int __btree_page_sizes(WT_SESSION_IMPL *, const char *);
+static int __btree_page_sizes(WT_SESSION_IMPL *);
static int __btree_tree_open_empty(WT_SESSION_IMPL *, int);
static int pse1(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t);
static int pse2(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t, uint32_t);
/*
- * __wt_btree_create --
- * Create a Btree.
- */
-int
-__wt_btree_create(WT_SESSION_IMPL *session, const char *filename)
-{
- return (__wt_block_manager_create(session, filename));
-}
-
-/*
- * __wt_btree_truncate --
- * Truncate a Btree.
- */
-int
-__wt_btree_truncate(WT_SESSION_IMPL *session, const char *filename)
-{
- return (__wt_block_manager_truncate(session, filename));
-}
-
-/*
* __wt_btree_open --
* Open a Btree.
*/
int
-__wt_btree_open(WT_SESSION_IMPL *session, const char *cfg[])
+__wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
{
WT_BM *bm;
WT_BTREE *btree;
@@ -75,12 +55,9 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *cfg[])
/* Handle salvage configuration. */
forced_salvage = 0;
- if (F_ISSET(btree, WT_BTREE_SALVAGE) && cfg != NULL) {
- ret = __wt_config_gets(session, cfg, "force", &cval);
- if (ret != 0 && ret != WT_NOTFOUND)
- WT_ERR(ret);
- if (ret == 0 && cval.val != 0)
- forced_salvage = 1;
+ if (F_ISSET(btree, WT_BTREE_SALVAGE)) {
+ WT_ERR(__wt_config_gets(session, op_cfg, "force", &cval));
+ forced_salvage = (cval.val != 0);
}
/* Initialize and configure the WT_BTREE structure. */
@@ -91,8 +68,8 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *cfg[])
if (!WT_PREFIX_SKIP(filename, "file:"))
WT_ERR_MSG(session, EINVAL, "expected a 'file:' URI");
- WT_ERR(__wt_block_manager_open(session,
- filename, dhandle->config, cfg, forced_salvage, &btree->bm));
+ WT_ERR(__wt_block_manager_open(
+ session, filename, dhandle->cfg, forced_salvage, &btree->bm));
bm = btree->bm;
/*
@@ -179,7 +156,6 @@ __wt_btree_close(WT_SESSION_IMPL *session)
__wt_free(session, btree->value_format);
if (btree->val_ovfl_lock != NULL)
WT_TRET(__wt_rwlock_destroy(session, &btree->val_ovfl_lock));
- __wt_free(session, dhandle->stats);
btree->bulk_load_ok = 0;
@@ -200,14 +176,14 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
WT_NAMED_COMPRESSOR *ncomp;
uint32_t bitcnt;
int fixed;
- const char *config;
+ const char **cfg;
btree = S2BT(session);
conn = S2C(session);
- config = btree->dhandle->config;
+ cfg = btree->dhandle->cfg;
/* Validate file types and check the data format plan. */
- WT_RET(__wt_config_getones(session, config, "key_format", &cval));
+ WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL));
if (WT_STRING_MATCH("r", cval.str, cval.len))
btree->type = BTREE_COL_VAR;
@@ -215,12 +191,12 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
btree->type = BTREE_ROW;
WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->key_format));
- WT_RET(__wt_config_getones(session, config, "value_format", &cval));
+ WT_RET(__wt_config_gets(session, cfg, "value_format", &cval));
WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->value_format));
/* Row-store key comparison and key gap for prefix compression. */
if (btree->type == BTREE_ROW) {
- WT_RET(__wt_config_getones(session, config, "collator", &cval));
+ WT_RET(__wt_config_gets(session, cfg, "collator", &cval));
if (cval.len > 0) {
TAILQ_FOREACH(ncoll, &conn->collqh, q) {
if (WT_STRING_MATCH(
@@ -234,7 +210,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
"unknown collator '%.*s'",
(int)cval.len, cval.str);
}
- WT_RET(__wt_config_getones(session, config, "key_gap", &cval));
+ WT_RET(__wt_config_gets(session, cfg, "key_gap", &cval));
btree->key_gap = (uint32_t)cval.val;
}
/* Check for fixed-size data. */
@@ -252,14 +228,13 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
}
/* Page sizes */
- WT_RET(__btree_page_sizes(session, config));
+ WT_RET(__btree_page_sizes(session));
/* Eviction; the metadata file is never evicted. */
if (WT_IS_METADATA(btree->dhandle))
F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD);
else {
- WT_RET(__wt_config_getones(
- session, config, "cache_resident", &cval));
+ WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval));
if (cval.val)
F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD);
else
@@ -267,7 +242,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
}
/* Checksums */
- WT_RET(__wt_config_getones(session, config, "checksum", &cval));
+ WT_RET(__wt_config_gets(session, cfg, "checksum", &cval));
if (WT_STRING_MATCH("on", cval.str, cval.len))
btree->checksum = CKSUM_ON;
else if (WT_STRING_MATCH("off", cval.str, cval.len))
@@ -276,7 +251,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
btree->checksum = CKSUM_UNCOMPRESSED;
/* Huffman encoding */
- WT_RET(__wt_btree_huffman_open(session, config));
+ WT_RET(__wt_btree_huffman_open(session));
/*
* Reconciliation configuration:
@@ -290,25 +265,21 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
case BTREE_COL_FIX:
break;
case BTREE_ROW:
- WT_RET(__wt_config_getones(
- session, config, "internal_key_truncate", &cval));
+ WT_RET(__wt_config_gets(
+ session, cfg, "internal_key_truncate", &cval));
btree->internal_key_truncate = cval.val == 0 ? 0 : 1;
- WT_RET(__wt_config_getones(
- session, config, "prefix_compression", &cval));
+ WT_RET(__wt_config_gets(
+ session, cfg, "prefix_compression", &cval));
btree->prefix_compression = cval.val == 0 ? 0 : 1;
/* FALLTHROUGH */
case BTREE_COL_VAR:
- WT_RET(
- __wt_config_getones(session, config, "dictionary", &cval));
+ WT_RET(__wt_config_gets(session, cfg, "dictionary", &cval));
btree->dictionary = (u_int)cval.val;
break;
}
- WT_RET(__wt_config_getones(session, config, "split_pct", &cval));
- btree->split_pct = (u_int)cval.val;
-
- WT_RET(__wt_config_getones(session, config, "block_compressor", &cval));
+ WT_RET(__wt_config_gets(session, cfg, "block_compressor", &cval));
if (cval.len > 0) {
TAILQ_FOREACH(ncomp, &conn->compqh, q)
if (WT_STRING_MATCH(ncomp->name, cval.str, cval.len)) {
@@ -325,7 +296,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
WT_RET(__wt_rwlock_alloc(
session, "btree overflow lock", &btree->val_ovfl_lock));
- WT_RET(__wt_stat_alloc_dsrc_stats(session, &btree->dhandle->stats));
+ __wt_stat_init_dsrc_stats(&btree->dhandle->stats);
btree->write_gen = ckpt->write_gen; /* Write generation */
btree->modified = 0; /* Clean */
@@ -403,29 +374,25 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation)
* __wt_page_out on error, we require a correct page setup at each point
* where we might fail.
*/
- WT_ERR(__wt_calloc_def(session, 1, &root));
switch (btree->type) {
case BTREE_COL_FIX:
case BTREE_COL_VAR:
- root->type = WT_PAGE_COL_INT;
+ WT_ERR(__wt_page_alloc(session, WT_PAGE_COL_INT, 1, &root));
root->u.intl.recno = 1;
- WT_ERR(__wt_calloc_def(session, 1, &root->u.intl.t));
ref = root->u.intl.t;
WT_ERR(__wt_btree_leaf_create(session, root, ref, &leaf));
- ref->page = leaf;
ref->addr = NULL;
ref->state = WT_REF_MEM;
ref->u.recno = 1;
break;
case BTREE_ROW:
- root->type = WT_PAGE_ROW_INT;
- WT_ERR(__wt_calloc_def(session, 1, &root->u.intl.t));
+ WT_ERR(__wt_page_alloc(session, WT_PAGE_ROW_INT, 1, &root));
ref = root->u.intl.t;
WT_ERR(__wt_btree_leaf_create(session, root, ref, &leaf));
- ref->page = leaf;
ref->addr = NULL;
ref->state = WT_REF_MEM;
- WT_ERR(__wt_row_ikey_alloc(session, 0, "", 1, &ref->u.key));
+ WT_ERR(
+ __wt_row_ikey_incr(session, root, 0, "", 1, &ref->u.key));
break;
WT_ILLEGAL_VALUE_ERR(session);
}
@@ -474,7 +441,7 @@ err: if (leaf != NULL)
/*
* __wt_btree_leaf_create --
- * Create an empty leaf page.
+ * Create an empty leaf page and link it into a reference in its parent.
*/
int
__wt_btree_leaf_create(
@@ -485,62 +452,38 @@ __wt_btree_leaf_create(
btree = S2BT(session);
- WT_RET(__wt_calloc_def(session, 1, &leaf));
switch (btree->type) {
case BTREE_COL_FIX:
+ WT_RET(__wt_page_alloc(session, WT_PAGE_COL_FIX, 0, &leaf));
leaf->u.col_fix.recno = 1;
- leaf->type = WT_PAGE_COL_FIX;
break;
case BTREE_COL_VAR:
+ WT_RET(__wt_page_alloc(session, WT_PAGE_COL_VAR, 0, &leaf));
leaf->u.col_var.recno = 1;
- leaf->type = WT_PAGE_COL_VAR;
break;
case BTREE_ROW:
- leaf->type = WT_PAGE_ROW_LEAF;
+ WT_RET(__wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, &leaf));
break;
+ WT_ILLEGAL_VALUE(session);
}
leaf->entries = 0;
- leaf->ref = ref;
- leaf->parent = parent;
+ WT_LINK_PAGE(parent, ref, leaf);
*pagep = leaf;
return (0);
}
/*
- * __wt_btree_get_memsize --
- * Access the size of an in-memory tree with a single leaf page.
+ * __wt_btree_no_eviction --
+ * Setup or release a cache-resident tree.
*/
-int
-__wt_btree_get_memsize(
- WT_SESSION_IMPL *session, WT_BTREE *btree, uint32_t **memsizep)
+void
+__wt_btree_evictable(WT_SESSION_IMPL *session, int on)
{
- WT_PAGE *root, *child;
-
- WT_UNUSED(session);
- root = btree->root_page;
- child = root->u.intl.t->page;
-
- if (root->entries != 1 || child == NULL) {
- *memsizep = NULL;
- return (WT_ERROR);
- }
-
- *memsizep = &child->memory_footprint;
- F_SET(btree, WT_BTREE_NO_EVICTION);
- return (0);
-}
-
-/*
- * __wt_btree_release_memsize --
- * Release a cache-resident tree.
- */
-int
-__wt_btree_release_memsize(WT_SESSION_IMPL *session, WT_BTREE *btree)
-{
- WT_UNUSED(session);
- F_CLR(btree, WT_BTREE_NO_EVICTION);
- return (0);
+ if (on)
+ F_CLR(S2BT(session), WT_BTREE_NO_EVICTION);
+ else
+ F_SET(S2BT(session), WT_BTREE_NO_EVICTION);
}
/*
@@ -566,50 +509,41 @@ __btree_get_last_recno(WT_SESSION_IMPL *session)
/*
* __btree_page_sizes --
- * Verify the page sizes.
+ * Verify the page sizes. Some of these sizes are automatically checked
+ * using limits defined in the API, don't duplicate the logic here.
*/
static int
-__btree_page_sizes(WT_SESSION_IMPL *session, const char *config)
+__btree_page_sizes(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_CONFIG_ITEM cval;
- uint32_t intl_split_size, leaf_split_size, split_pct;
+ uint32_t intl_split_size, leaf_split_size;
+ const char **cfg;
btree = S2BT(session);
+ cfg = btree->dhandle->cfg;
- WT_RET(__wt_config_getones(session, config, "allocation_size", &cval));
+ WT_RET(__wt_config_gets(session, cfg, "allocation_size", &cval));
btree->allocsize = (uint32_t)cval.val;
- WT_RET(
- __wt_config_getones(session, config, "internal_page_max", &cval));
+ WT_RET(__wt_config_gets(session, cfg, "internal_page_max", &cval));
btree->maxintlpage = (uint32_t)cval.val;
- WT_RET(__wt_config_getones(
- session, config, "internal_item_max", &cval));
+ WT_RET(__wt_config_gets(session, cfg, "internal_item_max", &cval));
btree->maxintlitem = (uint32_t)cval.val;
- WT_RET(__wt_config_getones(session, config, "leaf_page_max", &cval));
+ WT_RET(__wt_config_gets(session, cfg, "leaf_page_max", &cval));
btree->maxleafpage = (uint32_t)cval.val;
- WT_RET(__wt_config_getones(
- session, config, "leaf_item_max", &cval));
+ WT_RET(__wt_config_gets(session, cfg, "leaf_item_max", &cval));
btree->maxleafitem = (uint32_t)cval.val;
+ WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval));
+ btree->split_pct = (u_int)cval.val;
+
/*
* When a page is forced to split, we want at least 50 entries on its
* parent.
*/
- WT_RET(__wt_config_getones(session, config, "memory_page_max", &cval));
+ WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
btree->maxmempage = WT_MAX((uint64_t)cval.val, 50 * btree->maxleafpage);
- /*
- * Limit allocation units to 128MB, and page sizes to 512MB. There's no
- * reason we couldn't support larger values (any value up to the smaller
- * of an off_t and a size_t should work), but an application specifying
- * larger allocation units or page sizes is likely making a mistake. The
- * API checked this, but we assert it anyway.
- */
- WT_ASSERT(session, btree->allocsize >= WT_BTREE_ALLOCATION_SIZE_MIN);
- WT_ASSERT(session, btree->allocsize <= WT_BTREE_ALLOCATION_SIZE_MAX);
- WT_ASSERT(session, btree->maxintlpage <= WT_BTREE_PAGE_SIZE_MAX);
- WT_ASSERT(session, btree->maxleafpage <= WT_BTREE_PAGE_SIZE_MAX);
-
/* Allocation sizes must be a power-of-two, nothing else makes sense. */
if (!__wt_ispo2(btree->allocsize))
WT_RET_MSG(session,
@@ -628,12 +562,8 @@ __btree_page_sizes(WT_SESSION_IMPL *session, const char *config)
* Set the split percentage: reconciliation splits to a smaller-than-
* maximum page size so we don't split every time a new entry is added.
*/
- WT_RET(__wt_config_getones(session, config, "split_pct", &cval));
- split_pct = (uint32_t)cval.val;
- intl_split_size = WT_SPLIT_PAGE_SIZE(
- btree->maxintlpage, btree->allocsize, split_pct);
- leaf_split_size = WT_SPLIT_PAGE_SIZE(
- btree->maxleafpage, btree->allocsize, split_pct);
+ intl_split_size = __wt_split_page_size(btree, btree->maxintlpage);
+ leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage);
/*
* Default values for internal and leaf page items: make sure at least
@@ -667,14 +597,45 @@ __btree_page_sizes(WT_SESSION_IMPL *session, const char *config)
*/
if (btree->maxintlitem > intl_split_size / 2)
return (pse2(session, "internal",
- btree->maxintlpage, btree->maxintlitem, split_pct));
+ btree->maxintlpage, btree->maxintlitem, btree->split_pct));
if (btree->maxleafitem > leaf_split_size / 2)
return (pse2(session, "leaf",
- btree->maxleafpage, btree->maxleafitem, split_pct));
+ btree->maxleafpage, btree->maxleafitem, btree->split_pct));
return (0);
}
+/*
+ * __wt_split_page_size --
+ * Split page size calculation: we don't want to repeatedly split every
+ * time a new entry is added, so we split to a smaller-than-maximum page size.
+ */
+uint32_t
+__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
+{
+ uintmax_t a;
+ uint32_t split_size;
+
+ /*
+ * Ideally, the split page size is some percentage of the maximum page
+ * size rounded to an allocation unit (round to an allocation unit so
+ * we don't waste space when we write).
+ */
+ a = maxpagesize; /* Don't overflow. */
+ split_size =
+ (uint32_t)WT_ALIGN((a * btree->split_pct) / 100, btree->allocsize);
+
+ /*
+ * If the result of that calculation is the same as the allocation unit
+ * (that happens if the maximum size is the same size as an allocation
+ * unit, use a percentage of the maximum page size).
+ */
+ if (split_size == btree->allocsize)
+ split_size = (uint32_t)((a * btree->split_pct) / 100);
+
+ return (split_size);
+}
+
static int
pse1(WT_SESSION_IMPL *session, const char *type, uint32_t max, uint32_t ovfl)
{
diff --git a/src/btree/bt_huffman.c b/src/btree/bt_huffman.c
index 6385524f2f1..867fcdfe93d 100644
--- a/src/btree/bt_huffman.c
+++ b/src/btree/bt_huffman.c
@@ -132,19 +132,20 @@ static int __wt_huffman_read(WT_SESSION_IMPL *,
* Configure Huffman encoding for the tree.
*/
int
-__wt_btree_huffman_open(WT_SESSION_IMPL *session, const char *config)
+__wt_btree_huffman_open(WT_SESSION_IMPL *session)
{
struct __wt_huffman_table *table;
WT_BTREE *btree;
WT_CONFIG_ITEM key_conf, value_conf;
WT_DECL_RET;
+ const char **cfg;
u_int entries, numbytes;
btree = S2BT(session);
+ cfg = btree->dhandle->cfg;
- WT_RET(__wt_config_getones(session, config, "huffman_key", &key_conf));
- WT_RET(__wt_config_getones(
- session, config, "huffman_value", &value_conf));
+ WT_RET(__wt_config_gets(session, cfg, "huffman_key", &key_conf));
+ WT_RET(__wt_config_gets(session, cfg, "huffman_value", &value_conf));
if (key_conf.len == 0 && value_conf.len == 0)
return (0);
diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c
index 1bbae2b0ebb..dee629aea0b 100644
--- a/src/btree/bt_ovfl.c
+++ b/src/btree/bt_ovfl.c
@@ -275,7 +275,7 @@ __wt_val_ovfl_cache(WT_SESSION_IMPL *session,
* a snapshot transaction after the item was deleted from a page that's
* subsequently been checkpointed, where the checkpoint must know about
* the freed blocks. We don't have any way to delay a free of the
- * underlying blocks until a particular set of transactions exit(and
+ * underlying blocks until a particular set of transactions exit (and
* this isn't a common scenario), so cache the overflow value in memory.
*
* This gets hard because the snapshot transaction reader might:
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index 4622cae269b..3d326a238da 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -7,11 +7,13 @@
#include "wt_internal.h"
-static int __inmem_col_fix(WT_SESSION_IMPL *, WT_PAGE *);
-static int __inmem_col_int(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
+static void __inmem_col_fix(WT_SESSION_IMPL *, WT_PAGE *);
+static void __inmem_col_int(WT_SESSION_IMPL *, WT_PAGE *);
static int __inmem_col_var(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
static int __inmem_row_int(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
-static int __inmem_row_leaf(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
+static int __inmem_row_leaf(WT_SESSION_IMPL *, WT_PAGE *);
+static int __inmem_row_leaf_entries(
+ WT_SESSION_IMPL *, WT_PAGE_HEADER *, uint32_t *);
/*
* __wt_page_in --
@@ -68,7 +70,8 @@ __wt_page_in_func(
break;
page = ref->page;
- WT_ASSERT(session, !WT_PAGE_IS_ROOT(page));
+ WT_ASSERT(session,
+ page != NULL && !WT_PAGE_IS_ROOT(page));
/*
* Ensure the page doesn't have ancient updates on it.
@@ -79,7 +82,7 @@ __wt_page_in_func(
*/
if (page->modify != NULL &&
__wt_txn_ancient(session, page->modify->first_id)) {
- page->read_gen = 0;
+ page->read_gen = WT_READ_GEN_OLDEST;
WT_RET(__wt_hazard_clear(session, page));
WT_RET(__wt_evict_server_wake(session));
break;
@@ -91,7 +94,15 @@ __wt_page_in_func(
return (ret);
}
- page->read_gen = __wt_cache_read_gen(session);
+ /*
+ * If this page has ever been considered for eviction,
+ * and its generation is aging, update it.
+ */
+ if (page->read_gen != WT_READ_GEN_NOTSET &&
+ page->read_gen < __wt_cache_read_gen(session))
+ page->read_gen =
+ __wt_cache_read_gen_set(session);
+
return (0);
WT_ILLEGAL_VALUE(session);
}
@@ -102,6 +113,74 @@ __wt_page_in_func(
}
/*
+ * __wt_page_alloc --
+ * Create or read a page into the cache.
+ */
+int
+__wt_page_alloc(WT_SESSION_IMPL *session,
+ uint8_t type, uint32_t alloc_entries, WT_PAGE **pagep)
+{
+ WT_CACHE *cache;
+ WT_PAGE *page;
+ size_t size;
+ void *p;
+
+ *pagep = NULL;
+
+ cache = S2C(session)->cache;
+
+ /*
+ * Allocate a page, and for most page types, the additional information
+ * it needs to describe the disk image.
+ */
+ size = sizeof(WT_PAGE);
+ switch (type) {
+ case WT_PAGE_COL_FIX:
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ size += alloc_entries * sizeof(WT_REF);
+ break;
+ case WT_PAGE_COL_VAR:
+ size += alloc_entries * sizeof(WT_COL);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ size += alloc_entries * sizeof(WT_ROW);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ WT_RET(__wt_calloc(session, 1, size, &page));
+ p = (uint8_t *)page + sizeof(WT_PAGE);
+
+ switch (type) {
+ case WT_PAGE_COL_FIX:
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ page->u.intl.t = p;
+ break;
+ case WT_PAGE_COL_VAR:
+ page->u.col_var.d = p;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ page->u.row.d = p;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /* Increment the cache statistics. */
+ __wt_cache_page_inmem_incr(session, page, size);
+ (void)WT_ATOMIC_ADD(cache->pages_inmem, 1);
+
+ /* The one page field we set is the type. */
+ page->type = type;
+
+ *pagep = page;
+ return (0);
+}
+
+/*
* __wt_page_inmem --
* Build in-memory page information.
*/
@@ -112,54 +191,100 @@ __wt_page_inmem(
{
WT_DECL_RET;
WT_PAGE *page;
- size_t inmem_size;
-
- WT_ASSERT_RET(session, dsk->u.entries > 0);
+ uint32_t alloc_entries;
+ size_t size;
+ alloc_entries = 0;
*pagep = NULL;
/*
- * Allocate and initialize the WT_PAGE.
- * Set the LRU so the page is not immediately selected for eviction.
- * Set the read generation (which can't match a search where the write
- * generation wasn't set, that is, remained 0).
+ * Figure out how many underlying objects the page references so
+ * we can allocate them along with the page.
*/
- WT_RET(__wt_calloc_def(session, 1, &page));
- page->parent = parent;
- page->ref = parent_ref;
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ break;
+ case WT_PAGE_COL_INT:
+ /*
+ * Column-store internal page entries map one-to-one to the
+ * number of physical entries on the page (each physical entry
+ * is an offset object).
+ */
+ alloc_entries = dsk->u.entries;
+ break;
+ case WT_PAGE_COL_VAR:
+ /*
+ * Column-store leaf page entries map one-to-one to the number
+ * of physical entries on the page (each physical entry is a
+ * data item).
+ */
+ alloc_entries = dsk->u.entries;
+ break;
+ case WT_PAGE_ROW_INT:
+ /*
+ * Row-store internal page entries map one-to-two to the number
+ * of physical entries on the page (each in-memory entry is a
+ * key item and location cookie).
+ */
+ alloc_entries = dsk->u.entries / 2;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * Row-store leaf page entries map in an indeterminate way to
+ * the physical entries on the page, we have to walk the page
+ * to figure it out.
+ */
+ WT_RET(__inmem_row_leaf_entries(session, dsk, &alloc_entries));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /* Allocate and initialize a new WT_PAGE. */
+ WT_RET(__wt_page_alloc(session, dsk->type, alloc_entries, &page));
page->dsk = dsk;
- page->read_gen = __wt_cache_read_gen(session);
- page->type = dsk->type;
+ page->read_gen = WT_READ_GEN_NOTSET;
if (disk_not_alloc)
F_SET_ATOMIC(page, WT_PAGE_DISK_NOT_ALLOC);
- inmem_size = sizeof(WT_PAGE);
- if (!disk_not_alloc)
- inmem_size += dsk->mem_size;
+ /*
+ * Track the memory allocated to build this page so we can update the
+ * cache statistics in a single call.
+ */
+ size = disk_not_alloc ? 0 : dsk->mem_size;
switch (page->type) {
case WT_PAGE_COL_FIX:
+ page->entries = dsk->u.entries;
page->u.col_fix.recno = dsk->recno;
- WT_ERR(__inmem_col_fix(session, page));
+ __inmem_col_fix(session, page);
break;
case WT_PAGE_COL_INT:
+ page->entries = dsk->u.entries;
page->u.intl.recno = dsk->recno;
- WT_ERR(__inmem_col_int(session, page, &inmem_size));
+ __inmem_col_int(session, page);
break;
case WT_PAGE_COL_VAR:
+ page->entries = dsk->u.entries;
page->u.col_var.recno = dsk->recno;
- WT_ERR(__inmem_col_var(session, page, &inmem_size));
+ WT_ERR(__inmem_col_var(session, page, &size));
break;
case WT_PAGE_ROW_INT:
- WT_ERR(__inmem_row_int(session, page, &inmem_size));
+ page->entries = dsk->u.entries / 2;
+ WT_ERR(__inmem_row_int(session, page, &size));
break;
case WT_PAGE_ROW_LEAF:
- WT_ERR(__inmem_row_leaf(session, page, &inmem_size));
+ page->entries = alloc_entries;
+ WT_ERR(__inmem_row_leaf(session, page));
break;
WT_ILLEGAL_VALUE_ERR(session);
}
- __wt_cache_page_read(session, page, inmem_size);
+ /* Update the page's in-memory size and the cache statistics. */
+ __wt_cache_page_inmem_incr(session, page, size);
+
+ /* Link the new page into the parent. */
+ if (parent_ref != NULL)
+ WT_LINK_PAGE(parent, parent_ref, page);
*pagep = page;
return (0);
@@ -172,7 +297,7 @@ err: __wt_page_out(session, &page);
* __inmem_col_fix --
* Build in-memory index for fixed-length column-store leaf pages.
*/
-static int
+static void
__inmem_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_BTREE *btree;
@@ -182,16 +307,14 @@ __inmem_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
dsk = page->dsk;
page->u.col_fix.bitf = WT_PAGE_HEADER_BYTE(btree, dsk);
- page->entries = dsk->u.entries;
- return (0);
}
/*
* __inmem_col_int --
* Build in-memory index for column-store internal pages.
*/
-static int
-__inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
+static void
+__inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_BTREE *btree;
WT_CELL *cell;
@@ -205,14 +328,6 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
unpack = &_unpack;
/*
- * Column-store page entries map one-to-one to the number of physical
- * entries on the page (each physical entry is a offset object).
- */
- WT_RET(__wt_calloc_def(
- session, (size_t)dsk->u.entries, &page->u.intl.t));
- *inmem_sizep += dsk->u.entries * sizeof(*page->u.intl.t);
-
- /*
* Walk the page, building references: the page contains value items.
* The value items are on-page items (WT_CELL_VALUE).
*/
@@ -223,9 +338,6 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
ref->u.recno = unpack->v;
++ref;
}
-
- page->entries = dsk->u.entries;
- return (0);
}
/*
@@ -234,7 +346,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
* column-store trees.
*/
static int
-__inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
+__inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
{
WT_BTREE *btree;
WT_COL *cip;
@@ -254,20 +366,12 @@ __inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
recno = page->u.col_var.recno;
/*
- * Column-store page entries map one-to-one to the number of physical
- * entries on the page (each physical entry is a data item).
- */
- WT_RET(__wt_calloc_def(
- session, (size_t)dsk->u.entries, &page->u.col_var.d));
- *inmem_sizep += dsk->u.entries * sizeof(*page->u.col_var.d);
-
- /*
* Walk the page, building references: the page contains unsorted value
* items. The value items are on-page (WT_CELL_VALUE), overflow items
* (WT_CELL_VALUE_OVFL) or deleted items (WT_CELL_DEL).
*/
- cip = page->u.col_var.d;
indx = 0;
+ cip = page->u.col_var.d;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
__wt_cell_unpack(cell, unpack);
(cip++)->__value = WT_PAGE_DISK_OFFSET(page, cell);
@@ -292,11 +396,10 @@ __inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
indx++;
recno += rle;
}
+ *sizep += bytes_allocated;
page->u.col_var.repeats = repeats;
page->u.col_var.nrepeats = nrepeats;
- page->entries = dsk->u.entries;
- *inmem_sizep += bytes_allocated;
return (0);
}
@@ -305,7 +408,7 @@ __inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
* Build in-memory index for row-store internal pages.
*/
static int
-__inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
+__inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
{
WT_BTREE *btree;
WT_CELL *cell;
@@ -316,7 +419,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
WT_ITEM *tmp;
WT_PAGE_HEADER *dsk;
WT_REF *ref;
- uint32_t i, nindx, prefix;
+ uint32_t i, prefix;
void *huffman;
btree = S2BT(session);
@@ -328,22 +431,6 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
WT_ERR(__wt_scr_alloc(session, 0, &last));
/*
- * Internal row-store page entries map one-to-two to the number of
- * physical entries on the page (each in-memory entry is a key item
- * and location cookie).
- */
- nindx = dsk->u.entries / 2;
- WT_ERR((__wt_calloc_def(session, (size_t)nindx, &page->u.intl.t)));
- *inmem_sizep += nindx * sizeof(*page->u.intl.t);
-
- /*
- * Set the number of elements now -- we're about to allocate memory,
- * and if we fail in the middle of the page, we want to discard that
- * memory properly.
- */
- page->entries = nindx;
-
- /*
* Walk the page, instantiating keys: the page contains sorted key and
* location cookie pairs. Keys are on-page/overflow items and location
* cookies are WT_CELL_ADDR items.
@@ -446,10 +533,10 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
* for reconciliation, the row-store reconciliation function
* depends on keys always be instantiated.
*/
- WT_ERR(__wt_row_ikey_alloc(session,
+ WT_ERR(__wt_row_ikey(session,
WT_PAGE_DISK_OFFSET(page, cell),
current->data, current->size, &ref->u.key));
- *inmem_sizep += sizeof(WT_IKEY) + current->size;
+ *sizep += sizeof(WT_IKEY) + current->size;
/*
* Swap buffers if it's not an overflow key, we have a new
@@ -468,21 +555,19 @@ err: __wt_scr_free(&current);
}
/*
- * __inmem_row_leaf --
- * Build in-memory index for row-store leaf pages.
+ * __inmem_row_leaf_entries --
+ * Return the number of entries for row-store leaf pages.
*/
static int
-__inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
+__inmem_row_leaf_entries(
+ WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, uint32_t *nindxp)
{
WT_BTREE *btree;
WT_CELL *cell;
WT_CELL_UNPACK *unpack, _unpack;
- WT_PAGE_HEADER *dsk;
- WT_ROW *rip;
uint32_t i, nindx;
btree = S2BT(session);
- dsk = page->dsk;
unpack = &_unpack;
/*
@@ -519,10 +604,29 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
*/
WT_ASSERT(session, cell == (WT_CELL *)((uint8_t *)dsk + dsk->mem_size));
- WT_RET((__wt_calloc_def(session, (size_t)nindx, &page->u.row.d)));
- *inmem_sizep += nindx * sizeof(*page->u.row.d);
+ *nindxp = nindx;
+ return (0);
+}
- /* Walk the page again, building indices. */
+/*
+ * __inmem_row_leaf --
+ * Build in-memory index for row-store leaf pages.
+ */
+static int
+__inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_PAGE_HEADER *dsk;
+ WT_ROW *rip;
+ uint32_t i;
+
+ btree = S2BT(session);
+ dsk = page->dsk;
+ unpack = &_unpack;
+
+ /* Walk the page, building indices. */
rip = page->u.row.d;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
__wt_cell_unpack(cell, unpack);
@@ -539,14 +643,12 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
}
}
- page->entries = nindx;
-
/*
* If the keys are Huffman encoded, instantiate some set of them. It
* doesn't matter if we are randomly searching the page or scanning a
* cursor through it, there isn't a fast-path to getting keys off the
* page.
*/
- return (btree->huffman_key == NULL ?
- 0 : __wt_row_leaf_keys(session, page));
+ return (
+ btree->huffman_key == NULL ? 0 : __wt_row_leaf_keys(session, page));
}
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index 3c599a2c129..37155beb0dd 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -49,8 +49,8 @@ __cache_read_row_deleted(
upd->txnid = ref->txnid;
}
- __wt_cache_page_inmem_incr(
- session, page, sizeof(WT_UPDATE) * page->entries);
+ __wt_cache_page_inmem_incr(session, page,
+ page->entries * (sizeof(WT_UPDATE *) + sizeof(WT_UPDATE)));
return (0);
}
@@ -116,7 +116,6 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref)
WT_VERBOSE_ERR(session, read,
"page %p: %s", page, __wt_page_type_string(page->type));
- ref->page = page;
WT_PUBLISH(ref->state, WT_REF_MEM);
return (0);
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index 04268162073..d13be782394 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -376,7 +376,7 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
* checksum and still be broken, but paranoia is healthy in
* salvage. Regardless, verify does return failure because
* it detects failures we'd expect to see in a corrupted file,
- * like overflow references past the the end of the file or
+ * like overflow references past the end of the file or
* overflow references to non-existent pages, might as well
* discard these pages now.
*/
@@ -1087,17 +1087,13 @@ __slvg_col_build_internal(
WT_TRACK *trk;
uint32_t i;
- /* Allocate a column-store internal page. */
- WT_RET(__wt_calloc_def(session, 1, &page));
- WT_ERR(__wt_calloc_def(session, (size_t)leaf_cnt, &page->u.intl.t));
-
- /* Fill it in. */
+ /* Allocate a column-store root (internal) page and fill it in. */
+ WT_RET(__wt_page_alloc(session, WT_PAGE_COL_INT, leaf_cnt, &page));
page->parent = NULL; /* Root page */
page->ref = NULL;
- page->read_gen = 0;
+ page->read_gen = WT_READ_GEN_NOTSET;
page->u.intl.recno = 1;
page->entries = leaf_cnt;
- page->type = WT_PAGE_COL_INT;
WT_ERR(__slvg_modify_init(session, page));
for (ref = page->u.intl.t, i = 0; i < ss->pages_next; ++i) {
@@ -1665,16 +1661,12 @@ __slvg_row_build_internal(
WT_TRACK *trk;
uint32_t i;
- /* Allocate a row-store internal page. */
- WT_RET(__wt_calloc_def(session, 1, &page));
- WT_ERR(__wt_calloc_def(session, (size_t)leaf_cnt, &page->u.intl.t));
-
- /* Fill it in. */
- page->parent = NULL; /* Root page */
+ /* Allocate a row-store root (internal) page and fill it in. */
+ WT_RET(__wt_page_alloc(session, WT_PAGE_ROW_INT, leaf_cnt, &page));
+ page->parent = NULL;
page->ref = NULL;
- page->read_gen = 0;
+ page->read_gen = WT_READ_GEN_NOTSET;
page->entries = leaf_cnt;
- page->type = WT_PAGE_ROW_INT;
WT_ERR(__slvg_modify_init(session, page));
for (ref = page->u.intl.t, i = 0; i < ss->pages_next; ++i) {
@@ -1706,9 +1698,8 @@ __slvg_row_build_internal(
WT_ERR(__slvg_row_build_leaf(
session, trk, page, ref, ss));
} else
- WT_ERR(__wt_row_ikey_alloc(session, 0,
- trk->row_start.data,
- trk->row_start.size,
+ WT_ERR(__wt_row_ikey_incr(session, page, 0,
+ trk->row_start.data, trk->row_start.size,
&ref->u.key));
++ref;
}
@@ -1814,13 +1805,10 @@ __slvg_row_build_leaf(WT_SESSION_IMPL *session,
++skip_stop;
}
- /*
- * I believe it's no longer possible for a salvaged page to be entirely
- * empty, that is, if we selected the page for salvage, there is at
- * least one cell on the page we want. This is a change from previous
- * behavior, so I'm asserting it.
- */
- WT_ASSERT_ERR(session, skip_start + skip_stop < page->entries);
+ /* We should have selected some entries, but not the entire page. */
+ WT_ASSERT(session,
+ skip_start + skip_stop > 0 &&
+ skip_start + skip_stop < page->entries);
/*
* Take a copy of this page's first key to define the start of
@@ -1829,8 +1817,8 @@ __slvg_row_build_leaf(WT_SESSION_IMPL *session,
*/
rip = page->u.row.d + skip_start;
WT_ERR(__wt_row_key(session, page, rip, key, 0));
- WT_ERR(
- __wt_row_ikey_alloc(session, 0, key->data, key->size, &ref->u.key));
+ WT_ERR(__wt_row_ikey_incr(
+ session, parent, 0, key->data, key->size, &ref->u.key));
/*
* Discard backing overflow pages for any items being discarded that
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
index 36854553397..337cce7983a 100644
--- a/src/btree/bt_stat.c
+++ b/src/btree/bt_stat.c
@@ -7,9 +7,9 @@
#include "wt_internal.h"
-static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *);
-static int __stat_page_col_var(WT_SESSION_IMPL *, WT_PAGE *);
-static int __stat_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *);
+static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
+static int __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *);
+static int __stat_page_row_leaf(WT_PAGE *, WT_DSRC_STATS *);
/*
* __wt_btree_stat_init --
@@ -21,26 +21,28 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, uint32_t flags)
WT_BM *bm;
WT_BTREE *btree;
WT_DECL_RET;
+ WT_DSRC_STATS *stats;
WT_PAGE *page;
btree = S2BT(session);
bm = btree->bm;
+ stats = &btree->dhandle->stats;
- WT_RET(bm->stat(bm, session));
+ WT_RET(bm->stat(bm, session, stats));
- WT_DSTAT_SET(session, btree_fixed_len, btree->bitcnt);
- WT_DSTAT_SET(session, btree_maximum_depth, btree->maximum_depth);
- WT_DSTAT_SET(session, btree_maxintlitem, btree->maxintlitem);
- WT_DSTAT_SET(session, btree_maxintlpage, btree->maxintlpage);
- WT_DSTAT_SET(session, btree_maxleafitem, btree->maxleafitem);
- WT_DSTAT_SET(session, btree_maxleafpage, btree->maxleafpage);
+ WT_STAT_SET(stats, btree_fixed_len, btree->bitcnt);
+ WT_STAT_SET(stats, btree_maximum_depth, btree->maximum_depth);
+ WT_STAT_SET(stats, btree_maxintlitem, btree->maxintlitem);
+ WT_STAT_SET(stats, btree_maxintlpage, btree->maxintlpage);
+ WT_STAT_SET(stats, btree_maxleafitem, btree->maxleafitem);
+ WT_STAT_SET(stats, btree_maxleafpage, btree->maxleafpage);
page = NULL;
if (LF_ISSET(WT_STATISTICS_FAST))
return (0);
while ((ret = __wt_tree_walk(session, &page, 0)) == 0 && page != NULL)
- WT_RET(__stat_page(session, page));
+ WT_RET(__stat_page(session, page, stats));
return (ret == WT_NOTFOUND ? 0 : ret);
}
@@ -49,7 +51,7 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, uint32_t flags)
* Stat any Btree page.
*/
static int
-__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page)
+__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
{
/*
* All internal pages and overflow pages are trivial, all we track is
@@ -57,25 +59,25 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page)
*/
switch (page->type) {
case WT_PAGE_COL_FIX:
- WT_DSTAT_INCR(session, btree_column_fix);
- WT_DSTAT_INCRV(session, btree_entries, page->entries);
+ WT_STAT_INCR(stats, btree_column_fix);
+ WT_STAT_INCRV(stats, btree_entries, page->entries);
break;
case WT_PAGE_COL_INT:
- WT_DSTAT_INCR(session, btree_column_internal);
- WT_DSTAT_INCRV(session, btree_entries, page->entries);
+ WT_STAT_INCR(stats, btree_column_internal);
+ WT_STAT_INCRV(stats, btree_entries, page->entries);
break;
case WT_PAGE_COL_VAR:
- WT_RET(__stat_page_col_var(session, page));
+ WT_RET(__stat_page_col_var(page, stats));
break;
case WT_PAGE_OVFL:
- WT_DSTAT_INCR(session, btree_overflow);
+ WT_STAT_INCR(stats, btree_overflow);
break;
case WT_PAGE_ROW_INT:
- WT_DSTAT_INCR(session, btree_row_internal);
- WT_DSTAT_INCRV(session, btree_entries, page->entries);
+ WT_STAT_INCR(stats, btree_row_internal);
+ WT_STAT_INCRV(stats, btree_entries, page->entries);
break;
case WT_PAGE_ROW_LEAF:
- WT_RET(__stat_page_row_leaf(session, page));
+ WT_RET(__stat_page_row_leaf(page, stats));
break;
WT_ILLEGAL_VALUE(session);
}
@@ -87,7 +89,7 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page)
* Stat a WT_PAGE_COL_VAR page.
*/
static int
-__stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page)
+__stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
{
WT_CELL *cell;
WT_CELL_UNPACK *unpack, _unpack;
@@ -99,7 +101,7 @@ __stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page)
unpack = &_unpack;
- WT_DSTAT_INCR(session, btree_column_variable);
+ WT_STAT_INCR(stats, btree_column_variable);
/*
* Walk the page, counting regular and overflow data items, and checking
@@ -111,12 +113,12 @@ __stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_COL_FOREACH(page, cip, i) {
if ((cell = WT_COL_PTR(page, cip)) == NULL) {
orig_deleted = 1;
- WT_DSTAT_INCR(session, btree_column_deleted);
+ WT_STAT_INCR(stats, btree_column_deleted);
} else {
orig_deleted = 0;
__wt_cell_unpack(cell, unpack);
- WT_DSTAT_INCRV(
- session, btree_entries, __wt_cell_rle(unpack));
+ WT_STAT_INCRV(
+ stats, btree_entries, __wt_cell_rle(unpack));
}
/*
@@ -128,13 +130,13 @@ __stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page)
if (WT_UPDATE_DELETED_ISSET(upd)) {
if (orig_deleted)
continue;
- WT_DSTAT_INCR(session, btree_column_deleted);
- WT_DSTAT_DECR(session, btree_entries);
+ WT_STAT_INCR(stats, btree_column_deleted);
+ WT_STAT_DECR(stats, btree_entries);
} else {
if (!orig_deleted)
continue;
- WT_DSTAT_DECR(session, btree_column_deleted);
- WT_DSTAT_INCR(session, btree_entries);
+ WT_STAT_DECR(stats, btree_column_deleted);
+ WT_STAT_INCR(stats, btree_entries);
}
}
}
@@ -146,14 +148,14 @@ __stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page)
* Stat a WT_PAGE_ROW_LEAF page.
*/
static int
-__stat_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
+__stat_page_row_leaf(WT_PAGE *page, WT_DSRC_STATS *stats)
{
WT_INSERT *ins;
WT_ROW *rip;
WT_UPDATE *upd;
uint32_t cnt, i;
- WT_DSTAT_INCR(session, btree_row_leaf);
+ WT_STAT_INCR(stats, btree_row_leaf);
/*
* Stat any K/V pairs inserted into the page before the first from-disk
@@ -176,7 +178,7 @@ __stat_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
++cnt;
}
- WT_DSTAT_INCRV(session, btree_entries, cnt);
+ WT_STAT_INCRV(stats, btree_entries, cnt);
return (0);
}
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 12ffc0bba34..3fdfd8b7b56 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -62,10 +62,9 @@ __wt_bt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op)
switch (op) {
case WT_SYNC_CHECKPOINT:
- WT_ERR(__wt_sync_file(session, WT_SYNC_CHECKPOINT));
- break;
case WT_SYNC_COMPACT:
- WT_ERR(__wt_sync_file(session, WT_SYNC_COMPACT));
+ case WT_SYNC_WRITE_LEAVES:
+ WT_ERR(__wt_sync_file(session, op));
break;
case WT_SYNC_DISCARD:
case WT_SYNC_DISCARD_NOWRITE:
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index 0cc9a7ce3f6..323876d8c04 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -162,7 +162,7 @@ int
__wt_tree_walk(WT_SESSION_IMPL *session, WT_PAGE **pagep, uint32_t flags)
{
WT_BTREE *btree;
- WT_PAGE *page, *parent;
+ WT_PAGE *couple, *page;
WT_REF *ref;
uint32_t slot;
int cache, compact, discard, eviction, prev, set_read_gen;
@@ -184,6 +184,29 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_PAGE **pagep, uint32_t flags)
page = *pagep;
*pagep = NULL;
+ /*
+ * If not the eviction thread, we're hazard-pointer coupling through the
+ * tree and that's OK (hazard pointers can't deadlock, so there's none
+ * of the usual problems found when logically locking up a btree). If
+ * the eviction thread tries to evict the active page, it fails because
+ * of our hazard pointer. If eviction tries to evict our parent, that
+ * fails because the parent has a child page that can't be discarded.
+ * We do play one game: don't couple up to our parent and then back down
+ * to a new leaf, couple to the next page to which we're descending, it
+ * saves a hazard-pointer swap for each cursor page movement.
+ *
+ * !!!
+ * NOTE: we don't bother checking if we're hazard-pointer coupling when
+ * setting the variable couple in this code. We never actually use the
+ * variable couple if the variable eviction is true.
+ *
+ * NOTE: we depend on the fact it's OK to release a page we don't hold,
+ * that is, it's OK to release couple, when couple is set to NULL.
+ *
+ * Remember the hazard pointer we're currently holding.
+ */
+ couple = page;
+
/* If no page is active, begin a walk from the start of the tree. */
if (page == NULL) {
if ((page = btree->root_page) == NULL)
@@ -192,13 +215,12 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_PAGE **pagep, uint32_t flags)
goto descend;
}
-ascend: /* If the active page was the root, we've reached the walk's end. */
+ascend: /*
+ * If the active page was the root, we've reached the walk's end.
+ * Release any hazard-pointer we're holding.
+ */
if (WT_PAGE_IS_ROOT(page))
- return (0);
-
- /* Figure out the current slot in the parent page's WT_REF array. */
- parent = page->parent;
- slot = (uint32_t)(page->ref - parent->u.intl.t);
+ return (eviction ? 0 : __wt_page_release(session, couple));
/* If the eviction thread, clear the page's walk status. */
if (eviction)
@@ -206,41 +228,43 @@ ascend: /* If the active page was the root, we've reached the walk's end. */
page->ref->state = WT_REF_MEM;
/*
- * Move to the parent.
- *
- * If not the eviction thread, swap our hazard pointer for the hazard
- * pointer of our parent, if it's not the root page (we could access
- * it directly because we know it's in memory, but we need a hazard as
- * we climb the tree). Don't leave a hazard pointer dangling on error.
- *
- * We're hazard-pointer coupling up the tree and that's OK: first,
- * hazard pointers can't deadlock, so there's none of the usual
- * problems found when logically locking up a Btree; second, we don't
- * release our current hazard pointer until we have our parent's
- * hazard pointer. If the eviction thread tries to evict the active
- * page, that fails because of our hazard pointer. If eviction tries
- * to evict our parent, that fails because the parent has a child page
- * that can't be discarded.
+ * Figure out the current slot in the parent page's WT_REF array and
+ * switch to the parent.
*/
- if (!eviction) {
- if (WT_PAGE_IS_ROOT(parent))
- WT_RET(__wt_page_release(session, page));
- else
- WT_RET(
- __wt_page_swap(session, page, parent, parent->ref));
- }
- page = parent;
+ slot = (uint32_t)(page->ref - page->parent->u.intl.t);
+ page = page->parent;
- /*
- * If we're at the last/first slot on the page, return this page in
- * post-order traversal. Otherwise we move to the next/prev slot
- * and left/right-most element in its subtree.
- */
for (;;) {
+ /*
+ * If we're at the last/first slot on the page, return this
+ * page in post-order traversal. Otherwise we move to the
+ * next/prev slot and left/right-most element in its subtree.
+ */
if ((prev && slot == 0) ||
(!prev && slot == page->entries - 1)) {
+ /* Optionally skip internal pages. */
if (skip_intl)
goto ascend;
+
+ /*
+ * We've ascended the tree and are returning an internal
+ * page. If it's the root, discard any hazard pointer
+ * we have, otherwise, swap any hazard pointer we have
+ * for the page we'll return. We could keep the hazard
+ * pointer we have as it's sufficient to pin any page in
+ * our page stack, but we have no place to store it and
+ * it's simpler if callers just know they hold a hazard
+ * pointer on any page they're using.
+ */
+ if (!eviction) {
+ if (WT_PAGE_IS_ROOT(page))
+ WT_RET(
+ __wt_page_release(session, couple));
+ else
+ WT_RET(__wt_page_swap(
+ session, couple, page, page->ref));
+ }
+
*pagep = page;
return (0);
}
@@ -275,7 +299,6 @@ descend: for (;;) {
* another thread. The other cases get hazard pointers
* and protect the page from eviction that way.
*/
- set_read_gen = 0;
if (eviction) {
retry: if (ref->state != WT_REF_MEM ||
!WT_ATOMIC_CAS(ref->state,
@@ -315,7 +338,7 @@ retry: if (ref->state != WT_REF_MEM ||
ref->state == WT_REF_DISK)
break;
WT_RET(
- __wt_page_swap(session, page, page, ref));
+ __wt_page_swap(session, couple, page, ref));
} else if (discard) {
/*
* If deleting a range, try to delete the page
@@ -326,7 +349,7 @@ retry: if (ref->state != WT_REF_MEM ||
if (skip)
break;
WT_RET(
- __wt_page_swap(session, page, page, ref));
+ __wt_page_swap(session, couple, page, ref));
} else {
/*
* If iterating a cursor (or doing compaction),
@@ -341,11 +364,11 @@ retry: if (ref->state != WT_REF_MEM ||
* we don't want to read it if it won't help.
*
* Pages read for compaction aren't "useful";
- * reset the page generation to 0 so the page
- * is quickly chosen for eviction. (This can
- * race of course, but it's unlikely and will
- * only result in an incorrectly low page read
- * generation.)
+ * reset the page generation to a low value so
+ * the page is quickly chosen for eviction.
+ * (This can race of course, but it's unlikely
+ * and will only result in an incorrectly low
+ * page read generation and possible eviction.)
*/
set_read_gen = 0;
if (compact) {
@@ -357,12 +380,12 @@ retry: if (ref->state != WT_REF_MEM ||
ref->state == WT_REF_DISK ? 1 : 0;
}
WT_RET(
- __wt_page_swap(session, page, page, ref));
+ __wt_page_swap(session, couple, page, ref));
if (set_read_gen)
- page->read_gen = 0;
+ page->read_gen = WT_READ_GEN_OLDEST;
}
- page = ref->page;
+ couple = page = ref->page;
slot = prev ? page->entries - 1 : 0;
}
}
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
index 0e3e130062d..b05c2281c30 100644
--- a/src/btree/col_srch.c
+++ b/src/btree/col_srch.c
@@ -38,9 +38,15 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify)
WT_ASSERT(session, ref == NULL ||
ref->u.recno == page->u.intl.recno);
+ /* Fast path appends. */
+ base = page->entries;
+ ref = &page->u.intl.t[base - 1];
+ if (recno >= ref->u.recno)
+ goto descend;
+
/* Binary search of internal pages. */
- for (base = 0,
- limit = page->entries; limit != 0; limit >>= 1) {
+ for (base = 0, ref = NULL,
+ limit = page->entries - 1; limit != 0; limit >>= 1) {
indx = base + (limit >> 1);
ref = page->u.intl.t + indx;
@@ -51,7 +57,8 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify)
base = indx + 1;
--limit;
}
- WT_ASSERT(session, ref != NULL);
+
+descend: WT_ASSERT(session, ref != NULL);
/*
* Reference the slot used for next step down the tree.
diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c
index b82140b2a6d..5f35327201f 100644
--- a/src/btree/rec_evict.c
+++ b/src/btree/rec_evict.c
@@ -13,7 +13,7 @@ static void __rec_discard_tree(WT_SESSION_IMPL *, WT_PAGE *, int);
static void __rec_excl_clear(WT_SESSION_IMPL *);
static void __rec_page_clean_update(WT_SESSION_IMPL *, WT_PAGE *);
static int __rec_page_dirty_update(WT_SESSION_IMPL *, WT_PAGE *);
-static int __rec_review(WT_SESSION_IMPL *, WT_REF *, WT_PAGE *, int, int);
+static int __rec_review(WT_SESSION_IMPL *, WT_REF *, WT_PAGE *, int, int, int);
static void __rec_root_update(WT_SESSION_IMPL *);
/*
@@ -25,6 +25,7 @@ __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive)
{
WT_DECL_RET;
WT_PAGE_MODIFY *mod;
+ int merge;
WT_VERBOSE_RET(session, evict,
"page %p (%s)", page, __wt_page_type_string(page->type));
@@ -32,19 +33,17 @@ __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive)
WT_ASSERT(session, session->excl_next == 0);
/*
- * Split-merge pages cannot be evicted, they're always merged into their
- * parent; split-merge pages are ignored by the eviction thread, we
- * never get a split-merge page to evict. Check out of sheer paranoia.
- * Split pages are NOT included in this test, because a split page can
- * be separately evicted, at which point it's replaced in its parent by
- * a reference to a split-merge page. That's a normal part of the leaf
- * page life-cycle if it grows too large and must be pushed out of the
- * cache.
+ * If we get a split-merge page during normal eviction, try to collapse
+ * it. During close, it will be merged into its parent.
*/
mod = page->modify;
- if (mod != NULL && F_ISSET(mod, WT_PM_REC_SPLIT_MERGE))
+ merge = __wt_btree_mergeable(page);
+ if (merge && exclusive)
return (EBUSY);
+ WT_ASSERT(session, merge || mod == NULL ||
+ !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE));
+
/*
* Get exclusive access to the page and review the page and its subtree
* for conditions that would block our eviction of the page. If the
@@ -54,10 +53,14 @@ __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive)
* not disallowed anywhere.
*
* Note that page->ref may be NULL in some cases (e.g., for root pages
- * or during salvage). That's OK if WT_REC_SINGLE is set: we won't
- * check hazard pointers in that case.
+ * or during salvage). That's OK if exclusive is set: we won't check
+ * hazard pointers in that case.
*/
- WT_ERR(__rec_review(session, page->ref, page, exclusive, 1));
+ WT_ERR(__rec_review(session, page->ref, page, exclusive, merge, 1));
+
+ /* Try to merge internal pages. */
+ if (merge)
+ WT_ERR(__wt_merge_tree(session, page));
/*
* Update the page's modification reference, reconciliation might have
@@ -66,7 +69,7 @@ __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive)
mod = page->modify;
/* Count evictions of internal pages during normal operation. */
- if (!exclusive &&
+ if (!exclusive && !merge &&
(page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) {
WT_CSTAT_INCR(session, cache_eviction_internal);
WT_DSTAT_INCR(session, cache_eviction_internal);
@@ -291,17 +294,15 @@ __rec_discard_page(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive)
*/
static int
__rec_review(WT_SESSION_IMPL *session,
- WT_REF *ref, WT_PAGE *page, int exclusive, int top)
+ WT_REF *ref, WT_PAGE *page, int exclusive, int merge, int top)
{
WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE_MODIFY *mod;
WT_PAGE *t;
- WT_TXN *txn;
uint32_t i;
btree = S2BT(session);
- txn = &session->txn;
/*
* Get exclusive access to the page if our caller doesn't have the tree
@@ -322,8 +323,8 @@ __rec_review(WT_SESSION_IMPL *session,
case WT_REF_DELETED: /* On-disk, deleted */
break;
case WT_REF_MEM: /* In-memory */
- WT_RET(__rec_review(
- session, ref, ref->page, exclusive, 0));
+ WT_RET(__rec_review(session,
+ ref, ref->page, exclusive, merge, 0));
break;
case WT_REF_EVICT_WALK: /* Walk point */
case WT_REF_EVICT_FORCE: /* Forced evict */
@@ -374,15 +375,18 @@ __rec_review(WT_SESSION_IMPL *session,
* we find a page which can't be merged into its parent, and failing if
* we never find such a page.
*/
- if (btree->checkpointing && __wt_page_is_modified(page))
+ if (btree->checkpointing && !merge && __wt_page_is_modified(page)) {
+ckpt: WT_CSTAT_INCR(session, cache_eviction_checkpoint);
+ WT_DSTAT_INCR(session, cache_eviction_checkpoint);
return (EBUSY);
+ }
if (btree->checkpointing && top)
for (t = page->parent;; t = t->parent) {
if (t == NULL || t->ref == NULL) /* root */
- return (EBUSY);
+ goto ckpt;
if (t->ref->state != WT_REF_MEM) /* scary */
- return (EBUSY);
+ goto ckpt;
if (t->modify == NULL || /* not merged */
!F_ISSET(t->modify, WT_PM_REC_EMPTY |
WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))
@@ -390,6 +394,13 @@ __rec_review(WT_SESSION_IMPL *session,
}
/*
+ * If we are merging internal pages, we just need exclusive access, we
+ * don't need to write everything.
+ */
+ if (merge)
+ return (0);
+
+ /*
* Fail if any page in the top-level page's subtree won't be merged into
* its parent, the page that cannot be merged must be evicted first.
* The test is necessary but should not fire much: the eviction code is
@@ -435,20 +446,6 @@ __rec_review(WT_SESSION_IMPL *session,
WT_VERBOSE_RET(session, evict,
"eviction failed, reconciled page not clean");
- /*
- * A pathological case: if we're the oldest transaction
- * in the system and we're stuck trying to find space,
- * abort the transaction to give up all hazard
- * references before trying again.
- */
- if (F_ISSET(txn, TXN_RUNNING) &&
- __wt_txn_am_oldest(session) &&
- ++txn->eviction_fails >= 100) {
- txn->eviction_fails = 0;
- ret = WT_DEADLOCK;
- WT_CSTAT_INCR(session, txn_fail_cache);
- }
-
/*
* We may be able to discard any "update" memory the
* page no longer needs.
@@ -466,7 +463,6 @@ __rec_review(WT_SESSION_IMPL *session,
WT_RET(ret);
WT_ASSERT(session, __wt_page_is_modified(page) == 0);
- txn->eviction_fails = 0;
}
/*
diff --git a/src/btree/rec_merge.c b/src/btree/rec_merge.c
new file mode 100644
index 00000000000..caac7c77215
--- /dev/null
+++ b/src/btree/rec_merge.c
@@ -0,0 +1,538 @@
+/*-
+ * Copyright (c) 2008-2013 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * WT_VISIT_STATE --
+ * The state maintained across calls to the "visit" callback functions:
+ * the number of refs visited, the maximum depth, and the current page and
+ * reference when moving reference into the new tree.
+ */
+typedef struct {
+ WT_SESSION_IMPL *session;
+ WT_PAGE *first, *page, *second; /* New pages to be populated. */
+ WT_REF *ref, *second_ref; /* Insert and split point. */
+
+ uint64_t refcnt, split; /* Ref count, split point. */
+ uint64_t first_live, last_live; /* First/last in-memory ref. */
+ u_int maxdepth; /* Maximum subtree depth. */
+ int seen_live; /* Has a ref been live? */
+} WT_VISIT_STATE;
+
+/*
+ * __merge_walk --
+ * Visit all of the child references in a locked subtree and apply a
+ * callback function to them.
+ */
+static int
+__merge_walk(WT_SESSION_IMPL *session, WT_PAGE *page, u_int depth,
+ void (*visit)(WT_PAGE *, WT_REF *, WT_VISIT_STATE *),
+ WT_VISIT_STATE *state)
+{
+ WT_PAGE *child;
+ WT_REF *ref;
+ uint32_t i;
+
+ if (depth > state->maxdepth)
+ state->maxdepth = depth;
+
+ WT_REF_FOREACH(page, ref, i)
+ switch (ref->state) {
+ case WT_REF_LOCKED:
+ child = ref->page;
+
+ /*
+ * Visit internal pages recursively. This must match
+ * the walk in __rec_review: if the merge succeeds, we
+ * have to unlock everything.
+ */
+ if (child->type == page->type &&
+ __wt_btree_mergeable(child)) {
+ WT_RET(__merge_walk(
+ session, child, depth + 1, visit, state));
+ break;
+ }
+ /* FALLTHROUGH */
+
+ case WT_REF_DELETED:
+ case WT_REF_DISK:
+ (*visit)(page, ref, state);
+ break;
+
+ case WT_REF_EVICT_FORCE:
+ case WT_REF_EVICT_WALK:
+ case WT_REF_MEM:
+ case WT_REF_READING:
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * __merge_count --
+ * A callback function that counts the number of references as well as
+ * the first/last "live" reference.
+ */
+static void
+__merge_count(WT_PAGE *parent, WT_REF *ref, WT_VISIT_STATE *state)
+{
+ WT_UNUSED(parent);
+
+ if (ref->state == WT_REF_LOCKED) {
+ /* Prevent eviction until it is hooked into the new tree. */
+ __wt_evict_list_clr_page(state->session, ref->page);
+
+ if (!state->seen_live) {
+ state->first_live = state->refcnt;
+ state->seen_live = 1;
+ }
+ state->last_live = state->refcnt;
+ }
+
+ /*
+ * Sanity check that we don't overflow the counts. We can't put more
+ * than 2**32 keys on one page anyway.
+ */
+ ++state->refcnt;
+}
+
+/*
+ * __merge_copy_ref --
+ * Copy a child reference from the locked subtree to a new page.
+ */
+static void
+__merge_copy_ref(WT_PAGE *parent, WT_REF *ref, WT_VISIT_STATE *state)
+{
+ WT_REF *newref;
+
+ WT_UNUSED(parent);
+
+ if (state->split != 0 && state->refcnt++ == state->split)
+ state->ref = state->second_ref;
+
+ newref = state->ref++;
+ *newref = *ref;
+}
+
+/*
+ * __merge_unlock --
+ * Unlock all pages under an internal page being merged.
+ */
+static void
+__merge_unlock(WT_PAGE *page)
+{
+ WT_REF *ref;
+ uint32_t i;
+
+ WT_REF_FOREACH(page, ref, i)
+ if (ref->state == WT_REF_LOCKED) {
+ if (ref->page->type == WT_PAGE_ROW_INT ||
+ ref->page->type == WT_PAGE_COL_INT)
+ __merge_unlock(ref->page);
+ WT_PUBLISH(ref->state, WT_REF_MEM);
+ }
+}
+
+/*
+ * __merge_transfer_footprint --
+ * Transfer the size of references from an old page to a new page.
+ *
+ * Note that both pages are locked and there is no net change, so avoid
+ * __wt_cache_page_inmem_incr.
+ */
+static void
+__merge_transfer_footprint(WT_SESSION_IMPL *session,
+ WT_PAGE *newpage, WT_PAGE *oldpage, uint32_t size)
+{
+ WT_ASSERT(session, size < oldpage->memory_footprint);
+ oldpage->memory_footprint -= size;
+ newpage->memory_footprint += size;
+}
+
+/*
+ * __merge_switch_page --
+ * Switch a page from the locked tree into the new tree.
+ */
+static void
+__merge_switch_page(WT_PAGE *parent, WT_REF *ref, WT_VISIT_STATE *state)
+{
+ WT_PAGE *child;
+ WT_PAGE_MODIFY *modify;
+ WT_REF *newref;
+
+ if (state->split != 0 && state->refcnt++ == state->split) {
+ state->page = state->second;
+ state->ref = state->second_ref;
+ }
+
+ newref = state->ref++;
+
+ if (ref->addr != NULL)
+ __merge_transfer_footprint(
+ state->session, state->page, parent,
+ (uint32_t)sizeof(WT_ADDR) + ((WT_ADDR *)ref->addr)->size);
+
+ if (parent->type == WT_PAGE_ROW_INT)
+ __merge_transfer_footprint(
+ state->session, state->page, parent,
+ (uint32_t)sizeof(WT_IKEY) + ((WT_IKEY *)ref->u.key)->size);
+
+ if (ref->state == WT_REF_LOCKED) {
+ child = ref->page;
+
+ /*
+ * If the child has been split, update the split page to point
+ * into the new tree. That way, if the split-merge page is
+ * later swapped into place, it will point to the new parent.
+ *
+ * The order here is important: the parent page should point to
+ * the original child page, so we link that in last.
+ */
+ if ((modify = child->modify) != NULL &&
+ F_ISSET(modify, WT_PM_REC_SPLIT))
+ WT_LINK_PAGE(state->page, newref, modify->u.split);
+
+ WT_LINK_PAGE(state->page, newref, child);
+
+ /*
+ * If we have a child that is a live internal page, its subtree
+ * was locked by __rec_review. We're swapping it into the new
+ * tree, unlock it now.
+ */
+ if (child->type == WT_PAGE_ROW_INT ||
+ child->type == WT_PAGE_COL_INT)
+ __merge_unlock(child);
+
+ newref->state = WT_REF_MEM;
+ }
+
+ WT_CLEAR(*ref);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __merge_check_discard --
+ * Make sure we are only discarding split-merge pages.
+ */
+static void
+__merge_check_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_REF *ref;
+ uint32_t i;
+
+ WT_ASSERT(session, page->type == WT_PAGE_ROW_INT ||
+ page->type == WT_PAGE_COL_INT);
+ WT_ASSERT(session, page->modify != NULL &&
+ F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE));
+
+ WT_REF_FOREACH(page, ref, i) {
+ if (ref->state == WT_REF_DISK ||
+ ref->state == WT_REF_DELETED)
+ continue;
+
+ WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+ __merge_check_discard(session, ref->page);
+ }
+}
+#endif
+
+/*
+ * __merge_new_page --
+ * Create a new in-memory internal page.
+ */
+static int
+__merge_new_page(WT_SESSION_IMPL *session,
+ uint8_t type, uint32_t entries, int merge, WT_PAGE **pagep)
+{
+ WT_DECL_RET;
+ WT_PAGE *newpage;
+
+ /* Allocate a new internal page and fill it in. */
+ WT_RET(__wt_page_alloc(session, type, entries, &newpage));
+ newpage->read_gen = WT_READ_GEN_NOTSET;
+ newpage->entries = entries;
+
+ WT_ERR(__wt_page_modify_init(session, newpage));
+ if (merge)
+ F_SET(newpage->modify, WT_PM_REC_SPLIT_MERGE);
+ else
+ __wt_page_modify_set(session, newpage);
+
+ *pagep = newpage;
+ return (0);
+
+err: __wt_page_out(session, &newpage);
+ return (ret);
+}
+
+/*
+ * __merge_promote_key --
+ * Copy a key from a child page into the reference in its parent, so it
+ * can be found by searches.
+ */
+static int
+__merge_promote_key(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_IKEY *ikey;
+ WT_PAGE *page;
+ WT_REF *child_ref;
+
+ page = ref->page;
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ child_ref = &page->u.intl.t[0];
+ ref->u.recno = page->u.intl.recno = child_ref->u.recno;
+ return (0);
+
+ case WT_PAGE_ROW_INT:
+ child_ref = &page->u.intl.t[0];
+ ikey = child_ref->u.key;
+ WT_ASSERT(session, ikey != NULL);
+ return (__wt_row_ikey_incr(session,
+ page, 0, WT_IKEY_DATA(ikey), ikey->size, &ref->u.key));
+
+ WT_ILLEGAL_VALUE(session);
+ }
+}
+
+/*
+ * __wt_merge_tree --
+ * Attempt to collapse a stack of split-merge pages in memory into a
+ * shallow tree. If enough keys are found, create a real internal node
+ * that can be evicted (and, if necessary, split further).
+ *
+ * This code is designed to deal with workloads that otherwise create
+ * arbitrarily deep (and slow) trees in memory.
+ */
+int
+__wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
+{
+ WT_DECL_RET;
+ WT_PAGE *lchild, *newtop, *rchild;
+ WT_REF *newref;
+ WT_VISIT_STATE visit_state;
+ uint32_t refcnt, split;
+ int promote;
+ u_int levels;
+ uint8_t page_type;
+
+ WT_CLEAR(visit_state);
+ visit_state.session = session;
+ lchild = newtop = rchild = NULL;
+ page_type = top->type;
+
+ WT_ASSERT(session, __wt_btree_mergeable(top));
+ WT_ASSERT(session, top->ref->state == WT_REF_LOCKED);
+
+ /*
+ * Walk the subtree, count the references at the bottom level and
+ * calculate the maximum depth.
+ */
+ WT_RET(__merge_walk(session, top, 1, __merge_count, &visit_state));
+
+ /* If there aren't enough useful levels, give up. */
+ if (visit_state.maxdepth < WT_MERGE_STACK_MIN)
+ return (EBUSY);
+
+ /* Pages cannot grow larger than 2**32, but that should never happen. */
+ if (visit_state.refcnt > UINT32_MAX)
+ return (ENOMEM);
+
+ /* Make sure the top page isn't queued for eviction. */
+ __wt_evict_list_clr_page(session, top);
+
+ /* Clear the eviction walk: it may be in our subtree. */
+ __wt_evict_clear_tree_walk(session, NULL);
+
+ /*
+ * Now we either collapse the internal pages into one split-merge page,
+ * or if there are "enough" keys, we split into two equal internal
+ * pages, each of which can be evicted independently.
+ *
+ * We set a flag (WT_PM_REC_SPLIT_MERGE) on the created page if it
+ * isn't big enough to justify the cost of evicting it. If splits
+ * continue, it will be merged again until it gets over this limit.
+ */
+ promote = 0;
+ refcnt = (uint32_t)visit_state.refcnt;
+ if (refcnt >= WT_MERGE_FULL_PAGE && visit_state.seen_live) {
+ /*
+ * In the normal case where there are live children spread
+ * through the subtree, create two child pages.
+ *
+ * Handle the case where the only live child is first / last
+ * specially: put the live child into the top-level page.
+ *
+ * Set SPLIT_MERGE on the internal pages if there are any live
+ * children: they can't be evicted, so there is no point
+ * permanently deepening the tree.
+ */
+ if (visit_state.first_live == visit_state.last_live &&
+ (visit_state.first_live == 0 ||
+ visit_state.first_live == refcnt - 1))
+ split = (visit_state.first_live == 0) ? 1 : refcnt - 1;
+ else
+ split = (refcnt + 1) / 2;
+
+ /* Only promote if we can create a real page. */
+ if (split == 1 || split == refcnt - 1)
+ promote = 1;
+ else if (split >= WT_MERGE_FULL_PAGE &&
+ visit_state.first_live >= split)
+ promote = 1;
+ else if (refcnt - split >= WT_MERGE_FULL_PAGE &&
+ visit_state.last_live < split)
+ promote = 1;
+ }
+
+ if (promote) {
+ /* Create a new top-level split-merge page with two entries. */
+ WT_ERR(__merge_new_page(session, page_type, 2, 1, &newtop));
+
+ visit_state.split = split;
+
+ /* Left split. */
+ if (split == 1)
+ visit_state.first = newtop;
+ else {
+ WT_ERR(__merge_new_page(session, page_type, split,
+ visit_state.first_live < split, &lchild));
+ visit_state.first = lchild;
+ }
+
+ /* Right split. */
+ if (split == refcnt - 1) {
+ visit_state.second = newtop;
+ visit_state.second_ref = &newtop->u.intl.t[1];
+ } else {
+ WT_ERR(__merge_new_page(session, page_type,
+ refcnt - split, visit_state.last_live >= split,
+ &rchild));
+ visit_state.second = rchild;
+ visit_state.second_ref =
+ &visit_state.second->u.intl.t[0];
+ }
+ } else {
+ /*
+ * Create a new split-merge page for small merges, or if the
+ * page above is a split merge page. When we do a big enough
+ * merge, we create a real page at the top and don't consider
+ * it as a merge candidate again. Over time with an insert
+ * workload the tree will grow deeper, but that's inevitable,
+ * and this keeps individual merges small.
+ */
+ WT_ERR(__merge_new_page(session, page_type, refcnt,
+ refcnt < WT_MERGE_FULL_PAGE ||
+ __wt_btree_mergeable(top->parent),
+ &newtop));
+
+ visit_state.first = newtop;
+ }
+
+ /*
+ * Copy the references into the new tree, but don't update anything in
+ * the locked tree in case there is an error and we need to back out.
+ * We do this in a separate pass so that we can figure out the key for
+ * the split point: that allocates memory and so it could still fail.
+ */
+ visit_state.page = visit_state.first;
+ visit_state.ref = visit_state.page->u.intl.t;
+ visit_state.refcnt = 0;
+ WT_ERR(__merge_walk(session, top, 0, __merge_copy_ref, &visit_state));
+
+ if (promote) {
+ /* Promote keys into the top-level page. */
+ if (lchild != NULL) {
+ newref = &newtop->u.intl.t[0];
+ WT_LINK_PAGE(newtop, newref, lchild);
+ newref->state = WT_REF_MEM;
+ WT_ERR(__merge_promote_key(session, newref));
+ }
+
+ if (rchild != NULL) {
+ newref = &newtop->u.intl.t[1];
+ WT_LINK_PAGE(newtop, newref, rchild);
+ newref->state = WT_REF_MEM;
+ WT_ERR(__merge_promote_key(session, newref));
+ }
+ }
+
+ /*
+ * We have copied everything into place and allocated all of the memory
+ * we need. Now link all pages into the new tree and unlock them.
+ *
+ * The only way this could fail is if a reference state has been
+ * changed by another thread since they were locked. Panic in that
+ * case: that should never happen.
+ */
+ visit_state.page = visit_state.first;
+ visit_state.ref = visit_state.page->u.intl.t;
+ visit_state.refcnt = 0;
+ ret = __merge_walk(session, top, 0, __merge_switch_page, &visit_state);
+
+ if (ret != 0)
+ WT_ERR(__wt_illegal_value(session, "__wt_merge_tree"));
+
+ newtop->u.intl.recno = top->u.intl.recno;
+ newtop->parent = top->parent;
+ newtop->ref = top->ref;
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * Before swapping in the new tree, walk the pages we are discarding,
+ * check that everything looks right.
+ */
+ __merge_check_discard(session, top);
+#endif
+
+ /*
+ * Set up the new top-level page as a split so that it will be swapped
+ * into place by our caller.
+ */
+ top->modify->flags = WT_PM_REC_SPLIT;
+ top->modify->u.split = newtop;
+
+ WT_VERBOSE_ERR(session, evict,
+ "Successfully %s %" PRIu32
+ " split-merge pages containing %" PRIu32 " keys\n",
+ promote ? "promoted" : "merged", visit_state.maxdepth, refcnt);
+
+ /* Queue new child pages for forced eviction, if possible. */
+ if (lchild != NULL && !F_ISSET(lchild->modify, WT_PM_REC_SPLIT_MERGE))
+ __wt_evict_forced_page(session, lchild);
+ if (rchild != NULL && !F_ISSET(rchild->modify, WT_PM_REC_SPLIT_MERGE))
+ __wt_evict_forced_page(session, rchild);
+
+ /* Update statistics. */
+ WT_CSTAT_INCR(session, cache_eviction_merge);
+ WT_DSTAT_INCR(session, cache_eviction_merge);
+
+ /* How many levels did we remove? */
+ levels = visit_state.maxdepth - (promote ? 2 : 1);
+ WT_CSTAT_INCRV(session, cache_eviction_merge_levels, levels);
+ WT_DSTAT_INCRV(session, cache_eviction_merge_levels, levels);
+
+ return (0);
+
+err: WT_VERBOSE_TRET(session, evict,
+ "Failed to merge %" PRIu32
+ " split-merge pages containing %" PRIu32 " keys\n",
+ visit_state.maxdepth, refcnt);
+
+ WT_CSTAT_INCR(session, cache_eviction_merge_fail);
+ WT_DSTAT_INCR(session, cache_eviction_merge_fail);
+
+ if (newtop != NULL)
+ __wt_page_out(session, &newtop);
+ if (lchild != NULL)
+ __wt_page_out(session, &lchild);
+ if (rchild != NULL)
+ __wt_page_out(session, &rchild);
+ return (ret);
+}
diff --git a/src/btree/rec_track.c b/src/btree/rec_track.c
index beeeeb43461..2f0f284fbd1 100644
--- a/src/btree/rec_track.c
+++ b/src/btree/rec_track.c
@@ -237,17 +237,8 @@ __wt_rec_track_onpage_srch(
/*
* __wt_rec_track_onpage_addr --
- * Search for a permanently tracked object (based on an addr/size pair),
- * and add it if it isn't already tracked.
- *
- * __wt_rec_track_onpage_ref --
- * Search for a permanently tracked object (based on a page and ref),
- * and add it if it isn't already tracked.
- *
- * These functions are short-hand for "search the on-page records, and if the
- * address is not already listed as an object, add it". Note there is no
- * possibility of object re-use, the object is discarded when reconciliation
- * completes.
+ * Search the on-page records for a permanently tracked object (based on
+ * an addr/size pair), and add it if it isn't already tracked.
*/
int
__wt_rec_track_onpage_addr(WT_SESSION_IMPL *session,
@@ -256,21 +247,14 @@ __wt_rec_track_onpage_addr(WT_SESSION_IMPL *session,
if (__wt_rec_track_onpage_srch(page, addr, addr_size))
return (0);
+ /*
+ * Note there is no possibility of object re-use, the object is
+ * discarded when reconciliation completes.
+ */
return (__wt_rec_track(
session, page, addr, addr_size, NULL, 0, WT_TRK_ONPAGE));
}
-int
-__wt_rec_track_onpage_ref(
- WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE *refpage, WT_REF *ref)
-{
- uint32_t size;
- const uint8_t *addr;
-
- __wt_get_addr(refpage, ref, &addr, &size);
- return (__wt_rec_track_onpage_addr(session, page, addr, size));
-}
-
/*
* __wt_rec_track_ovfl_reuse --
* Search for a matching overflow record and reactivate it.
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index 9116679e98e..8809ee9e8c6 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -133,6 +133,7 @@ typedef struct {
int already_compressed;
} *bnd; /* Saved boundaries */
uint32_t bnd_next; /* Next boundary slot */
+ uint32_t bnd_next_max; /* Maximum boundary slots used */
uint32_t bnd_entries; /* Total boundary slots */
size_t bnd_allocated; /* Bytes allocated */
@@ -177,7 +178,6 @@ typedef struct {
* these fields work.
*/
int cell_zero; /* Row-store internal page 0th key */
- WT_REF *merge_ref; /* Row-store merge correction key */
/*
* WT_DICTIONARY --
@@ -278,7 +278,9 @@ __wt_rec_write(WT_SESSION_IMPL *session,
WT_DECL_RET;
/* We're shouldn't get called with a clean page, that's an error. */
- WT_ASSERT_RET(session, __wt_page_is_modified(page));
+ if (!__wt_page_is_modified(page))
+ WT_RET_MSG(session, WT_ERROR,
+ "Attempt to reconcile a clean page.");
/*
* We can't do anything with a split-merge page, it must be merged into
@@ -289,9 +291,12 @@ __wt_rec_write(WT_SESSION_IMPL *session,
WT_VERBOSE_RET(
session, reconcile, "%s", __wt_page_type_string(page->type));
+ WT_CSTAT_INCR(session, rec_pages);
WT_DSTAT_INCR(session, rec_pages);
- if (LF_ISSET(WT_EVICTION_SERVER_LOCKED))
+ if (LF_ISSET(WT_EVICTION_SERVER_LOCKED)) {
+ WT_CSTAT_INCR(session, rec_pages_eviction);
WT_DSTAT_INCR(session, rec_pages_eviction);
+ }
/* Initialize the reconciliation structure for each new run. */
WT_RET(__rec_write_init(session, page, flags, &session->reconcile));
@@ -574,6 +579,7 @@ __rec_txn_skip_chk(WT_SESSION_IMPL *session, WT_RECONCILE *r)
WT_PANIC_RETX(
session, "reconciliation illegally skipped an update");
case WT_SKIP_UPDATE_QUIT:
+ WT_CSTAT_INCR(session, rec_skipped_update);
WT_DSTAT_INCR(session, rec_skipped_update);
return (EBUSY);
case 0:
@@ -808,10 +814,11 @@ __rec_child_deleted(WT_SESSION_IMPL *session,
* If no such transactions exist, we can discard the leaf page to the
* block manager and no cell needs to be written at all. We do this
* outside of the underlying tracking routines because this action is
- * permanent and irrevocable. (Setting the WT_REF.addr value to NULL
- * means we've lost track of the disk address in a permanent way. If
- * we ever read into this chunk of the name space again, the cache read
- * function instantiates a new page.)
+ * permanent and irrevocable. (Clearing the address means we've lost
+ * track of the disk address in a permanent way. This is safe because
+ * there's no path to reading the leaf page again: if reconciliation
+ * fails, and we ever read into this part of the name space again, the
+ * cache read function instantiates a new page.)
*
* One final note: if the WT_REF transaction ID is set to WT_TXN_NONE,
* it means this WT_REF is the re-creation of a deleted node (we wrote
@@ -992,16 +999,22 @@ __rec_key_state_update(WT_RECONCILE *r, int ovfl_key)
static int
__rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
+ uint32_t incr;
+
/*
* Make sure there's enough room in which to save another boundary.
*
* The calculation is actually +1, because we save the start point one
- * past the current entry -- make it +20 so we don't grow slot-by-slot.
+ * past the current entry; normal reconciliation generally doesn't use
+ * a lot of buffers, but we grow aggressively anyway, bulk load eats up
+ * a lot of these entries because we have an entry for each page that's
+ * created by the bulk load.
*/
if (r->bnd_next + 1 >= r->bnd_entries) {
- WT_RET(__wt_realloc(session, &r->bnd_allocated,
- (r->bnd_entries + 20) * sizeof(*r->bnd), &r->bnd));
- r->bnd_entries += 20;
+ incr = r->bnd_entries + r->bnd_entries / 2 + 20;
+ WT_RET(__wt_realloc(session,
+ &r->bnd_allocated, incr * sizeof(*r->bnd), &r->bnd));
+ r->bnd_entries = incr;
}
return (0);
}
@@ -1055,9 +1068,8 @@ __rec_split_init(WT_SESSION_IMPL *session,
* split pages, because otherwise we could end up splitting one large
* packed page over and over. We don't want to pick the minimum size
* either, because that penalizes an application that did a bulk load
- * and subsequently inserted a few items into packed pages. Currently,
- * I'm using 75%, but I have no empirical evidence that's a good value.
- * We should leave this as a tuning variable, but probably undocumented.
+ * and subsequently inserted a few items into packed pages. Currently
+ * defaulted to 75%, but I have no empirical evidence that's "correct".
*
* The maximum page size may be a multiple of the split page size (for
* example, there's a maximum page size of 128KB, but because the table
@@ -1083,10 +1095,9 @@ __rec_split_init(WT_SESSION_IMPL *session,
if (r->raw_compression)
r->split_size = 0;
else if (page->type == WT_PAGE_COL_FIX)
- r->split_size = r->page_size;
+ r->split_size = r->page_size_max;
else
- r->split_size = WT_SPLIT_PAGE_SIZE(
- r->page_size, btree->allocsize, btree->split_pct);
+ r->split_size = __wt_split_page_size(btree, r->page_size_max);
/*
* If the maximum page size is the same as the split page size, either
@@ -1181,8 +1192,9 @@ __rec_split_row_promote_cell(
*/
cell = WT_PAGE_HEADER_BYTE(btree, dsk);
__wt_cell_unpack(cell, unpack);
- WT_ASSERT_RET(session,
- unpack->raw != WT_CELL_VALUE_COPY && unpack->prefix == 0);
+ WT_ASSERT(session,
+ unpack->prefix == 0 && unpack->raw != WT_CELL_VALUE_COPY);
+
WT_RET(__wt_cell_unpack_copy(session, unpack, copy));
return (0);
}
@@ -1203,18 +1215,11 @@ __rec_split_row_promote(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint8_t type)
* length byte string, get a copy.
*
* This function is called from the split code at each split boundary,
- * but that means we're not called before the first boundary. When we
- * do the split work at the second boundary, we need to copy the key
- * for the first boundary from the page we're building. Alternatively,
- * we could store a copy of the first key we put on a page somewhere,
- * perhaps while building the keys for a page, but that's likely to be
- * even uglier.
- */
- if (r->bnd_next == 1)
- WT_RET(__rec_split_row_promote_cell(
- session, r->dsk.mem, &r->bnd[0].key));
-
- /*
+ * but that means we're not called before the first boundary. It's OK
+ * we never do the work for the first boundary because that key cannot
+ * come from the page, it has to come from the parent. See the comment
+ * in the code that creates the row-store split-merge page for details.
+ *
* For the current slot, take the last key we built, after doing suffix
* compression. The "last key we built" describes some process: before
* calling the split code, we must place the last key on the page before
@@ -1446,8 +1451,14 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RECONCILE *r, int final)
* Set the promotion key for the chunk. Repeated each time we
* try and split, which might be wasted work, but detecting
* repeated key-building is probably more complicated than it's
- * worth.
+ * worth. Don't bother doing the work for the first boundary,
+ * that key cannot come from the page, it has to come from the
+ * parent. See the comment in the code that creates the row-
+ * store split-merge page for details.
*/
+ if (r->bnd_next == 0)
+ break;
+
WT_RET(__rec_split_row_promote_cell(session, dsk, &bnd->key));
break;
WT_ILLEGAL_VALUE(session);
@@ -1579,8 +1590,8 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RECONCILE *r, int final)
*/
memcpy(dst->mem, dsk, WT_BLOCK_COMPRESS_SKIP);
WT_ERR(compressor->compress_raw(compressor, wt_session,
- r->page_size_max, WT_BLOCK_COMPRESS_SKIP,
- (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
+ r->page_size_max, btree->split_pct,
+ WT_BLOCK_COMPRESS_SKIP, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
r->raw_offsets, slots,
(uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP,
result_len, final, &result_len, &result_slots));
@@ -1787,10 +1798,8 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* entries to 0, is because there's another entry to write, which then
* sets entries to 1). If the page was empty, we eventually delete it.
*/
- if (r->entries == 0) {
- WT_ASSERT_RET(session, r->bnd_next == 0);
+ if (r->entries == 0)
return (0);
- }
return (r->raw_compression ?
__rec_split_finish_raw(session, r) :
@@ -1827,7 +1836,7 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* WT_PAGE_HEADER header onto the scratch buffer, most of the header
* information remains unchanged between the pages.
*/
- WT_RET(__wt_scr_alloc(session, r->split_size, &tmp));
+ WT_RET(__wt_scr_alloc(session, r->page_size_max, &tmp));
dsk = tmp->mem;
memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE);
@@ -1859,8 +1868,9 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* Fix up our caller's information.
*/
len = WT_PTRDIFF32(r->first_free, bnd->start);
- WT_ASSERT_ERR(
- session, len < r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree));
+ if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree))
+ WT_PANIC_ERR(session, ret = WT_PANIC,
+ "Reconciliation remnant too large for the split buffer");
dsk = r->dsk.mem;
dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
@@ -2969,37 +2979,6 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* Modified child.
* The page may be emptied or internally created during a split.
* Deleted/split pages are merged into the parent and discarded.
- *
- * There's one special case we have to handle here: the internal
- * page being merged has a potentially incorrect first key and
- * we need to replace it with the one we have. The problem is
- * caused by the fact that the page search algorithm coerces the
- * 0th key on any internal page to be smaller than any search
- * key. We do that because we don't want to have to update the
- * internal pages every time a new "smallest" key is inserted
- * into the tree. But, if a new "smallest" key is inserted into
- * our split-created subtree, and we don't update the internal
- * page, when we merge that internal page into its parent page,
- * the key may be incorrect (or more likely, have been coerced
- * to a single byte because it's an internal page's 0th key).
- * Imagine the following tree:
- *
- * 2 5 40 internal page
- * |
- * 10 | 20 split-created internal page
- * |
- * 6 inserted smallest key
- *
- * after a simple merge, we'd have corruption:
- *
- * 2 10 20 40 merged internal page
- * |
- * 6 key sorts before parent's key
- *
- * To fix this problem, we take the higher-level page's key as
- * our first key, because that key sorts before any possible
- * key inserted into the subtree, and discard whatever 0th key
- * is on the split-created internal page.
*/
if (state == WT_CHILD_MODIFIED)
switch (F_ISSET(rp->modify, WT_PM_REC_MASK)) {
@@ -3040,7 +3019,6 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
session, page,
kpack->data, kpack->size));
- r->merge_ref = ref;
WT_RET(__rec_row_merge(session, r,
F_ISSET(rp->modify, WT_PM_REC_SPLIT_MERGE) ?
rp : rp->modify->u.split));
@@ -3082,7 +3060,6 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/*
* Build key cell.
- *
* Truncate any 0th key, internal pages don't need 0th keys.
*/
if (onpage_ovfl) {
@@ -3202,14 +3179,6 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
break;
case WT_PM_REC_SPLIT:
case WT_PM_REC_SPLIT_MERGE:
- /*
- * If we have a merge key set, we're working our
- * way down a merge tree. If we have not set a
- * merge key, we're starting descent of a new
- * merge tree, set the merge key.
- */
- if (r->merge_ref == NULL)
- r->merge_ref = ref;
WT_RET(__rec_row_merge(session, r,
F_ISSET(rp->modify, WT_PM_REC_SPLIT_MERGE) ?
rp : rp->modify->u.split));
@@ -3240,14 +3209,10 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
__rec_cell_build_addr(r, p, size, vtype, 0);
/*
- * Build the key cell. If this is the first key in a "to be
- * merged" subtree, use the merge correction key saved in the
- * top-level parent page when this function was called.
- *
+ * Build the key cell.
* Truncate any 0th key, internal pages don't need 0th keys.
*/
- ikey = r->merge_ref == NULL ? ref->u.key : r->merge_ref->u.key;
- r->merge_ref = NULL;
+ ikey = ref->u.key;
WT_RET(__rec_cell_build_key(session, r, WT_IKEY_DATA(ikey),
r->cell_zero ? 1 : ikey->size, 1, &ovfl_key));
r->cell_zero = 0;
@@ -3764,8 +3729,10 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_BTREE *btree;
WT_BOUNDARY *bnd;
WT_PAGE_MODIFY *mod;
- uint32_t page_size;
+ WT_REF *ref;
+ uint32_t size;
int was_modified;
+ const uint8_t *addr;
btree = S2BT(session);
bm = btree->bm;
@@ -3779,17 +3746,34 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
*/
switch (F_ISSET(mod, WT_PM_REC_MASK)) {
case 0: /*
- * The page has never been reconciled before, track the original
- * address blocks (if any). The "if any" is for empty trees we
- * create when a new tree is opened, and for previously deleted
- * pages that are instantiated in memory.
+ * The page has never been reconciled before, free the original
+ * address blocks (if any). The "if any" is for empty trees
+ * created when a new tree is opened, previously deleted pages
+ * instantiated in memory, or pages reconciled into split-merge
+ * pages and then replaced by other pages because the tree grew
+ * too deep.
*
* The exception is root pages are never tracked or free'd, they
* are checkpoints, and must be explicitly dropped.
*/
- if (!WT_PAGE_IS_ROOT(page) && page->ref->addr != NULL)
- WT_RET(__wt_rec_track_onpage_ref(
- session, page, page->parent, page->ref));
+ if (WT_PAGE_IS_ROOT(page))
+ break;
+
+ ref = page->ref;
+ if (ref->addr != NULL) {
+ /*
+ * Free the page and clear the address (so we don't free
+ * it twice). Logically, this is the same as adding the
+ * address to the reconciliation tracking information
+ * and freeing it when reconciliation ends as part of
+ * cleaning up the track information, but that is going
+ * to happen right at the end of this switch statement,
+ * might as well save the work.
+ */
+ __wt_get_addr(page->parent, ref, &addr, &size);
+ WT_RET(bm->free(bm, session, addr, size));
+ ref->addr = NULL;
+ }
break;
case WT_PM_REC_EMPTY: /* Page deleted */
break;
@@ -3807,7 +3791,6 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* Discard the replacement page's address. */
__wt_free(session, mod->u.replace.addr);
- mod->u.replace.addr = NULL;
mod->u.replace.size = 0;
break;
case WT_PM_REC_SPLIT: /* Page split */
@@ -3928,6 +3911,11 @@ err: __wt_scr_free(&tkey);
WT_RET(ret);
}
+ if (r->bnd_next > r->bnd_next_max) {
+ r->bnd_next_max = r->bnd_next;
+ WT_DSTAT_SET(session, rec_split_max, r->bnd_next_max);
+ }
+
switch (page->type) {
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
@@ -3967,12 +3955,15 @@ err: __wt_scr_free(&tkey);
*/
if (!r->upd_skipped) {
was_modified = __wt_page_is_modified(page);
- WT_ORDERED_READ(page_size, page->memory_footprint);
+ WT_ORDERED_READ(size, page->memory_footprint);
mod->disk_gen = r->orig_write_gen;
if (was_modified && !__wt_page_is_modified(page))
- __wt_cache_dirty_decr(session, page_size);
+ __wt_cache_dirty_decr(session, size);
}
+ /* Record the most recent transaction ID we could have written. */
+ mod->disk_txn = session->txn.snap_min;
+
return (0);
}
@@ -4000,36 +3991,36 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
if (bnd->addr.addr != NULL) {
WT_TRET(bm->free(
bm, session, bnd->addr.addr, bnd->addr.size));
- bnd->addr.addr = NULL;
+ __wt_free(session, bnd->addr.addr);
}
return (ret);
}
/*
- * __rec_split_row --
- * Split a row-store page, creating a new internal page.
+ * __rec_split_merge_new --
+ * Create a split-merge page.
*/
static int
-__rec_split_row(
- WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *orig, WT_PAGE **splitp)
+__rec_split_merge_new(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_PAGE *orig, WT_PAGE **pagep, uint8_t type)
{
- WT_ADDR *addr;
- WT_BOUNDARY *bnd;
- WT_DECL_RET;
WT_PAGE *page;
- WT_REF *ref;
- uint32_t i;
- /* Allocate a row-store internal page. */
- WT_RET(__wt_calloc_def(session, 1, &page));
- WT_ERR(__wt_calloc_def(session, (size_t)r->bnd_next, &page->u.intl.t));
-
- /* Fill it in. */
+ /*
+ * Allocate a new internal page and fill it in.
+ *
+ * Our caller cleans up, make sure we return a valid page reference,
+ * even on error.
+ */
+ WT_RET(__wt_page_alloc(session, type, r->bnd_next, pagep));
+ page = *pagep;
page->parent = orig->parent;
page->ref = orig->ref;
- page->read_gen = __wt_cache_read_gen(session);
+ if (type == WT_PAGE_COL_INT)
+ page->u.intl.recno = r->bnd[0].recno;
+ page->read_gen = WT_READ_GEN_NOTSET;
page->entries = r->bnd_next;
- page->type = WT_PAGE_ROW_INT;
+ page->flags_atomic = WT_PAGE_DISK_NOT_ALLOC;
/*
* We don't re-write parent pages when child pages split, which means
@@ -4050,27 +4041,100 @@ __rec_split_row(
* its memory discarded, but the newly created split page cannot be
* evicted, it can only be merged into its parent.
*/
- WT_ERR(__wt_page_modify_init(session, page));
+ WT_RET(__wt_page_modify_init(session, page));
F_SET(page->modify, WT_PM_REC_SPLIT_MERGE);
- /* Enter each split page into the new, internal page. */
+ return (0);
+}
+
+/*
+ * __rec_split_row --
+ * Split a row-store page, creating a new internal page.
+ */
+static int
+__rec_split_row(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *orig, WT_PAGE **splitp)
+{
+ WT_ADDR *addr;
+ WT_BOUNDARY *bnd;
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ WT_PAGE *page;
+ WT_REF *ref;
+ size_t size;
+ uint32_t i;
+
+ /* Allocate a split-merge page. */
+ WT_ERR(__rec_split_merge_new(session, r, orig, &page, WT_PAGE_ROW_INT));
+
+ /*
+ * The "parent" key for each split chunk is the first key on the chunk,
+ * except for the 0th chunk, which cannot come from the page itself as
+ * it might not be small enough. If the existing key for the page is
+ * smaller than the first key on the chunk we can lose after the merge.
+ * Imagine the following tree, where an internal page has keys 2, 5 and
+ * 40. The page with key 5 splits into two chunks, and 10 is the first
+ * key in the first chunk.
+ *
+ * 2 5 40 internal page
+ * |
+ * 10 | 20 split-created internal page
+ *
+ * If we subsequently insert a key 6, it works because the page search
+ * algorithm coerces the 0th key of an internal page to be smaller than
+ * any search key. (We do that because we don't want to have to update
+ * internal pages every time a new "smallest" key is inserted into the
+ * tree.) Anyway, that results in the following tree:
+ *
+ * 2 5 40 internal page
+ * |
+ * 10 | 20 split-created internal page
+ * |
+ * 6 inserted smallest key
+ *
+ * after a simple merge where we replace page 5 with pages 10 and 20,
+ * we'd have corruption:
+ *
+ * 2 10 20 40 merged internal page
+ * |
+ * 6 key sorts before parent's key
+ *
+ * To fix this problem, we take the original parent page's key as the
+ * first chunk's key because that key sorts before any possible key
+ * inserted into the subtree.
+ */
+ if (WT_PAGE_IS_ROOT(orig))
+ WT_ERR(__wt_buf_set(session, &r->bnd[0].key, "", 1));
+ else {
+ ikey = orig->ref->u.key;
+ WT_ERR(__wt_buf_set(
+ session, &r->bnd[0].key, WT_IKEY_DATA(ikey), ikey->size));
+ }
+
+ /* Enter each split child page into the new internal page. */
+ size = 0;
for (ref = page->u.intl.t,
bnd = r->bnd, i = 0; i < r->bnd_next; ++ref, ++bnd, ++i) {
WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
*addr = bnd->addr;
bnd->addr.addr = NULL;
+ size += bnd->addr.size;
ref->page = NULL;
- WT_ERR(__wt_row_ikey_alloc(session, 0,
+ WT_ERR(__wt_row_ikey(session, 0,
bnd->key.data, bnd->key.size, &ref->u.key));
+ size += sizeof(WT_IKEY) + bnd->key.size;
ref->addr = addr;
ref->state = WT_REF_DISK;
}
+ __wt_cache_page_inmem_incr(
+ session, page, r->bnd_next * sizeof(WT_ADDR) + size);
*splitp = page;
return (0);
-err: __wt_page_out(session, &page);
+err: if (page != NULL)
+ __wt_page_out(session, &page);
return (ret);
}
@@ -4089,25 +4153,10 @@ __rec_split_col(
WT_REF *ref;
uint32_t i;
- /* Allocate a column-store internal page. */
- WT_RET(__wt_calloc_def(session, 1, &page));
- WT_ERR(__wt_calloc_def(session, (size_t)r->bnd_next, &page->u.intl.t));
-
- /* Fill it in. */
- page->parent = orig->parent;
- page->ref = orig->ref;
- page->read_gen = __wt_cache_read_gen(session);
- page->u.intl.recno = r->bnd[0].recno;
- page->entries = r->bnd_next;
- page->type = WT_PAGE_COL_INT;
-
- /*
- * See the comment above in __rec_split_row().
- */
- WT_ERR(__wt_page_modify_init(session, page));
- F_SET(page->modify, WT_PM_REC_SPLIT_MERGE);
+ /* Allocate a split-merge page. */
+ WT_ERR(__rec_split_merge_new(session, r, orig, &page, WT_PAGE_COL_INT));
- /* Enter each split page into the new, internal page. */
+ /* Enter each split child page into the new internal page. */
for (ref = page->u.intl.t,
bnd = r->bnd, i = 0; i < r->bnd_next; ++ref, ++bnd, ++i) {
WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
@@ -4119,11 +4168,14 @@ __rec_split_col(
ref->addr = addr;
ref->state = WT_REF_DISK;
}
+ __wt_cache_page_inmem_incr(
+ session, page, r->bnd_next * sizeof(WT_ADDR));
*splitp = page;
return (0);
-err: __wt_page_out(session, &page);
+err: if (page != NULL)
+ __wt_page_out(session, &page);
return (ret);
}
diff --git a/src/btree/row_key.c b/src/btree/row_key.c
index 043cbace5ee..1142bffbdcc 100644
--- a/src/btree/row_key.c
+++ b/src/btree/row_key.c
@@ -309,7 +309,7 @@ next: switch (direction) {
/* If still needed, instantiate the key. */
key = WT_ROW_KEY_COPY(rip_arg);
if (!__wt_off_page(page, key)) {
- WT_ERR(__wt_row_ikey_alloc(session,
+ WT_ERR(__wt_row_ikey(session,
WT_PAGE_DISK_OFFSET(page, key),
retb->data, retb->size, &ikey));
@@ -380,11 +380,27 @@ __wt_row_value(WT_PAGE *page, WT_ROW *rip)
}
/*
- * __wt_row_ikey_alloc --
+ * __wt_row_ikey_incr --
+ * Instantiate a key in a WT_IKEY structure and increment the page's
+ * memory footprint.
+ */
+int
+__wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page,
+ uint32_t cell_offset, const void *key, uint32_t size, void *ikeyp)
+{
+ WT_RET(__wt_row_ikey(session, cell_offset, key, size, ikeyp));
+
+ __wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + size);
+
+ return (0);
+}
+
+/*
+ * __wt_row_ikey --
* Instantiate a key in a WT_IKEY structure.
*/
int
-__wt_row_ikey_alloc(WT_SESSION_IMPL *session,
+__wt_row_ikey(WT_SESSION_IMPL *session,
uint32_t cell_offset, const void *key, uint32_t size, void *ikeyp)
{
WT_IKEY *ikey;
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c
index aa3f6fc1c58..d031315e7bb 100644
--- a/src/btree/row_modify.c
+++ b/src/btree/row_modify.c
@@ -223,7 +223,8 @@ __wt_insert_serial_func(WT_SESSION_IMPL *session, void *args)
/*
* Check the page's write-generation: if that fails, check whether we
* are still in the expected position, and no item has been added where
- * our insert belongs.
+ * our insert belongs. Take extra care at the beginning and end of the
+ * list (at each level): retry if we race there.
*/
WT_RET(__wt_page_write_gen_wrapped_check(page));
@@ -233,8 +234,8 @@ __wt_insert_serial_func(WT_SESSION_IMPL *session, void *args)
*ins_stack[i] != next_stack[i])
return (WT_RESTART);
if (next_stack[i] == NULL &&
- inshead->tail[i] != NULL &&
- ins_stack[i] != &inshead->tail[i]->next[i])
+ (inshead->tail[i] == NULL ||
+ ins_stack[i] != &inshead->tail[i]->next[i]))
return (WT_RESTART);
}
}
@@ -352,7 +353,6 @@ __wt_update_alloc(WT_SESSION_IMPL *session,
WT_UPDATE *
__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
- WT_TXN *txn;
WT_UPDATE *next;
/*
@@ -360,13 +360,7 @@ __wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
* the rest of the chain; because this routine is called from inside
* a serialization function, the caller has responsibility for actually
* freeing the memory.
- */
- txn = &session->txn;
- if (txn->isolation != TXN_ISO_SNAPSHOT &&
- txn->isolation != TXN_ISO_READ_COMMITTED)
- return (NULL);
-
- /*
+ *
* Walk the list of updates, looking for obsolete updates. If we find
* an update no session will ever move past, we can discard any updates
* that appear after it.
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index 23860831451..d8de7f118a9 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -129,9 +129,27 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify)
item = &_item;
for (depth = 2,
page = btree->root_page; page->type == WT_PAGE_ROW_INT; ++depth) {
+ /*
+ * Fast-path internal pages with one child, a common case for
+ * the root page in new trees.
+ */
+ base = page->entries;
+ ref = &page->u.intl.t[base - 1];
+ if (base == 1)
+ goto descend;
+
+ /* Fast-path appends. */
+ ikey = ref->u.key;
+ item->data = WT_IKEY_DATA(ikey);
+ item->size = ikey->size;
+
+ WT_ERR(WT_BTREE_CMP(session, btree, srch_key, item, cmp));
+ if (cmp >= 0)
+ goto descend;
+
/* Binary search of internal pages. */
for (base = 0, ref = NULL,
- limit = page->entries; limit != 0; limit >>= 1) {
+ limit = page->entries - 1; limit != 0; limit >>= 1) {
indx = base + (limit >> 1);
ref = page->u.intl.t + indx;
@@ -157,7 +175,8 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify)
base = indx + 1;
--limit;
}
- WT_ASSERT(session, ref != NULL);
+
+descend: WT_ASSERT(session, ref != NULL);
/*
* Reference the slot used for next step down the tree.
diff --git a/src/config/config.c b/src/config/config.c
index 141a969895b..f0c413b0624 100644
--- a/src/config/config.c
+++ b/src/config/config.c
@@ -343,7 +343,7 @@ __config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
WT_CONFIG_ITEM *out = key;
int utf8_remain = 0;
static const WT_CONFIG_ITEM true_value = {
- "", 0, 1, ITEM_NUM
+ "", 0, 1, ITEM_BOOL
};
key->len = 0;
@@ -499,10 +499,10 @@ __config_process_value(WT_CONFIG *conf, WT_CONFIG_ITEM *value)
if (value->type == ITEM_ID) {
if (strncasecmp(value->str, "true", value->len) == 0) {
- value->type = ITEM_NUM;
+ value->type = ITEM_BOOL;
value->val = 1;
} else if (strncasecmp(value->str, "false", value->len) == 0) {
- value->type = ITEM_NUM;
+ value->type = ITEM_BOOL;
value->val = 0;
}
} else if (value->type == ITEM_NUM) {
@@ -547,10 +547,10 @@ __config_process_value(WT_CONFIG *conf, WT_CONFIG_ITEM *value)
}
/*
- * If we parsed the the whole string but the number is out of
- * range, report an error. Don't report an error for strings
- * that aren't well-formed integers: if an integer is expected,
- * that will be caught by __wt_config_check.
+ * If we parsed the whole string but the number is out of range,
+ * report an error. Don't report an error for strings that
+ * aren't well-formed integers: if an integer is expected, that
+ * will be caught by __wt_config_check.
*/
if (value->type == ITEM_NUM && errno == ERANGE)
goto range;
@@ -574,12 +574,12 @@ __wt_config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
}
/*
- * __wt_config_getraw --
+ * __config_getraw --
* Given a config parser, find the final value for a given key.
*/
-int
-__wt_config_getraw(
- WT_CONFIG *cparser, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+static int
+__config_getraw(
+ WT_CONFIG *cparser, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, int top)
{
WT_CONFIG sparser;
WT_CONFIG_ITEM k, v, subk;
@@ -601,14 +601,16 @@ __wt_config_getraw(
WT_RET(__wt_config_initn(
cparser->session, &sparser, v.str, v.len));
if ((ret =
- __wt_config_getraw(&sparser, &subk, value)) == 0)
+ __config_getraw(&sparser, &subk, value, 0)) == 0)
found = 1;
WT_RET_NOTFOUND_OK(ret);
}
}
+ WT_RET_NOTFOUND_OK(ret);
- return ((found && ret == WT_NOTFOUND) ?
- __config_process_value(cparser, value) : ret);
+ if (!found)
+ return (WT_NOTFOUND);
+ return (top ? __config_process_value(cparser, value) : 0);
}
/*
@@ -626,7 +628,7 @@ __wt_config_get(WT_SESSION_IMPL *session,
for (found = 0; *cfg != NULL; cfg++) {
WT_RET(__wt_config_init(session, &cparser, *cfg));
- if ((ret = __wt_config_getraw(&cparser, key, value)) == 0)
+ if ((ret = __config_getraw(&cparser, key, value, 1)) == 0)
found = 1;
else if (ret != WT_NOTFOUND)
return (ret);
@@ -660,7 +662,7 @@ __wt_config_getone(WT_SESSION_IMPL *session,
WT_CONFIG cparser;
WT_RET(__wt_config_init(session, &cparser, config));
- return (__wt_config_getraw(&cparser, key, value));
+ return (__config_getraw(&cparser, key, value, 1));
}
/*
@@ -675,7 +677,7 @@ __wt_config_getones(WT_SESSION_IMPL *session,
WT_CONFIG_ITEM key_item = { key, strlen(key), 0, ITEM_STRING };
WT_RET(__wt_config_init(session, &cparser, config));
- return (__wt_config_getraw(&cparser, &key_item, value));
+ return (__config_getraw(&cparser, &key_item, value, 1));
}
/*
@@ -723,7 +725,7 @@ __wt_config_subgetraw(WT_SESSION_IMPL *session,
WT_CONFIG cparser;
WT_RET(__wt_config_initn(session, &cparser, cfg->str, cfg->len));
- return (__wt_config_getraw(&cparser, key, value));
+ return (__config_getraw(&cparser, key, value, 1));
}
/*
diff --git a/src/config/config_check.c b/src/config/config_check.c
index 9f4628611dd..718547149c4 100644
--- a/src/config/config_check.c
+++ b/src/config/config_check.c
@@ -57,8 +57,9 @@ __wt_config_check(WT_SESSION_IMPL *session,
if (strcmp(checks[i].type, "int") == 0)
badtype = (v.type != ITEM_NUM);
else if (strcmp(checks[i].type, "boolean") == 0)
- badtype = (v.type != ITEM_NUM ||
- (v.val != 0 && v.val != 1));
+ badtype = (v.type != ITEM_BOOL &&
+ (v.type != ITEM_NUM ||
+ (v.val != 0 && v.val != 1)));
else if (strcmp(checks[i].type, "list") == 0)
badtype = (v.len > 0 && v.type != ITEM_STRUCT);
else if (strcmp(checks[i].type, "category") == 0) {
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 6ff187ab824..d99ac800bbb 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -86,13 +86,13 @@ const char *
__wt_confdfl_connection_reconfigure =
"cache_size=100MB,error_prefix=,eviction_dirty_target=80,"
"eviction_target=80,eviction_trigger=95,shared_cache=(chunk=10MB,"
- "name=,reserve=0,size=500MB),verbose=";
+ "name=pool,reserve=0,size=500MB),statistics=0,verbose=";
WT_CONFIG_CHECK
__wt_confchk_shared_cache_subconfigs[] = {
{ "chunk", "int", "min=1MB,max=10TB", NULL },
{ "name", "string", NULL, NULL },
- { "reserve", "string", NULL, NULL },
+ { "reserve", "int", NULL, NULL },
{ "size", "int", "min=1MB,max=10TB", NULL },
{ NULL, NULL, NULL, NULL }
};
@@ -106,6 +106,7 @@ __wt_confchk_connection_reconfigure[] = {
{ "eviction_trigger", "int", "min=10,max=99", NULL},
{ "shared_cache", "category", NULL,
__wt_confchk_shared_cache_subconfigs},
+ { "statistics", "boolean", NULL, NULL},
{ "verbose", "list",
"choices=[\"block\",\"shared_cache\",\"ckpt\",\"evict\","
"\"evictserver\",\"fileops\",\"hazard\",\"lsm\",\"mutex\",\"read\","
@@ -129,11 +130,9 @@ __wt_confdfl_file_meta =
"checksum=on,collator=,columns=,dictionary=0,format=btree,"
"huffman_key=,huffman_value=,internal_item_max=0,"
"internal_key_truncate=,internal_page_max=2KB,key_format=u,key_gap=10"
- ",leaf_item_max=0,leaf_page_max=1MB,lsm_bloom=,lsm_bloom_bit_count=8,"
- "lsm_bloom_config=,lsm_bloom_hash_count=4,lsm_bloom_newest=0,"
- "lsm_bloom_oldest=0,lsm_chunk_size=2MB,lsm_merge_max=15,"
- "lsm_merge_threads=1,memory_page_max=5MB,prefix_compression=,"
- "split_pct=75,value_format=u,version=(major=0,minor=0)";
+ ",leaf_item_max=0,leaf_page_max=1MB,memory_page_max=5MB,"
+ "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=,split_pct=75"
+ ",value_format=u,version=(major=0,minor=0)";
WT_CONFIG_CHECK
__wt_confchk_file_meta[] = {
@@ -157,16 +156,9 @@ __wt_confchk_file_meta[] = {
{ "key_gap", "int", "min=0", NULL},
{ "leaf_item_max", "int", "min=0", NULL},
{ "leaf_page_max", "int", "min=512B,max=512MB", NULL},
- { "lsm_bloom", "boolean", NULL, NULL},
- { "lsm_bloom_bit_count", "int", "min=2,max=1000", NULL},
- { "lsm_bloom_config", "string", NULL, NULL},
- { "lsm_bloom_hash_count", "int", "min=2,max=100", NULL},
- { "lsm_bloom_newest", "boolean", NULL, NULL},
- { "lsm_bloom_oldest", "boolean", NULL, NULL},
- { "lsm_chunk_size", "int", "min=512K,max=500MB", NULL},
- { "lsm_merge_max", "int", "min=2,max=100", NULL},
- { "lsm_merge_threads", "int", "min=1,max=10", NULL},
{ "memory_page_max", "int", "min=512B,max=10TB", NULL},
+ { "os_cache_dirty_max", "int", "min=0", NULL},
+ { "os_cache_max", "int", "min=0", NULL},
{ "prefix_compression", "boolean", NULL, NULL},
{ "split_pct", "int", "min=25,max=100", NULL},
{ "value_format", "format", NULL, NULL},
@@ -255,8 +247,9 @@ __wt_confdfl_session_create =
",leaf_item_max=0,leaf_page_max=1MB,lsm_bloom=,lsm_bloom_bit_count=8,"
"lsm_bloom_config=,lsm_bloom_hash_count=4,lsm_bloom_newest=0,"
"lsm_bloom_oldest=0,lsm_chunk_size=2MB,lsm_merge_max=15,"
- "lsm_merge_threads=1,memory_page_max=5MB,prefix_compression=,source=,"
- "split_pct=75,type=file,value_format=u";
+ "lsm_merge_threads=1,memory_page_max=5MB,os_cache_dirty_max=0,"
+ "os_cache_max=0,prefix_compression=,source=,split_pct=75,type=file,"
+ "value_format=u";
WT_CONFIG_CHECK
__wt_confchk_session_create[] = {
@@ -291,6 +284,8 @@ __wt_confchk_session_create[] = {
{ "lsm_merge_max", "int", "min=2,max=100", NULL},
{ "lsm_merge_threads", "int", "min=1,max=10", NULL},
{ "memory_page_max", "int", "min=512B,max=10TB", NULL},
+ { "os_cache_dirty_max", "int", "min=0", NULL},
+ { "os_cache_max", "int", "min=0", NULL},
{ "prefix_compression", "boolean", NULL, NULL},
{ "source", "string", NULL, NULL},
{ "split_pct", "int", "min=25,max=100", NULL},
@@ -423,17 +418,39 @@ __wt_confchk_table_meta[] = {
const char *
__wt_confdfl_wiredtiger_open =
- "buffer_alignment=-1,cache_size=100MB,create=0,direct_io=,"
- "error_prefix=,eviction_dirty_target=80,eviction_target=80,"
- "eviction_trigger=95,extensions=,hazard_max=1000,logging=0,lsm_merge="
- ",mmap=,multiprocess=0,session_max=50,shared_cache=(chunk=10MB,name=,"
- "reserve=0,size=500MB),sync=,transactional=,use_environment_priv=0,"
- "verbose=";
+ "buffer_alignment=-1,cache_size=100MB,"
+ "checkpoint=(name=\"WiredTigerCheckpoint\",wait=0),create=0,"
+ "direct_io=,error_prefix=,eviction_dirty_target=80,eviction_target=80"
+ ",eviction_trigger=95,extensions=,hazard_max=1000,logging=0,"
+ "lsm_merge=,mmap=,multiprocess=0,session_max=50,"
+ "shared_cache=(chunk=10MB,name=pool,reserve=0,size=500MB),"
+ "statistics=0,statistics_log=(clear=,path=\"WiredTigerStat.%H\","
+ "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),sync=,transactional=,"
+ "use_environment_priv=0,verbose=";
+
+WT_CONFIG_CHECK
+__wt_confchk_checkpoint_subconfigs[] = {
+ { "name", "string", NULL, NULL },
+ { "wait", "int", "min=1,max=100000", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+WT_CONFIG_CHECK
+__wt_confchk_statistics_log_subconfigs[] = {
+ { "clear", "boolean", NULL, NULL },
+ { "path", "string", NULL, NULL },
+ { "sources", "list", NULL, NULL },
+ { "timestamp", "string", NULL, NULL },
+ { "wait", "int", "min=5,max=100000", NULL },
+ { NULL, NULL, NULL, NULL }
+};
WT_CONFIG_CHECK
__wt_confchk_wiredtiger_open[] = {
{ "buffer_alignment", "int", "min=-1,max=1MB", NULL},
{ "cache_size", "int", "min=1MB,max=10TB", NULL},
+ { "checkpoint", "category", NULL,
+ __wt_confchk_checkpoint_subconfigs},
{ "create", "boolean", NULL, NULL},
{ "direct_io", "list", "choices=[\"data\",\"log\"]", NULL},
{ "error_prefix", "string", NULL, NULL},
@@ -449,6 +466,9 @@ __wt_confchk_wiredtiger_open[] = {
{ "session_max", "int", "min=1", NULL},
{ "shared_cache", "category", NULL,
__wt_confchk_shared_cache_subconfigs},
+ { "statistics", "boolean", NULL, NULL},
+ { "statistics_log", "category", NULL,
+ __wt_confchk_statistics_log_subconfigs},
{ "sync", "boolean", NULL, NULL},
{ "transactional", "boolean", NULL, NULL},
{ "use_environment_priv", "boolean", NULL, NULL},
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index fcb759f0fdf..3946fd93483 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -319,6 +319,16 @@ __conn_close(WT_CONNECTION *wt_conn, const char *config)
if (!F_ISSET(s, WT_SESSION_INTERNAL))
__wt_free(session, s->hazard);
+ /*
+ * Shut down server threads other than the eviction server, which is
+ * needed later to close btree handles. Some of these threads access
+ * btree handles, so take care in ordering shutdown to make sure they
+ * exit before files are closed.
+ */
+ F_CLR(conn, WT_CONN_SERVER_RUN);
+ WT_TRET(__wt_checkpoint_destroy(conn));
+ WT_TRET(__wt_statlog_destroy(conn));
+
/* Clean up open LSM handles. */
WT_ERR(__wt_lsm_cleanup(&conn->iface));
@@ -338,6 +348,7 @@ __conn_close(WT_CONNECTION *wt_conn, const char *config)
__conn_remove_data_source(conn, ndsrc);
WT_TRET(__wt_connection_close(conn));
+
/* We no longer have a session, don't try to update it. */
session = NULL;
@@ -351,23 +362,37 @@ err: API_END_NOTFOUND_MAP(session, ret);
static int
__conn_reconfigure(WT_CONNECTION *wt_conn, const char *config)
{
+ WT_CONFIG_ITEM cval;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_SESSION_IMPL *session;
+
+ /*
+ * Special version of cfg that doesn't include the default config: used
+ * to limit changes to values that the application sets explicitly.
+ * Note that any function using this value has to be prepared to handle
+ * not-found as a valid option return.
+ */
const char *raw_cfg[] = { config, NULL };
conn = (WT_CONNECTION_IMPL *)wt_conn;
CONNECTION_API_CALL(conn, session, reconfigure, config, cfg);
- WT_UNUSED(cfg);
- /*
- * Don't include the default config: only override values the
- * application sets explicitly.
- */
+ /* Turning on statistics clears any existing values. */
+ if ((ret =
+ __wt_config_gets(session, raw_cfg, "statistics", &cval)) == 0) {
+ conn->statistics = cval.val == 0 ? 0 : 1;
+ if (conn->statistics)
+ __wt_stat_clear_connection_stats(&conn->stats);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
WT_ERR(__wt_conn_cache_pool_config(session, cfg));
WT_ERR(__wt_cache_config(conn, raw_cfg));
- WT_ERR(__conn_verbose_config(session, cfg));
+
+ WT_ERR(__conn_verbose_config(session, raw_cfg));
+
/* Wake up the cache pool server so any changes are noticed. */
if (F_ISSET(conn, WT_CONN_CACHE_POOL))
WT_ERR(__wt_cond_signal(
@@ -526,7 +551,6 @@ __conn_config_file(WT_SESSION_IMPL *session, const char **cfg, WT_ITEM **cbufp)
#if 0
fprintf(stderr, "file config: {%s}\n", (const char *)cbuf->data);
- exit(0);
#endif
/* Check the configuration string. */
@@ -750,7 +774,8 @@ __conn_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
conn = S2C(session);
- WT_RET_NOTFOUND_OK(__wt_config_gets(session, cfg, "verbose", &cval));
+ if ((ret = __wt_config_gets(session, cfg, "verbose", &cval)) != 0)
+ return (ret == WT_NOTFOUND ? 0 : ret);
for (ft = verbtypes; ft->name != NULL; ft++) {
if ((ret = __wt_config_subgets(
session, &cval, ft->name, &sval)) == 0 && sval.val != 0)
@@ -882,6 +907,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
"buffer_alignment requires posix_memalign");
#endif
+ /*
+ * Configuration: direct_io, mmap, statistics.
+ */
WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval));
for (ft = directio_types; ft->name != NULL; ft++) {
ret = __wt_config_subgets(session, &cval, ft->name, &sval);
@@ -891,10 +919,10 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
} else if (ret != WT_NOTFOUND)
goto err;
}
-
- /* Configure mmap. */
WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval));
conn->mmap = cval.val == 0 ? 0 : 1;
+ WT_ERR(__wt_config_gets(session, cfg, "statistics", &cval));
+ conn->statistics = cval.val == 0 ? 0 : 1;
/* Load any extensions referenced in the config. */
WT_ERR(__wt_config_gets(session, cfg, "extensions", &cval));
@@ -963,7 +991,11 @@ err: if (cbuf != NULL)
__wt_buf_free(session, &exconfig);
if (ret != 0 && conn != NULL)
- WT_TRET(__wt_connection_destroy(conn));
+ WT_TRET(__wt_connection_close(conn));
+
+ /* Let the server threads proceed. */
+ if (ret == 0)
+ conn->connection_initialized = 1;
return (ret);
}
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index 24b03592de0..53a0d4c33e5 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -32,7 +32,7 @@ __wt_cache_config(WT_CONNECTION_IMPL *conn, const char *cfg[])
if (F_ISSET(conn, WT_CONN_CACHE_POOL) &&
(ret = __wt_config_gets(session, cfg,
- "shared_cache.reserved", &cval)) == 0)
+ "shared_cache.reserve", &cval)) == 0 && cval.val != 0)
cache->cp_reserved = (uint64_t)cval.val;
else if ((ret = __wt_config_gets(session, cfg,
"shared_cache.chunk", &cval)) == 0)
@@ -98,10 +98,10 @@ __wt_cache_create(WT_CONNECTION_IMPL *conn, const char *cfg[])
__wt_spin_init(session, &cache->evict_walk_lock);
/*
- * We pull some values from the cache statistics (rather than have two
- * copies). Set them.
+ * We get/set some values in the cache statistics (rather than have
+ * two copies), configure them.
*/
- __wt_cache_stats_update(conn, 0);
+ __wt_cache_stats_update(session);
return (0);
err: WT_RET(__wt_cache_destroy(conn));
@@ -113,22 +113,25 @@ err: WT_RET(__wt_cache_destroy(conn));
* Update the cache statistics for return to the application.
*/
void
-__wt_cache_stats_update(WT_CONNECTION_IMPL *conn, uint32_t flags)
+__wt_cache_stats_update(WT_SESSION_IMPL *session)
{
WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_CONNECTION_STATS *stats;
- WT_UNUSED(flags);
+ conn = S2C(session);
cache = conn->cache;
+ stats = &conn->stats;
- WT_STAT_SET(conn->stats, cache_bytes_max, conn->cache_size);
- WT_STAT_SET(
- conn->stats, cache_bytes_inuse, __wt_cache_bytes_inuse(cache));
- WT_STAT_SET(
- conn->stats, cache_pages_inuse, __wt_cache_pages_inuse(cache));
- WT_STAT_SET(
- conn->stats, cache_bytes_dirty, __wt_cache_bytes_dirty(cache));
- WT_STAT_SET(
- conn->stats, cache_pages_dirty, __wt_cache_pages_dirty(cache));
+ /*
+ * Some statistics are always set, regardless of the configuration of
+ * run-time statistics in the system.
+ */
+ WT_STAT_SET(stats, cache_bytes_max, conn->cache_size);
+ WT_STAT_SET(stats, cache_bytes_inuse, __wt_cache_bytes_inuse(cache));
+ WT_STAT_SET(stats, cache_pages_inuse, __wt_cache_pages_inuse(cache));
+ WT_STAT_SET(stats, cache_bytes_dirty, __wt_cache_bytes_dirty(cache));
+ WT_STAT_SET(stats, cache_pages_dirty, __wt_cache_pages_dirty(cache));
}
/*
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index 76ea6914c17..02f54e8bad0 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -25,7 +25,7 @@ static int __cache_pool_balance(void);
/*
* __wt_conn_cache_pool_config --
- * Parse and setup and cache pool options.
+ * Parse and setup the cache pool options.
*/
int
__wt_conn_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
@@ -47,10 +47,12 @@ __wt_conn_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
if (F_ISSET(conn, WT_CONN_CACHE_POOL))
reconfiguring = 1;
else {
+ /* Only setup if a shared cache was explicitly configured. */
+ if (__wt_config_gets(session, WT_SKIP_DEFAULT_CONFIG(cfg),
+ "shared_cache", &cval) == WT_NOTFOUND)
+ return (0);
WT_RET_NOTFOUND_OK(
__wt_config_gets(session, cfg, "shared_cache.name", &cval));
- if (cval.len == 0)
- return (0);
/*
* NOTE: The allocations made when configuring and opening a
* cache pool don't really belong to the connection that
@@ -238,27 +240,39 @@ __wt_conn_cache_pool_destroy(WT_CONNECTION_IMPL *conn)
break;
}
- if (!found) {
+ /*
+ * If there was an error during open, we may not have made it onto the
+ * queue. We did increment the reference count, so proceed regardless.
+ */
+ if (found) {
+ WT_VERBOSE_TRET(session, shared_cache,
+ "Removing %s from cache pool.", entry->home);
+ TAILQ_REMOVE(&cp->cache_pool_qh, entry, cpq);
+
+ /* Give the connection's resources back to the pool. */
+ WT_ASSERT(session, cp->currently_used >= conn->cache_size);
+ cp->currently_used -= conn->cache_size;
+ }
+
+ /*
+ * If there are no references, we are cleaning up after a failed
+ * wiredtiger_open, there is nothing further to do.
+ */
+ if (cp->refs < 1) {
__wt_spin_unlock(session, &cp->cache_pool_lock);
return (0);
}
- WT_VERBOSE_TRET(session, shared_cache,
- "Removing %s from cache pool.", entry->home);
- TAILQ_REMOVE(&cp->cache_pool_qh, entry, cpq);
-
- /* Give the connection's resources back to the pool. */
- WT_ASSERT(session, cp->currently_used >= conn->cache_size);
- cp->currently_used -= conn->cache_size;
- --cp->refs;
- if (cp->refs == 0 && TAILQ_EMPTY(&cp->cache_pool_qh))
+ if (--cp->refs == 0) {
+ WT_ASSERT(session, TAILQ_EMPTY(&cp->cache_pool_qh));
F_CLR(cp, WT_CACHE_POOL_RUN);
+ }
/*
* Free the connection pool session if it was created by this
* connection. A new one will be created by the next balance pass.
*/
- if (cp->session != NULL && entry == S2C(cp->session)) {
+ if (cp->session != NULL && conn == S2C(cp->session)) {
WT_VERBOSE_TRET(cp->session, shared_cache,
"Freeing a cache pool session due to connection close.");
wt_session = &cp->session->iface;
@@ -287,9 +301,11 @@ __wt_conn_cache_pool_destroy(WT_CONNECTION_IMPL *conn)
__wt_spin_unlock(session, &__wt_process.spinlock);
__wt_spin_unlock(session, &cp->cache_pool_lock);
- /* Shut down the cache pool worker. */
- WT_TRET(__wt_cond_signal(session, cp->cache_pool_cond));
- WT_TRET(__wt_thread_join(session, cp->cache_pool_tid));
+ if (found) {
+ /* Shut down the cache pool worker. */
+ WT_TRET(__wt_cond_signal(session, cp->cache_pool_cond));
+ WT_TRET(__wt_thread_join(session, cp->cache_pool_tid));
+ }
/* Now free the pool. */
__wt_free(session, cp->name);
diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c
new file mode 100644
index 00000000000..41e0cd5c640
--- /dev/null
+++ b/src/conn/conn_ckpt.c
@@ -0,0 +1,160 @@
+/*-
+ * Copyright (c) 2008-2013 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __ckpt_server_config --
+ * Parse and setup the checkpoint server options.
+ */
+static int
+__ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, int *runp)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ /*
+ * The checkpoint configuration requires a wait time -- if it's not set,
+ * we're not running at all.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval));
+ if (cval.val == 0) {
+ *runp = 0;
+ return (0);
+ }
+ conn->ckpt_usecs = (long)cval.val * 1000000;
+ *runp = 1;
+
+ WT_RET(__wt_config_gets(session, cfg, "checkpoint.name", &cval));
+
+ if (!WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) {
+ WT_RET(__wt_scr_alloc(session, cval.len + 20, &tmp));
+ strcpy((char *)tmp->data, "name=");
+ strncat((char *)tmp->data, cval.str, cval.len);
+ ret = __wt_strndup(session,
+ tmp->data, strlen("name=") + cval.len, &conn->ckpt_config);
+ __wt_scr_free(&tmp);
+ WT_RET(ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __ckpt_server --
+ * The checkpoint server thread.
+ */
+static void *
+__ckpt_server(void *arg)
+{
+ struct timespec ts;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ WT_SESSION_IMPL *session;
+
+ session = arg;
+ conn = S2C(session);
+ wt_session = (WT_SESSION *)session;
+
+ /*
+ * The checkpoint server may be running before the database is created,
+ * and checkpoints would fail. Wait for the wiredtiger_open call.
+ */
+ while (!conn->connection_initialized)
+ __wt_sleep(1, 0);
+
+ while (F_ISSET(conn, WT_CONN_SERVER_RUN)) {
+ /* Get the current local time of day. */
+ WT_ERR(__wt_epoch(session, &ts));
+
+ /* Checkpoint the database. */
+ WT_ERR(wt_session->checkpoint(wt_session, conn->ckpt_config));
+
+ /* Wait... */
+ WT_ERR(
+ __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs));
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "checkpoint server error");
+ }
+ return (NULL);
+}
+
+/*
+ * __wt_checkpoint_create -
+ * Start the checkpoint server thread.
+ */
+int
+__wt_checkpoint_create(WT_CONNECTION_IMPL *conn, const char *cfg[])
+{
+ WT_SESSION_IMPL *session;
+ int run;
+
+ session = conn->default_session;
+
+ /* Handle configuration. */
+ WT_RET(__ckpt_server_config(session, cfg, &run));
+
+ /* If not configured, we're done. */
+ if (!run)
+ return (0);
+
+ /* The checkpoint server gets its own session. */
+ WT_RET(__wt_open_session(conn, 1, NULL, NULL, &conn->ckpt_session));
+ conn->ckpt_session->name = "checkpoint-server";
+
+ WT_RET(
+ __wt_cond_alloc(session, "checkpoint server", 0, &conn->ckpt_cond));
+
+ /*
+ * Start the thread.
+ */
+ WT_RET(__wt_thread_create(
+ session, &conn->ckpt_tid, __ckpt_server, conn->ckpt_session));
+ conn->ckpt_tid_set = 1;
+
+ return (0);
+}
+
+/*
+ * __wt_checkpoint_destroy -
+ * Destroy the checkpoint server thread.
+ */
+int
+__wt_checkpoint_destroy(WT_CONNECTION_IMPL *conn)
+{
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ WT_SESSION_IMPL *session;
+
+ session = conn->default_session;
+
+ if (conn->ckpt_tid_set) {
+ WT_TRET(__wt_cond_signal(session, conn->ckpt_cond));
+ WT_TRET(__wt_thread_join(session, conn->ckpt_tid));
+ conn->ckpt_tid_set = 0;
+ }
+ if (conn->ckpt_cond != NULL)
+ WT_TRET(__wt_cond_destroy(session, conn->ckpt_cond));
+
+ __wt_free(session, conn->ckpt_config);
+
+ /* Close the server thread's session, free its hazard array. */
+ if (conn->ckpt_session != NULL) {
+ wt_session = &conn->ckpt_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ __wt_free(session, conn->ckpt_session->hazard);
+ }
+
+ return (ret);
+}
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index 3a5bfa430eb..fdd0966232e 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -201,12 +201,78 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session)
}
/*
+ * __conn_btree_config_clear --
+ * Clear the underlying object's configuration information.
+ */
+static void
+__conn_btree_config_clear(WT_SESSION_IMPL *session)
+{
+ WT_DATA_HANDLE *dhandle;
+ const char **a;
+
+ dhandle = session->dhandle;
+
+ if (dhandle->cfg == NULL)
+ return;
+ for (a = dhandle->cfg; *a != NULL; ++a)
+ __wt_free(session, *a);
+ __wt_free(session, dhandle->cfg);
+}
+
+/*
+ * __conn_btree_config_set --
+ * Set up a btree handle's configuration information.
+ */
+static int
+__conn_btree_config_set(WT_SESSION_IMPL *session)
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ const char *metaconf;
+
+ dhandle = session->dhandle;
+
+ /*
+ * Read the object's entry from the metadata file, we're done if we
+ * don't find one.
+ */
+ if ((ret =
+ __wt_metadata_read(session, dhandle->name, &metaconf)) != 0) {
+ if (ret == WT_NOTFOUND)
+ ret = ENOENT;
+ WT_RET(ret);
+ }
+
+ /*
+ * The defaults are included because underlying objects have persistent
+ * configuration information stored in the metadata file. If defaults
+ * are included in the configuration, we can add new configuration
+ * strings without upgrading the metadata file or writing special code
+ * in case a configuration string isn't initialized, as long as the new
+ * configuration string has an appropriate default value.
+ *
+ * The error handling is a little odd, but be careful: we're holding a
+ * chunk of allocated memory in metaconf. If we fail before we copy a
+ * reference to it into the object's configuration array, we must free
+ * it, after the copy, we don't want to free it.
+ */
+ WT_ERR(__wt_calloc_def(session, 3, &dhandle->cfg));
+ WT_ERR(__wt_strdup(session, __wt_confdfl_file_meta, &dhandle->cfg[0]));
+ dhandle->cfg[1] = metaconf;
+ metaconf = NULL;
+ return (0);
+
+err: __wt_free(session, metaconf);
+ return (ret);
+}
+
+/*
* __conn_btree_open --
* Open the current btree handle.
*/
static int
-__conn_btree_open(WT_SESSION_IMPL *session,
- const char *config, const char *cfg[], uint32_t flags)
+__conn_btree_open(
+ WT_SESSION_IMPL *session, const char *op_cfg[], uint32_t flags)
{
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
@@ -219,10 +285,6 @@ __conn_btree_open(WT_SESSION_IMPL *session,
F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) &&
!LF_ISSET(WT_DHANDLE_LOCK_ONLY));
- /* Open the underlying file, free any old config. */
- __wt_free(session, dhandle->config);
- dhandle->config = config;
-
/*
* If the handle is already open, it has to be closed so it can be
* reopened with a new configuration. We don't need to check again:
@@ -232,11 +294,15 @@ __conn_btree_open(WT_SESSION_IMPL *session,
if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
WT_RET(__wt_conn_btree_sync_and_close(session));
+ /* Discard any previous configuration, set up the new configuration. */
+ __conn_btree_config_clear(session);
+ WT_RET(__conn_btree_config_set(session));
+
/* Set any special flags on the handle. */
F_SET(btree, LF_ISSET(WT_BTREE_SPECIAL_FLAGS));
do {
- WT_ERR(__wt_btree_open(session, cfg));
+ WT_ERR(__wt_btree_open(session, op_cfg));
F_SET(dhandle, WT_DHANDLE_OPEN);
/*
* Checkpoint handles are read only, so eviction calculations
@@ -268,11 +334,10 @@ err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
*/
int
__wt_conn_btree_get(WT_SESSION_IMPL *session,
- const char *name, const char *ckpt, const char *cfg[], uint32_t flags)
+ const char *name, const char *ckpt, const char *op_cfg[], uint32_t flags)
{
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
- const char *treeconf;
WT_CSTAT_INCR(session, file_open);
@@ -281,19 +346,11 @@ __wt_conn_btree_get(WT_SESSION_IMPL *session,
if (!LF_ISSET(WT_DHANDLE_LOCK_ONLY) &&
(!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
- LF_ISSET(WT_BTREE_SPECIAL_FLAGS))) {
- if ((ret = __wt_metadata_read(session, name, &treeconf)) != 0) {
- if (ret == WT_NOTFOUND)
- ret = ENOENT;
- goto err;
+ LF_ISSET(WT_BTREE_SPECIAL_FLAGS)))
+ if ((ret = __conn_btree_open(session, op_cfg, flags)) != 0) {
+ F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+ WT_TRET(__wt_rwunlock(session, dhandle->rwlock));
}
- ret = __conn_btree_open(session, treeconf, cfg, flags);
- }
-
-err: if (ret != 0) {
- F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
- WT_TRET(__wt_rwunlock(session, dhandle->rwlock));
- }
WT_ASSERT(session, ret != 0 ||
LF_ISSET(WT_DHANDLE_EXCLUSIVE) ==
@@ -379,13 +436,11 @@ int
__wt_conn_btree_close(WT_SESSION_IMPL *session, int locked)
{
WT_BTREE *btree;
- WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
int inuse;
btree = S2BT(session);
- conn = S2C(session);
dhandle = session->dhandle;
WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
@@ -413,7 +468,7 @@ __wt_conn_btree_close(WT_SESSION_IMPL *session, int locked)
*/
WT_ASSERT(session,
btree != session->metafile ||
- session == conn->default_session);
+ session == S2C(session)->default_session);
if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
WT_TRET(__wt_conn_btree_sync_and_close(session));
@@ -511,18 +566,20 @@ __wt_conn_dhandle_discard_single(
{
WT_DECL_RET;
- if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
- session->dhandle = dhandle;
+ session->dhandle = dhandle;
+
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
WT_TRET(__wt_conn_btree_sync_and_close(session));
- session->dhandle = NULL;
- }
+
WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock));
- __wt_free(session, dhandle->config);
__wt_free(session, dhandle->name);
__wt_free(session, dhandle->checkpoint);
+ __conn_btree_config_clear(session);
__wt_free(session, dhandle->handle);
__wt_overwrite_and_free(session, dhandle);
+ WT_CLEAR_BTREE_IN_SESSION(session);
+
return (ret);
}
diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c
index df5bac8d0b9..fb59812d75d 100644
--- a/src/conn/conn_handle.c
+++ b/src/conn/conn_handle.c
@@ -28,7 +28,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
TAILQ_INIT(&conn->lsmqh); /* WT_LSM_TREE list */
/* Statistics. */
- WT_RET(__wt_stat_alloc_connection_stats(session, &conn->stats));
+ __wt_stat_init_connection_stats(&conn->stats);
/* Locks. */
__wt_spin_init(session, &conn->api_lock);
@@ -59,12 +59,12 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
WT_DECL_RET;
WT_SESSION_IMPL *session;
- session = conn->default_session;
-
/* Check there's something to destroy. */
if (conn == NULL)
return (0);
+ session = conn->default_session;
+
/*
* Close remaining open files (before discarding the mutex, the
* underlying file-close code uses the mutex to guard lists of
@@ -91,7 +91,6 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
/* Free allocated memory. */
__wt_free(session, conn->home);
__wt_free(session, conn->sessions);
- __wt_free(session, conn->stats);
__wt_free(NULL, conn);
return (ret);
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index ae5d0d24172..2bafe877767 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -38,7 +38,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
WT_WRITE_BARRIER();
/* Start worker threads. */
- F_SET(conn, WT_CONN_SERVER_RUN);
+ F_SET(conn, WT_CONN_EVICTION_RUN | WT_CONN_SERVER_RUN);
/*
* Start the eviction thread.
@@ -51,6 +51,13 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
evict_session->name = "eviction-server";
WT_ERR(__wt_thread_create(session,
&conn->cache_evict_tid, __wt_cache_evict_server, evict_session));
+ conn->cache_evict_tid_set = 1;
+
+ /* Start the optional checkpoint thread. */
+ WT_ERR(__wt_checkpoint_create(conn, cfg));
+
+ /* Start the optional statistics thread. */
+ WT_ERR(__wt_statlog_create(conn, cfg));
return (0);
@@ -86,11 +93,12 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
fh = TAILQ_FIRST(&conn->fhqh);
}
- /* Shut down the server threads. */
- F_CLR(conn, WT_CONN_SERVER_RUN);
- if (conn->cache_evict_tid != 0) {
+ /* Shut down the eviction server thread. */
+ F_CLR(conn, WT_CONN_EVICTION_RUN);
+ if (conn->cache_evict_tid_set) {
WT_TRET(__wt_evict_server_wake(session));
WT_TRET(__wt_thread_join(session, conn->cache_evict_tid));
+ conn->cache_evict_tid_set = 0;
}
/* Disconnect from shared cache - must be before cache destroy. */
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index 6ca3be4659a..0660ab2435c 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -7,6 +7,15 @@
#include "wt_internal.h"
+#ifdef __GNUC__
+/*
+ * !!!
+ * GCC with -Wformat-nonliteral complains about calls to strftime in this file.
+ * There's nothing wrong, this makes the warning go away.
+ */
+#pragma GCC diagnostic ignored "-Wformat-nonliteral"
+#endif
+
/*
* __wt_conn_stat_init --
* Initialize the per-connection statistics.
@@ -14,9 +23,316 @@
void
__wt_conn_stat_init(WT_SESSION_IMPL *session, uint32_t flags)
{
+ WT_UNUSED(flags);
+
+ __wt_cache_stats_update(session);
+}
+
+/*
+ * __wt_statlog_config --
+ * Parse and setup the statistics server options.
+ */
+static int
+__statlog_config(WT_SESSION_IMPL *session, const char **cfg, int *runp)
+{
+ WT_CONFIG objectconf;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ int cnt;
+
+ conn = S2C(session);
+
+ /*
+ * The statistics logging configuration requires a wait time -- if it's
+ * not set, we're not running at all.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "statistics_log.wait", &cval));
+ if (cval.val == 0) {
+ *runp = 0;
+ return (0);
+ }
+ conn->stat_usecs = (long)cval.val * 1000000;
+
+ /* Statistics logging implies statistics. */
+ conn->statistics = *runp = 1;
+
+ WT_RET(__wt_config_gets(session, cfg, "statistics_log.clear", &cval));
+ conn->stat_clear = cval.val != 0;
+
+ WT_RET(__wt_config_gets(session, cfg, "statistics_log.sources", &cval));
+ WT_RET(__wt_config_subinit(session, &objectconf, &cval));
+ for (cnt = 0; (ret = __wt_config_next(&objectconf, &k, &v)) == 0; ++cnt)
+ ;
+ WT_RET_NOTFOUND_OK(ret);
+ if (cnt != 0) {
+ WT_RET(
+ __wt_calloc_def(session, cnt * 2 + 1, &conn->stat_sources));
+ WT_RET(__wt_config_subinit(session, &objectconf, &cval));
+ for (cnt = 0;
+ (ret = __wt_config_next(&objectconf, &k, &v)) == 0;) {
+ /*
+ * We close and re-open each statistics cursor each time
+ * we dump statistics (the object may or may not exist
+ * underneath at any point, and I don't want this code
+ * to break if/when the lifetime of an underlying object
+ * changes). Create pairs of strings: the first is the
+ * object uri, written into the output, the second is
+ * the enhanced uri used to open the statistics cursor.
+ */
+ WT_RET(__wt_strndup(session,
+ k.str, k.len, &conn->stat_sources[cnt]));
+ ++cnt;
+
+ WT_RET(__wt_calloc_def(session,
+ strlen("statistics:") + k.len + 1,
+ &conn->stat_sources[cnt]));
+ strcpy(conn->stat_sources[cnt], "statistics:");
+ strncat(conn->stat_sources[cnt], k.str, k.len);
+ ++cnt;
+ }
+ WT_RET_NOTFOUND_OK(ret);
+ }
+
+ WT_RET(__wt_config_gets(session, cfg, "statistics_log.path", &cval));
+ WT_RET(__wt_nfilename(session, cval.str, cval.len, &conn->stat_path));
+
+ WT_RET(__wt_config_gets(
+ session, cfg, "statistics_log.timestamp", &cval));
+ WT_RET(__wt_strndup(session, cval.str, cval.len, &conn->stat_stamp));
+
+ return (0);
+}
+
+/*
+ * __stat_server_dump --
+ * Dump a single set of statistics.
+ */
+static int
+__stat_server_dump(WT_SESSION_IMPL *session,
+ const char *name, const char *cursor_uri, const char *stamp, FILE *fp)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ uint64_t value;
+ const char *config, *desc, *pdesc;
+
+ wt_session = (WT_SESSION *)session;
+ config = S2C(session)->stat_clear ?
+ "statistics_clear,statistics_fast" : "statistics_fast";
+
+ /*
+ * If we don't find an underlying object, silently ignore it, the object
+ * may exist only intermittently. User-level APIs return ENOENT instead
+ * of WT_NOTFOUND for missing files, check both, as well as for EBUSY if
+ * the handle is exclusively locked at the moment.
+ */
+ ret = wt_session->open_cursor(
+ wt_session, cursor_uri, NULL, config, &cursor);
+ if (ret == EBUSY || ret == ENOENT || ret == WT_NOTFOUND)
+ return (0);
+ WT_RET(ret);
+
+ while ((ret = cursor->next(cursor)) == 0 &&
+ (ret = cursor->get_value(cursor, &desc, &pdesc, &value)) == 0)
+ WT_ERR_TEST((fprintf(fp,
+ "%s %" PRIu64 " %s %s\n",
+ stamp, value, name, desc) < 0), __wt_errno());
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: WT_TRET(cursor->close(cursor));
+
+ return (ret);
+}
+
+/*
+ * __stat_server --
+ * The statistics server thread.
+ */
+static void *
+__stat_server(void *arg)
+{
+ struct timespec ts;
+ struct tm *tm, _tm;
+ FILE *fp;
WT_CONNECTION_IMPL *conn;
+ WT_ITEM path, tmp;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ char **p;
+ session = arg;
conn = S2C(session);
- __wt_cache_stats_update(conn, flags);
+ WT_CLEAR(path);
+ WT_CLEAR(tmp);
+ fp = NULL;
+
+ /*
+ * We need a temporary place to build a path and an entry prefix.
+ * The length of the path plus 128 should be more than enough.
+ *
+ * We also need a place to store the current path, because that's
+ * how we know when to close/re-open the file.
+ */
+ WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128));
+ WT_ERR(__wt_buf_init(session, &tmp, strlen(conn->stat_path) + 128));
+
+ /*
+ * The statistics log server may be running before the database is
+ * created (it should run fine because we're looking at statistics
+ * structures that have already been allocated, but it doesn't make
+ * sense and we have the information we need to wait). Wait for
+ * the wiredtiger_open call.
+ */
+ while (!conn->connection_initialized)
+ __wt_sleep(1, 0);
+
+ while (F_ISSET(conn, WT_CONN_SERVER_RUN)) {
+ /*
+ * If statistics are turned off, wait until it's time to output
+ * statistics and check again.
+ */
+ if (conn->statistics == 0) {
+ WT_ERR(__wt_cond_wait(
+ session, conn->stat_cond, conn->stat_usecs));
+ continue;
+ }
+
+ /* Get the current local time of day. */
+ WT_ERR(__wt_epoch(session, &ts));
+ tm = localtime_r(&ts.tv_sec, &_tm);
+
+ /* Create the logging path name for this time of day. */
+ if (strftime(tmp.mem, tmp.memsize, conn->stat_path, tm) == 0)
+ WT_ERR_MSG(
+ session, ENOMEM, "strftime path conversion");
+
+ /* If the path has changed, close/open the new log file. */
+ if (fp == NULL || strcmp(tmp.mem, path.mem) != 0) {
+ if (fp != NULL) {
+ (void)fclose(fp);
+ fp = NULL;
+ }
+
+ (void)strcpy(path.mem, tmp.mem);
+ WT_ERR_TEST(
+ (fp = fopen(path.mem, "a")) == NULL, __wt_errno());
+ }
+
+ /* Create the entry prefix for this time of day. */
+ if (strftime(tmp.mem, tmp.memsize, conn->stat_stamp, tm) == 0)
+ WT_ERR_MSG(
+ session, ENOMEM, "strftime timestamp conversion");
+
+ /* Dump the connection statistics. */
+ WT_ERR(__stat_server_dump(
+ session, conn->home, "statistics:", tmp.mem, fp));
+
+ /* Dump the object list statistics. */
+ if ((p = conn->stat_sources) != NULL)
+ for (; *p != NULL; p += 2)
+ WT_ERR(__stat_server_dump(
+ session, p[0], p[1], tmp.mem, fp));
+
+ /* Flush. */
+ WT_ERR(fflush(fp) == 0 ? 0 : __wt_errno());
+
+ /* Wait until the next event. */
+ WT_ERR(
+ __wt_cond_wait(session, conn->stat_cond, conn->stat_usecs));
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "statistics log server error");
+ }
+ if (fp != NULL)
+ WT_TRET(fclose(fp) == 0 ? 0 : __wt_errno());
+ __wt_buf_free(session, &path);
+ __wt_buf_free(session, &tmp);
+ return (NULL);
+}
+
+/*
+ * __wt_statlog_create -
+ * Start the statistics server thread.
+ */
+int
+__wt_statlog_create(WT_CONNECTION_IMPL *conn, const char *cfg[])
+{
+ WT_SESSION_IMPL *session;
+ int run;
+
+ session = conn->default_session;
+
+ /* Handle configuration. */
+ WT_RET(__statlog_config(session, cfg, &run));
+
+ /* If not configured, we're done. */
+ if (!run)
+ return (0);
+
+ /* The statistics log server gets its own session. */
+ WT_RET(__wt_open_session(conn, 1, NULL, NULL, &conn->stat_session));
+ conn->stat_session->name = "statlog-server";
+
+ WT_RET(__wt_cond_alloc(
+ session, "statistics log server", 0, &conn->stat_cond));
+
+ /*
+ * Start the thread.
+ *
+ * Statistics logging creates a thread per database, rather than using
+ * a single thread to do logging for all of the databases. If we ever
+ * see lots of databases at a time, doing statistics logging, and we
+ * want to reduce the number of threads, there's no reason we have to
+ * have more than one thread, I just didn't feel like writing the code
+ * to figure out the scheduling.
+ */
+ WT_RET(__wt_thread_create(
+ session, &conn->stat_tid, __stat_server, conn->stat_session));
+ conn->stat_tid_set = 1;
+
+ return (0);
+}
+
+/*
+ * __wt_statlog_destroy -
+ * Destroy the statistics server thread.
+ */
+int
+__wt_statlog_destroy(WT_CONNECTION_IMPL *conn)
+{
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ WT_SESSION_IMPL *session;
+ char **p;
+
+ session = conn->default_session;
+
+ if (conn->stat_tid_set) {
+ WT_TRET(__wt_cond_signal(session, conn->stat_cond));
+ WT_TRET(__wt_thread_join(session, conn->stat_tid));
+ conn->stat_tid_set = 0;
+ }
+ if (conn->stat_cond != NULL)
+ WT_TRET(__wt_cond_destroy(session, conn->stat_cond));
+
+ if ((p = conn->stat_sources) != NULL) {
+ for (; *p != NULL; ++p)
+ __wt_free(session, *p);
+ __wt_free(session, conn->stat_sources);
+ }
+ __wt_free(session, conn->stat_path);
+ __wt_free(session, conn->stat_stamp);
+
+ /* Close the server thread's session, free its hazard array. */
+ if (conn->stat_session != NULL) {
+ wt_session = &conn->stat_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ __wt_free(session, conn->stat_session->hazard);
+ }
+
+ return (ret);
}
diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c
index ba49cb6927c..3557dc13026 100644
--- a/src/cursor/cur_file.c
+++ b/src/cursor/cur_file.c
@@ -359,7 +359,8 @@ __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri,
flags = 0;
WT_RET(__wt_config_gets_defno(session, cfg, "bulk", &cval));
- if (cval.type == ITEM_NUM && (cval.val == 0 || cval.val == 1)) {
+ if (cval.type == ITEM_BOOL ||
+ (cval.type == ITEM_NUM && (cval.val == 0 || cval.val == 1))) {
bitmap = 0;
bulk = (cval.val != 0);
} else if (WT_STRING_MATCH("bitmap", cval.str, cval.len))
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index 91ae5e2c8f2..e40bfa509f6 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -292,6 +292,9 @@ __curindex_close(WT_CURSOR *cursor)
if (cindex->child != NULL)
WT_TRET(cindex->child->close(cindex->child));
+
+ if (cindex->table != NULL)
+ __wt_schema_release_table(session, cindex->table);
/* The URI is owned by the index. */
cursor->uri = NULL;
WT_TRET(__wt_cursor_close(cursor));
@@ -408,7 +411,10 @@ __wt_curindex_open(WT_SESSION_IMPL *session,
* using only the primary's recno as the index key. Disallow that for
* now.
*/
- WT_ASSERT(session, !WT_CURSOR_RECNO(cursor));
+ if (WT_CURSOR_RECNO(cursor))
+ WT_ERR_MSG(session, WT_ERROR,
+ "Column store indexes based on a record number primary "
+ "key are not supported.");
/* Handle projections. */
if (columns != NULL) {
diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c
index 8e93be354bb..4c344340c6c 100644
--- a/src/cursor/cur_stat.c
+++ b/src/cursor/cur_stat.c
@@ -307,8 +307,8 @@ __curstat_conn_init(
cst->btree = NULL;
cst->notpositioned = 1;
- cst->stats_first = (WT_STATS *)S2C(session)->stats;
- cst->stats_count = sizeof(*S2C(session)->stats) / sizeof(WT_STATS);
+ cst->stats_first = (WT_STATS *)&S2C(session)->stats;
+ cst->stats_count = sizeof(S2C(session)->stats) / sizeof(WT_STATS);
cst->clear_func = LF_ISSET(WT_STATISTICS_CLEAR) ?
__wt_stat_clear_connection_stats : NULL;
}
@@ -329,7 +329,7 @@ __curstat_file_init(WT_SESSION_IMPL *session,
cst->btree = btree;
cst->notpositioned = 1;
- cst->stats_first = (WT_STATS *)btree->dhandle->stats;
+ cst->stats_first = (WT_STATS *)&btree->dhandle->stats;
cst->stats_count = sizeof(WT_DSRC_STATS) / sizeof(WT_STATS);
cst->clear_func = LF_ISSET(WT_STATISTICS_CLEAR) ?
__wt_stat_clear_dsrc_stats : NULL;
@@ -441,7 +441,8 @@ __wt_curstat_open(WT_SESSION_IMPL *session,
WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
if (0) {
-err: __wt_free(session, cst);
+err: __wt_free(session, cst->stats);
+ __wt_free(session, cst);
}
return (ret);
diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c
index 63b0a8c29ee..ab4f48cbf83 100644
--- a/src/cursor/cur_table.c
+++ b/src/cursor/cur_table.c
@@ -623,6 +623,7 @@ __curtable_close(WT_CURSOR *cursor)
__wt_free(session, cursor->value_format);
__wt_free(session, ctable->cg_cursors);
__wt_free(session, ctable->idx_cursors);
+ __wt_schema_release_table(session, ctable->table);
/* The URI is owned by the table. */
cursor->uri = NULL;
WT_TRET(__wt_cursor_close(cursor));
@@ -742,10 +743,14 @@ __wt_curtable_open(WT_SESSION_IMPL *session,
size = WT_PTRDIFF(columns, tablename);
WT_RET(__wt_schema_get_table(session, tablename, size, 0, &table));
- if (table->is_simple)
+ if (table->is_simple) {
/* Just return a cursor on the underlying data source. */
- return (__wt_open_cursor(session,
- table->cgroups[0]->source, NULL, cfg, cursorp));
+ ret = __wt_open_cursor(session,
+ table->cgroups[0]->source, NULL, cfg, cursorp);
+
+ __wt_schema_release_table(session, table);
+ return (ret);
+ }
WT_RET(__wt_calloc_def(session, 1, &ctable));
diff --git a/src/docs/bulk-load.dox b/src/docs/bulk-load.dox
new file mode 100644
index 00000000000..f65e1f81ca0
--- /dev/null
+++ b/src/docs/bulk-load.dox
@@ -0,0 +1,23 @@
+/*! @page bulk_load Bulk-load
+
+WiredTiger cursors can be configured for bulk-load using the \c bulk
+configuration keyword to WT_SESSION::open_cursor. Bulk-load is a "fast
+path" for quickly loading a large number of rows. Bulk-load may only
+be used on newly created objects, and an object being bulk-loaded is not
+accessible from other cursors.
+
+Cursors configured for bulk-load only support the WT_CURSOR::insert and
+WT_CURSOR::close methods.
+
+When bulk-loading row-store objects, keys must be loaded in sorted
+order.
+
+When bulk-loading fixed-length column store objects, the \c bulk
+configuration string value \c bitmap allows chunks of a memory resident
+bitmap to be loaded directly into an object by passing a WT_ITEM to
+WT_CURSOR::set_value, where the size field indicates the number of
+records in the bitmap (as specified by the object's \c value_format
+configuration). Bulk-loaded bitmap values must end on a byte boundary
+relative to the bit count (except for the last set of values loaded).
+
+ */
diff --git a/src/docs/cache-configuration.dox b/src/docs/cache-configuration.dox
index 0faf0eaf144..3eda7a4fbdc 100644
--- a/src/docs/cache-configuration.dox
+++ b/src/docs/cache-configuration.dox
@@ -1,6 +1,6 @@
/*! @page cache_configuration Cache configuration
-@section cache_basic Overview of WiredTiger cache configuration.
+@section cache_basic Cache configuration
The WiredTiger cache implements an approximation of a least recently used
algorithm. Ideally the cache should be configured to be large enough to
@@ -10,7 +10,7 @@ The WiredTiger cache size can be configured when first opening a database via
@ref wiredtiger_open or changed after open using the
WT_CONNECTION::reconfigure method.
-@section shared_cache Overview of WiredTiger shared cache configuration.
+@section shared_cache Shared cache configuration
WiredTiger supports sharing a single cache among multiple databases within
a process.
@@ -50,7 +50,7 @@ WiredTiger shared cache tuning options can be configured when first opening a
database via @ref wiredtiger_open or changed after open using the
WT_CONNECTION::reconfigure method.
-@section cache_eviction Overview of WiredTiger eviction configuration.
+@section cache_eviction Eviction configuration
WiredTiger provides several configuration options for tuning how aggressively
pages are evicted from the cache. Different values will result in better
diff --git a/src/docs/community.dox b/src/docs/community.dox
new file mode 100644
index 00000000000..ee7674850dc
--- /dev/null
+++ b/src/docs/community.dox
@@ -0,0 +1,23 @@
+/*! @page community WiredTiger community and contact information
+
+WiredTiger Inc., and the WiredTiger community maintain and develop
+<a href="https://github.com/wiredtiger">WiredTiger on GitHub</a>,
+and all contributors are welcome!
+
+All source code and revision histories for this software are available
+in the <a href="https://github.com/wiredtiger/wiredtiger"> WiredTiger
+source tree repository</a>.
+
+Please submit any feature suggestions and bug reports at
+<a href="https://github.com/wiredtiger/wiredtiger/issues">
+WiredTiger's Issues page</a>.
+
+Discussion of WiredTiger issues and development can also be found in
+the
+<a href="http://groups.google.com/group/wiredtiger-users">WiredTiger
+Google Group</a>.
+
+To contact WiredTiger, Inc. please send email to
+<a mailto="info@wiredtiger.com">info@wiredtiger.com</a>.
+
+*/
diff --git a/src/docs/cursors.dox b/src/docs/cursors.dox
index fa25e0148a0..1fd3646603e 100644
--- a/src/docs/cursors.dox
+++ b/src/docs/cursors.dox
@@ -57,21 +57,21 @@ The following are some of the common builtin cursor types:
@hrow{URI, Type, Notes}
@row{<tt>backup:</tt>,
hot backup cursor, See also: @ref hot_backup}
- @row{<tt>colgroup:\<tablename\>.\<columnset\></tt>,
+ @row{<tt>colgroup:\<table name\>:\<column group name\></tt>,
column group cursor,}
@row{<tt>config:[\<uri\>]</tt>,
object configuration cursor, (key=config string\,
value=config value)}
- @row{<tt>file:\<filename\></tt>,
+ @row{<tt>file:\<file name\></tt>,
file cursor (key=file key\, value=file value),}
- @row{<tt>index:\<tablename\>.\<indexname\></tt>,
+ @row{<tt>index:\<table name\>:\<index name\></tt>,
index cursor (key=index key\, value=table value),}
@row{<tt>lsm:\<name\></tt>,
LSM cursor (key=LSM key\, value=LSM value), See also: @ref lsm}
- @row{<tt>statistics:[file</tt><tt>:\<filename\>]</tt>,
+ @row{<tt>statistics:[\<data source URI\>]</tt>,
database or file statistics (key=(int)\,
value=(string)description\, (string)value\, (uint64_t)value),}
- @row{<tt>table:\<tablename\></tt>,
+ @row{<tt>table:\<table name\></tt>,
table cursor (key=table key\, value=table value),}
</table>
diff --git a/src/docs/data_sources.dox b/src/docs/data_sources.dox
index 458786a7617..2b9a33caddc 100644
--- a/src/docs/data_sources.dox
+++ b/src/docs/data_sources.dox
@@ -19,23 +19,23 @@ The following are the builtin cursor types:
@hrow{URI, Type, Notes}
@row{<tt>backup:</tt>,
hot backup cursor, See also: @ref hot_backup}
- @row{<tt>colgroup:\<tablename\>.\<columnset\></tt>,
+ @row{<tt>colgroup:\<table name\>:\<column group name\></tt>,
column group cursor,}
@row{<tt>config:[\<uri\>]</tt>,
object configuration cursor (key=config string\,
value=config value),}
- @row{<tt>file:\<filename\></tt>,
+ @row{<tt>file:\<file name\></tt>,
file cursor (key=file key\, value=file value),}
- @row{<tt>index:\<tablename\>.\<indexname\></tt>,
+ @row{<tt>index:\<table name\>:\<index name\></tt>,
index cursor (key=index key\, value=table value),}
@row{<tt>join:\<cursor1\>\&\<cursor2\>[&\<cursor3\>...]</tt>,
join cursor, @notyet{join cursors}}
@row{<tt>lsm:\<name\></tt>,
LSM cursor (key=LSM key\, value=LSM value), See also: @ref lsm}
- @row{<tt>statistics:[file</tt><tt>:\<filename\>]</tt>,
-database or file statistics (key=(int)\,
+ @row{<tt>statistics:[\<data source URI\>]</tt>,
+database or data source statistics (key=(int)\,
value=(string)description\, (string)value\, (uint64_t)value),}
- @row{<tt>table:\<tablename\></tt>,
+ @row{<tt>table:\<table name\></tt>,
table cursor (key=table key\, value=table value),}
</table>
diff --git a/src/docs/install.dox b/src/docs/install.dox
index dbf4712ba1d..263ebc541b7 100644
--- a/src/docs/install.dox
+++ b/src/docs/install.dox
@@ -45,7 +45,7 @@ To rebuild from scratch, discard any previous configuration by cleaning
out the build area:
@code
-make realclean
+make distclean
@endcode
To see additional configuration options, run:
diff --git a/src/docs/license.dox b/src/docs/license.dox
index 2644daa22e2..2c1f9f8eeb4 100644
--- a/src/docs/license.dox
+++ b/src/docs/license.dox
@@ -1,7 +1,8 @@
/*! @page license WiredTiger license
-The WiredTiger software is Open Source software: you may redistribute
-it and modify it under the terms of version 3 of the
+The complete WiredTiger software package is Open Source software: you
+are welcome to modify and redistribute it under the terms of version 3
+of the
<a href="http://www.gnu.org/licenses/gpl-3.0-standalone.html">
<b>GNU General Public License</b></a>
as published by the Free Software Foundation. This program is
@@ -11,6 +12,13 @@ FITNESS FOR A PARTICULAR PURPOSE. See the
<a href="http://www.gnu.org/licenses/gpl-3.0-standalone.html">
<b>GNU General Public License</b></a> for details.
+For a license to use the WiredTiger software under conditions other than
+those described above, or for technical support for this software, please
+contact WiredTiger, Inc. at
+<a mailto="info@wiredtiger.com">info@wiredtiger.com</a>.
+
+@section library 3rd party software included in the WiredTiger library binary
+
The WiredTiger library binary includes software copyrighted under the
terms of the
<a href="http://www.opensource.org/licenses/BSD-3-Clause">
@@ -20,13 +28,6 @@ and the
<b>MIT License</b></a>. Any redistribution should comply with these
copyrights.
-For a license to use the WiredTiger software under conditions other than
-those described above, or for technical support for this software, please
-contact WiredTiger, Inc. at
-<a mailto="info@wiredtiger.com">info@wiredtiger.com</a>.
-
-@section library 3rd party software included in the WiredTiger library binary
-
The WiredTiger library binary includes the following 3rd party software,
distributed under the following licenses:
@@ -51,10 +52,9 @@ sources, please review the copyright notices and LICENSE files included
in the WiredTiger distribution for the terms and conditions of such
redistribution.
-@section pd Public domain software
+@section public_domain Public domain software
-Portions of this program are public domain software. Public domain
-files have copyright notices releasing the software into the public
-domain and may be freely used.
+Portions of this program are public domain software. Public domain files have
+notices releasing the software into the public domain and may be freely used.
*/
diff --git a/src/docs/namespace.dox b/src/docs/namespace.dox
index 67702c1f1bb..a307185c93e 100644
--- a/src/docs/namespace.dox
+++ b/src/docs/namespace.dox
@@ -1,4 +1,4 @@
-/*! @page name_space Programmatic name spaces
+/*! @page name_space Name spaces
@section env Process' environment name space
diff --git a/src/docs/programming.dox b/src/docs/programming.dox
index 90034b203b0..2b84792f075 100644
--- a/src/docs/programming.dox
+++ b/src/docs/programming.dox
@@ -5,27 +5,27 @@ WiredTiger applications:
- @subpage basic_api
- @subpage config_strings
-- @subpage error_handling
-
- @subpage cursors
-- @subpage threads
+- @subpage transactions
+- @subpage error_handling
- @subpage schema
-- @subpage file_formats
- @subpage lsm
+- @subpage file_formats
-- @subpage transactions
+- @subpage bulk_load
- @subpage cache_configuration
- @subpage checkpoints
-- @subpage hot_backup
-
- @subpage compression
+- @subpage hot_backup
+- @subpage statistics
+- @subpage threads
+- @subpage tuning
- @subpage home
- @subpage database_config
- @subpage name_space
- @subpage security
- @subpage signals
-- @subpage tuning
*/
diff --git a/src/docs/spell.ok b/src/docs/spell.ok
index 7e1b8fa2d04..62551104925 100644
--- a/src/docs/spell.ok
+++ b/src/docs/spell.ok
@@ -41,6 +41,7 @@ WiredTigerCheckpoint
aR
ack'ed
alloc
+allocator
allocsize
ao
api
@@ -97,6 +98,7 @@ del
desc
destructor
destructors
+distclean
dl
dlp
dontlock
@@ -116,6 +118,7 @@ env
eof
erlang
errno
+fadvise
failchk
fd's
fieldname
@@ -135,6 +138,7 @@ getopt
getter
gid
github
+gnuplot
hb
hotbackup
href
@@ -201,6 +205,7 @@ mutexes
mutexing
mvcc
mygcc
+mytable
namespace
ndary
ndbm
@@ -223,6 +228,7 @@ objectsin
ol
oltp
oob
+os
ovfl
pcoll
pdf
@@ -238,6 +244,7 @@ printvalue
priv
pthread
pthreads
+py
qnx
rdbms
rdlock
@@ -270,8 +277,10 @@ spinlocks
sql
src
startsync
+statlog
str
strerror
+strftime
struct
structs
subdatabases
diff --git a/src/docs/statistics.dox b/src/docs/statistics.dox
new file mode 100644
index 00000000000..0258ae2d59d
--- /dev/null
+++ b/src/docs/statistics.dox
@@ -0,0 +1,74 @@
+/*! @page statistics Statistics
+
+WiredTiger can be configured to maintain a variety of run-time
+statistics. The \c statistics configuration boolean must be set for
+statistics to be maintained; see @ref data_statistics for information
+about accessing the statistics. The following example configures
+WiredTiger to maintain statistics:
+
+@snippet ex_all.c Statistics configuration
+
+Note that maintaining statistics involves updating shared-memory data
+structures and may decrease application performance.
+
+@section statistics_log Statistics logging
+WiredTiger will optionally log the current database statistics into a
+file when configured the \c statistics_log.log configuration string of
+the ::wiredtiger_open function is set.
+
+The following example logs statistics every 30 seconds:
+
+@snippet ex_all.c Statistics logging
+
+Each record is formatted as a space-separated timestamp, unsigned 64-bit
+value and a variable length string which describes the statistic.
+
+The timestamp format may be changed with the \c statistics_log.timestamp
+configuration string. The \c timestamp value may contain ISO C90 standard
+strftime conversion specifications.
+
+By default, only the system's connection statistics are logged, but
+statistics may be optionally reported for underlying objects by adding
+a list of URIs to the \c statistics_log configuration string:
+
+@snippet ex_all.c Statistics logging with objects
+
+When database statistics are logged, the database home will be the first
+space-separated entry for each record in the log file. For example:
+
+@code
+Mar 08 11:38:23 463 /database/home pthread mutex condition wait calls
+Mar 08 11:38:23 0 /database/home files currently open
+Mar 08 11:38:23 1855437 /database/home total heap memory allocations
+Mar 08 11:38:23 1856622 /database/home total heap memory frees
+Mar 08 11:38:23 1 /database/home total heap memory re-allocations
+Mar 08 11:38:23 472 /database/home total read I/Os
+@endcode
+
+When object statistics are logged, the object URI will be the first
+space-separated entry for each record in the log file. For example:
+
+@code
+Mar 20 10:42:36 21 table:mytable compressed pages written
+Mar 20 10:42:36 0 table:mytable page written failed to compress
+Mar 20 10:42:36 5 table:mytable page written was too small to compress
+Mar 20 10:42:36 586 table:mytable cursor insert calls
+Mar 20 10:42:36 0 table:mytable bulk-loaded cursor-insert calls
+@endcode
+
+The location of the log files may be changed with the \c statistics_log.path
+configuration string. The \c path value value may contain ISO C90 standard
+strftime conversion specifications. WiredTiger will not create non-existent
+directories in the path, they must exist before ::wiredtiger_open is called.
+
+The following example logs statistics into files named with the month,
+day and year:
+
+@snippet ex_all.c Statistics logging with path
+
+A Python script that parses the default logging output and uses the
+<a href="http://www.gnuplot.info/">gnuplot</a>, utility to generate
+Portable Network Graphics (PNG) format graphs is included in the
+WiredTiger distribution in the file \c tools/statlog.py.
+
+*/
diff --git a/src/docs/style/DoxygenLayout.xml b/src/docs/style/DoxygenLayout.xml
index 1d7363f48db..fba7c189509 100644
--- a/src/docs/style/DoxygenLayout.xml
+++ b/src/docs/style/DoxygenLayout.xml
@@ -19,6 +19,7 @@
<tab type="globals" visible="yes" title="" intro=""/>
</tab>
<tab type="examples" visible="yes" title="" intro=""/>
+ <tab type="user" url="community.html" visible="yes" title="Community"/>
<tab type="user" url="license.html" visible="yes" title="License"/>
</navindex>
diff --git a/src/docs/top/main.dox b/src/docs/top/main.dox
index 49e6b2f75ff..9293c21fc67 100644
--- a/src/docs/top/main.dox
+++ b/src/docs/top/main.dox
@@ -6,7 +6,10 @@ WiredTiger is an high performance, scalable, production quality, NoSQL,
@section releases Releases
<table>
-@row{<b>WiredTiger 1.4.2</b> (current),
+@row{<b>WiredTiger 1.5.0</b> (current),
+ <a href="releases/wiredtiger-1.5.0.tar.bz2"><b>[Release package]</b></a>,
+ <a href="1.5.0/index.html"><b>[Documentation]</b></a>}
+@row{<b>WiredTiger 1.4.2</b>,
<a href="releases/wiredtiger-1.4.2.tar.bz2"><b>[Release package]</b></a>,
<a href="1.4.2/index.html"><b>[Documentation]</b></a>}
@row{<b>Development branch</b>,
diff --git a/src/docs/tuning.dox b/src/docs/tuning.dox
index 202256b6ce5..ebfbd4d57a1 100644
--- a/src/docs/tuning.dox
+++ b/src/docs/tuning.dox
@@ -1,6 +1,8 @@
/*! @page tuning Performance Tuning
-@section tuning_cache_size Cache size
+@section tuning_cache WiredTiger's cache
+
+@subsection tuning_cache_size Cache size
The cache size for the database is configurable by setting the \c
cache_size configuration string when calling the ::wiredtiger_open
@@ -13,18 +15,7 @@ An example of setting a cache size to 500MB:
@snippet ex_config.c configure cache size
-@section tuning_memory_allocation Memory allocation
-
-The performance of heavily-threaded WiredTiger applications can be
-dominated by memory allocation because the WiredTiger engine has to free
-and re-allocate memory as part of many queries. Replacing the system's
-malloc implementation with one that has better threaded performance (for
-example, Google's
-<a href="http://goog-perftools.sourceforge.net/doc/tcmalloc.html">tcmalloc</a>,
-or <a href="http://www.canonware.com/jemalloc">jemalloc</a>),
-can dramatically improve throughput.
-
-@section tuning_read_only_objects Read-only objects
+@subsection tuning_read_only_objects Read-only objects
Cursors opened on checkpoints (either named, or using the special "last
checkpoint" name "WiredTigerCheckpoint") are read-only objects. Unless
@@ -45,7 +36,7 @@ string "checkpoint" with the name "WiredTigerCheckpoint" to the
WT_SESSION::open_cursor method:
@snippet ex_all.c open the default checkpoint
-@section tuning_cache_resident Cache resident objects
+@subsection tuning_cache_resident Cache resident objects
Cache resident objects (objects never considered for the purposes of
cache eviction), can be configured with the WT_SESSION::create
@@ -63,6 +54,36 @@ An example of configuring a cache-resident object:
@snippet ex_all.c Create a cache-resident object
+@section tuning_memory_allocator Memory allocator
+
+The performance of heavily-threaded WiredTiger applications can be
+dominated by memory allocation because the WiredTiger engine has to free
+and re-allocate memory as part of many queries. Replacing the system's
+malloc implementation with one that has better threaded performance (for
+example, Google's
+<a href="http://goog-perftools.sourceforge.net/doc/tcmalloc.html">tcmalloc</a>,
+or <a href="http://www.canonware.com/jemalloc">jemalloc</a>),
+can dramatically improve throughput.
+
+@section tuning_cursor_persistence Cursor persistence
+
+Opening a new cursor is a relatively expensive operation in WiredTiger
+(especially in table objects and Log-Structured Merge Trees (LSM) trees,
+where a logical cursor may require multiple, underlying object cursors),
+and caching cursors can improve performance. On the other hand, cursors
+hold positions in objects, and therefore long-lived cursor positions can
+decrease performance. The best combination is to cache cursors, but use
+the WT_CURSOR::reset method to discard the cursor's position in the
+object when the position is no longer needed.
+
+Additionally, cursors are automatically reset whenever a transaction
+boundary is crossed; when a transaction is started with the
+WT_SESSION::begin_transaction or ended with either
+WT_SESSION::commit_transaction or WT_SESSION::rollback_transaction, all
+open cursors are automatically reset, there is no need to call the
+WT_CURSOR::reset method explicitly, and the cursor can be immediately
+reused.
+
@section tuning_page_size Page and overflow sizes
There are four page and item size configuration values: \c internal_page_max,
@@ -140,6 +161,82 @@ An example of configuring page sizes:
@snippet ex_file.c file create
+@section tuning_system_buffer_cache System buffer cache
+
+@subsection tuning_system_buffer_cache_direct_io Direct I/O
+
+WiredTiger optionally supports direct I/O. Configuring direct I/O may
+be useful for applications wanting to:
+- minimize the operating system cache effects of I/O to and from
+WiredTiger's buffer cache,
+- avoid double-buffering of blocks in WiredTiger's cache and the
+operating system buffer cache, and
+- avoid stalling underlying solid-state drives by writing a large number
+of dirty blocks.
+
+Direct I/O is configured using the "direct_io" configuration string to
+the ::wiredtiger_open function. An example of configuring direct I/O
+for WiredTiger's data files:
+
+@snippet ex_all.c Configure direct_io for data files
+
+Direct I/O implies a writing thread waits for the write to complete
+(which is a slower operation than writing into the system buffer cache),
+and configuring direct I/O is likely to decrease overall application
+performance.
+
+Direct I/O is based on the non-standard \c O_DIRECT flag to the POSIX
+1003.1 open system call and may not available on all platforms.
+
+@subsection tuning_system_buffer_cache_os_cache_dirty_max os_cache_dirty_max
+
+As well as direct I/O, WiredTiger supports two additional configuration
+options related to the system buffer cache:
+
+The first is \c os_cache_dirty_max, the maximum dirty bytes an object
+is allowed to have in the system buffer cache. Once this many bytes
+from an object are written into the system buffer cache, WiredTiger will
+attempt to schedule writes for all of the dirty blocks the object has
+in the system buffer cache. This configuration option allows
+applications to flush dirty blocks from the object, avoiding stalling
+any underlying drives when the object is subsequently flushed to disk
+as part of a durability operation.
+
+An example of configuring \c os_cache_dirty_max:
+
+@snippet ex_all.c os_cache_dirty_max configuration
+
+The \c os_cache_dirty_max configuration may not be used in combination
+with direct I/O.
+
+The \c os_cache_dirty_max configuration is based on the non-standard
+Linux \c sync_file_range system call and may not available on all
+platforms.
+
+@subsection tuning_system_buffer_cache_os_cache_max os_cache_max
+
+The second configuration option related to the system buffer cache is
+\c os_cache_max, the maximum bytes an object is allowed to have in the
+system buffer cache. Once this many bytes from an object are either
+read into or written from the system buffer cache, WiredTiger will
+attempt to evict all of the object's blocks from the buffer cache. This
+configuration option allows applications to evict blocks from the system
+buffer cache to limit double-buffering and system buffer cache overhead.
+
+An example of configuring \c os_cache_max:
+
+@snippet ex_all.c os_cache_max configuration
+
+The \c os_cache_max configuration may not be used in combination with
+direct I/O.
+
+The \c os_cache_max configuration is based on the POSIX 1003.1 standard
+\c posix_fadvise system call and may not available on all platforms.
+
+Configuring direct I/O, \c os_cache_dirty_max or \c os_cache_max all
+have the side effect of turning off memory-mapping of objects in
+WiredTiger.
+
@section tuning_checksums Checksums
WiredTiger checksums file reads and writes, by default. In read-only
@@ -158,45 +255,41 @@ blocks which are not compressed:
@snippet ex_all.c Configure checksums to uncompressed
-@section tuning_direct_io Direct I/O
-
-WiredTiger optionally supports direct I/O, based on the non-standard \c
-O_DIRECT flag to the POSIX 1003.1 open system call. Configuring direct
-I/O may be useful for applications wanting to minimize the operating
-system cache effects of I/O to and from WiredTiger's buffer cache.
-
-Direct I/O is configured using the "direct_io" configuration string to
-the ::wiredtiger_open function. An example of configuring direct I/O
-for WiredTiger's data files:
-
-@snippet ex_all.c Configure direct_io for data files
-
@section tuning_compression Compression
-WiredTiger configures key prefix compression for row-store objects, and
-column-store compression for both row-store and column-store objects,
-by default.
-These forms of compression minimize in-memory and on-disk space, but at
-some CPU cost when rows are read and written. Turning these forms of
-compression off may increase application throughput.
+WiredTiger configures key prefix compression for row-store objects by
+default. Additional forms of compression for both row- and column-store
+objects, including dictionary and block compression, and Huffman
+encoding, are optional. Compression minimizes in-memory and on-disk
+resource requirements and decreases the amount of I/O, at some CPU cost
+when rows are read and written.
+
+Configuring compression on or off may change application throughput.
+For example, in applications using solid-state drives (where I/O is less
+expensive), turning off compression may increase application performance
+by reducing CPU costs; in applications where I/O costs are more
+expensive, turning on compression may increase application performance
+by reducing the overall number of I/O operations.
For example, turning off row-store key prefix compression:
@snippet ex_all.c Configure key prefix compression off
-For example, turning off row-store or column-store dictionary compression:
+For example, turning on row-store or column-store dictionary compression:
-@snippet ex_all.c Configure dictionary compression off
+@snippet ex_all.c Configure dictionary compression on
-WiredTiger does not configure Huffman encoding or block compression by
-default, but these forms of compression can also impact overall
-throughput. See @ref file_formats_compression for more information.
+See @ref file_formats_compression for more information.
@section tuning_statistics Performance monitoring with statistics
-WiredTiger maintains a variety of statistics that can be read with a
-cursor. See @ref data_statistics for general information about accessing
-statistics.
+WiredTiger optionally maintains a variety of statistics, when the
+\c statistics configuration string is specified to ::wiredtiger_open;
+see @ref statistics for general information about statistics, and
+@ref data_statistics for information about accessing the statistics.
+
+Note that maintaining run-time statistics involves updating
+shared-memory data structures and may decrease application performance.
The statistics gathered by WiredTiger can be combined to derive information
about the system's behavior. For example, a cursor can be opened on the
diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox
index 5aab6284a76..c9ae4b62099 100644
--- a/src/docs/upgrading.dox
+++ b/src/docs/upgrading.dox
@@ -1,5 +1,16 @@
/*! @page upgrading Upgrading WiredTiger applications
+@section version_143 Upgrading to Version 1.4.3
+<dl>
+<dt>Statistics</dt>
+<dd>
+WiredTiger statistics are no longer maintained by default; to configure
+statistics, use the \c statistics configuration string to the
+::wiredtiger_open function.
+</dd>
+
+</dl>
+<hr>
@section version_139 Upgrading to Version 1.3.9
<dl>
diff --git a/src/include/api.h b/src/include/api.h
index 6e7534af908..e0943fe4d94 100644
--- a/src/include/api.h
+++ b/src/include/api.h
@@ -74,7 +74,7 @@
(ret) != WT_DUPLICATE_KEY) \
F_SET(&(s)->txn, TXN_ERROR); \
break; \
-} while (1)
+} while (ret == 0)
/*
* If a session or connection method is about to return WT_NOTFOUND (some
diff --git a/src/include/block.h b/src/include/block.h
index bb86d799349..0d4f3275a4c 100644
--- a/src/include/block.h
+++ b/src/include/block.h
@@ -159,7 +159,8 @@ struct __wt_bm {
(WT_BM *, WT_SESSION_IMPL *, uint8_t *, uint32_t *, int *);
int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *);
int (*salvage_valid)(WT_BM *, WT_SESSION_IMPL *, uint8_t *, uint32_t);
- int (*stat)(WT_BM *, WT_SESSION_IMPL *);
+ int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats);
+ int (*sync)(WT_BM *, WT_SESSION_IMPL *);
int (*verify_addr)
(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, uint32_t);
int (*verify_end)(WT_BM *, WT_SESSION_IMPL *);
@@ -194,7 +195,12 @@ struct __wt_block {
/* Configuration information, set when the file is opened. */
uint32_t allocsize; /* Allocation size */
- u_int block_header; /* Header length */
+ u_int block_header; /* Header length */
+
+ int64_t os_cache; /* System buffer cache flush max */
+ int64_t os_cache_max;
+ int64_t os_cache_dirty; /* System buffer cache write max */
+ int64_t os_cache_dirty_max;
/*
* There is only a single checkpoint in a file that can be written. The
@@ -206,6 +212,13 @@ struct __wt_block {
WT_SPINLOCK live_lock; /* Live checkpoint lock */
WT_BLOCK_CKPT live; /* Live checkpoint */
+ /*
+ * Array of free WT_EXTLIST structures, if we're doing lots of I/O,
+ * a cache avoids an allocation/free while holding the spin lock.
+ */
+ WT_EXT *free_ext; /* List of free entries */
+ u_int free_ext_cnt; /* Limit the number we cache */
+
/* Salvage support */
off_t slvg_off; /* Salvage file offset */
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 8f2ec8e8fa9..d6b2deb6255 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -133,6 +133,14 @@ struct __wt_page_modify {
*/
uint32_t disk_gen;
+ /*
+ * Track the highest transaction ID at which the page was written to
+ * disk. This can be used to avoid trying to write the page multiple
+ * times if a snapshot is keeping old versions pinned (e.g., in a
+ * checkpoint).
+ */
+ wt_txnid_t disk_txn;
+
union {
WT_PAGE *split; /* Resulting split */
WT_ADDR replace; /* Resulting replacement */
@@ -224,7 +232,7 @@ struct __wt_page {
*/
struct {
uint64_t recno; /* Starting recno */
- WT_REF *t; /* Subtree */
+ WT_REF *t; /* Subtree */
} intl;
/* Row-store leaf page. */
@@ -237,10 +245,9 @@ struct __wt_page {
* WT_PAGE structure as small as possible for read-only
* pages. For consistency, we could move the row-store
* modification structures into WT_PAGE_MODIFY too, but
- * it doesn't shrink WT_PAGE any further, and avoiding
- * ugly naming in WT_PAGE_MODIFY to avoid growing it
- * won't be pretty. So far, avoiding ugly naming has
- * overridden consistency.
+ * that doesn't shrink WT_PAGE any further and it would
+ * require really ugly naming inside of WT_PAGE_MODIFY
+ * to avoid growing that structure.
*/
WT_INSERT_HEAD **ins; /* Inserts */
WT_UPDATE **upd; /* Updates */
@@ -274,17 +281,29 @@ struct __wt_page {
WT_PAGE_MODIFY *modify;
/*
- * The read generation is incremented each time the page is searched,
- * and acts as an LRU value for each page in the tree; it is read by
- * the eviction server thread to select pages to be discarded from the
- * in-memory tree.
+ * The page's read generation acts as an LRU value for each page in the
+ * tree; it is used by the eviction server thread to select pages to be
+ * discarded from the in-memory tree.
*
- * The read generation is a 64-bit value; incremented every time the
- * page is searched, a 32-bit value could overflow.
+ * The read generation is a 64-bit value, if incremented frequently, a
+ * 32-bit value could overflow.
*
- * The read-generation is not declared volatile: read-generation is set
- * a lot (on every access), and we don't want to write it that much.
+ * The read generation is a piece of shared memory potentially accessed
+ * by many threads. We don't want to update page read generations for
+ * in-cache workloads and suffer the cache misses, so we don't simply
+ * increment the read generation value on every access. Instead, the
+ * read generation is initialized to 0, then set to a real value if the
+ * page is ever considered for eviction. Once set to a real value, the
+ * read generation is potentially incremented every time the page is
+ * accessed. To try and avoid incrementing the page at a fast rate in
+ * this case, the read generation is incremented to a future point.
+ *
+ * The read generation is not declared volatile or published: the read
+ * generation is set a lot, and we don't want to write it that much.
*/
+#define WT_READ_GEN_NOTSET 0
+#define WT_READ_GEN_OLDEST 1
+#define WT_READ_GEN_STEP 1000
uint64_t read_gen;
/*
@@ -420,6 +439,28 @@ struct __wt_ref {
(ref) = (page)->u.intl.t; (i) > 0; ++(ref), --(i))
/*
+ * WT_LINK_PAGE --
+ * Link a child page into a reference in its parent.
+ */
+#define WT_LINK_PAGE(ppage, pref, cpage) do { \
+ (pref)->page = (cpage); \
+ (cpage)->parent = (ppage); \
+ (cpage)->ref = (pref); \
+} while (0)
+
+/*
+ * WT_MERGE_STACK_MIN --
+ * When stacks of in-memory pages become this deep, they are considered for
+ * merging.
+ *
+ * WT_MERGE_FULL_PAGE --
+ * When the result of a merge contains more than this number of keys, it is
+ * considered "done" and will not be merged again.
+ */
+#define WT_MERGE_STACK_MIN 3
+#define WT_MERGE_FULL_PAGE 100
+
+/*
* WT_ROW --
* Each in-memory page row-store leaf page has an array of WT_ROW structures:
* this is created from on-page data when a page is read from the file. It's
diff --git a/src/include/btree.h b/src/include/btree.h
index 4eff4851348..49b814266ba 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -9,12 +9,9 @@
#define WT_BTREE_MINOR_VERSION 0
/*
- * The minimum btree leaf and internal page sizes are 512B, the maximum 512MB.
- * (The maximum of 512MB is enforced by the software, it could be set as high
- * as 4GB.)
+ * The maximum btree leaf and internal page size is 512MB. (The maximum of
+ * 512MB is enforced by the software, it could be set as high as 4GB.)
*/
-#define WT_BTREE_ALLOCATION_SIZE_MIN 512
-#define WT_BTREE_ALLOCATION_SIZE_MAX (128 * WT_MEGABYTE)
#define WT_BTREE_PAGE_SIZE_MAX (512 * WT_MEGABYTE)
/*
@@ -40,13 +37,6 @@
#define WT_BTREE_MAX_ADDR_COOKIE 255 /* Maximum address cookie */
/*
- * Split page size calculation -- we don't want to repeatedly split every time
- * a new entry is added, so we split to a smaller-than-maximum page size.
- */
-#define WT_SPLIT_PAGE_SIZE(pagesize, allocsize, pct) \
- WT_ALIGN32(((uintmax_t)(pagesize) * (pct)) / 100, allocsize)
-
-/*
* XXX
* The server threads use their own WT_SESSION_IMPL handles because they may
* want to block (for example, the eviction server calls reconciliation, and
@@ -70,12 +60,12 @@ struct __wt_data_handle {
const char *name; /* Object name as a URI */
const char *checkpoint; /* Checkpoint name (or NULL) */
- const char *config; /* Configuration string */
+ const char **cfg; /* Configuration information */
WT_DATA_SOURCE *dsrc; /* Data source for this handle */
void *handle; /* Generic handle */
- WT_DSRC_STATS *stats; /* Data source statistics */
+ WT_DSRC_STATS stats; /* Data-source statistics */
/* Flags values over 0xff are reserved for WT_BTREE_* */
#define WT_DHANDLE_DISCARD 0x01 /* Discard on release */
diff --git a/src/include/btree.i b/src/include/btree.i
index a04777169ee..c3c1c720fcb 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -20,23 +20,35 @@ __wt_page_is_modified(WT_PAGE *page)
* __wt_eviction_page_force --
* Add a page for forced eviction if it matches the criteria.
*/
-static inline int
+static inline void
__wt_eviction_page_force(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_BTREE *btree;
btree = S2BT(session);
- if (btree != NULL && !F_ISSET(btree, WT_BTREE_NO_EVICTION) &&
+ /*
+ * Ignore internal pages (check read-only information first to the
+ * extent possible, this is shared data).
+ */
+ if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT)
+ return;
+
+ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) &&
__wt_page_is_modified(page) &&
- page->type != WT_PAGE_ROW_INT && page->type != WT_PAGE_COL_INT &&
page->memory_footprint > btree->maxmempage)
- return (__wt_evict_forced_page(session, page));
-
- return (0);
+ __wt_evict_forced_page(session, page);
}
/*
+ * Estimate the per-allocation overhead. All implementations of malloc / free
+ * have some kind of header and pad for alignment. We can't know for sure what
+ * that adds up to, but this is an estimate based on some measurements of heap
+ * size versus bytes in use.
+ */
+#define WT_ALLOC_OVERHEAD 32
+
+/*
* __wt_cache_page_inmem_incr --
* Increment a page's memory footprint in the cache.
*/
@@ -45,6 +57,8 @@ __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
{
WT_CACHE *cache;
+ size += WT_ALLOC_OVERHEAD;
+
cache = S2C(session)->cache;
(void)WT_ATOMIC_ADD(cache->bytes_inmem, size);
(void)WT_ATOMIC_ADD(page->memory_footprint, WT_STORE_SIZE(size));
@@ -61,6 +75,8 @@ __wt_cache_page_inmem_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
{
WT_CACHE *cache;
+ size += WT_ALLOC_OVERHEAD;
+
cache = S2C(session)->cache;
(void)WT_ATOMIC_SUB(cache->bytes_inmem, size);
(void)WT_ATOMIC_SUB(page->memory_footprint, WT_STORE_SIZE(size));
@@ -96,31 +112,6 @@ __wt_cache_dirty_decr(WT_SESSION_IMPL *session, size_t size)
}
/*
- * __wt_cache_page_read --
- * Read pages into the cache.
- */
-static inline void
-__wt_cache_page_read(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
-{
- WT_CACHE *cache;
-
- cache = S2C(session)->cache;
- WT_ASSERT(session, size != 0);
- (void)WT_ATOMIC_ADD(cache->pages_read, 1);
- (void)WT_ATOMIC_ADD(cache->bytes_read, size);
- (void)WT_ATOMIC_ADD(page->memory_footprint, WT_STORE_SIZE(size));
-
- /*
- * It's unusual, but possible, that the page is already dirty.
- * For example, when reading an in-memory page with references to
- * deleted leaf pages, the internal page may be marked dirty. If so,
- * update the total bytes dirty here.
- */
- if (__wt_page_is_modified(page))
- (void)WT_ATOMIC_ADD(cache->bytes_dirty, size);
-}
-
-/*
* __wt_cache_page_evict --
* Evict pages from the cache.
*/
@@ -130,7 +121,9 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_CACHE *cache;
cache = S2C(session)->cache;
+
WT_ASSERT(session, page->memory_footprint != 0);
+
(void)WT_ATOMIC_ADD(cache->pages_evict, 1);
(void)WT_ATOMIC_ADD(cache->bytes_evict, page->memory_footprint);
@@ -140,7 +133,22 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
static inline uint64_t
__wt_cache_read_gen(WT_SESSION_IMPL *session)
{
- return (++S2C(session)->cache->read_gen);
+ return (S2C(session)->cache->read_gen);
+}
+
+static inline uint64_t
+__wt_cache_read_gen_set(WT_SESSION_IMPL *session)
+{
+ /*
+ * We return read-generations from the future (where "the future" is
+ * measured by increments of the global read generation). The reason
+ * is because when acquiring a new hazard reference on a page, we can
+ * check its read generation, and if the read generation isn't less
+ * than the current global generation, we don't bother updating the
+ * page. In other words, the goal is to avoid some number of updates
+ * immediately after each update we have to make.
+ */
+ return (++S2C(session)->cache->read_gen + WT_READ_GEN_STEP);
}
/*
@@ -158,7 +166,7 @@ __wt_cache_pages_inuse(WT_CACHE *cache)
* (although "interesting" corruption is vanishingly unlikely, these
* values just increment over time).
*/
- pages_in = cache->pages_read;
+ pages_in = cache->pages_inmem;
pages_out = cache->pages_evict;
return (pages_in > pages_out ? pages_in - pages_out : 0);
}
@@ -178,7 +186,7 @@ __wt_cache_bytes_inuse(WT_CACHE *cache)
* (although "interesting" corruption is vanishingly unlikely, these
* values just increment over time).
*/
- bytes_in = cache->bytes_read + cache->bytes_inmem;
+ bytes_in = cache->bytes_inmem;
bytes_out = cache->bytes_evict;
return (bytes_in > bytes_out ? bytes_in - bytes_out : 0);
}
@@ -219,9 +227,13 @@ __wt_page_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page)
/*
* Multiple threads of control may be searching and deciding to modify
- * a page, if we don't do the update, discard the memory.
+ * a page. If our modify structure is used, update the page's memory
+ * footprint, else discard the modify structure, another thread did the
+ * work.
*/
- if (!WT_ATOMIC_CAS(page->modify, NULL, modify))
+ if (WT_ATOMIC_CAS(page->modify, NULL, modify))
+ __wt_cache_page_inmem_incr(session, page, sizeof(*modify));
+ else
__wt_free(session, modify);
return (0);
}
@@ -237,7 +249,15 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
(void)WT_ATOMIC_ADD(S2C(session)->cache->pages_dirty, 1);
(void)WT_ATOMIC_ADD(
S2C(session)->cache->bytes_dirty, page->memory_footprint);
+
+ /*
+ * The page can never end up with changes older than the oldest
+ * running transaction.
+ */
+ if (F_ISSET(&session->txn, TXN_RUNNING))
+ page->modify->disk_txn = session->txn.snap_min - 1;
}
+
/*
* Publish: there must be a barrier to ensure all changes to the page
* are flushed before we update the page's write generation, otherwise
@@ -475,7 +495,7 @@ __wt_page_hazard_check(WT_SESSION_IMPL *session, WT_PAGE *page)
/*
* No lock is required because the session array is fixed size, but it
- * it may contain inactive entries. We must review any active session
+ * may contain inactive entries. We must review any active session
* that might contain a hazard pointer, so insert a barrier before
* reading the active session count. That way, no matter what sessions
* come or go, we'll check the slots for all of the sessions that could
@@ -509,6 +529,36 @@ __wt_skip_choose_depth(void)
}
/*
+ * __wt_btree_size_overflow --
+ * Check if the size of an in-memory tree with a single leaf page is
+ * over a specified maximum. If called on anything other than a simple
+ * tree with a single leaf page, returns true so the calling code will
+ * switch to a new tree.
+ */
+static inline int
+__wt_btree_size_overflow(WT_SESSION_IMPL *session, uint32_t maxsize)
+{
+ WT_BTREE *btree;
+ WT_PAGE *child, *root;
+
+ btree = S2BT(session);
+ root = btree->root_page;
+
+ if (btree == NULL || root == NULL ||
+ (child = root->u.intl.t->page) == NULL)
+ return (0);
+
+ /* Make sure this is a simple tree, or LSM should switch. */
+ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) ||
+ root->entries != 1 ||
+ root->u.intl.t->state != WT_REF_MEM ||
+ child->type != WT_PAGE_ROW_LEAF)
+ return (1);
+
+ return (child->memory_footprint > maxsize);
+}
+
+/*
* __wt_btree_lex_compare --
* Lexicographic comparison routine.
*
@@ -545,3 +595,18 @@ __wt_btree_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item)
(((cmp) = __wt_btree_lex_compare((k1), (k2))), 0) : \
(bt)->collator->compare((bt)->collator, &(s)->iface, \
(k1), (k2), &(cmp)))
+
+/*
+ * __wt_btree_mergeable --
+ * Determines whether the given page is a candidate for merging.
+ */
+static inline int
+__wt_btree_mergeable(WT_PAGE *page)
+{
+ if (WT_PAGE_IS_ROOT(page) ||
+ page->modify == NULL ||
+ !F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE))
+ return (0);
+
+ return (!WT_PAGE_IS_ROOT(page->parent));
+}
diff --git a/src/include/cache.h b/src/include/cache.h
index ce88994b733..a9d2af5dc61 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -25,9 +25,8 @@ struct __wt_cache {
* be exact, they can't be garbage, we track what comes in and what goes
* out and calculate the difference as needed.
*/
- uint64_t bytes_read; /* Bytes/pages read by read server */
- uint64_t pages_read;
- uint64_t bytes_inmem; /* Bytes/pages created in memory */
+ uint64_t bytes_inmem; /* Bytes/pages in memory */
+ uint64_t pages_inmem;
uint64_t bytes_evict; /* Bytes/pages discarded by eviction */
uint64_t pages_evict;
uint64_t bytes_dirty; /* Bytes/pages currently dirty */
@@ -54,7 +53,6 @@ struct __wt_cache {
*/
WT_EVICT_ENTRY *evict; /* LRU pages being tracked */
WT_EVICT_ENTRY *evict_current; /* LRU current page to be evicted */
- size_t evict_allocated; /* LRU list bytes allocated */
uint32_t evict_entries; /* LRU list eviction slots */
uint32_t evict_candidates; /* LRU list pages to evict */
u_int evict_file_next; /* LRU: next file to search */
@@ -77,6 +75,8 @@ struct __wt_cache {
* Flags.
*/
#define WT_EVICT_FORCE_PASS 0x01 /* Ignore the eviction trigger */
+#define WT_EVICT_NO_PROGRESS 0x02 /* Check if pages are being evicted */
+#define WT_EVICT_STUCK 0x04 /* Eviction server is stuck */
uint32_t flags;
};
diff --git a/src/include/cache.i b/src/include/cache.i
index 1c370655fc4..bf29f728181 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -63,7 +63,8 @@ __wt_cache_full_check(WT_SESSION_IMPL *session)
if (!lockout || F_ISSET(session,
WT_SESSION_NO_CACHE_CHECK | WT_SESSION_SCHEMA_LOCKED))
return (0);
- if (F_ISSET(btree, WT_BTREE_BULK | WT_BTREE_NO_EVICTION))
+ if (btree != NULL &&
+ F_ISSET(btree, WT_BTREE_BULK | WT_BTREE_NO_EVICTION))
return (0);
if ((ret = __wt_evict_lru_page(session, 1)) == EBUSY)
__wt_yield();
diff --git a/src/include/cell.i b/src/include/cell.i
index 1ef5059536a..637ed50de4d 100644
--- a/src/include/cell.i
+++ b/src/include/cell.i
@@ -448,11 +448,13 @@ __wt_cell_type_raw(WT_CELL *cell)
static inline int
__wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end)
{
- WT_DECL_RET;
uint64_t v;
const uint8_t *p;
uint32_t saved_len;
uint64_t saved_v;
+ int copied;
+
+ copied = 0;
/*
* The verification code specifies an end argument, a pointer to 1 past
@@ -467,7 +469,8 @@ __wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end)
return (WT_ERROR); \
} while (0)
- memset(unpack, 0, sizeof(*unpack));
+restart:
+ WT_CLEAR(*unpack);
unpack->cell = cell;
/*
@@ -560,11 +563,8 @@ __wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end)
saved_len = WT_PTRDIFF32(p, cell);
saved_v = unpack->v;
cell = (WT_CELL *)((uint8_t *)cell - v);
- ret = __wt_cell_unpack_safe(cell, unpack, end);
- unpack->raw = WT_CELL_VALUE_COPY;
- unpack->__len = saved_len;
- unpack->v = saved_v;
- return (ret);
+ copied = 1;
+ goto restart;
case WT_CELL_KEY_OVFL:
case WT_CELL_VALUE_OVFL:
@@ -600,6 +600,11 @@ __wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end)
* we need the right length).
*/
done: CHK(cell, unpack->__len);
+ if (copied) {
+ unpack->raw = WT_CELL_VALUE_COPY;
+ unpack->__len = saved_len;
+ unpack->v = saved_v;
+ }
return (0);
}
diff --git a/src/include/config.h b/src/include/config.h
index 7d5e64e715a..cfbebcb239f 100644
--- a/src/include/config.h
+++ b/src/include/config.h
@@ -19,7 +19,7 @@ struct __wt_config_item {
const char *str;
size_t len;
int64_t val;
- enum { ITEM_STRING, ITEM_ID, ITEM_NUM, ITEM_STRUCT } type;
+ enum { ITEM_STRING, ITEM_BOOL, ITEM_ID, ITEM_NUM, ITEM_STRUCT } type;
};
struct __wt_config_check {
diff --git a/src/include/connection.h b/src/include/connection.h
index 21a40238ff0..8fb137802a0 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -76,8 +76,6 @@ struct __wt_connection_impl {
WT_SPINLOCK schema_lock; /* Schema operation spinlock */
WT_SPINLOCK serial_lock; /* Serial function call spinlock */
- int ckpt_backup; /* Backup: don't delete checkpoints */
-
/* Connection queue */
TAILQ_ENTRY(__wt_connection_impl) q;
/* Cache pool queue */
@@ -86,9 +84,12 @@ struct __wt_connection_impl {
const char *home; /* Database home */
int is_new; /* Connection created database */
+ int connection_initialized; /* Connection is initialized */
+
WT_FH *lock_fh; /* Lock file handle */
- pthread_t cache_evict_tid; /* Cache eviction server thread ID */
+ pthread_t cache_evict_tid; /* Eviction server thread ID */
+ int cache_evict_tid_set; /* Eviction server thread ID set */
/* Locked: data handle list */
TAILQ_HEAD(__wt_dhandle_qh, __wt_data_handle) dhqh;
@@ -129,9 +130,28 @@ struct __wt_connection_impl {
WT_CACHE *cache; /* Page cache */
uint64_t cache_size;
- WT_TXN_GLOBAL txn_global; /* Global transaction state. */
+ WT_TXN_GLOBAL txn_global; /* Global transaction state */
- WT_CONNECTION_STATS *stats; /* Connection statistics */
+ int ckpt_backup; /* Backup: don't delete checkpoints */
+
+ WT_SESSION_IMPL *ckpt_session; /* Checkpoint thread session */
+ pthread_t ckpt_tid; /* Checkpoint thread */
+ int ckpt_tid_set; /* Checkpoint thread set */
+ WT_CONDVAR *ckpt_cond; /* Checkpoint wait mutex */
+ const char *ckpt_config; /* Checkpoint configuration */
+ long ckpt_usecs; /* Checkpoint period */
+
+ WT_CONNECTION_STATS stats; /* Connection statistics */
+ int statistics; /* Global statistics configuration */
+ WT_SESSION_IMPL *stat_session; /* Statistics log session */
+ pthread_t stat_tid; /* Statistics log thread */
+ int stat_tid_set; /* Statistics log thread set */
+ WT_CONDVAR *stat_cond; /* Statistics log wait mutex */
+ int stat_clear; /* Statistics log clear */
+ const char *stat_path; /* Statistics log path format */
+ char **stat_sources; /* Statistics log list of objects */
+ const char *stat_stamp; /* Statistics log timestamp format */
+ long stat_usecs; /* Statistics log period */
WT_FH *log_fh; /* Logging file handle */
@@ -150,6 +170,8 @@ struct __wt_connection_impl {
/* If non-zero, all buffers used for I/O will be aligned to this. */
size_t buffer_alignment;
+ uint32_t schema_gen; /* Schema generation number */
+
uint32_t direct_io; /* O_DIRECT configuration */
int mmap; /* mmap configuration */
uint32_t verbose;
diff --git a/src/include/cursor.h b/src/include/cursor.h
index 6d1cf1f4d8e..e144c19de32 100644
--- a/src/include/cursor.h
+++ b/src/include/cursor.h
@@ -226,7 +226,7 @@ struct __wt_cursor_stat {
uint64_t v; /* Current stats value */
WT_ITEM pv; /* Current stats value (string) */
- void (*clear_func)(WT_STATS *); /* Function to clear stats. */
+ void (*clear_func)(void *); /* Function to clear stats. */
WT_BTREE *btree; /* Pinned btree handle. */
};
diff --git a/src/include/cursor.i b/src/include/cursor.i
index 6e289058376..2a8ed9c2dcf 100644
--- a/src/include/cursor.i
+++ b/src/include/cursor.i
@@ -75,18 +75,18 @@ __cursor_leave(WT_CURSOR_BTREE *cbt)
if (F_ISSET(cbt, WT_CBT_ACTIVE)) {
WT_ASSERT(session, session->ncursors > 0);
- if (--session->ncursors == 0) {
+ if (--session->ncursors == 0)
__wt_txn_read_last(session);
-
- /*
- * We no longer have any active cursors, check if our
- * operation overflowed the cache. We don't care if we
- * fail to evict pages: our operation is done
- * regardless.
- */
- (void)__wt_cache_full_check(session);
- }
F_CLR(cbt, WT_CBT_ACTIVE);
+
+ /*
+ * If this is an autocommit operation that is just getting
+ * started, check that the cache isn't full. We may have other
+ * cursors open, but the one we just closed might help eviction
+ * make progress.
+ */
+ if (F_ISSET(&session->txn, TXN_AUTOCOMMIT))
+ WT_RET(__wt_cache_full_check(session));
}
return (0);
}
diff --git a/src/include/error.h b/src/include/error.h
index 842e7abe2ba..08efcfa1690 100644
--- a/src/include/error.h
+++ b/src/include/error.h
@@ -97,25 +97,17 @@
} while (0)
/*
- * WT_ASSERT, WT_ASSERT_ERR, WT_ASSERT_RET --
- * Assert an expression, abort in diagnostic mode, otherwise, optionally
- * return an error.
+ * WT_ASSERT
+ * Assert an expression, aborting in diagnostic mode. Otherwise,
+ * "use" the session to keep the compiler quiet and don't evaluate the
+ * expression.
*/
+#ifdef HAVE_DIAGNOSTIC
#define WT_ASSERT(session, exp) do { \
if (!(exp)) \
__wt_assert(session, 0, __FILE__, __LINE__, "%s", #exp);\
} while (0)
-#define WT_ASSERT_ERR(session, exp) do { \
- if (!(exp)) { \
- __wt_assert( \
- session, WT_ERROR, __FILE__, __LINE__, "%s", #exp); \
- WT_ERR(WT_ERROR); \
- } \
-} while (0)
-#define WT_ASSERT_RET(session, exp) do { \
- if (!(exp)) { \
- __wt_assert( \
- session, WT_ERROR, __FILE__, __LINE__, "%s", #exp); \
- return (WT_ERROR); \
- } \
-} while (0)
+#else
+#define WT_ASSERT(session, exp) \
+ WT_UNUSED(session)
+#endif
diff --git a/src/include/extern.h b/src/include/extern.h
index 902e21395a7..79057e17dae 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -58,12 +58,14 @@ extern int __wt_block_compact_page_skip(WT_SESSION_IMPL *session,
const uint8_t *addr,
uint32_t addr_size,
int *skipp);
+extern void __wt_block_ext_cleanup(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern int __wt_block_misplaced(WT_SESSION_IMPL *session,
WT_BLOCK *block,
const char *tag,
off_t offset,
uint32_t size);
-extern int __wt_block_off_remove_overlap( WT_SESSION_IMPL *session,
+extern int __wt_block_off_remove_overlap(WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
WT_EXTLIST *el,
off_t off,
off_t size);
@@ -85,10 +87,12 @@ extern int __wt_block_extlist_check( WT_SESSION_IMPL *session,
extern int __wt_block_extlist_overlap( WT_SESSION_IMPL *session,
WT_BLOCK *block,
WT_BLOCK_CKPT *ci);
-extern int __wt_block_extlist_merge(WT_SESSION_IMPL *session,
+extern int __wt_block_extlist_merge( WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
WT_EXTLIST *a,
WT_EXTLIST *b);
-extern int __wt_block_insert_ext( WT_SESSION_IMPL *session,
+extern int __wt_block_insert_ext(WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
WT_EXTLIST *el,
off_t off,
off_t size);
@@ -112,9 +116,16 @@ extern int __wt_block_extlist_init(WT_SESSION_IMPL *session,
const char *name,
const char *extname);
extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el);
+extern int __wt_block_map( WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
+ void *mapp,
+ size_t *maplenp);
+extern int __wt_block_unmap( WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
+ void *map,
+ size_t maplen);
extern int __wt_block_manager_open(WT_SESSION_IMPL *session,
const char *filename,
- const char *config,
const char *cfg[],
int forced_salvage,
WT_BM **bmp);
@@ -124,13 +135,14 @@ extern int __wt_block_manager_create(WT_SESSION_IMPL *session,
const char *filename);
extern int __wt_block_open(WT_SESSION_IMPL *session,
const char *filename,
- const char *config,
const char *cfg[],
int forced_salvage,
WT_BLOCK **blockp);
extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh);
-extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern void __wt_block_stat(WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
+ WT_DSRC_STATS *stats);
extern int __wt_bm_read(WT_BM *bm,
WT_SESSION_IMPL *session,
WT_ITEM *buf,
@@ -250,16 +262,14 @@ extern int __wt_debug_page(WT_SESSION_IMPL *session,
const char *ofile);
extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep);
extern void __wt_evict_list_clr_page(WT_SESSION_IMPL *session, WT_PAGE *page);
-extern int __wt_evict_forced_page(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern void __wt_evict_forced_page(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_evict_server_wake(WT_SESSION_IMPL *session);
extern int __wt_sync_file_serial_func(WT_SESSION_IMPL *session, void *args);
extern void *__wt_cache_evict_server(void *arg);
extern void __wt_evict_clear_tree_walk(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_sync_file(WT_SESSION_IMPL *session, int syncop);
extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app);
-extern int __wt_btree_create(WT_SESSION_IMPL *session, const char *filename);
-extern int __wt_btree_truncate(WT_SESSION_IMPL *session, const char *filename);
-extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]);
extern int __wt_btree_close(WT_SESSION_IMPL *session);
extern int __wt_btree_tree_open( WT_SESSION_IMPL *session,
const uint8_t *addr,
@@ -268,13 +278,9 @@ extern int __wt_btree_leaf_create( WT_SESSION_IMPL *session,
WT_PAGE *parent,
WT_REF *ref,
WT_PAGE **pagep);
-extern int __wt_btree_get_memsize( WT_SESSION_IMPL *session,
- WT_BTREE *btree,
- uint32_t **memsizep);
-extern int __wt_btree_release_memsize(WT_SESSION_IMPL *session,
- WT_BTREE *btree);
-extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session,
- const char *config);
+extern void __wt_btree_evictable(WT_SESSION_IMPL *session, int on);
+extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize);
+extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session);
extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session);
extern int __wt_bt_read(WT_SESSION_IMPL *session,
WT_ITEM *buf,
@@ -313,6 +319,10 @@ __wt_page_in_func(
, const char *file, int line
#endif
);
+extern int __wt_page_alloc(WT_SESSION_IMPL *session,
+ uint8_t type,
+ uint32_t alloc_entries,
+ WT_PAGE **pagep);
extern int __wt_page_inmem( WT_SESSION_IMPL *session,
WT_PAGE *parent,
WT_REF *parent_ref,
@@ -351,6 +361,7 @@ extern int __wt_col_search(WT_SESSION_IMPL *session,
extern int __wt_rec_evict(WT_SESSION_IMPL *session,
WT_PAGE *page,
int exclusive);
+extern int __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top);
extern int __wt_rec_track(WT_SESSION_IMPL *session,
WT_PAGE *page,
const uint8_t *addr,
@@ -369,10 +380,6 @@ extern int __wt_rec_track_onpage_addr(WT_SESSION_IMPL *session,
WT_PAGE *page,
const uint8_t *addr,
uint32_t addr_size);
-extern int __wt_rec_track_onpage_ref( WT_SESSION_IMPL *session,
- WT_PAGE *page,
- WT_PAGE *refpage,
- WT_REF *ref);
extern int __wt_rec_track_ovfl_reuse( WT_SESSION_IMPL *session,
WT_PAGE *page,
const void *data,
@@ -401,7 +408,13 @@ extern int __wt_row_key_copy( WT_SESSION_IMPL *session,
WT_ROW *rip_arg,
WT_ITEM *retb);
extern WT_CELL *__wt_row_value(WT_PAGE *page, WT_ROW *rip);
-extern int __wt_row_ikey_alloc(WT_SESSION_IMPL *session,
+extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session,
+ WT_PAGE *page,
+ uint32_t cell_offset,
+ const void *key,
+ uint32_t size,
+ void *ikeyp);
+extern int __wt_row_ikey(WT_SESSION_IMPL *session,
uint32_t cell_offset,
const void *key,
uint32_t size,
@@ -450,9 +463,6 @@ extern int __wt_config_subinit( WT_SESSION_IMPL *session,
extern int __wt_config_next(WT_CONFIG *conf,
WT_CONFIG_ITEM *key,
WT_CONFIG_ITEM *value);
-extern int __wt_config_getraw( WT_CONFIG *cparser,
- WT_CONFIG_ITEM *key,
- WT_CONFIG_ITEM *value);
extern int __wt_config_get(WT_SESSION_IMPL *session,
const char **cfg,
WT_CONFIG_ITEM *key,
@@ -551,12 +561,14 @@ extern WT_CONFIG_CHECK __wt_confchk_session_verify[];
extern const char *__wt_confdfl_table_meta;
extern WT_CONFIG_CHECK __wt_confchk_table_meta[];
extern const char *__wt_confdfl_wiredtiger_open;
+extern WT_CONFIG_CHECK __wt_confchk_checkpoint_subconfigs[];
+extern WT_CONFIG_CHECK __wt_confchk_statistics_log_subconfigs[];
extern WT_CONFIG_CHECK __wt_confchk_wiredtiger_open[];
extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session);
extern int __wt_conn_btree_get(WT_SESSION_IMPL *session,
const char *name,
const char *ckpt,
- const char *cfg[],
+ const char *op_cfg[],
uint32_t flags);
extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session,
int (*func)(WT_SESSION_IMPL *,
@@ -575,18 +587,22 @@ extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session,
extern int __wt_conn_dhandle_discard(WT_CONNECTION_IMPL *conn);
extern int __wt_cache_config(WT_CONNECTION_IMPL *conn, const char *cfg[]);
extern int __wt_cache_create(WT_CONNECTION_IMPL *conn, const char *cfg[]);
-extern void __wt_cache_stats_update(WT_CONNECTION_IMPL *conn, uint32_t flags);
+extern void __wt_cache_stats_update(WT_SESSION_IMPL *session);
extern int __wt_cache_destroy(WT_CONNECTION_IMPL *conn);
extern int __wt_conn_cache_pool_config(WT_SESSION_IMPL *session,
const char **cfg);
extern int __wt_conn_cache_pool_open(WT_SESSION_IMPL *session);
extern int __wt_conn_cache_pool_destroy(WT_CONNECTION_IMPL *conn);
extern void *__wt_cache_pool_server(void *arg);
+extern int __wt_checkpoint_create(WT_CONNECTION_IMPL *conn, const char *cfg[]);
+extern int __wt_checkpoint_destroy(WT_CONNECTION_IMPL *conn);
extern int __wt_connection_init(WT_CONNECTION_IMPL *conn);
extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn);
extern int __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]);
extern int __wt_connection_close(WT_CONNECTION_IMPL *conn);
extern void __wt_conn_stat_init(WT_SESSION_IMPL *session, uint32_t flags);
+extern int __wt_statlog_create(WT_CONNECTION_IMPL *conn, const char *cfg[]);
+extern int __wt_statlog_destroy(WT_CONNECTION_IMPL *conn);
extern int __wt_curbackup_open(WT_SESSION_IMPL *session,
const char *uri,
const char *cfg[],
@@ -950,13 +966,16 @@ extern int __wt_schema_get_table(WT_SESSION_IMPL *session,
size_t namelen,
int ok_incomplete,
WT_TABLE **tablep);
+extern void __wt_schema_release_table(WT_SESSION_IMPL *session,
+ WT_TABLE *table);
extern void __wt_schema_destroy_colgroup(WT_SESSION_IMPL *session,
WT_COLGROUP *colgroup);
extern void __wt_schema_destroy_index(WT_SESSION_IMPL *session, WT_INDEX *idx);
extern void __wt_schema_destroy_table(WT_SESSION_IMPL *session,
WT_TABLE *table);
-extern int __wt_schema_remove_table( WT_SESSION_IMPL *session, WT_TABLE *table);
-extern int __wt_schema_close_tables(WT_SESSION_IMPL *session);
+extern void __wt_schema_remove_table( WT_SESSION_IMPL *session,
+ WT_TABLE *table);
+extern void __wt_schema_close_tables(WT_SESSION_IMPL *session);
extern int __wt_schema_colgroup_name(WT_SESSION_IMPL *session,
WT_TABLE *table,
const char *cgname,
@@ -1117,9 +1136,14 @@ extern void __wt_assert(WT_SESSION_IMPL *session,
extern int __wt_panic(WT_SESSION_IMPL *session);
extern int __wt_illegal_value(WT_SESSION_IMPL *session, const char *name);
extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri);
+extern int __wt_absolute_path(const char *path);
extern int __wt_filename(WT_SESSION_IMPL *session,
const char *name,
const char **path);
+extern int __wt_nfilename(WT_SESSION_IMPL *session,
+ const char *name,
+ size_t namelen,
+ const char **path);
extern int __wt_library_init(void);
extern int __wt_breakpoint(void);
extern void __wt_attach(WT_SESSION_IMPL *session);
@@ -1214,18 +1238,17 @@ extern void *__wt_scr_alloc_ext(WT_SESSION *wt_session, size_t size);
extern void __wt_scr_free_ext(WT_SESSION *wt_session, void *p);
extern void __wt_session_dump_all(WT_SESSION_IMPL *session);
extern void __wt_session_dump(WT_SESSION_IMPL *session);
-extern int __wt_stat_alloc_dsrc_stats(WT_SESSION_IMPL *session,
- WT_DSRC_STATS **statsp);
-extern void __wt_stat_clear_dsrc_stats(WT_STATS *stats_arg);
-extern int __wt_stat_alloc_connection_stats(WT_SESSION_IMPL *session,
- WT_CONNECTION_STATS **statsp);
-extern void __wt_stat_clear_connection_stats(WT_STATS *stats_arg);
+extern void __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats);
+extern void __wt_stat_clear_dsrc_stats(void *stats_arg);
+extern void __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats);
+extern void __wt_stat_clear_connection_stats(void *stats_arg);
extern int __wt_txnid_cmp(const void *v1, const void *v2);
extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session);
extern void __wt_txn_get_oldest(WT_SESSION_IMPL *session);
extern void __wt_txn_get_snapshot( WT_SESSION_IMPL *session,
wt_txnid_t my_id,
- wt_txnid_t max_id);
+ wt_txnid_t max_id,
+ int force);
extern void __wt_txn_get_evict_snapshot(WT_SESSION_IMPL *session);
extern int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]);
extern void __wt_txn_release(WT_SESSION_IMPL *session);
diff --git a/src/include/flags.h b/src/include/flags.h
index e08724d1bfe..340bd924985 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -3,7 +3,8 @@
* flags section: BEGIN
*/
#define WT_CACHE_POOL_RUN 0x00000001
-#define WT_CONN_CACHE_POOL 0x00000020
+#define WT_CONN_CACHE_POOL 0x00000040
+#define WT_CONN_EVICTION_RUN 0x00000020
#define WT_CONN_LSM_MERGE 0x00000010
#define WT_CONN_PANIC 0x00000008
#define WT_CONN_SERVER_RUN 0x00000004
@@ -18,10 +19,11 @@
#define WT_SESSION_SCHEMA_LOCKED 0x00000001
#define WT_SKIP_UPDATE_ERR 0x00000002
#define WT_SKIP_UPDATE_QUIT 0x00000001
-#define WT_SYNC_CHECKPOINT 0x00000008
-#define WT_SYNC_COMPACT 0x00000004
-#define WT_SYNC_DISCARD 0x00000002
-#define WT_SYNC_DISCARD_NOWRITE 0x00000001
+#define WT_SYNC_CHECKPOINT 0x00000010
+#define WT_SYNC_COMPACT 0x00000008
+#define WT_SYNC_DISCARD 0x00000004
+#define WT_SYNC_DISCARD_NOWRITE 0x00000002
+#define WT_SYNC_WRITE_LEAVES 0x00000001
#define WT_TREE_CACHE 0x00000080
#define WT_TREE_COMPACT 0x00000040
#define WT_TREE_DISCARD 0x00000020
diff --git a/src/include/lsm.h b/src/include/lsm.h
index 396363e9e44..3f288449de5 100644
--- a/src/include/lsm.h
+++ b/src/include/lsm.h
@@ -65,10 +65,9 @@ struct __wt_lsm_tree {
WT_RWLOCK *rwlock;
TAILQ_ENTRY(__wt_lsm_tree) q;
- WT_DSRC_STATS *stats; /* LSM statistics */
+ WT_DSRC_STATS stats; /* LSM statistics */
uint64_t dsk_gen;
- uint32_t *memsizep;
/* Configuration parameters */
uint32_t bloom_bit_count;
diff --git a/src/include/misc.h b/src/include/misc.h
index e42620b1d5f..53dbdc6544b 100644
--- a/src/include/misc.h
+++ b/src/include/misc.h
@@ -35,14 +35,9 @@
* Align an unsigned value of any type to a specified power-of-2, including the
* offset result of a pointer subtraction; do the calculation using the largest
* unsigned integer type available.
- *
- * Optionally cast the result to a uint32_t because that's the size of a piece
- * of data in the WiredTiger engine.
*/
#define WT_ALIGN(n, v) \
((((uintmax_t)(n)) + ((v) - 1)) & ~(((uintmax_t)(v)) - 1))
-#define WT_ALIGN32(n, v) \
- ((uint32_t)WT_ALIGN(n, v))
/* Min, max. */
#define WT_MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -165,6 +160,13 @@
#define WT_DECL_RET int ret = 0
/*
+ * Skip the default configuration string in an list of configurations. The
+ * default config is always the first entry in the array, and the array always
+ * has an explicit NULL terminator, so this is safe.
+ */
+#define WT_SKIP_DEFAULT_CONFIG(c) &(c)[1]
+
+/*
* In diagnostic mode we track the locations from which hazard pointers and
* scratch buffers were acquired.
*/
diff --git a/src/include/mutex.h b/src/include/mutex.h
index b33c3b5b8a2..ca2b3346c5c 100644
--- a/src/include/mutex.h
+++ b/src/include/mutex.h
@@ -72,7 +72,7 @@
#if defined(_lint)
#define WT_ATOMIC_ADD(v, val) ((v) += (val), (v))
#define WT_ATOMIC_CAS(v, oldv, newv) \
- ((v) == (oldv) || (v) == (newv) ? 1 : 0)
+ ((v) == (oldv) && (v) = (newv) ? 1 : 0)
#define WT_ATOMIC_SUB(v, val) ((v) -= (val), (v))
#define WT_FULL_BARRIER()
#define WT_READ_BARRIER()
diff --git a/src/include/os.h b/src/include/os.h
index f56fb939c43..ad2932cb403 100644
--- a/src/include/os.h
+++ b/src/include/os.h
@@ -30,12 +30,12 @@
} while (0)
struct __wt_fh {
+ u_int refcnt; /* Reference count */
TAILQ_ENTRY(__wt_fh) q; /* List of open handles */
- off_t file_size; /* File size */
-
char *name; /* File name */
+ off_t file_size; /* File size */
int fd; /* POSIX file handle */
- u_int refcnt; /* Reference count */
+ int direct_io; /* O_DIRECT configured */
};
diff --git a/src/include/packing.i b/src/include/packing.i
index b5f50ebf56e..a47df42d56b 100644
--- a/src/include/packing.i
+++ b/src/include/packing.i
@@ -12,13 +12,6 @@
* gory details. The short version is that we have less cases to deal with
* because the compiler promotes shorter types to int or unsigned int.
*/
-
-typedef struct {
- WT_SESSION_IMPL *session;
- const char *cur, *end, *orig;
- unsigned long repeats;
-} WT_PACK;
-
typedef struct {
union {
int64_t i;
@@ -31,6 +24,13 @@ typedef struct {
char type;
} WT_PACK_VALUE;
+typedef struct {
+ WT_SESSION_IMPL *session;
+ const char *cur, *end, *orig;
+ unsigned long repeats;
+ WT_PACK_VALUE lastv;
+} WT_PACK;
+
static inline int
__pack_initn(
WT_SESSION_IMPL *session, WT_PACK *pack, const char *fmt, size_t len)
@@ -59,6 +59,7 @@ __pack_next(WT_PACK *pack, WT_PACK_VALUE *pv)
char *endsize;
if (pack->repeats > 0) {
+ *pv = pack->lastv;
--pack->repeats;
return (0);
}
@@ -66,13 +67,17 @@ __pack_next(WT_PACK *pack, WT_PACK_VALUE *pv)
next: if (pack->cur == pack->end)
return (WT_NOTFOUND);
- pv->size = WT_STORE_SIZE(strtoul(pack->cur, &endsize, 10));
- pv->havesize = (endsize > pack->cur);
- if (!pv->havesize)
+ if (isdigit(*pack->cur)) {
+ pv->havesize = 1;
+ pv->size = WT_STORE_SIZE(strtoul(pack->cur, &endsize, 10));
+ pack->cur = endsize;
+ } else {
+ pv->havesize = 0;
pv->size = 1;
- pack->cur = endsize;
- pack->repeats = 0;
+ }
+
pv->type = *pack->cur++;
+ pack->repeats = 0;
switch (pv->type) {
case 'S':
@@ -107,6 +112,7 @@ next: if (pack->cur == pack->end)
if (pv->size == 0)
goto next;
pack->repeats = pv->size - 1;
+ pack->lastv = *pv;
return (0);
default:
WT_RET_MSG(pack->session, EINVAL,
@@ -156,9 +162,8 @@ next: if (pack->cur == pack->end)
case 'R': \
pv.u.u = va_arg(ap, uint64_t); \
break; \
- default: \
- WT_ASSERT(session, pv.type != pv.type); \
- break; \
+ /* User format strings have already been validated. */ \
+ WT_ILLEGAL_VALUE(session); \
} \
} while (0)
@@ -436,8 +441,7 @@ __unpack_read(WT_SESSION_IMPL *session,
case 'R': \
*va_arg(ap, uint64_t *) = pv.u.u; \
break; \
- default: \
- WT_ASSERT(session, pv.type != pv.type); \
- break; \
+ /* User format strings have already been validated. */ \
+ WT_ILLEGAL_VALUE(session); \
} \
} while (0)
diff --git a/src/include/schema.h b/src/include/schema.h
index 8312ec72836..68eb046e815 100644
--- a/src/include/schema.h
+++ b/src/include/schema.h
@@ -5,12 +5,12 @@
* See the file LICENSE for redistribution information.
*/
-/* Character constants for projection plans. */
-#define WT_PROJ_KEY 'k' /* Go to key in cursor <arg>. */
-#define WT_PROJ_NEXT 'n' /* Process the next item (<arg> repeats). */
-#define WT_PROJ_REUSE 'r' /* Reuse the previous item (<arg> repeats). */
-#define WT_PROJ_SKIP 's' /* Skip a column in the cursor (<arg> repeats). */
-#define WT_PROJ_VALUE 'v' /* Go to the value in cursor <arg>. */
+/* Character constants for projection plans */
+#define WT_PROJ_KEY 'k' /* Go to key in cursor <arg> */
+#define WT_PROJ_NEXT 'n' /* Process the next item (<arg> repeats) */
+#define WT_PROJ_REUSE 'r' /* Reuse the previous item (<arg> repeats) */
+#define WT_PROJ_SKIP 's' /* Skip a column in the cursor (<arg> repeats) */
+#define WT_PROJ_VALUE 'v' /* Go to the value in cursor <arg> */
struct __wt_colgroup {
const char *name; /* Logical name */
@@ -32,7 +32,7 @@ struct __wt_index {
const char *key_plan; /* Key projection plan */
const char *value_plan; /* Value projection plan */
- int need_value; /* Index must have a non-empty value. */
+ int need_value; /* Index must have a non-empty value */
};
/*
@@ -56,6 +56,9 @@ struct __wt_table {
int cg_complete, idx_complete, is_simple;
u_int ncolgroups, nindices, nkey_columns;
+
+ uint32_t refcnt; /* Number of open cursors */
+ uint32_t schema_gen; /* Cached schema generation number */
};
/*
diff --git a/src/include/session.h b/src/include/session.h
index 24f0c1b3860..293129b7f9e 100644
--- a/src/include/session.h
+++ b/src/include/session.h
@@ -38,7 +38,8 @@ typedef enum {
#define S2C(session) ((WT_CONNECTION_IMPL *)(session)->iface.connection)
/* Get the btree for a session */
-#define S2BT(session) ((WT_BTREE *)(session)->dhandle->handle)
+#define S2BT(session) ((session)->dhandle == NULL ? \
+ NULL : (WT_BTREE *)(session)->dhandle->handle)
/*
* WT_SESSION_IMPL --
diff --git a/src/include/stat.h b/src/include/stat.h
index 36a69068622..d4f276b50bb 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -28,46 +28,40 @@ struct __wt_stats {
(stats)->fld.v = (uint64_t)(value); \
} while (0)
-#define WT_STAT_CHECK_SESSION(session) \
- ((session) != NULL && (session) != S2C(session)->default_session)
-
/* Connection statistics. */
#define WT_CSTAT_DECR(session, fld) do { \
- if (WT_STAT_CHECK_SESSION(session)) { \
- WT_STAT_DECR(S2C(session)->stats, fld); \
- } \
+ if (S2C(session)->statistics) \
+ WT_STAT_DECR(&S2C(session)->stats, fld); \
} while (0)
#define WT_CSTAT_INCR(session, fld) do { \
- if (WT_STAT_CHECK_SESSION(session)) { \
- WT_STAT_INCR(S2C(session)->stats, fld); \
- } \
+ if (S2C(session)->statistics) \
+ WT_STAT_INCR(&S2C(session)->stats, fld); \
} while (0)
#define WT_CSTAT_INCRV(session, fld, v) do { \
- if (WT_STAT_CHECK_SESSION(session)) { \
- WT_STAT_INCRV(S2C(session)->stats, fld, v); \
- } \
+ if (S2C(session)->statistics) \
+ WT_STAT_INCRV(&S2C(session)->stats, fld, v); \
+} while (0)
+#define WT_CSTAT_SET(session, fld, v) do { \
+ if (S2C(session)->statistics) \
+ WT_STAT_SET(&S2C(session)->stats, fld, v); \
} while (0)
/* Data-source statistics. */
+#define WT_DSTAT_DECR(session, fld) do { \
+ if (S2C(session)->statistics) \
+ WT_STAT_DECR(&(session)->btree->stats, fld); \
+} while (0)
#define WT_DSTAT_INCR(session, fld) do { \
- if (WT_STAT_CHECK_SESSION(session)) { \
- WT_STAT_INCR(session->dhandle->stats, fld); \
- } \
+ if (S2C(session)->statistics) \
+ WT_STAT_INCR(&session->dhandle->stats, fld); \
} while (0)
#define WT_DSTAT_INCRV(session, fld, v) do { \
- if (WT_STAT_CHECK_SESSION(session)) { \
- WT_STAT_INCRV(session->dhandle->stats, fld, v); \
- } \
-} while (0)
-#define WT_DSTAT_DECR(session, fld) do { \
- if (WT_STAT_CHECK_SESSION(session)) { \
- WT_STAT_DECR(session->dhandle->stats, fld); \
- } \
+ if (S2C(session)->statistics) \
+ WT_STAT_INCRV(&session->dhandle->stats, fld, v); \
} while (0)
#define WT_DSTAT_SET(session, fld, v) do { \
- if (WT_STAT_CHECK_SESSION(session)) { \
- WT_STAT_SET(session->dhandle->stats, fld, v); \
- } \
+ if (S2C(session)->statistics) \
+ WT_STAT_SET(&session->dhandle->stats, fld, v); \
} while (0)
/* Flags used by statistics initialization. */
@@ -116,11 +110,16 @@ struct __wt_dsrc_stats {
WT_STATS btree_row_leaf;
WT_STATS cache_bytes_read;
WT_STATS cache_bytes_write;
+ WT_STATS cache_eviction_checkpoint;
WT_STATS cache_eviction_clean;
WT_STATS cache_eviction_dirty;
WT_STATS cache_eviction_fail;
+ WT_STATS cache_eviction_force;
WT_STATS cache_eviction_hazard;
WT_STATS cache_eviction_internal;
+ WT_STATS cache_eviction_merge;
+ WT_STATS cache_eviction_merge_fail;
+ WT_STATS cache_eviction_merge_levels;
WT_STATS cache_overflow_value;
WT_STATS cache_read;
WT_STATS cache_read_overflow;
@@ -157,6 +156,7 @@ struct __wt_dsrc_stats {
WT_STATS rec_skipped_update;
WT_STATS rec_split_intl;
WT_STATS rec_split_leaf;
+ WT_STATS rec_split_max;
WT_STATS session_compact;
WT_STATS txn_update_conflict;
WT_STATS txn_write_conflict;
@@ -177,12 +177,18 @@ struct __wt_connection_stats {
WT_STATS cache_bytes_max;
WT_STATS cache_bytes_read;
WT_STATS cache_bytes_write;
+ WT_STATS cache_eviction_checkpoint;
WT_STATS cache_eviction_clean;
WT_STATS cache_eviction_dirty;
WT_STATS cache_eviction_fail;
+ WT_STATS cache_eviction_force;
WT_STATS cache_eviction_hazard;
WT_STATS cache_eviction_internal;
+ WT_STATS cache_eviction_merge;
+ WT_STATS cache_eviction_merge_fail;
+ WT_STATS cache_eviction_merge_levels;
WT_STATS cache_eviction_slow;
+ WT_STATS cache_eviction_walk;
WT_STATS cache_pages_dirty;
WT_STATS cache_pages_inuse;
WT_STATS cache_read;
@@ -191,7 +197,11 @@ struct __wt_connection_stats {
WT_STATS file_open;
WT_STATS memory_allocation;
WT_STATS memory_free;
+ WT_STATS memory_grow;
WT_STATS read_io;
+ WT_STATS rec_pages;
+ WT_STATS rec_pages_eviction;
+ WT_STATS rec_skipped_update;
WT_STATS rwlock_read;
WT_STATS rwlock_write;
WT_STATS txn_ancient;
diff --git a/src/include/txn.h b/src/include/txn.h
index 483bc1de289..e87df4bae5d 100644
--- a/src/include/txn.h
+++ b/src/include/txn.h
@@ -49,6 +49,7 @@ struct __wt_txn_state {
struct __wt_txn_global {
volatile wt_txnid_t current; /* Current transaction ID. */
+ volatile uint32_t gen; /* Completed transaction generation */
WT_TXN_STATE *states; /* Per-session transaction states */
};
@@ -79,6 +80,10 @@ struct __wt_txn {
*/
wt_txnid_t oldest_snap_min;
+ /* Saved global state, to avoid repeating scans. */
+ wt_txnid_t last_id, last_oldest_id;
+ uint32_t last_gen, last_oldest_gen;
+
/*
* Arrays of txn IDs in WT_UPDATE or WT_REF structures created or
* modified by this transaction.
@@ -91,12 +96,6 @@ struct __wt_txn {
size_t modref_alloc;
u_int modref_count;
- /*
- * Count of unsuccessful eviction attempts, used to abort if the cache
- * is full and no progress can be made.
- */
- u_int eviction_fails;
-
#define TXN_AUTOCOMMIT 0x01
#define TXN_ERROR 0x02
#define TXN_OLDEST 0x04
diff --git a/src/include/txn.i b/src/include/txn.i
index 4e470a50eb3..45c89fe278b 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -276,7 +276,7 @@ __wt_txn_read_first(WT_SESSION_IMPL *session)
if (txn->isolation == TXN_ISO_READ_COMMITTED ||
(!F_ISSET(txn, TXN_RUNNING) &&
txn->isolation == TXN_ISO_SNAPSHOT))
- __wt_txn_get_snapshot(session, WT_TXN_NONE, WT_TXN_NONE);
+ __wt_txn_get_snapshot(session, WT_TXN_NONE, WT_TXN_NONE, 0);
else if (!F_ISSET(txn, TXN_RUNNING))
txn_state->snap_min = txn_global->current;
}
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 3c3ad74d1df..d61871af60a 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -509,26 +509,26 @@ struct __wt_session {
* @row{<tt>backup:</tt>,
* hot backup cursor,
* key=<code>string</code>\, see @ref hot_backup for details}
- * @row{<tt>colgroup:\<tablename\>.\<columnset\></tt>,
+ * @row{<tt>colgroup:\<table name\>:\<column group name\></tt>,
* column group cursor,
* table key\, column group value(s)}
* @row{<tt>config:[\<uri\>]</tt>,
* object configuration cursor, (key=config string\,
* value=config value)}
- * @row{<tt>file:\<filename\></tt>,
+ * @row{<tt>file:\<file name\></tt>,
* file cursor,
* file key\, file value(s)}
- * @row{<tt>index:\<tablename\>.\<indexname\></tt>,
+ * @row{<tt>index:\<table name\>:\<index name\></tt>,
* index cursor,
* key=index key\, value=table value(s)}
* @row{<tt>lsm:\<name\></tt>,
* LSM cursor (key=LSM key\, value=LSM value), See also: @ref lsm}
- * @row{<tt>statistics:[file</tt><tt>:\<filename\>]</tt>,
- * database or file statistics cursor,
+ * @row{<tt>statistics:[\<data source URI\>]</tt>,
+ * database or data source statistics cursor,
* key=<code>int id</code>\, value=(<code>string description\,
* string value\, uint64_t value</code>)\,
* see @ref data_statistics for details}
- * @row{<tt>table:\<tablename\></tt>,
+ * @row{<tt>table:\<table name\></tt>,
* table cursor,
* table key\, table value(s)}
* </table>
@@ -537,17 +537,18 @@ struct __wt_session {
* @config{append, append the value as a new record\, creating a new
* record number key; valid only for cursors with record number keys.,a
* boolean flag; default \c false.}
- * @config{bulk, configure the cursor for bulk loads\, a fast\, initial
- * load path. Bulk load may only be used for newly created objects\,
- * and in the case of row-store objects\, key/value items must be loaded
- * in sorted order. Cursors configured for bulk load only support the
- * WT_CURSOR::insert and WT_CURSOR::close methods. The value is usually
- * a true/false flag\, but the the special value \c "bitmap" is for use
- * with fixed-length column stores\, and allows chunks of a memory
- * resident bitmap to be loaded directly into a file by passing a \c
- * WT_ITEM to WT_CURSOR::set_value where the \c size field indicates the
- * number of records in the bitmap (as specified by the file's \c
- * value_format). Bulk load bitmap values must end on a byte boundary
+ * @config{bulk, configure the cursor for bulk-loading\, a fast\,
+ * initial load path (see @ref bulk_load for more information).
+ * Bulk-load may only be used for newly created objects and cursors
+ * configured for bulk-load only support the WT_CURSOR::insert and
+ * WT_CURSOR::close methods. When bulk-loading row-store objects\, keys
+ * must be loaded in sorted order. The value is usually a true/false
+ * flag; when bulk-loading fixed-length column store objects\, the
+ * special value \c bitmap allows chunks of a memory resident bitmap to
+ * be loaded directly into a file by passing a \c WT_ITEM to
+ * WT_CURSOR::set_value where the \c size field indicates the number of
+ * records in the bitmap (as specified by the object's \c value_format
+ * configuration). Bulk-loaded bitmap values must end on a byte boundary
* relative to the bit count (except for the last set of values
* loaded).,a string; default \c false.}
* @config{checkpoint, the name of a checkpoint to open (the reserved
@@ -580,7 +581,7 @@ struct __wt_session {
* @config{target, if non-empty\, backup the list of objects; valid only
* for a backup data source.,a list of strings; default empty.}
* @configend
- * @param cursorp a pointer to the newly opened cursor
+ * @param[out] cursorp a pointer to the newly opened cursor
* @errors
*/
int __F(open_cursor)(WT_SESSION *session,
@@ -596,7 +597,9 @@ struct __wt_session {
* @snippet ex_all.c Create a table
*
* @param session the session handle
- * @param name the URI of the object to create, such as \c "table:stock"
+ * @param name the URI of the object to create, such as
+ * \c "table:stock". For a description of URI formats
+ * see @ref data_sources.
* @configstart{session.create, see dist/api_data.py}
* @config{allocation_size, the file unit allocation size\, in bytes\,
* must a power-of-two; smaller values decrease the file space required
@@ -718,19 +721,34 @@ struct __wt_session {
* adjusted to a lower bound of <code>50 * leaf_page_max</code>. This
* limit is soft - it is possible for pages to be temporarily larger
* than this value.,an integer between 512B and 10TB; default \c 5MB.}
+ * @config{os_cache_dirty_max, maximum dirty system buffer cache usage\,
+ * in bytes. If non-zero\, schedule writes for dirty blocks belonging
+ * to this object in the system buffer cache after that many bytes from
+ * this object are written into the buffer cache.,an integer greater
+ * than or equal to 0; default \c 0.}
+ * @config{os_cache_max, maximum system buffer cache usage\, in bytes.
+ * If non-zero\, evict object blocks from the system buffer cache after
+ * that many bytes from this object are read or written into the buffer
+ * cache.,an integer greater than or equal to 0; default \c 0.}
* @config{prefix_compression, configure row-store format key prefix
* compression.,a boolean flag; default \c true.}
- * @config{source, override the default data source URI derived from the
- * object name.,a string; default empty.}
+ * @config{source, set a custom data source URI for a column group\,
+ * index or simple table. By default\, the data source URI is derived
+ * from the \c type and the column group or index name. Applications
+ * can create tables from existing data sources by supplying a \c source
+ * configuration.,a string; default empty.}
* @config{split_pct, the Btree page split size as a percentage of the
* maximum Btree page size\, that is\, when a Btree page is split\, it
* will be split into smaller pages\, where each page is the specified
* percentage of the maximum Btree page size.,an integer between 25 and
* 100; default \c 75.}
- * @config{type, set the data source type. This setting overrides the
- * URI prefix for the data source\, if no \c source configuration
- * setting is provided.,a string\, chosen from the following options: \c
- * "file"\, \c "lsm"; default \c file.}
+ * @config{type, set the type of data source used to store a column
+ * group\, index or simple table. By default\, a \c "file:" URI is
+ * derived from the object name. The \c type configuration can be used
+ * to switch to a different storage format\, such as LSM. Ignored if an
+ * explicit URI is supplied with a \c source configuration.,a string\,
+ * chosen from the following options: \c "file"\, \c "lsm"; default \c
+ * file.}
* @config{value_format, the format of the data packed into value items.
* See @ref schema_format_types for details. By default\, the
* value_format is \c 'u' and applications use a WT_ITEM structure to
@@ -1052,13 +1070,15 @@ struct __wt_connection {
* shared cache is redistributed.,an integer between 1MB and 10TB;
* default \c 10MB.}@config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of
* cache this database is guaranteed to have available from the shared
- * cache. This setting is per database. Defaults to the chunk size.,a
- * string; default \c 0.}@config{&nbsp;&nbsp;&nbsp;&nbsp;name, name of a
- * cache that is shared between databases.,a string; default
- * empty.}@config{&nbsp;&nbsp;&nbsp;&nbsp;size, maximum memory to
+ * cache. This setting is per database. Defaults to the chunk size.,an
+ * integer; default \c 0.}@config{&nbsp;&nbsp;&nbsp;&nbsp;name, name of
+ * a cache that is shared between databases.,a string; default \c
+ * pool.}@config{&nbsp;&nbsp;&nbsp;&nbsp;size, maximum memory to
* allocate for the shared cache. Setting this will update the value if
* one is already set.,an integer between 1MB and 10TB; default \c
* 500MB.}@config{ ),,}
+ * @config{statistics, Maintain database statistics that may impact
+ * performance.,a boolean flag; default \c false.}
* @config{verbose, enable messages for various events. Options are
* given as a list\, such as
* <code>"verbose=[evictserver\,read]"</code>.,a list\, with values
@@ -1108,7 +1128,7 @@ struct __wt_connection {
* "read-uncommitted"\, \c "read-committed"\, \c "snapshot"; default \c
* read-committed.}
* @configend
- * @param sessionp the new session handle
+ * @param[out] sessionp the new session handle
* @errors
*/
int __F(open_session)(WT_CONNECTION *connection,
@@ -1224,6 +1244,13 @@ struct __wt_connection {
* @config{cache_size, maximum heap memory to allocate for the cache. A database
* should configure either a cache_size or a shared_cache not both.,an integer
* between 1MB and 10TB; default \c 100MB.}
+ * @config{checkpoint = (, periodically checkpoint the database.,a set of
+ * related configuration options defined
+ * below.}@config{&nbsp;&nbsp;&nbsp;&nbsp;name, the checkpoint name.,a string;
+ * default \c "WiredTigerCheckpoint".}@config{&nbsp;&nbsp;&nbsp;&nbsp;wait,
+ * seconds to wait between each checkpoint; setting this value configures
+ * periodic checkpoints.,an integer between 1 and 100000; default \c 0.}@config{
+ * ),,}
* @config{create, create the database if it does not exist.,a boolean flag;
* default \c false.}
* @config{direct_io, Use \c O_DIRECT to access files. Options are given as a
@@ -1265,12 +1292,33 @@ struct __wt_connection {
* cache is redistributed.,an integer between 1MB and 10TB; default \c
* 10MB.}@config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache this database
* is guaranteed to have available from the shared cache. This setting is per
- * database. Defaults to the chunk size.,a string; default \c
+ * database. Defaults to the chunk size.,an integer; default \c
* 0.}@config{&nbsp;&nbsp;&nbsp;&nbsp;name, name of a cache that is shared
- * between databases.,a string; default
- * empty.}@config{&nbsp;&nbsp;&nbsp;&nbsp;size, maximum memory to allocate for
+ * between databases.,a string; default \c
+ * pool.}@config{&nbsp;&nbsp;&nbsp;&nbsp;size, maximum memory to allocate for
* the shared cache. Setting this will update the value if one is already
* set.,an integer between 1MB and 10TB; default \c 500MB.}@config{ ),,}
+ * @config{statistics, Maintain database statistics that may impact
+ * performance.,a boolean flag; default \c false.}
+ * @config{statistics_log = (, log database connection statistics into a file
+ * when the \c statistics configuration value is set to true. See @ref
+ * statistics_log for more information.,a set of related configuration options
+ * defined below.}@config{&nbsp;&nbsp;&nbsp;&nbsp;clear, reset statistics
+ * counters after each set of log records are written.,a boolean flag; default
+ * \c true.}@config{&nbsp;&nbsp;&nbsp;&nbsp;path, the pathname to a file into
+ * which the log records are written\, may contain strftime conversion
+ * specifications. If the value is not an absolute path name\, the file is
+ * created relative to the database home.,a string; default \c
+ * "WiredTigerStat.%H".}@config{&nbsp;&nbsp;&nbsp;&nbsp;sources, if non-empty\,
+ * include statistics for the list of data source URIs. No statistics that
+ * require traversing a tree are reported\, as if the \c statistics_fast
+ * configuration string were set.,a list of strings; default
+ * empty.}@config{&nbsp;&nbsp;&nbsp;&nbsp;timestamp, a timestamp prepended to
+ * each log record\, may contain strftime conversion specifications.,a string;
+ * default \c "%b %d %H:%M:%S".}@config{&nbsp;&nbsp;&nbsp;&nbsp;wait, seconds to
+ * wait between each write of the log records; setting this value configures \c
+ * statistics and statistics logging.,an integer between 5 and 100000; default
+ * \c 0.}@config{ ),,}
* @config{sync, flush files to stable storage when closing or writing
* checkpoints.,a boolean flag; default \c true.}
* @config{transactional, support transactional semantics.,a boolean flag;
@@ -1291,7 +1339,7 @@ struct __wt_connection {
* for details). Configuration values specified in the \c config argument to
* the ::wiredtiger_open function override configuration values specified in
* the \c WiredTiger.config file.
- * @param connectionp A pointer to the newly opened connection handle
+ * @param[out] connectionp A pointer to the newly opened connection handle
* @errors
*/
int wiredtiger_open(const char *home,
@@ -1356,6 +1404,9 @@ struct __wt_event_handler {
const char *operation, uint64_t progress);
};
+/*! @name Data packing and unpacking
+ * @{
+ */
/*! Pack a structure into a buffer.
*
* See @ref packing for a description of the permitted format strings.
@@ -1391,7 +1442,7 @@ int wiredtiger_struct_pack(
* @snippet ex_all.c Get the packed size
*
* @param session the session handle
- * @param sizep a location where the the number of bytes needed for the
+ * @param sizep a location where the number of bytes needed for the
* matching call to ::wiredtiger_struct_pack is returned
* @param format the data format, see @ref packing
* @errors
@@ -1414,6 +1465,136 @@ int wiredtiger_struct_size(
int wiredtiger_struct_unpack(WT_SESSION *session,
const void *buffer, size_t size, const char *format, ...);
+#if !defined(SWIG)
+
+/*!
+ * Streaming interface to packing.
+ *
+ * This allows applications to pack or unpack records one field at a time.
+ * This is an opaque handle returned by ::wiredtiger_pack_start or
+ * ::wiredtiger_unpack_start. It must be closed with ::wiredtiger_pack_close.
+ */
+typedef struct __wt_pack_stream WT_PACK_STREAM;
+
+/*!
+ * Start a packing operation into a buffer with the given format string. This
+ * should be followed by a series of calls to ::wiredtiger_pack_item,
+ * ::wiredtiger_pack_int, ::wiredtiger_pack_str or ::wiredtiger_pack_uint
+ * to fill in the values.
+ *
+ * @param session the session handle
+ * @param format the data format, see @ref packing
+ * @param buffer a pointer to memory to hold the packed data
+ * @param size the size of the buffer
+ * @param[out] psp the new packing stream handle
+ * @errors
+ */
+int wiredtiger_pack_start(WT_SESSION *session,
+ const char *format, void *buffer, size_t size, WT_PACK_STREAM **psp);
+
+/*!
+ * Start an unpacking operation from a buffer with the given format string.
+ * This should be followed by a series of calls to ::wiredtiger_unpack_item,
+ * ::wiredtiger_unpack_int, ::wiredtiger_unpack_str or ::wiredtiger_unpack_uint
+ * to retrieve the packed values.
+ *
+ * @param session the session handle
+ * @param format the data format, see @ref packing
+ * @param buffer a pointer to memory holding the packed data
+ * @param size the size of the buffer
+ * @param[out] psp the new packing stream handle
+ * @errors
+ */
+int wiredtiger_unpack_start(WT_SESSION *session,
+ const char *format, const void *buffer, size_t size, WT_PACK_STREAM **psp);
+
+/*!
+ * Close a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] usedp the number of bytes in the buffer used by the stream
+ * @errors
+ */
+int wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp);
+
+/*!
+ * Pack an item into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param item an item to pack
+ * @errors
+ */
+int wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item);
+
+/*!
+ * Pack a signed integer into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param i a signed integer to pack
+ * @errors
+ */
+int wiredtiger_pack_int(WT_PACK_STREAM *ps, int64_t i);
+
+/*!
+ * Pack a string into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param s a string to pack
+ * @errors
+ */
+int wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s);
+
+/*!
+ * Pack an unsigned integer into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param u an unsigned integer to pack
+ * @errors
+ */
+int wiredtiger_pack_uint(WT_PACK_STREAM *ps, uint64_t u);
+
+/*!
+ * Unpack an item from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param item an item to unpack
+ * @errors
+ */
+int wiredtiger_unpack_item(WT_PACK_STREAM *ps, WT_ITEM *item);
+
+/*!
+ * Unpack a signed integer from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] ip the unpacked signed integer
+ * @errors
+ */
+int wiredtiger_unpack_int(WT_PACK_STREAM *ps, int64_t *ip);
+
+/*!
+ * Unpack a string from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] sp the unpacked string
+ * @errors
+ */
+int wiredtiger_unpack_str(WT_PACK_STREAM *ps, const char **sp);
+
+/*!
+ * Unpack an unsigned integer from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] up the unpacked unsigned integer
+ * @errors
+ */
+int wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up);
+
+#endif /* !defined(SWIG) */
+
+/*!
+ * @}
+ */
+
/*! Get version information.
*
* @snippet ex_all.c Get the WiredTiger library version #1
@@ -1596,6 +1777,11 @@ struct __wt_compressor {
* either the \c internal_page_max or \c leaf_page_max value specified
* to WT_SESSION::create when the object was created.)
*
+ * On entry, \c split_pct is the configured Btree page split size for
+ * this object. (This value is provided for convenience, and will be
+ * the \c split_pct value specified to WT_SESSION::create when the
+ * object was created.)
+ *
* On entry, \c extra is a count of additional bytes that will be added
* to the encoded representation before it is written. In other words,
* if the target write size is 8KB, the returned encoded representation
@@ -1661,6 +1847,7 @@ struct __wt_compressor {
* applicable, the WT_COMPRESSOR::compress callback is used instead.
*
* @param[in] page_max the configured maximum page size for this object
+ * @param[in] split_pct the configured page split size for this object
* @param[in] extra the count of the additional bytes
* @param[in] src the data to compress
* @param[in] offsets the byte offsets of the byte strings in src
@@ -1673,7 +1860,7 @@ struct __wt_compressor {
* @returns zero for success, non-zero to indicate an error.
*/
int (*compress_raw)(WT_COMPRESSOR *compressor, WT_SESSION *session,
- size_t page_max, size_t extra,
+ size_t page_max, u_int split_pct, size_t extra,
uint8_t *src, uint32_t *offsets, uint32_t slots,
uint8_t *dst, size_t dst_len,
int final,
@@ -1886,54 +2073,74 @@ extern int wiredtiger_extension_init(WT_SESSION *session,
#define WT_STAT_CONN_CACHE_BYTES_READ 9
/*! cache: bytes written from cache */
#define WT_STAT_CONN_CACHE_BYTES_WRITE 10
+/*! cache: checkpoint blocked page eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 11
/*! cache: unmodified pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 11
+#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 12
/*! cache: modified pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 12
+#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 13
/*! cache: pages selected for eviction unable to be evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_FAIL 13
-/*! cache: eviction unable to acquire hazard pointer */
-#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 14
+#define WT_STAT_CONN_CACHE_EVICTION_FAIL 14
+/*! cache: pages queued for forced eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE 15
+/*! cache: hazard pointer blocked page eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 16
/*! cache: internal pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 15
+#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 17
+/*! cache: internal page merge operations completed */
+#define WT_STAT_CONN_CACHE_EVICTION_MERGE 18
+/*! cache: internal page merge attempts that could not complete */
+#define WT_STAT_CONN_CACHE_EVICTION_MERGE_FAIL 19
+/*! cache: internal levels merged */
+#define WT_STAT_CONN_CACHE_EVICTION_MERGE_LEVELS 20
/*! cache: eviction server unable to reach eviction goal */
-#define WT_STAT_CONN_CACHE_EVICTION_SLOW 16
+#define WT_STAT_CONN_CACHE_EVICTION_SLOW 21
+/*! cache: pages walked for eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_WALK 22
/*! cache: tracked dirty pages in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_DIRTY 17
+#define WT_STAT_CONN_CACHE_PAGES_DIRTY 23
/*! cache: pages currently held in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_INUSE 18
+#define WT_STAT_CONN_CACHE_PAGES_INUSE 24
/*! cache: pages read into cache */
-#define WT_STAT_CONN_CACHE_READ 19
+#define WT_STAT_CONN_CACHE_READ 25
/*! cache: pages written from cache */
-#define WT_STAT_CONN_CACHE_WRITE 20
+#define WT_STAT_CONN_CACHE_WRITE 26
/*! pthread mutex condition wait calls */
-#define WT_STAT_CONN_COND_WAIT 21
+#define WT_STAT_CONN_COND_WAIT 27
/*! files currently open */
-#define WT_STAT_CONN_FILE_OPEN 22
+#define WT_STAT_CONN_FILE_OPEN 28
/*! total heap memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 23
+#define WT_STAT_CONN_MEMORY_ALLOCATION 29
/*! total heap memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 24
+#define WT_STAT_CONN_MEMORY_FREE 30
+/*! total heap memory re-allocations */
+#define WT_STAT_CONN_MEMORY_GROW 31
/*! total read I/Os */
-#define WT_STAT_CONN_READ_IO 25
+#define WT_STAT_CONN_READ_IO 32
+/*! page reconciliation calls */
+#define WT_STAT_CONN_REC_PAGES 33
+/*! page reconciliation calls for eviction */
+#define WT_STAT_CONN_REC_PAGES_EVICTION 34
+/*! reconciliation failed because an update could not be included */
+#define WT_STAT_CONN_REC_SKIPPED_UPDATE 35
/*! pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 26
+#define WT_STAT_CONN_RWLOCK_READ 36
/*! pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 27
+#define WT_STAT_CONN_RWLOCK_WRITE 37
/*! ancient transactions */
-#define WT_STAT_CONN_TXN_ANCIENT 28
+#define WT_STAT_CONN_TXN_ANCIENT 38
/*! transactions */
-#define WT_STAT_CONN_TXN_BEGIN 29
+#define WT_STAT_CONN_TXN_BEGIN 39
/*! transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 30
+#define WT_STAT_CONN_TXN_CHECKPOINT 40
/*! transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 31
+#define WT_STAT_CONN_TXN_COMMIT 41
/*! transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 32
+#define WT_STAT_CONN_TXN_FAIL_CACHE 42
/*! transactions rolled-back */
-#define WT_STAT_CONN_TXN_ROLLBACK 33
+#define WT_STAT_CONN_TXN_ROLLBACK 43
/*! total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 34
+#define WT_STAT_CONN_WRITE_IO 44
/*!
* @}
@@ -2007,95 +2214,107 @@ extern int wiredtiger_extension_init(WT_SESSION *session,
#define WT_STAT_DSRC_CACHE_BYTES_READ 31
/*! bytes written from cache */
#define WT_STAT_DSRC_CACHE_BYTES_WRITE 32
+/*! cache: checkpoint blocked page eviction */
+#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 33
/*! unmodified pages evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 33
+#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 34
/*! modified pages evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 34
+#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 35
/*! data source pages selected for eviction unable to be evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 35
-/*! eviction unable to acquire hazard pointer */
-#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 36
+#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 36
+/*! cache: pages queued for forced eviction */
+#define WT_STAT_DSRC_CACHE_EVICTION_FORCE 37
+/*! cache: hazard pointer blocked page eviction */
+#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 38
/*! internal pages evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 37
+#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 39
+/*! cache: internal page merge operations completed */
+#define WT_STAT_DSRC_CACHE_EVICTION_MERGE 40
+/*! cache: internal page merge attempts that could not complete */
+#define WT_STAT_DSRC_CACHE_EVICTION_MERGE_FAIL 41
+/*! cache: internal levels merged */
+#define WT_STAT_DSRC_CACHE_EVICTION_MERGE_LEVELS 42
/*! overflow values cached in memory */
-#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 38
+#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 43
/*! pages read into cache */
-#define WT_STAT_DSRC_CACHE_READ 39
+#define WT_STAT_DSRC_CACHE_READ 44
/*! overflow pages read into cache */
-#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 40
+#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 45
/*! pages written from cache */
-#define WT_STAT_DSRC_CACHE_WRITE 41
+#define WT_STAT_DSRC_CACHE_WRITE 46
/*! raw compression call failed (no additional data available) */
-#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 42
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 47
/*! raw compression call failed (additional data available) */
-#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 43
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 48
/*! raw compression call succeeded */
-#define WT_STAT_DSRC_COMPRESS_RAW_OK 44
+#define WT_STAT_DSRC_COMPRESS_RAW_OK 49
/*! compressed pages read */
-#define WT_STAT_DSRC_COMPRESS_READ 45
+#define WT_STAT_DSRC_COMPRESS_READ 50
/*! compressed pages written */
-#define WT_STAT_DSRC_COMPRESS_WRITE 46
+#define WT_STAT_DSRC_COMPRESS_WRITE 51
/*! page written failed to compress */
-#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 47
+#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 52
/*! page written was too small to compress */
-#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 48
+#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 53
/*! cursor insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT 49
+#define WT_STAT_DSRC_CURSOR_INSERT 54
/*! bulk-loaded cursor-insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT_BULK 50
+#define WT_STAT_DSRC_CURSOR_INSERT_BULK 55
/*! cursor-insert key and value bytes inserted */
-#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 51
+#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 56
/*! cursor next calls */
-#define WT_STAT_DSRC_CURSOR_NEXT 52
+#define WT_STAT_DSRC_CURSOR_NEXT 57
/*! cursor prev calls */
-#define WT_STAT_DSRC_CURSOR_PREV 53
+#define WT_STAT_DSRC_CURSOR_PREV 58
/*! cursor remove calls */
-#define WT_STAT_DSRC_CURSOR_REMOVE 54
+#define WT_STAT_DSRC_CURSOR_REMOVE 59
/*! cursor-remove key bytes removed */
-#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 55
+#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 60
/*! cursor reset calls */
-#define WT_STAT_DSRC_CURSOR_RESET 56
+#define WT_STAT_DSRC_CURSOR_RESET 61
/*! cursor search calls */
-#define WT_STAT_DSRC_CURSOR_SEARCH 57
+#define WT_STAT_DSRC_CURSOR_SEARCH 62
/*! cursor search near calls */
-#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 58
+#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 63
/*! cursor update calls */
-#define WT_STAT_DSRC_CURSOR_UPDATE 59
+#define WT_STAT_DSRC_CURSOR_UPDATE 64
/*! cursor-update value bytes updated */
-#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 60
+#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 65
/*! chunks in the LSM tree */
-#define WT_STAT_DSRC_LSM_CHUNK_COUNT 61
+#define WT_STAT_DSRC_LSM_CHUNK_COUNT 66
/*! highest merge generation in the LSM tree */
-#define WT_STAT_DSRC_LSM_GENERATION_MAX 62
+#define WT_STAT_DSRC_LSM_GENERATION_MAX 67
/*! queries that could have benefited from a Bloom filter that did not
* exist */
-#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 63
+#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 68
/*! reconciliation dictionary matches */
-#define WT_STAT_DSRC_REC_DICTIONARY 64
+#define WT_STAT_DSRC_REC_DICTIONARY 69
/*! reconciliation overflow keys written */
-#define WT_STAT_DSRC_REC_OVFL_KEY 65
+#define WT_STAT_DSRC_REC_OVFL_KEY 70
/*! reconciliation overflow values written */
-#define WT_STAT_DSRC_REC_OVFL_VALUE 66
+#define WT_STAT_DSRC_REC_OVFL_VALUE 71
/*! reconciliation pages deleted */
-#define WT_STAT_DSRC_REC_PAGE_DELETE 67
+#define WT_STAT_DSRC_REC_PAGE_DELETE 72
/*! reconciliation pages merged */
-#define WT_STAT_DSRC_REC_PAGE_MERGE 68
+#define WT_STAT_DSRC_REC_PAGE_MERGE 73
/*! page reconciliation calls */
-#define WT_STAT_DSRC_REC_PAGES 69
+#define WT_STAT_DSRC_REC_PAGES 74
/*! page reconciliation calls for eviction */
-#define WT_STAT_DSRC_REC_PAGES_EVICTION 70
-/*! page reconciliation failed when an update could not be included */
-#define WT_STAT_DSRC_REC_SKIPPED_UPDATE 71
+#define WT_STAT_DSRC_REC_PAGES_EVICTION 75
+/*! reconciliation failed because an update could not be included */
+#define WT_STAT_DSRC_REC_SKIPPED_UPDATE 76
/*! reconciliation internal pages split */
-#define WT_STAT_DSRC_REC_SPLIT_INTL 72
+#define WT_STAT_DSRC_REC_SPLIT_INTL 77
/*! reconciliation leaf pages split */
-#define WT_STAT_DSRC_REC_SPLIT_LEAF 73
+#define WT_STAT_DSRC_REC_SPLIT_LEAF 78
+/*! reconciliation maximum number of splits created by for a page */
+#define WT_STAT_DSRC_REC_SPLIT_MAX 79
/*! object compaction */
-#define WT_STAT_DSRC_SESSION_COMPACT 74
+#define WT_STAT_DSRC_SESSION_COMPACT 80
/*! update conflicts */
-#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 75
+#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 81
/*! write generation conflicts */
-#define WT_STAT_DSRC_TXN_WRITE_CONFLICT 76
+#define WT_STAT_DSRC_TXN_WRITE_CONFLICT 82
/*! @} */
/*
* Statistics section: END
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index b3ca7fabcce..952f332d4e4 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -206,6 +206,7 @@ struct __wt_update;
#include "posix.h"
#include "txn.h" /* typedef for wt_txnid_t */
+#include "stat.h" /* WT_DSRC_STATS for data sources */
#include "api.h"
#include "block.h"
@@ -223,7 +224,6 @@ struct __wt_update;
#include "meta.h"
#include "os.h"
#include "schema.h"
-#include "stat.h"
#include "session.h" /* required by connection.h */
#include "connection.h"
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 880243f85a6..87d2e891b71 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -214,14 +214,7 @@ __clsm_open_cursors(
clsm->primary_chunk = chunk;
(void)WT_ATOMIC_ADD(clsm->primary_chunk->ncursor, 1);
- /*
- * Peek into the btree layer to track the in-memory size.
- * Ignore error returns since it is OK for the btree to be
- * empty in this code path (and that is an error condition).
- */
- if (lsm_tree->memsizep == NULL)
- (void)__wt_btree_get_memsize(
- session, S2BT(session), &lsm_tree->memsizep);
+ __wt_btree_evictable(session, 0);
}
clsm->dsk_gen = lsm_tree->dsk_gen;
@@ -571,10 +564,10 @@ __clsm_search(WT_CURSOR *cursor)
ret = __wt_bloom_hash_get(bloom, &bhash);
if (ret == WT_NOTFOUND) {
WT_STAT_INCR(
- clsm->lsm_tree->stats, bloom_miss);
+ &clsm->lsm_tree->stats, bloom_miss);
continue;
} else if (ret == 0)
- WT_STAT_INCR(clsm->lsm_tree->stats, bloom_hit);
+ WT_STAT_INCR(&clsm->lsm_tree->stats, bloom_hit);
WT_ERR(ret);
}
c->set_key(c, &cursor->key);
@@ -589,11 +582,11 @@ __clsm_search(WT_CURSOR *cursor)
goto err;
else if (bloom != NULL)
WT_STAT_INCR(
- clsm->lsm_tree->stats, bloom_false_positive);
+ &clsm->lsm_tree->stats, bloom_false_positive);
/* The active chunk can't have a bloom filter. */
else if (clsm->primary_chunk == NULL || i != clsm->nchunks)
WT_STAT_INCR(
- clsm->lsm_tree->stats, lsm_lookup_no_bloom);
+ &clsm->lsm_tree->stats, lsm_lookup_no_bloom);
}
ret = WT_NOTFOUND;
@@ -781,11 +774,10 @@ static inline int
__clsm_put(
WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, WT_ITEM *key, WT_ITEM *value)
{
- WT_BTREE *btree;
+ WT_DATA_HANDLE *saved_dhandle;
WT_CURSOR *primary;
WT_DECL_RET;
WT_LSM_TREE *lsm_tree;
- uint32_t *memsizep;
lsm_tree = clsm->lsm_tree;
@@ -826,13 +818,19 @@ __clsm_put(
F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
clsm->current = primary;
- if ((memsizep = lsm_tree->memsizep) != NULL &&
- *memsizep > lsm_tree->chunk_size) {
+ /*
+ * In LSM there are multiple btrees active at one time. The tree
+ * switch code needs to use btree API methods, and it wants to
+ * operate on the btree for the primary chunk. Set that up now.
+ */
+ saved_dhandle = session->dhandle;
+ WT_SET_BTREE_IN_SESSION(session, ((WT_CURSOR_BTREE *)primary)->btree);
+ if (__wt_btree_size_overflow(session, lsm_tree->chunk_size)) {
/*
* Take the LSM lock first: we can't acquire it while
* holding the schema lock, or we will deadlock.
*/
- WT_RET(__wt_writelock(session, lsm_tree->rwlock));
+ WT_ERR(__wt_writelock(session, lsm_tree->rwlock));
/* Make sure we don't race. */
if (clsm->dsk_gen == lsm_tree->dsk_gen)
WT_WITH_SCHEMA_LOCK(session,
@@ -844,12 +842,15 @@ __clsm_put(
* in switching: if something went wrong, we should keep
* trying to switch.
*/
- btree = ((WT_CURSOR_BTREE *)primary)->btree;
- if (ret == 0)
- ret = __wt_btree_release_memsize(session, btree);
+ if (ret == 0) {
+ WT_SET_BTREE_IN_SESSION(session,
+ ((WT_CURSOR_BTREE *)primary)->btree);
+ __wt_btree_evictable(session, 1);
+ }
WT_TRET(__wt_rwunlock(session, lsm_tree->rwlock));
}
+err: session->dhandle = saved_dhandle;
return (ret);
}
diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c
index b3335089ae8..a738462ac69 100644
--- a/src/lsm/lsm_meta.c
+++ b/src/lsm/lsm_meta.c
@@ -19,15 +19,15 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
WT_DECL_RET;
WT_ITEM buf;
WT_LSM_CHUNK *chunk;
- const char *config;
+ const char *lsmconfig;
size_t chunk_sz, alloc;
u_int nchunks;
WT_CLEAR(buf);
chunk_sz = sizeof(WT_LSM_CHUNK);
- WT_RET(__wt_metadata_read(session, lsm_tree->name, &config));
- WT_ERR(__wt_config_init(session, &cparser, config));
+ WT_RET(__wt_metadata_read(session, lsm_tree->name, &lsmconfig));
+ WT_ERR(__wt_config_init(session, &cparser, lsmconfig));
while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) {
if (WT_STRING_MATCH("bloom_config", ck.str, ck.len)) {
__wt_free(session, lsm_tree->bloom_config);
@@ -146,7 +146,7 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
}
WT_ERR_NOTFOUND_OK(ret);
-err: __wt_free(session, config);
+err: __wt_free(session, lsmconfig);
return (ret);
}
diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c
index e4e36311862..fa0babfb4be 100644
--- a/src/lsm/lsm_stat.c
+++ b/src/lsm/lsm_stat.c
@@ -38,14 +38,15 @@ __wt_lsm_stat_init(WT_SESSION_IMPL *session,
if (cst->stats != NULL)
stats = (WT_DSRC_STATS *)cst->stats;
else {
- WT_ERR(__wt_stat_alloc_dsrc_stats(session, &stats));
+ WT_ERR(__wt_calloc_def(session, 1, &stats));
+ __wt_stat_init_dsrc_stats(stats);
cst->stats_first = cst->stats = (WT_STATS *)stats;
cst->stats_count = sizeof(*stats) / sizeof(WT_STATS);
}
- *stats = *lsm_tree->stats;
+ *stats = lsm_tree->stats;
if (LF_ISSET(WT_STATISTICS_CLEAR))
- __wt_stat_clear_dsrc_stats((WT_STATS *)lsm_tree->stats);
+ __wt_stat_clear_dsrc_stats(&lsm_tree->stats);
/* Hold the LSM lock so that we can safely walk through the chunks. */
WT_ERR(__wt_readlock(session, lsm_tree->rwlock));
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 0746c9f4443..82493354e1e 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -37,8 +37,6 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
if (lsm_tree->rwlock != NULL)
WT_TRET(__wt_rwlock_destroy(session, &lsm_tree->rwlock));
- __wt_free(session, lsm_tree->stats);
-
for (i = 0; i < lsm_tree->nchunks; i++) {
if ((chunk = lsm_tree->chunk[i]) == NULL)
continue;
@@ -296,6 +294,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_LSM_TREE *lsm_tree;
const char *cfg[] = API_CONF_DEFAULTS(session, create, config);
+ const char *tmpconfig;
/* If the tree is open, it already exists. */
if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) {
@@ -304,9 +303,15 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
}
WT_RET_NOTFOUND_OK(ret);
- /* If the tree has metadata, it already exists. */
- if (__wt_metadata_read(session, uri, &config) == 0) {
- __wt_free(session, config);
+ /*
+ * If the tree has metadata, it already exists.
+ *
+ * !!!
+ * Use a local variable: we don't care what the existing configuration
+ * is, but we don't want to overwrite the real config.
+ */
+ if (__wt_metadata_read(session, uri, &tmpconfig) == 0) {
+ __wt_free(session, tmpconfig);
return (exclusive ? EEXIST : 0);
}
WT_RET_NOTFOUND_OK(ret);
@@ -447,7 +452,7 @@ __lsm_tree_open(
WT_RET(__wt_calloc_def(session, 1, &lsm_tree));
WT_ERR(__wt_rwlock_alloc(session, "lsm tree", &lsm_tree->rwlock));
WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
- WT_ERR(__wt_stat_alloc_dsrc_stats(session, &lsm_tree->stats));
+ __wt_stat_init_dsrc_stats(&lsm_tree->stats);
WT_ERR(__wt_lsm_meta_read(session, lsm_tree));
@@ -528,11 +533,7 @@ __wt_lsm_tree_switch(
uint32_t new_id;
new_id = WT_ATOMIC_ADD(lsm_tree->last, 1);
-
- WT_VERBOSE_RET(session, lsm,
- "Tree switch to: %d because %d > %d", new_id,
- (lsm_tree->memsizep == NULL ? 0 : (int)*lsm_tree->memsizep),
- (int)lsm_tree->chunk_size);
+ WT_VERBOSE_RET(session, lsm, "Tree switch to: %d", new_id);
if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) >
lsm_tree->chunk_alloc)
@@ -550,8 +551,6 @@ __wt_lsm_tree_switch(
++lsm_tree->dsk_gen;
WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
- lsm_tree->memsizep = NULL;
-
err: /* TODO: mark lsm_tree bad on error(?) */
return (ret);
}
diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c
index ca66cceffcb..56511546f9b 100644
--- a/src/lsm/lsm_worker.c
+++ b/src/lsm/lsm_worker.c
@@ -341,9 +341,9 @@ __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
if ((chunk = lsm_tree->old_chunks[i]) == NULL)
continue;
if (!locked) {
- locked = 1;
/* TODO: Do we need the lsm_tree lock for all drops? */
WT_ERR(__wt_writelock(session, lsm_tree->rwlock));
+ locked = 1;
}
if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_drop(
diff --git a/src/meta/meta_table.c b/src/meta/meta_table.c
index 1f82bd3fb7a..7d790800148 100644
--- a/src/meta/meta_table.c
+++ b/src/meta/meta_table.c
@@ -210,6 +210,8 @@ __wt_metadata_read(
WT_DECL_RET;
const char *value;
+ *valuep = NULL;
+
if (__metadata_turtle(key))
return (__wt_meta_turtle_read(session, key, valuep));
diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c
index d81f562aa4e..6e9cc1e6352 100644
--- a/src/meta/meta_turtle.c
+++ b/src/meta/meta_turtle.c
@@ -64,6 +64,8 @@ __wt_meta_turtle_read(
int match;
const char *path;
+ *valuep = NULL;
+
fp = NULL;
path = NULL;
diff --git a/src/os_posix/os_alloc.c b/src/os_posix/os_alloc.c
index 8a3fb45eec1..bc4ef463e2a 100644
--- a/src/os_posix/os_alloc.c
+++ b/src/os_posix/os_alloc.c
@@ -32,7 +32,7 @@ __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp)
*/
WT_ASSERT(session, number != 0 && size != 0);
- if (session != NULL && S2C(session)->stats != NULL)
+ if (session != NULL)
WT_CSTAT_INCR(session, memory_allocation);
if ((p = calloc(number, size)) == NULL)
@@ -56,21 +56,26 @@ __wt_realloc(WT_SESSION_IMPL *session,
/*
* !!!
* This function MUST handle a NULL WT_SESSION_IMPL handle.
- */
- WT_ASSERT(session, bytes_to_allocate != 0);
-
- /*
+ *
* Sometimes we're allocating memory and we don't care about the
* final length -- bytes_allocated_ret may be NULL.
*/
- bytes_allocated = (bytes_allocated_ret == NULL) ?
- 0 : *bytes_allocated_ret;
- WT_ASSERT(session, bytes_allocated < bytes_to_allocate);
-
p = *(void **)retp;
+ bytes_allocated =
+ (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret;
+ WT_ASSERT(session,
+ (p == NULL && bytes_allocated == 0) ||
+ (p != NULL &&
+ (bytes_allocated_ret == NULL || bytes_allocated != 0)));
+ WT_ASSERT(session, bytes_to_allocate != 0);
+ WT_ASSERT(session, bytes_allocated < bytes_to_allocate);
- if (p == NULL && session != NULL && S2C(session)->stats != NULL)
- WT_CSTAT_INCR(session, memory_allocation);
+ if (session != NULL) {
+ if (p == NULL)
+ WT_CSTAT_INCR(session, memory_allocation);
+ else
+ WT_CSTAT_INCR(session, memory_grow);
+ }
if ((p = realloc(p, bytes_to_allocate)) == NULL)
WT_RET_MSG(session, __wt_errno(), "memory allocation");
@@ -114,21 +119,21 @@ __wt_realloc_aligned(WT_SESSION_IMPL *session,
void *p, *newp;
size_t bytes_allocated;
- WT_ASSERT(session, bytes_to_allocate != 0);
-
/*
* Sometimes we're allocating memory and we don't care about the
* final length -- bytes_allocated_ret may be NULL.
*/
- bytes_allocated = (bytes_allocated_ret == NULL) ?
- 0 : *bytes_allocated_ret;
- WT_ASSERT(session, bytes_allocated < bytes_to_allocate);
-
p = *(void **)retp;
+ bytes_allocated =
+ (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret;
+ WT_ASSERT(session,
+ (p == NULL && bytes_allocated == 0) ||
+ (p != NULL &&
+ (bytes_allocated_ret == NULL || bytes_allocated != 0)));
+ WT_ASSERT(session, bytes_to_allocate != 0);
+ WT_ASSERT(session, bytes_allocated < bytes_to_allocate);
- WT_ASSERT(session, p == NULL || bytes_allocated != 0);
-
- if (p == NULL && session != NULL && S2C(session)->stats != NULL)
+ if (session != NULL)
WT_CSTAT_INCR(session, memory_allocation);
if ((ret = posix_memalign(&newp,
@@ -207,22 +212,24 @@ __wt_free_int(WT_SESSION_IMPL *session, void *p_arg)
{
void *p;
- /*
- * !!!
- * This function MUST handle a NULL WT_SESSION_IMPL handle.
- */
- if (session != NULL && S2C(session)->stats != NULL)
- WT_CSTAT_INCR(session, memory_free);
+ p = *(void **)p_arg;
+ if (p == NULL) /* ANSI C free semantics */
+ return;
/*
* If there's a serialization bug we might race with another thread.
* We can't avoid the race (and we aren't willing to flush memory),
- * but we minimize the window by clearing the free address atomically,
- * hoping a racing thread will see, and won't free, a NULL pointer.
+ * but we minimize the window by clearing the free address, hoping a
+ * racing thread will see, and won't free, a NULL pointer.
*/
- p = *(void **)p_arg;
*(void **)p_arg = NULL;
- if (p != NULL) /* ANSI C free semantics */
- free(p);
+ /*
+ * !!!
+ * This function MUST handle a NULL WT_SESSION_IMPL handle.
+ */
+ if (session != NULL)
+ WT_CSTAT_INCR(session, memory_free);
+
+ free(p);
}
diff --git a/src/os_posix/os_fsync.c b/src/os_posix/os_fsync.c
index abc71a73324..2871aa9a21f 100644
--- a/src/os_posix/os_fsync.c
+++ b/src/os_posix/os_fsync.c
@@ -19,8 +19,8 @@ __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh)
WT_VERBOSE_RET(session, fileops, "%s: fsync", fh->name);
WT_SYSCALL_RETRY(fsync(fh->fd), ret);
- if (ret == 0)
- return (0);
+ if (ret != 0)
+ WT_RET_MSG(session, ret, "%s fsync error", fh->name);
- WT_RET_MSG(session, ret, "%s fsync error", fh->name);
+ return (0);
}
diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c
index fd321b7759c..f35d976aab5 100644
--- a/src/os_posix/os_open.c
+++ b/src/os_posix/os_open.c
@@ -59,12 +59,13 @@ __wt_open(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_FH *fh;
mode_t mode;
- int f, fd, matched;
+ int direct_io, f, fd, matched;
const char *path;
conn = S2C(session);
fh = NULL;
fd = -1;
+ direct_io = 0;
WT_VERBOSE_RET(session, fileops, "%s: open", name);
@@ -113,13 +114,18 @@ __wt_open(WT_SESSION_IMPL *session,
mode = 0;
#ifdef O_DIRECT
- if (is_tree && FLD_ISSET(conn->direct_io, WT_DIRECTIO_DATA))
+ if (is_tree && FLD_ISSET(conn->direct_io, WT_DIRECTIO_DATA)) {
f |= O_DIRECT;
+ direct_io = 1;
+ }
#endif
WT_SYSCALL_RETRY(((fd = open(path, f, mode)) == -1 ? 1 : 0), ret);
if (ret != 0)
- WT_ERR_MSG(session, ret, "%s", name);
+ WT_ERR_MSG(session, ret,
+ direct_io ?
+ "%s: open failed with direct I/O configured, some "
+ "filesystem types do not support direct I/O" : "%s", name);
#if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) && !defined(O_CLOEXEC)
/*
@@ -147,6 +153,11 @@ __wt_open(WT_SESSION_IMPL *session,
fh->fd = fd;
fh->refcnt = 1;
+#ifdef O_DIRECT
+ if (f & O_DIRECT)
+ fh->direct_io = 1;
+#endif
+
/* Set the file's size. */
WT_ERR(__wt_filesize(session, fh, &fh->file_size));
diff --git a/src/os_posix/os_remove.c b/src/os_posix/os_remove.c
index 026a1e9740b..ccd163b1ac6 100644
--- a/src/os_posix/os_remove.c
+++ b/src/os_posix/os_remove.c
@@ -8,23 +8,23 @@
#include "wt_internal.h"
/*
- * __wt_remove --
- * Remove a file.
+ * __remove_file_check --
+ * Check if the file is currently open before removing it.
*/
-int
-__wt_remove(WT_SESSION_IMPL *session, const char *name)
+static inline void
+__remove_file_check(WT_SESSION_IMPL *session, const char *name)
{
+#ifdef HAVE_DIAGNOSTIC
WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
WT_FH *fh;
- const char *path;
conn = S2C(session);
fh = NULL;
- WT_VERBOSE_RET(session, fileops, "%s: remove", name);
-
- /* If the file is open, close/free it. */
+ /*
+ * Check if the file is open: it's an error if it is, since a higher
+ * level should have closed it before removing.
+ */
__wt_spin_lock(session, &conn->fh_lock);
TAILQ_FOREACH(fh, &conn->fhqh, q) {
if (strcmp(name, fh->name) == 0)
@@ -32,8 +32,26 @@ __wt_remove(WT_SESSION_IMPL *session, const char *name)
}
__wt_spin_unlock(session, &conn->fh_lock);
- /* This should be caught at a higher level. */
WT_ASSERT(session, fh == NULL);
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(name);
+#endif
+}
+
+/*
+ * __wt_remove --
+ * Remove a file.
+ */
+int
+__wt_remove(WT_SESSION_IMPL *session, const char *name)
+{
+ WT_DECL_RET;
+ const char *path;
+
+ WT_VERBOSE_RET(session, fileops, "%s: remove", name);
+
+ __remove_file_check(session, name);
WT_RET(__wt_filename(session, name, &path));
diff --git a/src/os_posix/os_rw.c b/src/os_posix/os_rw.c
index c355b054cb6..2b83d961592 100644
--- a/src/os_posix/os_rw.c
+++ b/src/os_posix/os_rw.c
@@ -21,13 +21,13 @@ __wt_read(WT_SESSION_IMPL *session,
"%s: read %" PRIu32 " bytes at offset %" PRIuMAX,
fh->name, bytes, (uintmax_t)offset);
- if (pread(fh->fd, buf, (size_t)bytes, offset) == (ssize_t)bytes)
- return (0);
+ if (pread(fh->fd, buf, (size_t)bytes, offset) != (ssize_t)bytes)
+ WT_RET_MSG(session, __wt_errno(),
+ "%s read error: failed to read %" PRIu32
+ " bytes at offset %" PRIuMAX,
+ fh->name, bytes, (uintmax_t)offset);
- WT_RET_MSG(session, __wt_errno(),
- "%s read error: failed to read %" PRIu32 " bytes at offset %"
- PRIuMAX,
- fh->name, bytes, (uintmax_t)offset);
+ return (0);
}
/*
@@ -44,11 +44,11 @@ __wt_write(WT_SESSION_IMPL *session,
"%s: write %" PRIu32 " bytes at offset %" PRIuMAX,
fh->name, bytes, (uintmax_t)offset);
- if (pwrite(fh->fd, buf, (size_t)bytes, offset) == (ssize_t)bytes)
- return (0);
+ if (pwrite(fh->fd, buf, (size_t)bytes, offset) != (ssize_t)bytes)
+ WT_RET_MSG(session, __wt_errno(),
+ "%s write error: failed to write %" PRIu32
+ " bytes at offset %" PRIuMAX,
+ fh->name, bytes, (uintmax_t)offset);
- WT_RET_MSG(session, __wt_errno(),
- "%s write error: failed to write %" PRIu32 " bytes at offset %"
- PRIuMAX,
- fh->name, bytes, (uintmax_t)offset);
+ return (0);
}
diff --git a/src/packing/packing_api.c b/src/packing/pack_api.c
index 143eee445eb..143eee445eb 100644
--- a/src/packing/packing_api.c
+++ b/src/packing/pack_api.c
diff --git a/src/packing/packing.c b/src/packing/pack_impl.c
index d0db6bf4128..a5a4a75ce67 100644
--- a/src/packing/packing.c
+++ b/src/packing/pack_impl.c
@@ -21,8 +21,6 @@ __wt_struct_check(WT_SESSION_IMPL *session,
WT_PACK_VALUE pv;
int fields;
- WT_CLEAR(pv); /* -Wuninitialized. */
-
WT_RET(__pack_initn(session, &pack, fmt, len));
for (fields = 0; (ret = __pack_next(&pack, &pv)) == 0; fields++)
@@ -57,8 +55,6 @@ __wt_struct_sizev(
WT_PACK_VALUE pv;
size_t total;
- WT_CLEAR(pv); /* -Wuninitialized */
-
WT_RET(__pack_init(session, &pack, fmt));
for (total = 0; __pack_next(&pack, &pv) == 0;) {
@@ -99,8 +95,6 @@ __wt_struct_packv(WT_SESSION_IMPL *session,
WT_PACK_VALUE pv;
uint8_t *p, *end;
- WT_CLEAR(pv); /* -Wuninitialized */
-
WT_RET(__pack_init(session, &pack, fmt));
p = buffer;
@@ -111,6 +105,7 @@ __wt_struct_packv(WT_SESSION_IMPL *session,
WT_RET(__pack_write(session, &pv, &p, (size_t)(end - p)));
}
+ /* Be paranoid - __pack_write should never overflow. */
WT_ASSERT(session, p <= end);
if (ret != WT_NOTFOUND)
@@ -154,13 +149,13 @@ __wt_struct_unpackv(WT_SESSION_IMPL *session,
p = buffer;
end = p + size;
- WT_CLEAR(pv.u.item); /* GCC 4.6 lint */
while ((ret = __pack_next(&pack, &pv)) == 0) {
WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p)));
WT_UNPACK_PUT(session, pv, ap);
}
+ /* Be paranoid - __pack_write should never overflow. */
WT_ASSERT(session, p <= end);
if (ret != WT_NOTFOUND)
diff --git a/src/packing/pack_stream.c b/src/packing/pack_stream.c
new file mode 100644
index 00000000000..2e8c4a22040
--- /dev/null
+++ b/src/packing/pack_stream.c
@@ -0,0 +1,288 @@
+/*-
+ * Copyright (c) 2008-2013 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Streaming interface to packing.
+ *
+ * This allows applications to pack or unpack records one field at a time.
+ */
+struct __wt_pack_stream {
+ WT_PACK pack;
+ uint8_t *end, *p, *start;
+};
+
+/*
+ * wiredtiger_pack_start --
+ * Open a stream for packing.
+ */
+int
+wiredtiger_pack_start(WT_SESSION *wt_session,
+ const char *format, void *buffer, size_t len, WT_PACK_STREAM **psp)
+{
+ WT_DECL_RET;
+ WT_PACK_STREAM *ps;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ WT_RET(__wt_calloc_def(session, 1, &ps));
+ WT_ERR(__pack_init(session, &ps->pack, format));
+ ps->p = ps->start = buffer;
+ ps->end = ps->p + len;
+ *psp = ps;
+
+ if (0) {
+err: (void)wiredtiger_pack_close(ps, NULL);
+ }
+ return (ret);
+}
+
+/*
+ * wiredtiger_unpack_start --
+ * Open a stream for unpacking.
+ */
+int
+wiredtiger_unpack_start(WT_SESSION *wt_session, const char *format,
+ const void *buffer, size_t size, WT_PACK_STREAM **psp)
+{
+ return (wiredtiger_pack_start(
+ wt_session, format, (void *)buffer, size, psp));
+}
+
+/*
+ * wiredtiger_pack_close --
+ * Close a packing stream.
+ */
+int
+wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp)
+{
+ if (usedp != NULL)
+ *usedp = WT_PTRDIFF(ps->p, ps->start);
+
+ if (ps != NULL)
+ __wt_free(ps->pack.session, ps);
+
+ return (0);
+}
+
+/*
+ * wiredtiger_pack_item --
+ * Pack an item.
+ */
+int
+wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item)
+{
+ WT_PACK_VALUE pv;
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'U':
+ case 'u':
+ pv.u.item.data = item->data;
+ pv.u.item.size = item->size;
+ WT_RET(__pack_write(
+ session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_pack_int --
+ * Pack a signed integer.
+ */
+int
+wiredtiger_pack_int(WT_PACK_STREAM *ps, int64_t i)
+{
+ WT_PACK_VALUE pv;
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'b':
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'q':
+ pv.u.i = i;
+ WT_RET(__pack_write(
+ session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_pack_str --
+ * Pack a string.
+ */
+int
+wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s)
+{
+ WT_PACK_VALUE pv;
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'S':
+ case 's':
+ pv.u.s = s;
+ WT_RET(__pack_write(
+ session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_pack_uint --
+ * Pack an unsigned int.
+ */
+int
+wiredtiger_pack_uint(WT_PACK_STREAM *ps, uint64_t u)
+{
+ WT_PACK_VALUE pv;
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'B':
+ case 'H':
+ case 'I':
+ case 'L':
+ case 'Q':
+ case 'R':
+ case 'r':
+ case 't':
+ pv.u.u = u;
+ WT_RET(__pack_write(
+ session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_unpack_item --
+ * Unpack an item.
+ */
+int
+wiredtiger_unpack_item(WT_PACK_STREAM *ps, WT_ITEM *item)
+{
+ WT_PACK_VALUE pv;
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'U':
+ case 'u':
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+ item->data = pv.u.item.data;
+ item->size = pv.u.item.size;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_unpack_int --
+ * Unpack a signed integer.
+ */
+int
+wiredtiger_unpack_int(WT_PACK_STREAM *ps, int64_t *ip)
+{
+ WT_PACK_VALUE pv;
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'b':
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'q':
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+ *ip = pv.u.i;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
+
+/*
+ * wiredtiger_unpack_str --
+ * Unpack a string.
+ */
+int
+wiredtiger_unpack_str(WT_PACK_STREAM *ps, const char **sp)
+{
+ WT_PACK_VALUE pv;
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'S':
+ case 's':
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+ *sp = pv.u.s;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
+
+/*
+ * wiredtiger_unpack_uint --
+ * Unpack an unsigned integer.
+ */
+int
+wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up)
+{
+ WT_PACK_VALUE pv;
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'B':
+ case 'H':
+ case 'I':
+ case 'L':
+ case 'Q':
+ case 'R':
+ case 'r':
+ case 't':
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+ *up = pv.u.u;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c
index e401f5cf484..4a6b75d5dd3 100644
--- a/src/schema/schema_create.c
+++ b/src/schema/schema_create.c
@@ -14,11 +14,10 @@ __create_file(WT_SESSION_IMPL *session,
WT_DECL_ITEM(val);
WT_DECL_RET;
int is_metadata;
- const char *cfg[] = API_CONF_DEFAULTS(session, create, config);
const char *filecfg[4] = API_CONF_DEFAULTS(file, meta, config);
- const char *filename, *treeconf;
+ const char *fileconf, *filename;
- treeconf = NULL;
+ fileconf = NULL;
is_metadata = strcmp(uri, WT_METADATA_URI) == 0;
@@ -28,14 +27,14 @@ __create_file(WT_SESSION_IMPL *session,
/* Check if the file already exists. */
if (!is_metadata && (ret =
- __wt_metadata_read(session, uri, &treeconf)) != WT_NOTFOUND) {
+ __wt_metadata_read(session, uri, &fileconf)) != WT_NOTFOUND) {
if (exclusive)
WT_TRET(EEXIST);
goto err;
}
/* Create the file. */
- WT_ERR(__wt_btree_create(session, filename));
+ WT_ERR(__wt_block_manager_create(session, filename));
if (WT_META_TRACKING(session))
WT_ERR(__wt_meta_track_fileop(session, NULL, uri));
@@ -50,8 +49,8 @@ __create_file(WT_SESSION_IMPL *session,
WT_BTREE_MAJOR_VERSION, WT_BTREE_MINOR_VERSION));
filecfg[2] = val->data;
filecfg[3] = NULL;
- WT_ERR(__wt_config_collapse(session, filecfg, &treeconf));
- if ((ret = __wt_metadata_insert(session, uri, treeconf)) != 0) {
+ WT_ERR(__wt_config_collapse(session, filecfg, &fileconf));
+ if ((ret = __wt_metadata_insert(session, uri, fileconf)) != 0) {
if (ret == WT_DUPLICATE_KEY)
ret = EEXIST;
goto err;
@@ -59,20 +58,23 @@ __create_file(WT_SESSION_IMPL *session,
}
/*
- * Open the file to check that it was setup correctly.
+ * Open the file to check that it was setup correctly. We don't need
+ * to pass the configuration, we just wrote the collapsed configuration
+ * into the metadata file, and it's going to be read/used by underlying
+ * functions.
*
* Keep the handle exclusive until it is released at the end of the
* call, otherwise we could race with a drop.
*/
WT_ERR(__wt_conn_btree_get(
- session, uri, NULL, cfg, WT_DHANDLE_EXCLUSIVE));
+ session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
if (WT_META_TRACKING(session))
WT_ERR(__wt_meta_track_handle_lock(session, 1));
else
WT_ERR(__wt_session_release_btree(session));
err: __wt_scr_free(&val);
- __wt_free(session, treeconf);
+ __wt_free(session, fileconf);
return (ret);
}
@@ -143,7 +145,7 @@ __create_colgroup(WT_SESSION_IMPL *session,
/* Make sure the column group is referenced from the table. */
if (cgname != NULL && (ret =
__wt_config_subgets(session, &table->cgconf, cgname, &cval)) != 0)
- WT_RET_MSG(session, EINVAL,
+ WT_ERR_MSG(session, EINVAL,
"Column group '%s' not found in table '%.*s'",
cgname, (int)tlen, tablename);
@@ -203,6 +205,8 @@ err: __wt_free(session, cgconf);
__wt_buf_free(session, &confbuf);
__wt_buf_free(session, &fmt);
__wt_buf_free(session, &namebuf);
+
+ __wt_schema_release_table(session, table);
return (ret);
}
@@ -355,6 +359,8 @@ err: __wt_free(session, idxconf);
__wt_buf_free(session, &extra_cols);
__wt_buf_free(session, &fmt);
__wt_buf_free(session, &namebuf);
+
+ __wt_schema_release_table(session, table);
return (ret);
}
@@ -381,8 +387,10 @@ __create_table(WT_SESSION_IMPL *session,
return (EINVAL);
if ((ret = __wt_schema_get_table(session,
- tablename, strlen(tablename), 0, &table)) == 0)
+ tablename, strlen(tablename), 0, &table)) == 0) {
+ __wt_schema_release_table(session, table);
return (exclusive ? EEXIST : 0);
+ }
WT_RET_NOTFOUND_OK(ret);
WT_RET(__wt_config_gets(session, cfg, "colgroups", &cval));
@@ -416,9 +424,13 @@ __create_table(WT_SESSION_IMPL *session,
}
if (0) {
-err: if (table != NULL)
- WT_TRET(__wt_schema_remove_table(session, table));
+err: if (table != NULL) {
+ __wt_schema_remove_table(session, table);
+ table = NULL;
+ }
}
+ if (table != NULL)
+ __wt_schema_release_table(session, table);
__wt_free(session, cgname);
__wt_free(session, tableconf);
return (ret);
diff --git a/src/schema/schema_drop.c b/src/schema/schema_drop.c
index c6109a6f020..db0fcbb3f5d 100644
--- a/src/schema/schema_drop.c
+++ b/src/schema/schema_drop.c
@@ -117,6 +117,7 @@ __drop_table(
name = uri;
(void)WT_PREFIX_SKIP(name, "table:");
+ table = NULL;
WT_ERR(__wt_schema_get_table(session, name, strlen(name), 1, &table));
/* Drop the column groups. */
@@ -136,13 +137,16 @@ __drop_table(
WT_ERR(__wt_schema_drop(session, idx->source, cfg));
}
- WT_ERR(__wt_schema_remove_table(session, table));
+ __wt_schema_remove_table(session, table);
+ table = NULL;
/* Remove the metadata entry (ignore missing items). */
WT_ERR(__wt_metadata_remove(session, uri));
err: if (force && ret == WT_NOTFOUND)
ret = 0;
+ if (table != NULL)
+ __wt_schema_release_table(session, table);
return (ret);
}
@@ -185,6 +189,9 @@ __wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
if (ret == WT_NOTFOUND)
ret = force ? 0 : ENOENT;
+ /* Bump the schema generation so that stale data is ignored. */
+ ++S2C(session)->schema_gen;
+
WT_TRET(__wt_meta_track_off(session, ret != 0));
return (ret);
diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c
index 61dc88288eb..6508d9696d4 100644
--- a/src/schema/schema_list.c
+++ b/src/schema/schema_list.c
@@ -19,6 +19,9 @@ __schema_add_table(WT_SESSION_IMPL *session,
WT_RET(__wt_schema_open_table(session, name, namelen, &table));
+ /* Copy the schema generation into the new table. */
+ table->schema_gen = S2C(session)->schema_gen;
+
TAILQ_INSERT_HEAD(&session->tables, table, q);
*tablep = table;
@@ -36,10 +39,28 @@ __schema_find_table(WT_SESSION_IMPL *session,
WT_TABLE *table;
const char *tablename;
+restart:
TAILQ_FOREACH(table, &session->tables, q) {
tablename = table->name;
(void)WT_PREFIX_SKIP(tablename, "table:");
if (WT_STRING_MATCH(tablename, name, namelen)) {
+ /*
+ * Ignore stale tables.
+ *
+ * XXX: should be managed the same as btree handles,
+ * with a local cache in each session and a shared list
+ * in the connection. There is still a race here
+ * between checking the generation and opening the
+ * first column group.
+ */
+ if (table->schema_gen != S2C(session)->schema_gen) {
+ if (table->refcnt == 0) {
+ __wt_schema_remove_table(
+ session, table);
+ goto restart;
+ }
+ continue;
+ }
*tablep = table;
return (0);
}
@@ -59,7 +80,6 @@ __wt_schema_get_table(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_TABLE *table;
- table = NULL;
ret = __schema_find_table(session, name, namelen, &table);
if (ret == WT_NOTFOUND)
@@ -72,6 +92,7 @@ __wt_schema_get_table(WT_SESSION_IMPL *session,
"until all column groups are created",
table->name);
+ ++table->refcnt;
*tablep = table;
}
@@ -79,6 +100,17 @@ __wt_schema_get_table(WT_SESSION_IMPL *session,
}
/*
+ * __wt_schema_release_table --
+ * Release a table handle.
+ */
+void
+__wt_schema_release_table(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+ WT_ASSERT(session, table->refcnt > 0);
+ --table->refcnt;
+}
+
+/*
* __wt_schema_destroy_colgroup --
* Free a column group handle.
*/
@@ -147,28 +179,25 @@ __wt_schema_destroy_table(WT_SESSION_IMPL *session, WT_TABLE *table)
* __wt_schema_remove_table --
* Remove the table handle from the session, closing if necessary.
*/
-int
+void
__wt_schema_remove_table(
WT_SESSION_IMPL *session, WT_TABLE *table)
{
+ WT_ASSERT(session, table->refcnt <= 1);
+
TAILQ_REMOVE(&session->tables, table, q);
__wt_schema_destroy_table(session, table);
-
- return (0);
}
/*
* __wt_schema_close_tables --
* Close all of the tables in a session.
*/
-int
+void
__wt_schema_close_tables(WT_SESSION_IMPL *session)
{
- WT_DECL_RET;
WT_TABLE *table;
while ((table = TAILQ_FIRST(&session->tables)) != NULL)
- WT_TRET(__wt_schema_remove_table(session, table));
-
- return (ret);
+ __wt_schema_remove_table(session, table);
}
diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c
index c624ccd0797..c95ae0b97d8 100644
--- a/src/schema/schema_open.c
+++ b/src/schema/schema_open.c
@@ -431,17 +431,19 @@ __wt_schema_get_colgroup(WT_SESSION_IMPL *session,
WT_RET(__wt_schema_get_table(session,
tablename, WT_PTRDIFF(tend, tablename), 0, &table));
- if (tablep != NULL)
- *tablep = table;
-
for (i = 0; i < WT_COLGROUPS(table); i++) {
colgroup = table->cgroups[i];
if (strcmp(colgroup->name, uri) == 0) {
*colgroupp = colgroup;
+ if (tablep != NULL)
+ *tablep = table;
+ else
+ __wt_schema_release_table(session, table);
return (0);
}
}
+ __wt_schema_release_table(session, table);
WT_RET_MSG(session, ENOENT, "%s not found in table", uri);
}
@@ -453,6 +455,7 @@ int
__wt_schema_get_index(WT_SESSION_IMPL *session,
const char *uri, WT_TABLE **tablep, WT_INDEX **indexp)
{
+ WT_DECL_RET;
WT_INDEX *idx;
WT_TABLE *table;
const char *tablename, *tend;
@@ -468,22 +471,26 @@ __wt_schema_get_index(WT_SESSION_IMPL *session,
WT_RET(__wt_schema_get_table(session,
tablename, WT_PTRDIFF(tend, tablename), 0, &table));
- if (tablep != NULL)
- *tablep = table;
-
/* Try to find the index in the table. */
for (i = 0; i < table->nindices; i++) {
idx = table->indices[i];
if (strcmp(idx->name, uri) == 0) {
+ if (tablep != NULL)
+ *tablep = table;
+ else
+ __wt_schema_release_table(session, table);
*indexp = idx;
return (0);
}
}
/* Otherwise, open it. */
- WT_RET(__wt_schema_open_index(
+ WT_ERR(__wt_schema_open_index(
session, table, tend + 1, strlen(tend + 1), indexp));
+err: __wt_schema_release_table(session, table);
+ WT_RET(ret);
+
if (*indexp != NULL)
return (0);
diff --git a/src/schema/schema_rename.c b/src/schema/schema_rename.c
index c336ff95dae..00dddbb4058 100644
--- a/src/schema/schema_rename.c
+++ b/src/schema/schema_rename.c
@@ -102,6 +102,7 @@ __rename_tree(WT_SESSION_IMPL *session,
"expected a 'colgroup:' or 'index:' source: '%s'", name);
suffix = strchr(name, ':');
+ /* An existing table should have a well formed name. */
WT_ASSERT(session, suffix != NULL);
suffix = strchr(suffix + 1, ':');
@@ -187,16 +188,17 @@ __rename_table(WT_SESSION_IMPL *session,
/* Rename the column groups. */
for (i = 0; i < WT_COLGROUPS(table); i++)
- WT_RET(__rename_tree(session, table, newuri,
+ WT_ERR(__rename_tree(session, table, newuri,
table->cgroups[i]->name, cfg));
/* Rename the indices. */
- WT_RET(__wt_schema_open_indices(session, table));
+ WT_ERR(__wt_schema_open_indices(session, table));
for (i = 0; i < table->nindices; i++)
- WT_RET(__rename_tree(session, table, newuri,
+ WT_ERR(__rename_tree(session, table, newuri,
table->indices[i]->name, cfg));
- WT_RET(__wt_schema_remove_table(session, table));
+ __wt_schema_remove_table(session, table);
+ table = NULL;
/* Rename the table. */
WT_ERR(__wt_scr_alloc(session, 0, &buf));
@@ -205,6 +207,8 @@ __rename_table(WT_SESSION_IMPL *session,
WT_ERR(__wt_metadata_insert(session, newuri, value));
err: __wt_scr_free(&buf);
+ if (table != NULL)
+ __wt_schema_release_table(session, table);
return (ret);
}
@@ -247,6 +251,9 @@ __wt_schema_rename(WT_SESSION_IMPL *session,
} else if ((ret = __wt_schema_get_source(session, uri, &dsrc)) == 0)
ret = dsrc->rename(dsrc, &session->iface, uri, newuri, cfg);
+ /* Bump the schema generation so that stale data is ignored. */
+ ++S2C(session)->schema_gen;
+
WT_TRET(__wt_meta_track_off(session, ret != 0));
/* If we didn't find a metadata entry, map that error to ENOENT. */
diff --git a/src/schema/schema_stat.c b/src/schema/schema_stat.c
index 2631b97a1c1..4bec00165c0 100644
--- a/src/schema/schema_stat.c
+++ b/src/schema/schema_stat.c
@@ -74,15 +74,16 @@ __curstat_table_init(WT_SESSION_IMPL *session,
WT_UNUSED(flags);
name = uri + strlen("table:");
- WT_RET(__wt_schema_get_table(session, name, strlen(name), 0, &table));
WT_RET(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_schema_get_table(session, name, strlen(name), 0, &table));
/* Clear the statistics we are about to recalculate. */
if (cst->stats != NULL) {
__wt_stat_clear_dsrc_stats(cst->stats);
stats = (WT_DSRC_STATS *)cst->stats;
} else {
- WT_ERR(__wt_stat_alloc_dsrc_stats(session, &stats));
+ WT_ERR(__wt_calloc_def(session, 1, &stats));
+ __wt_stat_init_dsrc_stats(stats);
cst->stats_first = cst->stats = (WT_STATS *)stats;
cst->stats_count = sizeof(*stats) / sizeof(WT_STATS);
}
@@ -127,6 +128,7 @@ __curstat_table_init(WT_SESSION_IMPL *session,
}
err: __wt_scr_free(&buf);
+ __wt_schema_release_table(session, table);
return (ret);
}
diff --git a/src/schema/schema_truncate.c b/src/schema/schema_truncate.c
index 4caa4604a22..ece2ecbf049 100644
--- a/src/schema/schema_truncate.c
+++ b/src/schema/schema_truncate.c
@@ -25,7 +25,7 @@ __truncate_file(WT_SESSION_IMPL *session, const char *name)
/* Delete the root address and truncate the file. */
WT_RET(__wt_meta_checkpoint_clear(session, name));
- WT_RET(__wt_btree_truncate(session, filename));
+ WT_RET(__wt_block_manager_truncate(session, filename));
return (0);
}
@@ -43,8 +43,8 @@ __truncate_table(WT_SESSION_IMPL *session, const char *name)
const char *hname;
u_int i;
- WT_RET(__wt_schema_get_table(session, name, strlen(name), 0, &table));
WT_RET(__wt_scr_alloc(session, 0, &namebuf));
+ WT_ERR(__wt_schema_get_table(session, name, strlen(name), 0, &table));
/* Truncate the column groups. */
for (i = 0; i < WT_COLGROUPS(table); i++) {
@@ -78,6 +78,7 @@ __truncate_table(WT_SESSION_IMPL *session, const char *name)
}
err: __wt_scr_free(&namebuf);
+ __wt_schema_release_table(session, table);
return (ret);
}
diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c
index ad9480bc0fe..619432ea9ac 100644
--- a/src/schema/schema_worker.c
+++ b/src/schema/schema_worker.c
@@ -25,6 +25,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
const char *tablename;
u_int i;
+ table = NULL;
tablename = uri;
/* Get the btree handle(s) and call the underlying function. */
@@ -52,18 +53,20 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
for (i = 0; i < WT_COLGROUPS(table); i++) {
colgroup = table->cgroups[i];
- WT_RET(__wt_schema_worker(
+ WT_ERR(__wt_schema_worker(
session, colgroup->source, func, cfg, open_flags));
}
- WT_RET(__wt_schema_open_indices(session, table));
+ WT_ERR(__wt_schema_open_indices(session, table));
for (i = 0; i < table->nindices; i++) {
idx = table->indices[i];
- WT_RET(__wt_schema_worker(
+ WT_ERR(__wt_schema_worker(
session, idx->source, func, cfg, open_flags));
}
} else
return (__wt_bad_object_type(session, uri));
+err: if (table != NULL)
+ __wt_schema_release_table(session, table);
return (ret);
}
diff --git a/src/session/session_api.c b/src/session/session_api.c
index 5339dcf0800..8141a28e545 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -38,7 +38,7 @@ __session_close_cache(WT_SESSION_IMPL *session)
while ((dhandle_cache = TAILQ_FIRST(&session->dhandles)) != NULL)
WT_TRET(__wt_session_discard_btree(session, dhandle_cache));
- WT_TRET(__wt_schema_close_tables(session));
+ __wt_schema_close_tables(session);
return (ret);
}
@@ -595,6 +595,13 @@ __session_begin_transaction(WT_SESSION *wt_session, const char *config)
WT_ERR(__session_reset_cursors(session));
+ /*
+ * Now there are no cursors open and no transaction active in this
+ * thread. Check if the cache is full: if we have to block for
+ * eviction, this is the best time to do it.
+ */
+ WT_ERR(__wt_cache_full_check(session));
+
ret = __wt_txn_begin(session, cfg);
err: API_END(session);
diff --git a/src/support/filename.c b/src/support/filename.c
index 1b2223a50ef..6b3d7ff17d2 100644
--- a/src/support/filename.c
+++ b/src/support/filename.c
@@ -8,12 +8,36 @@
#include "wt_internal.h"
/*
+ * __wt_absolute_path --
+ * Return if a filename is an absolute path.
+ */
+int
+__wt_absolute_path(const char *path)
+{
+ return (path[0] == '/' ? 1 : 0);
+}
+
+/*
* __wt_filename --
- * Build a filename in a scratch buffer.
+ * Build a file name in a scratch buffer, automatically calculate the
+ * length of the file name.
*/
int
__wt_filename(WT_SESSION_IMPL *session, const char *name, const char **path)
{
+ return __wt_nfilename(session, name, strlen(name), path);
+}
+
+/*
+ * __wt_nfilename --
+ * Build a file name in a scratch buffer. If the name is already an
+ * absolute path duplicate it, otherwise generate a path relative to the
+ * connection home directory.
+ */
+int
+__wt_nfilename(WT_SESSION_IMPL *session,
+ const char *name, size_t namelen, const char **path)
+{
WT_CONNECTION_IMPL *conn;
size_t len;
char *buf;
@@ -21,10 +45,14 @@ __wt_filename(WT_SESSION_IMPL *session, const char *name, const char **path)
conn = S2C(session);
*path = NULL;
- len = strlen(conn->home) + 1 + strlen(name) + 1;
- WT_RET(__wt_calloc(session, 1, len, &buf));
- snprintf(buf, len, "%s/%s", conn->home, name);
+ if (__wt_absolute_path(name))
+ WT_RET(__wt_strndup(session, name, namelen, path));
+ else {
+ len = strlen(conn->home) + 1 + namelen + 1;
+ WT_RET(__wt_calloc(session, 1, len, &buf));
+ snprintf(buf, len, "%s/%.*s", conn->home, (int)namelen, name);
+ *path = buf;
+ }
- *path = buf;
return (0);
}
diff --git a/src/support/hazard.c b/src/support/hazard.c
index d5f49f89eef..9db7a5d6c95 100644
--- a/src/support/hazard.c
+++ b/src/support/hazard.c
@@ -88,7 +88,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp
* page to be evicted and a different page read into the same
* memory, so the pointer hasn't changed but the contents have.
* That's OK, we found this page using the tree's key space,
- * whatever page we find here is the page page for us to use.)
+ * whatever page we find here is the page for us to use.)
*/
if (ref->page == hp->page &&
(ref->state == WT_REF_MEM ||
@@ -153,7 +153,7 @@ __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
* Perform the check here since we want to do it when
* we are about to release the hazard reference.
*/
- (void)__wt_eviction_page_force(session, page);
+ __wt_eviction_page_force(session, page);
/*
* We don't publish the hazard pointer clear in the
diff --git a/src/support/scratch.c b/src/support/scratch.c
index 9cda695ec89..e0e76348876 100644
--- a/src/support/scratch.c
+++ b/src/support/scratch.c
@@ -302,7 +302,7 @@ __wt_scr_alloc_func(WT_SESSION_IMPL *session,
/*
* If we find a buffer that's not in-use, check its size: we
- * want the the smallest buffer larger than the requested size,
+ * want the smallest buffer larger than the requested size,
* or the largest buffer if none are large enough.
*/
if (best == NULL ||
diff --git a/src/support/stat.c b/src/support/stat.c
index 08f7f2a9f30..7df21d4719e 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -2,13 +2,9 @@
#include "wt_internal.h"
-int
-__wt_stat_alloc_dsrc_stats(WT_SESSION_IMPL *session, WT_DSRC_STATS **statsp)
+void
+__wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats)
{
- WT_DSRC_STATS *stats;
-
- WT_RET(__wt_calloc_def(session, 1, &stats));
-
stats->block_alloc.desc = "blocks allocated";
stats->block_allocsize.desc =
"block manager file allocation unit size";
@@ -47,13 +43,23 @@ __wt_stat_alloc_dsrc_stats(WT_SESSION_IMPL *session, WT_DSRC_STATS **statsp)
stats->btree_row_leaf.desc = "row-store leaf pages";
stats->cache_bytes_read.desc = "bytes read into cache";
stats->cache_bytes_write.desc = "bytes written from cache";
+ stats->cache_eviction_checkpoint.desc =
+ "cache: checkpoint blocked page eviction";
stats->cache_eviction_clean.desc = "unmodified pages evicted";
stats->cache_eviction_dirty.desc = "modified pages evicted";
stats->cache_eviction_fail.desc =
"data source pages selected for eviction unable to be evicted";
+ stats->cache_eviction_force.desc =
+ "cache: pages queued for forced eviction";
stats->cache_eviction_hazard.desc =
- "eviction unable to acquire hazard pointer";
+ "cache: hazard pointer blocked page eviction";
stats->cache_eviction_internal.desc = "internal pages evicted";
+ stats->cache_eviction_merge.desc =
+ "cache: internal page merge operations completed";
+ stats->cache_eviction_merge_fail.desc =
+ "cache: internal page merge attempts that could not complete";
+ stats->cache_eviction_merge_levels.desc =
+ "cache: internal levels merged";
stats->cache_overflow_value.desc = "overflow values cached in memory";
stats->cache_read.desc = "pages read into cache";
stats->cache_read_overflow.desc = "overflow pages read into cache";
@@ -95,19 +101,18 @@ __wt_stat_alloc_dsrc_stats(WT_SESSION_IMPL *session, WT_DSRC_STATS **statsp)
stats->rec_pages_eviction.desc =
"page reconciliation calls for eviction";
stats->rec_skipped_update.desc =
- "page reconciliation failed when an update could not be included";
+ "reconciliation failed because an update could not be included";
stats->rec_split_intl.desc = "reconciliation internal pages split";
stats->rec_split_leaf.desc = "reconciliation leaf pages split";
+ stats->rec_split_max.desc =
+ "reconciliation maximum number of splits created by for a page";
stats->session_compact.desc = "object compaction";
stats->txn_update_conflict.desc = "update conflicts";
stats->txn_write_conflict.desc = "write generation conflicts";
-
- *statsp = stats;
- return (0);
}
void
-__wt_stat_clear_dsrc_stats(WT_STATS *stats_arg)
+__wt_stat_clear_dsrc_stats(void *stats_arg)
{
WT_DSRC_STATS *stats;
@@ -145,11 +150,16 @@ __wt_stat_clear_dsrc_stats(WT_STATS *stats_arg)
stats->btree_row_leaf.v = 0;
stats->cache_bytes_read.v = 0;
stats->cache_bytes_write.v = 0;
+ stats->cache_eviction_checkpoint.v = 0;
stats->cache_eviction_clean.v = 0;
stats->cache_eviction_dirty.v = 0;
stats->cache_eviction_fail.v = 0;
+ stats->cache_eviction_force.v = 0;
stats->cache_eviction_hazard.v = 0;
stats->cache_eviction_internal.v = 0;
+ stats->cache_eviction_merge.v = 0;
+ stats->cache_eviction_merge_fail.v = 0;
+ stats->cache_eviction_merge_levels.v = 0;
stats->cache_overflow_value.v = 0;
stats->cache_read.v = 0;
stats->cache_read_overflow.v = 0;
@@ -186,18 +196,15 @@ __wt_stat_clear_dsrc_stats(WT_STATS *stats_arg)
stats->rec_skipped_update.v = 0;
stats->rec_split_intl.v = 0;
stats->rec_split_leaf.v = 0;
+ stats->rec_split_max.v = 0;
stats->session_compact.v = 0;
stats->txn_update_conflict.v = 0;
stats->txn_write_conflict.v = 0;
}
-int
-__wt_stat_alloc_connection_stats(WT_SESSION_IMPL *session, WT_CONNECTION_STATS **statsp)
+void
+__wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
{
- WT_CONNECTION_STATS *stats;
-
- WT_RET(__wt_calloc_def(session, 1, &stats));
-
stats->block_byte_map_read.desc =
"mapped bytes read by the block manager";
stats->block_byte_read.desc = "bytes read by the block manager";
@@ -211,15 +218,26 @@ __wt_stat_alloc_connection_stats(WT_SESSION_IMPL *session, WT_CONNECTION_STATS *
stats->cache_bytes_max.desc = "cache: maximum bytes configured";
stats->cache_bytes_read.desc = "cache: bytes read into cache";
stats->cache_bytes_write.desc = "cache: bytes written from cache";
+ stats->cache_eviction_checkpoint.desc =
+ "cache: checkpoint blocked page eviction";
stats->cache_eviction_clean.desc = "cache: unmodified pages evicted";
stats->cache_eviction_dirty.desc = "cache: modified pages evicted";
stats->cache_eviction_fail.desc =
"cache: pages selected for eviction unable to be evicted";
+ stats->cache_eviction_force.desc =
+ "cache: pages queued for forced eviction";
stats->cache_eviction_hazard.desc =
- "cache: eviction unable to acquire hazard pointer";
+ "cache: hazard pointer blocked page eviction";
stats->cache_eviction_internal.desc = "cache: internal pages evicted";
+ stats->cache_eviction_merge.desc =
+ "cache: internal page merge operations completed";
+ stats->cache_eviction_merge_fail.desc =
+ "cache: internal page merge attempts that could not complete";
+ stats->cache_eviction_merge_levels.desc =
+ "cache: internal levels merged";
stats->cache_eviction_slow.desc =
"cache: eviction server unable to reach eviction goal";
+ stats->cache_eviction_walk.desc = "cache: pages walked for eviction";
stats->cache_pages_dirty.desc =
"cache: tracked dirty pages in the cache";
stats->cache_pages_inuse.desc =
@@ -230,7 +248,13 @@ __wt_stat_alloc_connection_stats(WT_SESSION_IMPL *session, WT_CONNECTION_STATS *
stats->file_open.desc = "files currently open";
stats->memory_allocation.desc = "total heap memory allocations";
stats->memory_free.desc = "total heap memory frees";
+ stats->memory_grow.desc = "total heap memory re-allocations";
stats->read_io.desc = "total read I/Os";
+ stats->rec_pages.desc = "page reconciliation calls";
+ stats->rec_pages_eviction.desc =
+ "page reconciliation calls for eviction";
+ stats->rec_skipped_update.desc =
+ "reconciliation failed because an update could not be included";
stats->rwlock_read.desc = "pthread mutex shared lock read-lock calls";
stats->rwlock_write.desc =
"pthread mutex shared lock write-lock calls";
@@ -242,13 +266,10 @@ __wt_stat_alloc_connection_stats(WT_SESSION_IMPL *session, WT_CONNECTION_STATS *
"transaction failures due to cache overflow";
stats->txn_rollback.desc = "transactions rolled-back";
stats->write_io.desc = "total write I/Os";
-
- *statsp = stats;
- return (0);
}
void
-__wt_stat_clear_connection_stats(WT_STATS *stats_arg)
+__wt_stat_clear_connection_stats(void *stats_arg)
{
WT_CONNECTION_STATS *stats;
@@ -262,12 +283,18 @@ __wt_stat_clear_connection_stats(WT_STATS *stats_arg)
stats->cache_bytes_dirty.v = 0;
stats->cache_bytes_read.v = 0;
stats->cache_bytes_write.v = 0;
+ stats->cache_eviction_checkpoint.v = 0;
stats->cache_eviction_clean.v = 0;
stats->cache_eviction_dirty.v = 0;
stats->cache_eviction_fail.v = 0;
+ stats->cache_eviction_force.v = 0;
stats->cache_eviction_hazard.v = 0;
stats->cache_eviction_internal.v = 0;
+ stats->cache_eviction_merge.v = 0;
+ stats->cache_eviction_merge_fail.v = 0;
+ stats->cache_eviction_merge_levels.v = 0;
stats->cache_eviction_slow.v = 0;
+ stats->cache_eviction_walk.v = 0;
stats->cache_pages_dirty.v = 0;
stats->cache_read.v = 0;
stats->cache_write.v = 0;
@@ -275,7 +302,11 @@ __wt_stat_clear_connection_stats(WT_STATS *stats_arg)
stats->file_open.v = 0;
stats->memory_allocation.v = 0;
stats->memory_free.v = 0;
+ stats->memory_grow.v = 0;
stats->read_io.v = 0;
+ stats->rec_pages.v = 0;
+ stats->rec_pages_eviction.v = 0;
+ stats->rec_skipped_update.v = 0;
stats->rwlock_read.v = 0;
stats->rwlock_write.v = 0;
stats->txn_ancient.v = 0;
diff --git a/src/txn/txn.c b/src/txn/txn.c
index 918ac67d9da..6e2ba2d5e33 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -39,7 +39,7 @@ __txn_sort_snapshot(WT_SESSION_IMPL *session,
txn->snapshot_count = n;
txn->snap_min = (n == 0) ? id : txn->snapshot[0];
txn->snap_max = id;
- WT_ASSERT(session, txn->snap_min != WT_TXN_NONE);
+ WT_ASSERT(session, n == 0 || txn->snap_min != WT_TXN_NONE);
txn->oldest_snap_min = TXNID_LT(oldest_snap_min, txn->snap_min) ?
oldest_snap_min : txn->snap_min;
}
@@ -51,13 +51,9 @@ __txn_sort_snapshot(WT_SESSION_IMPL *session,
void
__wt_txn_release_snapshot(WT_SESSION_IMPL *session)
{
- WT_TXN *txn;
WT_TXN_STATE *txn_state;
- txn = &session->txn;
txn_state = &S2C(session)->txn_global.states[session->id];
-
- txn->snapshot_count = 0;
txn_state->snap_min = WT_TXN_NONE;
}
@@ -79,9 +75,17 @@ __wt_txn_get_oldest(WT_SESSION_IMPL *session)
conn = S2C(session);
txn = &session->txn;
txn_global = &conn->txn_global;
+
oldest_snap_min =
(txn->id != WT_TXN_NONE) ? txn->id : txn_global->current;
+ /* If nothing has changed since last time, we're done. */
+ if (txn->last_oldest_gen == txn_global->gen &&
+ txn->last_oldest_id == oldest_snap_min)
+ return;
+ txn->last_oldest_gen = txn_global->gen;
+ txn->last_oldest_id = oldest_snap_min;
+
WT_ORDERED_READ(session_cnt, conn->session_cnt);
for (i = 0, s = txn_global->states;
i < session_cnt;
@@ -100,7 +104,7 @@ __wt_txn_get_oldest(WT_SESSION_IMPL *session)
*/
void
__wt_txn_get_snapshot(
- WT_SESSION_IMPL *session, wt_txnid_t my_id, wt_txnid_t max_id)
+ WT_SESSION_IMPL *session, wt_txnid_t my_id, wt_txnid_t max_id, int force)
{
WT_CONNECTION_IMPL *conn;
WT_TXN *txn;
@@ -114,9 +118,18 @@ __wt_txn_get_snapshot(
txn_global = &conn->txn_global;
txn_state = &txn_global->states[session->id];
+ /* If nothing has changed since last time, we're done. */
+ if (!force && txn->last_id == txn_global->current &&
+ txn->last_gen == txn_global->gen) {
+ txn_state->snap_min = txn->snap_min;
+ return;
+ }
+
do {
/* Take a copy of the current session ID. */
- current_id = oldest_snap_min = txn_global->current;
+ txn->last_gen = txn->last_oldest_gen = txn_global->gen;
+ txn->last_id = oldest_snap_min = current_id =
+ txn_global->current;
/* Copy the array of concurrent transactions. */
WT_ORDERED_READ(session_cnt, conn->session_cnt);
@@ -153,39 +166,19 @@ __wt_txn_get_snapshot(
/*
* __wt_txn_get_evict_snapshot --
* Set up a snapshot in the current transaction for eviction.
- * No changes that are invisible to any active transaction can be evicted.
+ * Only changes that visible to all active transactions can be evicted.
*/
void
__wt_txn_get_evict_snapshot(WT_SESSION_IMPL *session)
{
- WT_CONNECTION_IMPL *conn;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *s;
- wt_txnid_t current_id, id, oldest_snap_min;
- uint32_t i, session_cnt;
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
-
- do {
- /* Take a copy of the current session ID. */
- current_id = oldest_snap_min = txn_global->current;
+ WT_TXN *txn;
- /* Walk the array of concurrent transactions. */
- WT_ORDERED_READ(session_cnt, conn->session_cnt);
- for (i = 0, s = txn_global->states; i < session_cnt; i++, s++)
- if ((id = s->snap_min) != WT_TXN_NONE &&
- TXNID_LT(id, oldest_snap_min))
- oldest_snap_min = id;
+ txn = &session->txn;
- /*
- * Ensure the snapshot reads are scheduled before re-checking
- * the global current ID.
- */
- WT_READ_BARRIER();
- } while (current_id != txn_global->current);
+ __wt_txn_get_oldest(session);
+ __txn_sort_snapshot(
+ session, 0, txn->oldest_snap_min, txn->oldest_snap_min);
- __txn_sort_snapshot(session, 0, oldest_snap_min, oldest_snap_min);
/*
* Note that we carefully don't update the global table with this
* snap_min value: there is already a running transaction in this
@@ -237,7 +230,7 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
* If two threads race to allocate an ID, only the latest ID
* will proceed. The winning thread can be sure its snapshot
* contains all of the earlier active IDs. Threads that race
- * race and get an earlier ID may not appear in the snapshot,
+ * and get an earlier ID may not appear in the snapshot,
* but they will loop and allocate a new ID before proceeding
* to make any updates.
*
@@ -249,7 +242,6 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
txn->id = WT_ATOMIC_ADD(txn_global->current, 1);
} while (txn->id == WT_TXN_NONE || txn->id == WT_TXN_ABORTED);
WT_PUBLISH(txn_state->id, txn->id);
- oldest_snap_min = txn->id;
/*
* If we are starting a snapshot isolation transaction, get
@@ -260,6 +252,9 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
* visible.
*/
if (txn->isolation == TXN_ISO_SNAPSHOT) {
+ txn->last_gen = txn->last_oldest_gen = txn_global->gen;
+ oldest_snap_min = txn->id;
+
/* Copy the array of concurrent transactions. */
WT_ORDERED_READ(session_cnt, conn->session_cnt);
for (i = n = 0, s = txn_global->states;
@@ -297,11 +292,13 @@ void
__wt_txn_release(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *txn_state;
txn = &session->txn;
txn->mod_count = txn->modref_count = 0;
- txn_state = &S2C(session)->txn_global.states[session->id];
+ txn_global = &S2C(session)->txn_global;
+ txn_state = &txn_global->states[session->id];
/* Clear the transaction's ID from the global table. */
WT_ASSERT(session, txn_state->id != WT_TXN_NONE &&
@@ -319,6 +316,9 @@ __wt_txn_release(WT_SESSION_IMPL *session)
__wt_txn_release_snapshot(session);
txn->isolation = session->isolation;
F_CLR(txn, TXN_ERROR | TXN_OLDEST | TXN_RUNNING);
+
+ /* Update the global generation number. */
+ ++txn_global->gen;
}
/*
@@ -347,7 +347,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
* commit.
*/
if (session->ncursors > 0)
- __wt_txn_get_snapshot(session, txn->id, WT_TXN_NONE);
+ __wt_txn_get_snapshot(session, txn->id, WT_TXN_NONE, 1);
__wt_txn_release(session);
return (0);
}
@@ -463,5 +463,6 @@ __wt_txn_global_destroy(WT_CONNECTION_IMPL *conn)
session = conn->default_session;
txn_global = &conn->txn_global;
- __wt_free(session, txn_global->states);
+ if (txn_global != NULL)
+ __wt_free(session, txn_global->states);
}
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 39194819b90..ce957ceffd8 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -7,46 +7,24 @@
#include "wt_internal.h"
+static int __checkpoint_sync(WT_SESSION_IMPL *, const char *[]);
+static int __checkpoint_write_leaves(WT_SESSION_IMPL *, const char *[]);
+
/*
- * __wt_txn_checkpoint --
- * Checkpoint a database or a list of objects in the database.
+ * __checkpoint_apply --
+ * Apply an operation to all files involved in a checkpoint.
*/
-int
-__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
+static int
+__checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[],
+ int (*op)(WT_SESSION_IMPL *, const char *[]))
{
- WT_CONNECTION_IMPL *conn;
WT_CONFIG targetconf;
WT_CONFIG_ITEM cval, k, v;
- WT_DATA_HANDLE *dhandle, *saved_dhandle;
WT_DECL_ITEM(tmp);
WT_DECL_RET;
- WT_SESSION *wt_session;
- WT_TXN *txn;
- void *saved_meta_next;
- int ckpt_closed, target_list, tracking;
+ int ckpt_closed, target_list;
- conn = S2C(session);
- target_list = tracking = 0;
- txn = &session->txn;
-
- /*
- * Only one checkpoint can be active at a time, and checkpoints must
- * run in the same order as they update the metadata; we are using the
- * schema lock to determine that ordering, so we can't move this to
- * __session_checkpoint.
- *
- * Begin a transaction for the checkpoint.
- */
- WT_ASSERT(session,
- F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) &&
- !F_ISSET(txn, TXN_RUNNING));
- __wt_spin_lock(session, &conn->metadata_lock);
-
- wt_session = &session->iface;
- WT_ERR(wt_session->begin_transaction(wt_session, "isolation=snapshot"));
-
- WT_ERR(__wt_meta_track_on(session));
- tracking = 1;
+ target_list = 0;
/* Step through the list of targets and checkpoint each one. */
WT_ERR(__wt_config_gets(session, cfg, "target", &cval));
@@ -65,7 +43,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
if ((ret = __wt_schema_worker(
- session, tmp->data, __wt_checkpoint, cfg, 0)) != 0)
+ session, tmp->data, op, cfg, 0)) != 0)
WT_ERR_MSG(session, ret, "%s", (const char *)tmp->data);
}
WT_ERR_NOTFOUND_OK(ret);
@@ -91,10 +69,70 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
if (cval.len != 0)
ckpt_closed = 1;
WT_ERR(ckpt_closed ?
- __wt_meta_btree_apply(session, __wt_checkpoint, cfg) :
- __wt_conn_btree_apply(session, __wt_checkpoint, cfg));
+ __wt_meta_btree_apply(session, op, cfg) :
+ __wt_conn_btree_apply(session, op, cfg));
}
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_txn_checkpoint --
+ * Checkpoint a database or a list of objects in the database.
+ */
+int
+__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle, *saved_dhandle;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ WT_TXN *txn;
+ void *saved_meta_next;
+ int tracking;
+
+ conn = S2C(session);
+ tracking = 0;
+ txn = &session->txn;
+
+ /*
+ * Only one checkpoint can be active at a time, and checkpoints must
+ * run in the same order as they update the metadata; we are using the
+ * schema lock to determine that ordering, so we can't move this to
+ * __session_checkpoint.
+ *
+ * Begin a transaction for the checkpoint.
+ */
+ WT_ASSERT(session,
+ F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) &&
+ !F_ISSET(txn, TXN_RUNNING));
+ __wt_spin_lock(session, &conn->metadata_lock);
+
+ /* Flush dirty leaf pages before we start the checkpoint. */
+ txn->isolation = TXN_ISO_READ_COMMITTED;
+ WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_write_leaves));
+
+ WT_ERR(__wt_meta_track_on(session));
+ tracking = 1;
+
+ /* Start a snapshot transaction for the checkpoint. */
+ wt_session = &session->iface;
+ WT_ERR(wt_session->begin_transaction(wt_session, "isolation=snapshot"));
+
+ WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint));
+
+ /* Release the snapshot transaction, before syncing the file(s). */
+ __wt_txn_release(session);
+
+ /*
+ * Checkpoints have to hit disk (it would be reasonable to configure for
+ * lazy checkpoints, but we don't support them yet).
+ */
+ if (F_ISSET(conn, WT_CONN_SYNC))
+ WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_sync));
+
/* Checkpoint the metadata file. */
TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
if (WT_IS_METADATA(dhandle) ||
@@ -139,7 +177,8 @@ err: /*
if (tracking)
WT_TRET(__wt_meta_track_off(session, ret != 0));
- __wt_txn_release(session);
+ if (F_ISSET(txn, TXN_RUNNING))
+ __wt_txn_release(session);
__wt_spin_unlock(session, &conn->metadata_lock);
__wt_scr_free(&tmp);
@@ -628,11 +667,48 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
}
/*
+ * __checkpoint_write_leaves --
+ * Write dirty leaf pages before a checkpoint.
+ */
+static int
+__checkpoint_write_leaves(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_UNUSED(cfg);
+
+ if (S2BT(session)->modified)
+ WT_RET(__wt_bt_cache_op(
+ session, NULL, WT_SYNC_WRITE_LEAVES));
+
+ return (0);
+}
+
+/*
+ * __checkpoint_sync --
+ * Sync a file that has been checkpointed.
+ */
+static int
+__checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_BTREE *btree;
+
+ WT_UNUSED(cfg);
+ btree = S2BT(session);
+
+ /* Only sync ordinary handles: checkpoint handles are read-only. */
+ if (btree->dhandle->checkpoint == NULL && btree->bm != NULL)
+ return (btree->bm->sync(btree->bm, session));
+ return (0);
+}
+
+/*
* __wt_checkpoint_close --
* Checkpoint a file as part of a close.
*/
int
__wt_checkpoint_close(WT_SESSION_IMPL *session, const char *cfg[])
{
- return (__checkpoint_worker(session, cfg, 0));
+ WT_RET(__checkpoint_worker(session, cfg, 0));
+ if (F_ISSET(S2C(session), WT_CONN_SYNC))
+ WT_RET(__checkpoint_sync(session, cfg));
+ return (0);
}
diff --git a/src/utilities/util.h b/src/utilities/util.h
index cdf51946fc6..d231b0ac9c0 100644
--- a/src/utilities/util.h
+++ b/src/utilities/util.h
@@ -10,7 +10,7 @@
#define UTIL_COLGROUP_OK 0x01 /* colgroup: prefix OK */
#define UTIL_FILE_OK 0x02 /* file: prefix OK */
#define UTIL_INDEX_OK 0x04 /* index: prefix OK */
-#define UTIL_LSM_OK 0x04 /* lsm: prefix OK */
+#define UTIL_LSM_OK 0x08 /* lsm: prefix OK */
#define UTIL_TABLE_OK 0x10 /* table: prefix OK */
/* all known prefixes OK */
diff --git a/test/format/wts.c b/test/format/wts.c
index 5e489a62b39..8c1f02a3ebe 100644
--- a/test/format/wts.c
+++ b/test/format/wts.c
@@ -75,9 +75,13 @@ wts_open(void)
* override the standard configuration.
*/
snprintf(config, sizeof(config),
- "create,error_prefix=\"%s\",cache_size=%" PRIu32 "MB,sync=false,"
- "extensions=[\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"],%s,%s",
- g.progname, g.c_cache,
+ "create,cache_size=%" PRIu32 "MB,"
+ "error_prefix=\"%s\","
+ "extensions=[\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"],%s,%s,"
+ "statistics=true,statistics_log=(sources=(\"%s\"),wait=5),"
+ "sync=false,",
+ g.c_cache,
+ g.progname,
REVERSE_PATH,
access(BZIP_PATH, R_OK) == 0 ? BZIP_PATH : "",
access(LZO_PATH, R_OK) == 0 ? LZO_PATH : "",
@@ -85,7 +89,8 @@ wts_open(void)
access(BZIP_PATH, R_OK) == 0) ? RAW_PATH : "",
access(SNAPPY_PATH, R_OK) == 0 ? SNAPPY_PATH : "",
g.c_config_open == NULL ? "" : g.c_config_open,
- g.config_open == NULL ? "" : g.config_open);
+ g.config_open == NULL ? "" : g.config_open,
+ g.uri);
if ((ret =
wiredtiger_open("RUNDIR", &event_handler, config, &conn)) != 0)
diff --git a/test/java/com/wiredtiger/test/CursorTest.java b/test/java/com/wiredtiger/test/CursorTest.java
new file mode 100644
index 00000000000..f5608e96554
--- /dev/null
+++ b/test/java/com/wiredtiger/test/CursorTest.java
@@ -0,0 +1,117 @@
+/*-
+ * Public Domain 2008-2013 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package com.wiredtiger.test;
+
+import com.wiredtiger.db.Connection;
+import com.wiredtiger.db.Cursor;
+import com.wiredtiger.db.Session;
+import com.wiredtiger.db.WiredTigerPackingException;
+import com.wiredtiger.db.wiredtiger;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+import org.junit.Assert;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+public class CursorTest {
+ Connection conn;
+ Session s;
+
+ @Test
+ public void cursor01()
+ throws WiredTigerPackingException {
+ String keyFormat = "S";
+ String valueFormat = "u";
+ setup(keyFormat, valueFormat);
+
+ Cursor c = s.open_cursor("table:t", null, null);
+ c.putKeyString("bar");
+ c.putValueByteArray("foo".getBytes());
+ c.insert();
+ c.close();
+ teardown();
+ }
+
+ @Test
+ public void cursor02()
+ throws WiredTigerPackingException {
+ String keyFormat = "S";
+ String valueFormat = "u";
+ setup(keyFormat, valueFormat);
+
+ Cursor c = s.open_cursor("table:t", null, null);
+ c.putKeyString("bar");
+ c.putValueByteArray("foo".getBytes());
+ c.insert();
+ c.putKeyString("bar");
+ c.search();
+ Assert.assertEquals(c.getKeyString(), "bar");
+ Assert.assertEquals(new String(c.getValueByteArray()), "foo");
+ c.close();
+ teardown();
+ }
+
+ @Test
+ public void cursor03()
+ throws WiredTigerPackingException {
+ String keyFormat = "S";
+ String valueFormat = "uiSu";
+ setup(keyFormat, valueFormat);
+
+ Cursor c = s.open_cursor("table:t", null, null);
+ c.putKeyString("bar");
+ c.putValueByteArray("aaaaa".getBytes()).putValueInt(123);
+ c.putValueString("eeeee").putValueByteArray("iiiii".getBytes());
+
+ c.insert();
+ c.putKeyString("bar");
+ c.search();
+ Assert.assertEquals(c.getKeyString(), "bar");
+ Assert.assertEquals(new String(c.getValueByteArray()), "aaaaa");
+ Assert.assertEquals(c.getValueInt(), 123);
+ Assert.assertEquals(c.getValueString(), "eeeee");
+ Assert.assertEquals(new String(c.getValueByteArray()), "iiiii");
+ c.close();
+ teardown();
+ }
+
+ private void setup(String keyFormat, String valueFormat) {
+ conn = wiredtiger.open("WT_HOME", "create");
+ s = conn.open_session(null);
+ s.create("table:t",
+ "key_format=" + keyFormat + ",value_format=" + valueFormat);
+ }
+
+ private void teardown() {
+ s.drop("table:t", "");
+ s.close("");
+ conn.close("");
+ }
+
+}
diff --git a/test/java/com/wiredtiger/test/PackTest.java b/test/java/com/wiredtiger/test/PackTest.java
new file mode 100644
index 00000000000..a16d03a1fb0
--- /dev/null
+++ b/test/java/com/wiredtiger/test/PackTest.java
@@ -0,0 +1,246 @@
+/*-
+ * Public Domain 2008-2013 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package com.wiredtiger.test;
+
+import com.wiredtiger.db.PackOutputStream;
+import com.wiredtiger.db.PackInputStream;
+import com.wiredtiger.db.WiredTigerPackingException;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+import org.junit.Assert;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+public class PackTest {
+
+ // Some random numbers for testing.
+ static long[] testers = {
+ -12, -145, -14135, -1352308572, -1, 0, 1, 12, 145, 12314,
+ 873593485, -30194371, -4578928, 75452136, -28619244, 93580892,
+ 83350219, 27407091, -82413912, -727169, -3748613, 54046160,
+ -49539872, -4517158, 20397230, -68522195, 61663315, -6009306,
+ -57778143, -97631892, -62388819, 23581637, 2417807, -17761744,
+ -4174142, 92685293, 84597598, -83143925, 95302021, 90888796,
+ 88697909, -89601258, 93585507, 63705051, 51191732, 60900034,
+ -93016118, -68693051, -49366599, -90203871, 58404039, -79195628,
+ -98043222, 35325799, 47942822, 11582824, 93322027, 71799760,
+ 65114434, 42851643, 69146495, -86855643, 40073283, 1956899,
+ 28090147, 71274080, -95192279, -30641467, -1142067, -32599515,
+ 92478069, -90277478, -39370258, -77673053, 82435569, 88167012,
+ -39048877, 96895962, -8587864, -70095341, 49508886, 69912387,
+ 24311011, -58758419, 63228782, -52050021, 24687766, 34342885,
+ 97830395, 74658034, -9715954, -76120311, -63117710, -19312535,
+ 42829521, 32389638, -51273506, 16329653, -39061706, -9931233,
+ 42174615, 75412082, -26236331, 57741055, -17577762, 3605997,
+ -73993355, -54545904, -86580638, 84432898, -83573465, -1278,
+ 636, -9935, 9847, 8300, -5170, -2501, 6031, -6658, -9780, -5351,
+ 6573, -5582, -1994, -7498, -5190, 7710, -8125, -6478, 3670, 4293,
+ 1903, 2367, 3501, 841, -1718, -2303, -670, 9668, 8391, 3719, 1453,
+ 7203, -9693, 1294, -3549, -8941, -5455, 30, 2773, 8354, 7272,
+ -9794, -4806, -7091, -8404, 8297, -4093, -9890, -4948, -38, -66,
+ -12, 9, 50, -26, 4, -25, 62, 2, 47, -40, -22, -87, 75, -43, -51,
+ 65, 7, -17, -90, -27, 56, -60, 27, -2, 2, -3, 4, 7, 8, -8
+ };
+
+ @Test
+ public void pack01()
+ throws WiredTigerPackingException {
+ String format = "b";
+ PackOutputStream packer = new PackOutputStream(format);
+ packer.addByte((byte)8);
+
+ Assert.assertEquals(format, packer.getFormat());
+ byte[] packed = packer.getValue();
+
+ PackInputStream unpacker = new PackInputStream(format, packed);
+ Assert.assertEquals(unpacker.getByte(), (byte)8);
+ }
+
+ @Test
+ public void pack02()
+ throws WiredTigerPackingException {
+ String format = "biqrhS";
+ PackOutputStream packer = new PackOutputStream(format);
+ packer.addByte((byte)8);
+ packer.addInt(124);
+ packer.addLong(1240978);
+ packer.addRecord(5680234);
+ packer.addShort((short)8576);
+ packer.addString("Hello string");
+
+ Assert.assertEquals(format, packer.getFormat());
+ byte[] packed = packer.getValue();
+
+ PackInputStream unpacker = new PackInputStream(format, packed);
+ Assert.assertEquals(unpacker.getByte(), (byte)8);
+ Assert.assertEquals(unpacker.getInt(), 124);
+ Assert.assertEquals(unpacker.getLong(), 1240978);
+ Assert.assertEquals(unpacker.getRecord(), 5680234);
+ Assert.assertEquals(unpacker.getShort(), 8576);
+ Assert.assertEquals(unpacker.getString(), "Hello string");
+ }
+
+ @Test
+ public void pack03()
+ throws WiredTigerPackingException {
+ String format = "SS";
+ PackOutputStream packer = new PackOutputStream(format);
+ packer.addString("Hello 1");
+ packer.addString("Hello 2");
+
+ byte[] packed = packer.getValue();
+ PackInputStream unpacker = new PackInputStream(format, packed);
+ Assert.assertEquals(unpacker.getString(), "Hello 1");
+ Assert.assertEquals(unpacker.getString(), "Hello 2");
+ }
+
+ @Test
+ public void pack04()
+ throws WiredTigerPackingException {
+ String format = "U";
+ PackOutputStream packer = new PackOutputStream(format);
+ packer.addByteArray("Hello 1".getBytes());
+
+ byte[] packed = packer.getValue();
+ PackInputStream unpacker = new PackInputStream(format, packed);
+ Assert.assertTrue(java.util.Arrays.equals(
+ unpacker.getByteArray(), "Hello 1".getBytes()));
+ }
+
+ @Test
+ public void pack05()
+ throws WiredTigerPackingException {
+ String format = "uuu";
+ PackOutputStream packer = new PackOutputStream(format);
+ packer.addByteArray("Hello 1".getBytes());
+ packer.addByteArray("Hello 2".getBytes());
+ packer.addByteArray("Hello 3".getBytes());
+
+ byte[] packed = packer.getValue();
+ //printByteArray(packed, packed.length);
+ PackInputStream unpacker = new PackInputStream(format, packed);
+ Assert.assertTrue(java.util.Arrays.equals(
+ unpacker.getByteArray(), "Hello 1".getBytes()));
+ Assert.assertTrue(java.util.Arrays.equals(
+ unpacker.getByteArray(), "Hello 2".getBytes()));
+ Assert.assertTrue(java.util.Arrays.equals(
+ unpacker.getByteArray(), "Hello 3".getBytes()));
+ }
+
+ @Test
+ public void pack06()
+ throws WiredTigerPackingException {
+ String format = "uiS";
+ PackOutputStream packer = new PackOutputStream(format);
+ packer.addByteArray("Hello 1".getBytes());
+ packer.addInt(12);
+ packer.addString("Hello 3");
+
+ byte[] packed = packer.getValue();
+ PackInputStream unpacker = new PackInputStream(format, packed);
+ Assert.assertTrue(java.util.Arrays.equals(
+ unpacker.getByteArray(), "Hello 1".getBytes()));
+ Assert.assertEquals(unpacker.getInt(), 12);
+ Assert.assertEquals(unpacker.getString(), "Hello 3");
+ }
+
+ @Test
+ public void pack07()
+ throws WiredTigerPackingException {
+ String format = "4s";
+ PackOutputStream packer = new PackOutputStream(format);
+ packer.addString("Hello 1");
+
+ byte[] packed = packer.getValue();
+ PackInputStream unpacker = new PackInputStream(format, packed);
+ Assert.assertEquals(unpacker.getString(), "Hell");
+ }
+
+ @Test
+ public void packUnpackNumber01()
+ throws WiredTigerPackingException {
+ // Verify that we can pack and unpack single signed longs.
+ for (long i:testers) {
+ PackOutputStream packer = new PackOutputStream("Q");
+ packer.addLong(i);
+ PackInputStream unpacker =
+ new PackInputStream("Q", packer.getValue());
+ long unpacked = unpacker.getLong();
+ if (i != unpacked)
+ System.out.println(
+ i + " did not match " + unpacked);
+ }
+ }
+
+ @Test
+ public void packUnpackNumber02()
+ throws WiredTigerPackingException {
+ // Verify that we can pack and unpack pairs of signed longs reliably.
+ // This is interesting because it ensures that we are tracking the
+ // number of bytes used by number packing correctly.
+ for (int i = 0; i + 1 < testers.length; i += 2) {
+ long val1 = testers[i];
+ long val2 = testers[i+1];
+
+ PackOutputStream packer = new PackOutputStream("QQ");
+ packer.addLong(val1);
+ packer.addLong(val2);
+ PackInputStream unpacker =
+ new PackInputStream("QQ", packer.getValue());
+ long unpacked = unpacker.getLong();
+ if (val1 != unpacked) {
+ System.out.println(i + " did not match " + unpacked);
+ }
+ unpacked = unpacker.getLong();
+ if (val2 != unpacked) {
+ System.out.println(i + " did not match " + unpacked);
+ }
+ }
+ }
+
+ // A debug helper method
+ private void printByteArray(byte[] bytes, int len) {
+ for (int i = 0; i < len; i++) {
+ System.out.println(String.format(
+ "\t%8s", Integer.toBinaryString(
+ bytes[i] & 0xff)).replace(' ', '0'));
+ }
+ }
+
+ public static void main(String[] args) {
+ PackTest tester = new PackTest();
+ try {
+ tester.pack01();
+ tester.pack02();
+ tester.packUnpackNumber01();
+ } catch (WiredTigerPackingException wtpe) {
+ System.err.println("Packing exception: " + wtpe);
+ }
+ }
+}
diff --git a/test/java/com/wiredtiger/test/WiredTigerSuite.java b/test/java/com/wiredtiger/test/WiredTigerSuite.java
new file mode 100644
index 00000000000..0bdb4308871
--- /dev/null
+++ b/test/java/com/wiredtiger/test/WiredTigerSuite.java
@@ -0,0 +1,41 @@
+/*-
+ * Public Domain 2008-2013 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package com.wiredtiger.test;
+
+import org.junit.runner.RunWith;
+import org.junit.runners.Suite;
+
+@RunWith(Suite.class)
+@Suite.SuiteClasses( {
+ CursorTest.class,
+ PackTest.class
+})
+
+public class WiredTigerSuite {
+ // the class remains empty,
+ // used only as a holder for the above annotations
+}
diff --git a/test/suite/test_cursor_random.py b/test/suite/test_cursor_random.py
index 929355042ba..95f8540f156 100644
--- a/test/suite/test_cursor_random.py
+++ b/test/suite/test_cursor_random.py
@@ -72,7 +72,7 @@ class test_cursor_random(wttest.WiredTigerTestCase):
cursor.close
# Check that next_random works in the presence of a larger set of values.
- def test_cursor_random_single_record(self):
+ def test_cursor_random_multiple_records(self):
uri = self.type + 'random'
if self.type == 'file:':
simple_populate(
diff --git a/test/suite/test_drop_create.py b/test/suite/test_drop_create.py
index cf9971d6efd..12638cc00ae 100644
--- a/test/suite/test_drop_create.py
+++ b/test/suite/test_drop_create.py
@@ -45,5 +45,28 @@ class test_drop_create(wttest.WiredTigerTestCase):
self.assertEqual(s.create("table:test", config), 0)
self.assertEqual(s.close(), 0)
+ def test_drop_create2(self):
+ s, self.session = self.session, None
+ self.assertEqual(s.close(), 0)
+
+ # Test creating the same table with multiple sessions, to ensure
+ # that session table cache is working as expected.
+ s = self.conn.open_session()
+ s2 = self.conn.open_session()
+ self.assertEqual(s.drop("table:test", "force"), 0)
+ self.assertEqual(s.create("table:test", 'key_format=S,value_format=S,columns=(k,v)'), 0)
+ # Ensure the table cache for the second session knows about this table
+ c2 = s2.open_cursor("table:test", None, None)
+ c2.close()
+ self.assertEqual(s.drop("table:test"), 0)
+ # Create a table with the same name, but a different schema
+ self.assertEqual(s.create("table:test", 'key_format=S,value_format=l,columns=(k,v)'), 0)
+ c2 = s2.open_cursor("table:test", None, None)
+ c2.set_key("Hi")
+ c2.set_value(1)
+ c2.insert()
+ self.assertEqual(s.close(), 0)
+ self.assertEqual(s2.close(), 0)
+
if __name__ == '__main__':
wttest.run()
diff --git a/test/suite/test_reconfig.py b/test/suite/test_reconfig.py
new file mode 100644
index 00000000000..a86ebecb115
--- /dev/null
+++ b/test/suite/test_reconfig.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2013 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+
+# test_reconfig.py
+# Smoke-test the connection reconfiguration operations.
+class test_reconfig(wttest.WiredTigerTestCase):
+
+ def test_reconfig_shared_cache(self):
+ self.conn.reconfigure("shared_cache=(size=300M)")
+
+ def test_reconfig_statistics(self):
+ self.conn.reconfigure("statistics=true")
+
+ def test_reconfig_verbose(self):
+ self.conn.reconfigure("verbose=[mutex]")
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/test/suite/test_shared_cache.py b/test/suite/test_shared_cache.py
index cb1edc46ded..7bf54432e3a 100644
--- a/test/suite/test_shared_cache.py
+++ b/test/suite/test_shared_cache.py
@@ -209,5 +209,15 @@ class test_shared_cache(wttest.WiredTigerTestCase):
connection.reconfigure("shared_cache=(size=300M)")
self.closeConnections()
+ # Test default config values
+ def test_shared_cache11(self):
+ nops = 1000
+ self.openConnections(['WT_TEST1', 'WT_TEST2'], pool_opts=',shared_cache=()')
+
+ for sess in self.sessions:
+ sess.create(self.uri, "key_format=S,value_format=S")
+ self.add_records(sess, 0, nops)
+ self.closeConnections()
+
if __name__ == '__main__':
wttest.run()
diff --git a/tools/statlog.py b/tools/statlog.py
new file mode 100644
index 00000000000..ef74b2812aa
--- /dev/null
+++ b/tools/statlog.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2013 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+import fileinput, os, shutil, sys
+from collections import defaultdict
+from subprocess import call
+
+# Plot a set of entries for a title.
+def plot(title, entries, num):
+ # Ignore entries where the value never changes.
+ skip = 1
+ v = entries[0][1]
+ for entry in entries:
+ if v != entry[1]:
+ skip = 0
+ break
+ if skip == 1:
+ print '\tskipping ' + title
+ return
+
+ print 'building ' + title
+
+ # Write the raw data into a file for processing.
+ of = open("reports/report." + num + ".raw", "w")
+ for entry in sorted(entries):
+ of.write(" ".join(entry) + "\n")
+ of.close()
+
+ # Write a command file for gnuplot.
+ of = open("gnuplot.cmd", "w")
+ of.write("set terminal png nocrop\n")
+ of.write("set autoscale\n")
+ of.write("set grid\n")
+ of.write("set style data linespoints\n")
+ of.write("set title \"" + title + "\"\n")
+ of.write("set xlabel \"Time\"\n")
+ of.write("set xtics rotate by -45\n")
+ of.write("set xdata time\n")
+ of.write("set timefmt \"%b %d %H:%M:%S\"\n")
+ of.write("set format x \"%b %d %H:%M:%S\"\n")
+ of.write("set ylabel \"Value\"\n")
+ of.write("set yrange [0:]\n")
+ of.write("set output 'reports/report." + num + ".png'\n")
+ of.write("plot \"reports/report." + num + ".raw\" using 1:4 notitle\n")
+ of.close()
+
+ # Run gnuplot.
+ call(["gnuplot", "gnuplot.cmd"])
+
+ # Remove the command file.
+ os.remove("gnuplot.cmd")
+
+# Remove and re-create the reports folder.
+shutil.rmtree("reports", True)
+os.mkdir("reports")
+
+# Read the input into a dictionary of lists.
+if sys.argv[1:] == []:
+ print "usage: " + sys.argv[0] + " file ..."
+ sys.exit(1)
+d = defaultdict(list)
+for line in fileinput.input(sys.argv[1:]):
+ s = line.strip('\n').split(" ", 4)
+ d[s[4]].append([" ".join([s[0], s[1], s[2]]), s[3]])
+
+# Plot each entry in the dictionary.
+rno = 0
+for entry in sorted(d.iteritems()):
+ rno = rno + 1
+ plot(entry[0], entry[1], "%03d" % rno)