diff options
181 files changed, 8291 insertions, 1797 deletions
@@ -18,3 +18,4 @@ df87effe7cd3239e3666a76312bae77b92090d98 1.3.4 8b91f84675fd67259b1f513e3f84786501cbc16c 1.3.6 27cec73582030254a2752cc3213bb89825dc5183 1.3.7 edc4643f811d706cbbb6400d048bf56602aed963 1.4.2 +aff8aabe571be6db68e8bf44bf7670df5d55d1ff 1.5.0 @@ -1,3 +1,63 @@ +WiredTiger release 1.5.0, 2013-03-14 +------------------------------------ + +This release contains some major new features along with numerous bug fixes +and performance improvements. The significant changes are highlighted +below: + +* Add a Java API. + +* Create a thread to do automatic checkpoints, configured by passing + "checkpoint=(wait=X)" to wiredtiger_open. + +* Add support for periodically logging statistics to a file and a tool to + generate graphs based on those logs. Configured by passing + "statistics_log=(wait=X)" to wiredtiger_open. + +* Several changes to minimize the impact of checkpoints on other threads. + +* When reading from checkpoints, use mmap by default. + +* Enhance eviction so that internal pages take up less space. + +* Add maximum filesystem buffer cache settings to wiredtiger_open called + "os_cache_max" and "os_cache_dirty_max". After doing the specified + amount of reads or writes, WiredTiger will call fadvise and/or + sync_file_range to drop pages from the filesystem cache. This is an + alternative to direct I/O with less impact on performance. + +* Make run-time statistics optional, defaulted to "off". + +* Change how we detect if shared cache is used. It used to rely on a name, + now it will be used if the shared_cache configuration option is included. + +* Add the ability to specify a per-connection reserved size for cache + pools. Ensure cache pool reconfiguration is honoured quickly. + +* Rework hazard pointer coupling during cursor walks to be more efficient. + +* Add a cache_eviction_walk statistic to track the pages we walk and a + cache_eviction_force statistic to track the count of pages queued for + forced eviction. + +* Fixes to reduce the number of operations on shared data that were causing + bottlenecks in read only workloads. + +* Add streaming pack / unpack to the API. + +* Add some basic reconciliation stats to the connection stats. + +* In LSM, keep trying to switch if there is an error: it may be transient. + +* Minor clean up and enhancement for the reconciliation statistics, add a + set of compression statistics, both to the data-source statistics. + +* Compaction cannot run at the same time as a checkpoint: the problem is + that checkpoints review page reconciliation information and checkpoints + update page reconciliation information. Lock out checkpoints while + compaction is running. + + WiredTiger release 1.4.2, 2013-01-14 ------------------------------------ @@ -1,6 +1,6 @@ -WiredTiger 1.4.2: (January 14, 2013) +WiredTiger 1.5.0: (March 14, 2013) -This is version 1.4.2 of WiredTiger. +This is version 1.5.0 of WiredTiger. WiredTiger documentation can be found at: @@ -1,6 +1,6 @@ WIREDTIGER_VERSION_MAJOR=1 -WIREDTIGER_VERSION_MINOR=4 -WIREDTIGER_VERSION_PATCH=2 +WIREDTIGER_VERSION_MINOR=5 +WIREDTIGER_VERSION_PATCH=0 WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH" WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"` diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 97fb4875c9d..8fff6000a26 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -41,6 +41,15 @@ #define ATOMIC_ADD(v, val) \ __sync_add_and_fetch(&(v), val) +#ifndef F_CLR +#define F_CLR(p, mask) ((p)->flags &= ~((uint32_t)(mask))) +#endif +#ifndef F_ISSET +#define F_ISSET(p, mask) ((p)->flags & ((uint32_t)(mask))) +#endif +#ifndef F_SET +#define F_SET(p, mask) ((p)->flags |= ((uint32_t)(mask))) +#endif typedef struct { const char *home; @@ -68,6 +77,8 @@ typedef struct { #define WT_PERF_POP 0x01 #define WT_PERF_READ 0x02 uint32_t phase; +#define WT_INSERT_RMW 0x01 + uint32_t flags; struct timeval phase_start_time; } CONFIG; @@ -99,9 +110,10 @@ void worker(CONFIG *, uint32_t); "leaf_page_max=4kb,internal_page_max=64kb,allocation_size=4kb," /* Worker thread types. */ -#define WORKER_READ 0x01 -#define WORKER_INSERT 0x02 -#define WORKER_UPDATE 0x03 +#define WORKER_READ 0x01 +#define WORKER_INSERT 0x02 +#define WORKER_INSERT_RMW 0x03 +#define WORKER_UPDATE 0x04 /* Default values - these are tiny, we want the basic run to be fast. */ CONFIG default_cfg = { @@ -127,6 +139,7 @@ CONFIG default_cfg = { NULL, /* conn */ NULL, /* logf */ WT_PERF_INIT, /* phase */ + 0, /* flags */ {0, 0} /* phase_start_time */ }; /* Small config values - these are small. */ @@ -154,6 +167,7 @@ CONFIG small_cfg = { NULL, /* conn */ NULL, /* logf */ WT_PERF_INIT, /* phase */ + 0, /* flags */ {0, 0} /* phase_start_time */ }; /* Default values - these are small, we want the basic run to be fast. */ @@ -181,6 +195,7 @@ CONFIG med_cfg = { NULL, /* conn */ NULL, /* logf */ WT_PERF_INIT, /* phase */ + 0, /* flags */ {0, 0} /* phase_start_time */ }; /* Default values - these are small, we want the basic run to be fast. */ @@ -208,6 +223,7 @@ CONFIG large_cfg = { NULL, /* conn */ NULL, /* logf */ WT_PERF_INIT, /* phase */ + 0, /* flags */ {0, 0} /* phase_start_time */ }; @@ -240,7 +256,11 @@ read_thread(void *arg) void * insert_thread(void *arg) { - worker((CONFIG *)arg, WORKER_INSERT); + CONFIG *config; + + config = (CONFIG *)arg; + worker(config, F_ISSET(config, WT_INSERT_RMW) ? + WORKER_INSERT_RMW : WORKER_INSERT); return (NULL); } @@ -314,6 +334,12 @@ worker(CONFIG *cfg, uint32_t worker_type) if (op_ret == 0) ++g_nread_ops; break; + case WORKER_INSERT_RMW: + op_name="insert_rmw"; + op_ret = cursor->search(cursor); + if (op_ret != WT_NOTFOUND) + break; + /* Fall through */ case WORKER_INSERT: op_name = "insert"; cursor->set_value(cursor, data_buf); @@ -758,7 +784,7 @@ int main(int argc, char **argv) CONFIG cfg; WT_CONNECTION *conn; const char *user_cconfig, *user_tconfig; - const char *opts = "C:I:P:R:U:T:c:d:eh:i:k:l:r:s:t:u:v:SML"; + const char *opts = "C:I:P:R:U:T:c:d:eh:i:jk:l:r:s:t:u:v:SML"; char *cc_buf, *tc_buf; int ch, checkpoint_created, ret, stat_created; pthread_t checkpoint, stat; @@ -810,6 +836,9 @@ int main(int argc, char **argv) case 'i': cfg.icount = (uint32_t)atoi(optarg); break; + case 'j': + F_SET(&cfg, WT_INSERT_RMW); + break; case 'k': cfg.key_sz = (uint32_t)atoi(optarg); break; @@ -1122,6 +1151,8 @@ void print_config(CONFIG *cfg) printf("\t Workload period: %d\n", cfg->run_time); printf("\t Number read threads: %d\n", cfg->read_threads); printf("\t Number insert threads: %d\n", cfg->insert_threads); + if (F_ISSET(cfg, WT_INSERT_RMW)) + printf("\t Insert operations are RMW.\n"); printf("\t Number update threads: %d\n", cfg->update_threads); printf("\t Verbosity: %d\n", cfg->verbose); } @@ -1144,6 +1175,7 @@ void usage(void) printf("\t-e use existing database (skip population phase)\n"); printf("\t-h <string> Wired Tiger home must exist, default WT_TEST \n"); printf("\t-i <int> number of records to insert\n"); + printf("\t-j Execute a read prior to each insert in populate\n"); printf("\t-k <int> key item size\n"); printf("\t-l <int> log statistics every <int> report intervals." "Default disabled.\n"); diff --git a/build_posix/Make.subdirs b/build_posix/Make.subdirs index 07bb8269a72..aeb2316d634 100644 --- a/build_posix/Make.subdirs +++ b/build_posix/Make.subdirs @@ -12,6 +12,7 @@ ext/collators/reverse ext/compressors/bzip2 BZIP2 ext/compressors/nop ext/compressors/snappy SNAPPY +lang/java JAVA lang/python PYTHON test/bloom test/fops diff --git a/build_posix/aclocal/ax_check_class.m4 b/build_posix/aclocal/ax_check_class.m4 new file mode 100644 index 00000000000..098aa77290b --- /dev/null +++ b/build_posix/aclocal/ax_check_class.m4 @@ -0,0 +1,144 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_check_class.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_CHECK_CLASS +# +# DESCRIPTION +# +# AX_CHECK_CLASS tests the existence of a given Java class, either in a +# jar or in a '.class' file. +# +# *Warning*: its success or failure can depend on a proper setting of the +# CLASSPATH env. variable. +# +# Note: This is part of the set of autoconf M4 macros for Java programs. +# It is VERY IMPORTANT that you download the whole set, some macros depend +# on other. Unfortunately, the autoconf archive does not support the +# concept of set of macros, so I had to break it for submission. The +# general documentation, as well as the sample configure.in, is included +# in the AX_PROG_JAVA macro. +# +# LICENSE +# +# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr> +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 7 + +AU_ALIAS([AC_CHECK_CLASS], [AX_CHECK_CLASS]) +AC_DEFUN([AX_CHECK_CLASS],[ +AC_REQUIRE([AX_PROG_JAVA]) +ac_var_name=`echo $1 | sed 's/\./_/g'` +dnl Normaly I'd use a AC_CACHE_CHECK here but since the variable name is +dnl dynamic I need an extra level of extraction +AC_MSG_CHECKING([for $1 class]) +AC_CACHE_VAL(ax_cv_class_$ac_var_name, [ +if test x$ac_cv_prog_uudecode_base64 = xyes; then +dnl /** +dnl * Test.java: used to test dynamicaly if a class exists. +dnl */ +dnl public class Test +dnl { +dnl +dnl public static void +dnl main( String[] argv ) +dnl { +dnl Class lib; +dnl if (argv.length < 1) +dnl { +dnl System.err.println ("Missing argument"); +dnl System.exit (77); +dnl } +dnl try +dnl { +dnl lib = Class.forName (argv[0]); +dnl } +dnl catch (ClassNotFoundException e) +dnl { +dnl System.exit (1); +dnl } +dnl lib = null; +dnl System.exit (0); +dnl } +dnl +dnl } +cat << \EOF > Test.uue +begin-base64 644 Test.class +yv66vgADAC0AKQcAAgEABFRlc3QHAAQBABBqYXZhL2xhbmcvT2JqZWN0AQAE +bWFpbgEAFihbTGphdmEvbGFuZy9TdHJpbmc7KVYBAARDb2RlAQAPTGluZU51 +bWJlclRhYmxlDAAKAAsBAANlcnIBABVMamF2YS9pby9QcmludFN0cmVhbTsJ +AA0ACQcADgEAEGphdmEvbGFuZy9TeXN0ZW0IABABABBNaXNzaW5nIGFyZ3Vt +ZW50DAASABMBAAdwcmludGxuAQAVKExqYXZhL2xhbmcvU3RyaW5nOylWCgAV +ABEHABYBABNqYXZhL2lvL1ByaW50U3RyZWFtDAAYABkBAARleGl0AQAEKEkp +VgoADQAXDAAcAB0BAAdmb3JOYW1lAQAlKExqYXZhL2xhbmcvU3RyaW5nOylM +amF2YS9sYW5nL0NsYXNzOwoAHwAbBwAgAQAPamF2YS9sYW5nL0NsYXNzBwAi +AQAgamF2YS9sYW5nL0NsYXNzTm90Rm91bmRFeGNlcHRpb24BAAY8aW5pdD4B +AAMoKVYMACMAJAoAAwAlAQAKU291cmNlRmlsZQEACVRlc3QuamF2YQAhAAEA +AwAAAAAAAgAJAAUABgABAAcAAABtAAMAAwAAACkqvgSiABCyAAwSD7YAFBBN +uAAaKgMyuAAeTKcACE0EuAAaAUwDuAAasQABABMAGgAdACEAAQAIAAAAKgAK +AAAACgAAAAsABgANAA4ADgATABAAEwASAB4AFgAiABgAJAAZACgAGgABACMA +JAABAAcAAAAhAAEAAQAAAAUqtwAmsQAAAAEACAAAAAoAAgAAAAQABAAEAAEA +JwAAAAIAKA== +==== +EOF + if $UUDECODE Test.uue; then + : + else + echo "configure: __oline__: uudecode had trouble decoding base 64 file 'Test.uue'" >&AS_MESSAGE_LOG_FD + echo "configure: failed file was:" >&AS_MESSAGE_LOG_FD + cat Test.uue >&AS_MESSAGE_LOG_FD + ac_cv_prog_uudecode_base64=no + fi + rm -f Test.uue + if AC_TRY_COMMAND($JAVA $JAVAFLAGS Test $1) >/dev/null 2>&1; then + eval "ac_cv_class_$ac_var_name=yes" + else + eval "ac_cv_class_$ac_var_name=no" + fi + rm -f Test.class +else + AX_TRY_COMPILE_JAVA([$1], , [eval "ac_cv_class_$ac_var_name=yes"], + [eval "ac_cv_class_$ac_var_name=no"]) +fi +eval "ac_var_val=$`eval echo ac_cv_class_$ac_var_name`" +eval "HAVE_$ac_var_name=$`echo ac_cv_class_$ac_var_val`" +HAVE_LAST_CLASS=$ac_var_val +if test x$ac_var_val = xyes; then + ifelse([$2], , :, [$2]) +else + ifelse([$3], , :, [$3]) +fi +]) +dnl for some reason the above statment didn't fall though here? +dnl do scripts have variable scoping? +eval "ac_var_val=$`eval echo ac_cv_class_$ac_var_name`" +AC_MSG_RESULT($ac_var_val) +]) diff --git a/build_posix/aclocal/ax_check_junit.m4 b/build_posix/aclocal/ax_check_junit.m4 new file mode 100644 index 00000000000..724e0e0814f --- /dev/null +++ b/build_posix/aclocal/ax_check_junit.m4 @@ -0,0 +1,72 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_check_junit.html +# =========================================================================== +# +# WiredTiger: Updated to use JUnit 4 call semantics. +# +# SYNOPSIS +# +# AX_CHECK_JUNIT +# +# DESCRIPTION +# +# AX_CHECK_JUNIT tests the availability of the Junit testing framework, +# and set some variables for conditional compilation of the test suite by +# automake. +# +# If available, JUNIT is set to a command launching the text based user +# interface of Junit, @JAVA_JUNIT@ is set to $JAVA_JUNIT and @TESTS_JUNIT@ +# is set to $TESTS_JUNIT, otherwise they are set to empty values. +# +# You can use these variables in your Makefile.am file like this : +# +# # Some of the following classes are built only if junit is available +# JAVA_JUNIT = Class1Test.java Class2Test.java AllJunitTests.java +# +# noinst_JAVA = Example1.java Example2.java @JAVA_JUNIT@ +# +# EXTRA_JAVA = $(JAVA_JUNIT) +# +# TESTS_JUNIT = AllJunitTests +# +# TESTS = StandaloneTest1 StandaloneTest2 @TESTS_JUNIT@ +# +# EXTRA_TESTS = $(TESTS_JUNIT) +# +# AllJunitTests : +# echo "#! /bin/sh" > $@ +# echo "exec @JUNIT@ my.package.name.AllJunitTests" >> $@ +# chmod +x $@ +# +# LICENSE +# +# Copyright (c) 2008 Luc Maisonobe <luc@spaceroots.org> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 5 + +AU_ALIAS([AC_CHECK_JUNIT], [AX_CHECK_JUNIT]) +AC_DEFUN([AX_CHECK_JUNIT],[ +AC_CACHE_VAL(ac_cv_prog_JUNIT,[ +AX_CHECK_CLASS(org.junit.runner.JUnitCore) +if test x"`eval 'echo $ac_cv_class_org_junit_runner_JUnitCore'`" != xno ; then + ac_cv_prog_JUNIT='$(CLASSPATH_ENV) $(JAVA) $(JAVAFLAGS) org.junit.runner.JUnitCore' +fi]) +AC_MSG_CHECKING([for junit]) +if test x"`eval 'echo $ac_cv_prog_JUNIT'`" != x ; then + JUNIT="$ac_cv_prog_JUNIT" + JAVA_JUNIT='$(JAVA_JUNIT)' + TESTS_JUNIT='$(TESTS_JUNIT)' +else + JUNIT= + JAVA_JUNIT= + TESTS_JUNIT= +fi +AC_MSG_RESULT($JAVA_JUNIT) +AC_SUBST(JUNIT) +AC_SUBST(JAVA_JUNIT) +AC_SUBST(TESTS_JUNIT)]) diff --git a/build_posix/aclocal/ax_java_options.m4 b/build_posix/aclocal/ax_java_options.m4 new file mode 100644 index 00000000000..36c10d922bd --- /dev/null +++ b/build_posix/aclocal/ax_java_options.m4 @@ -0,0 +1,48 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_java_options.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_JAVA_OPTIONS +# +# DESCRIPTION +# +# AX_JAVA_OPTIONS adds configure command line options used for Java m4 +# macros. This Macro is optional. +# +# Note: This is part of the set of autoconf M4 macros for Java programs. +# It is VERY IMPORTANT that you download the whole set, some macros depend +# on other. Unfortunately, the autoconf archive does not support the +# concept of set of macros, so I had to break it for submission. The +# general documentation, as well as the sample configure.in, is included +# in the AX_PROG_JAVA macro. +# +# LICENSE +# +# Copyright (c) 2008 Devin Weaver <ktohg@tritarget.com> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 6 + +AU_ALIAS([AC_JAVA_OPTIONS], [AX_JAVA_OPTIONS]) +AC_DEFUN([AX_JAVA_OPTIONS],[ +AC_ARG_WITH(java-prefix, + [ --with-java-prefix=PFX prefix where Java runtime is installed (optional)]) +AC_ARG_WITH(javac-flags, + [ --with-javac-flags=FLAGS flags to pass to the Java compiler (optional)]) +AC_ARG_WITH(java-flags, + [ --with-java-flags=FLAGS flags to pass to the Java VM (optional)]) +JAVAPREFIX=$with_java_prefix +JAVACFLAGS=$with_javac_flags +JAVAFLAGS=$with_java_flags +AC_SUBST(JAVAPREFIX)dnl +AC_SUBST(JAVACFLAGS)dnl +AC_SUBST(JAVAFLAGS)dnl +AC_SUBST(JAVA)dnl +AC_SUBST(JAVAC)dnl +]) diff --git a/build_posix/aclocal/ax_jni_include_dir.m4 b/build_posix/aclocal/ax_jni_include_dir.m4 new file mode 100644 index 00000000000..7ce12e10c82 --- /dev/null +++ b/build_posix/aclocal/ax_jni_include_dir.m4 @@ -0,0 +1,120 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_jni_include_dir.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_JNI_INCLUDE_DIR +# +# DESCRIPTION +# +# AX_JNI_INCLUDE_DIR finds include directories needed for compiling +# programs using the JNI interface. +# +# JNI include directories are usually in the java distribution This is +# deduced from the value of JAVAC. When this macro completes, a list of +# directories is left in the variable JNI_INCLUDE_DIRS. +# +# Example usage follows: +# +# AX_JNI_INCLUDE_DIR +# +# for JNI_INCLUDE_DIR in $JNI_INCLUDE_DIRS +# do +# CPPFLAGS="$CPPFLAGS -I$JNI_INCLUDE_DIR" +# done +# +# If you want to force a specific compiler: +# +# - at the configure.in level, set JAVAC=yourcompiler before calling +# AX_JNI_INCLUDE_DIR +# +# - at the configure level, setenv JAVAC +# +# Note: This macro can work with the autoconf M4 macros for Java programs. +# This particular macro is not part of the original set of macros. +# +# LICENSE +# +# Copyright (c) 2008 Don Anderson <dda@sleepycat.com> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 7 + +AU_ALIAS([AC_JNI_INCLUDE_DIR], [AX_JNI_INCLUDE_DIR]) +AC_DEFUN([AX_JNI_INCLUDE_DIR],[ + +JNI_INCLUDE_DIRS="" + +test "x$JAVAC" = x && AC_MSG_ERROR(['\$JAVAC' undefined]) +AC_PATH_PROG([_ACJNI_JAVAC], [$JAVAC], [no]) +test "x$_ACJNI_JAVAC" = xno && AC_MSG_ERROR([$JAVAC could not be found in path]) + +_ACJNI_FOLLOW_SYMLINKS("$_ACJNI_JAVAC") +_JTOPDIR=`echo "$_ACJNI_FOLLOWED" | sed -e 's://*:/:g' -e 's:/[[^/]]*$::'` +case "$host_os" in + darwin*) _JTOPDIR=`echo "$_JTOPDIR" | sed -e 's:/[[^/]]*$::'` + _JINC="$_JTOPDIR/Headers";; + *) _JINC="$_JTOPDIR/include";; +esac +_AS_ECHO_LOG([_JTOPDIR=$_JTOPDIR]) +_AS_ECHO_LOG([_JINC=$_JINC]) + +# On Mac OS X 10.6.4, jni.h is a symlink: +# /System/Library/Frameworks/JavaVM.framework/Versions/Current/Headers/jni.h +# -> ../../CurrentJDK/Headers/jni.h. +if test -f "$_JINC/jni.h" || test -L "$_JINC/jni.h"; then + JNI_INCLUDE_DIRS="$JNI_INCLUDE_DIRS $_JINC" +else + _JTOPDIR=`echo "$_JTOPDIR" | sed -e 's:/[[^/]]*$::'` + if test -f "$_JTOPDIR/include/jni.h"; then + JNI_INCLUDE_DIRS="$JNI_INCLUDE_DIRS $_JTOPDIR/include" + else + AC_MSG_ERROR([cannot find java include files]) + fi +fi + +# get the likely subdirectories for system specific java includes +case "$host_os" in +bsdi*) _JNI_INC_SUBDIRS="bsdos";; +freebsd*) _JNI_INC_SUBDIRS="freebsd";; +linux*) _JNI_INC_SUBDIRS="linux genunix";; +osf*) _JNI_INC_SUBDIRS="alpha";; +solaris*) _JNI_INC_SUBDIRS="solaris";; +mingw*) _JNI_INC_SUBDIRS="win32";; +cygwin*) _JNI_INC_SUBDIRS="win32";; +*) _JNI_INC_SUBDIRS="genunix";; +esac + +# add any subdirectories that are present +for JINCSUBDIR in $_JNI_INC_SUBDIRS +do + if test -d "$_JTOPDIR/include/$JINCSUBDIR"; then + JNI_INCLUDE_DIRS="$JNI_INCLUDE_DIRS $_JTOPDIR/include/$JINCSUBDIR" + fi +done +]) + +# _ACJNI_FOLLOW_SYMLINKS <path> +# Follows symbolic links on <path>, +# finally setting variable _ACJNI_FOLLOWED +# ---------------------------------------- +AC_DEFUN([_ACJNI_FOLLOW_SYMLINKS],[ +# find the include directory relative to the javac executable +_cur="$1" +while ls -ld "$_cur" 2>/dev/null | grep " -> " >/dev/null; do + AC_MSG_CHECKING([symlink for $_cur]) + _slink=`ls -ld "$_cur" | sed 's/.* -> //'` + case "$_slink" in + /*) _cur="$_slink";; + # 'X' avoids triggering unwanted echo options. + *) _cur=`echo "X$_cur" | sed -e 's/^X//' -e 's:[[^/]]*$::'`"$_slink";; + esac + AC_MSG_RESULT([$_cur]) +done +_ACJNI_FOLLOWED="$_cur" +])# _ACJNI diff --git a/build_posix/aclocal/ax_prog_jar.m4 b/build_posix/aclocal/ax_prog_jar.m4 new file mode 100644 index 00000000000..776e804ad9f --- /dev/null +++ b/build_posix/aclocal/ax_prog_jar.m4 @@ -0,0 +1,52 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_prog_jar.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_PROG_JAR +# +# DESCRIPTION +# +# AX_PROG_JAR tests for an existing jar program. It uses the environment +# variable JAR then tests in sequence various common jar programs. +# +# If you want to force a specific compiler: +# +# - at the configure.in level, set JAR=yourcompiler before calling +# AX_PROG_JAR +# +# - at the configure level, setenv JAR +# +# You can use the JAR variable in your Makefile.in, with @JAR@. +# +# Note: This macro depends on the autoconf M4 macros for Java programs. It +# is VERY IMPORTANT that you download that whole set, some macros depend +# on other. Unfortunately, the autoconf archive does not support the +# concept of set of macros, so I had to break it for submission. +# +# The general documentation of those macros, as well as the sample +# configure.in, is included in the AX_PROG_JAVA macro. +# +# LICENSE +# +# Copyright (c) 2008 Egon Willighagen <e.willighagen@science.ru.nl> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 6 + +AU_ALIAS([AC_PROG_JAR], [AX_PROG_JAR]) +AC_DEFUN([AX_PROG_JAR],[ +AC_REQUIRE([AC_EXEEXT])dnl +if test "x$JAVAPREFIX" = x; then + test "x$JAR" = x && AC_CHECK_PROGS(JAR, jar$EXEEXT) +else + test "x$JAR" = x && AC_CHECK_PROGS(JAR, jar, $JAVAPREFIX) +fi +test "x$JAR" = x && AC_MSG_ERROR([no acceptable jar program found in \$PATH]) +AC_PROVIDE([$0])dnl +]) diff --git a/build_posix/aclocal/ax_prog_java.m4 b/build_posix/aclocal/ax_prog_java.m4 new file mode 100644 index 00000000000..5471f322d25 --- /dev/null +++ b/build_posix/aclocal/ax_prog_java.m4 @@ -0,0 +1,115 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_prog_java.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_PROG_JAVA +# +# DESCRIPTION +# +# Here is a summary of the main macros: +# +# AX_PROG_JAVAC: finds a Java compiler. +# +# AX_PROG_JAVA: finds a Java virtual machine. +# +# AX_CHECK_CLASS: finds if we have the given class (beware of CLASSPATH!). +# +# AX_CHECK_RQRD_CLASS: finds if we have the given class and stops +# otherwise. +# +# AX_TRY_COMPILE_JAVA: attempt to compile user given source. +# +# AX_TRY_RUN_JAVA: attempt to compile and run user given source. +# +# AX_JAVA_OPTIONS: adds Java configure options. +# +# AX_PROG_JAVA tests an existing Java virtual machine. It uses the +# environment variable JAVA then tests in sequence various common Java +# virtual machines. For political reasons, it starts with the free ones. +# You *must* call [AX_PROG_JAVAC] before. +# +# If you want to force a specific VM: +# +# - at the configure.in level, set JAVA=yourvm before calling AX_PROG_JAVA +# +# (but after AC_INIT) +# +# - at the configure level, setenv JAVA +# +# You can use the JAVA variable in your Makefile.in, with @JAVA@. +# +# *Warning*: its success or failure can depend on a proper setting of the +# CLASSPATH env. variable. +# +# TODO: allow to exclude virtual machines (rationale: most Java programs +# cannot run with some VM like kaffe). +# +# Note: This is part of the set of autoconf M4 macros for Java programs. +# It is VERY IMPORTANT that you download the whole set, some macros depend +# on other. Unfortunately, the autoconf archive does not support the +# concept of set of macros, so I had to break it for submission. +# +# A Web page, with a link to the latest CVS snapshot is at +# <http://www.internatif.org/bortzmeyer/autoconf-Java/>. +# +# This is a sample configure.in Process this file with autoconf to produce +# a configure script. +# +# AC_INIT(UnTag.java) +# +# dnl Checks for programs. +# AC_CHECK_CLASSPATH +# AX_PROG_JAVAC +# AX_PROG_JAVA +# +# dnl Checks for classes +# AX_CHECK_RQRD_CLASS(org.xml.sax.Parser) +# AX_CHECK_RQRD_CLASS(com.jclark.xml.sax.Driver) +# +# AC_OUTPUT(Makefile) +# +# LICENSE +# +# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr> +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 8 + +AU_ALIAS([AC_PROG_JAVA], [AX_PROG_JAVA]) +AC_DEFUN([AX_PROG_JAVA],[ +if test x$JAVAPREFIX = x; then + test x$JAVA = x && AC_CHECK_PROGS(JAVA, kaffe java) +else + test x$JAVA = x && AC_CHECK_PROGS(JAVA, kaffe java, $JAVAPREFIX) +fi +test x$JAVA = x && AC_MSG_ERROR([no acceptable Java virtual machine found in \$PATH]) +AX_PROG_JAVA_WORKS +AC_PROVIDE([$0])dnl +]) diff --git a/build_posix/aclocal/ax_prog_java_works.m4 b/build_posix/aclocal/ax_prog_java_works.m4 new file mode 100644 index 00000000000..741bd561b62 --- /dev/null +++ b/build_posix/aclocal/ax_prog_java_works.m4 @@ -0,0 +1,134 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_prog_java_works.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_PROG_JAVA_WORKS +# +# DESCRIPTION +# +# Internal use ONLY. +# +# Note: This is part of the set of autoconf M4 macros for Java programs. +# It is VERY IMPORTANT that you download the whole set, some macros depend +# on other. Unfortunately, the autoconf archive does not support the +# concept of set of macros, so I had to break it for submission. The +# general documentation, as well as the sample configure.in, is included +# in the AX_PROG_JAVA macro. +# +# LICENSE +# +# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr> +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 8 + +AU_ALIAS([AC_PROG_JAVA_WORKS], [AX_PROG_JAVA_WORKS]) +AC_DEFUN([AX_PROG_JAVA_WORKS], [ +AC_PATH_PROG(UUDECODE, uudecode, [no]) +if test x$UUDECODE != xno; then +AC_CACHE_CHECK([if uudecode can decode base 64 file], ac_cv_prog_uudecode_base64, [ +dnl /** +dnl * Test.java: used to test if java compiler works. +dnl */ +dnl public class Test +dnl { +dnl +dnl public static void +dnl main( String[] argv ) +dnl { +dnl System.exit (0); +dnl } +dnl +dnl } +cat << \EOF > Test.uue +begin-base64 644 Test.class +yv66vgADAC0AFQcAAgEABFRlc3QHAAQBABBqYXZhL2xhbmcvT2JqZWN0AQAE +bWFpbgEAFihbTGphdmEvbGFuZy9TdHJpbmc7KVYBAARDb2RlAQAPTGluZU51 +bWJlclRhYmxlDAAKAAsBAARleGl0AQAEKEkpVgoADQAJBwAOAQAQamF2YS9s +YW5nL1N5c3RlbQEABjxpbml0PgEAAygpVgwADwAQCgADABEBAApTb3VyY2VG +aWxlAQAJVGVzdC5qYXZhACEAAQADAAAAAAACAAkABQAGAAEABwAAACEAAQAB +AAAABQO4AAyxAAAAAQAIAAAACgACAAAACgAEAAsAAQAPABAAAQAHAAAAIQAB +AAEAAAAFKrcAErEAAAABAAgAAAAKAAIAAAAEAAQABAABABMAAAACABQ= +==== +EOF +if $UUDECODE Test.uue; then + ac_cv_prog_uudecode_base64=yes +else + echo "configure: __oline__: uudecode had trouble decoding base 64 file 'Test.uue'" >&AS_MESSAGE_LOG_FD + echo "configure: failed file was:" >&AS_MESSAGE_LOG_FD + cat Test.uue >&AS_MESSAGE_LOG_FD + ac_cv_prog_uudecode_base64=no +fi +rm -f Test.uue]) +fi +if test x$ac_cv_prog_uudecode_base64 != xyes; then + rm -f Test.class + AC_MSG_WARN([I have to compile Test.class from scratch]) + if test x$ac_cv_prog_javac_works = xno; then + AC_MSG_ERROR([Cannot compile java source. $JAVAC does not work properly]) + fi + if test x$ac_cv_prog_javac_works = x; then + AX_PROG_JAVAC + fi +fi +AC_CACHE_CHECK(if $JAVA works, ac_cv_prog_java_works, [ +JAVA_TEST=Test.java +CLASS_TEST=Test.class +TEST=Test +changequote(, )dnl +cat << \EOF > $JAVA_TEST +/* [#]line __oline__ "configure" */ +public class Test { +public static void main (String args[]) { + System.exit (0); +} } +EOF +changequote([, ])dnl +if test x$ac_cv_prog_uudecode_base64 != xyes; then + if AC_TRY_COMMAND($JAVAC $JAVACFLAGS $JAVA_TEST) && test -s $CLASS_TEST; then + : + else + echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD + cat $JAVA_TEST >&AS_MESSAGE_LOG_FD + AC_MSG_ERROR(The Java compiler $JAVAC failed (see config.log, check the CLASSPATH?)) + fi +fi +if AC_TRY_COMMAND($JAVA $JAVAFLAGS $TEST) >/dev/null 2>&1; then + ac_cv_prog_java_works=yes +else + echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD + cat $JAVA_TEST >&AS_MESSAGE_LOG_FD + AC_MSG_ERROR(The Java VM $JAVA failed (see config.log, check the CLASSPATH?)) +fi +rm -fr $JAVA_TEST $CLASS_TEST Test.uue +]) +AC_PROVIDE([$0])dnl +] +) diff --git a/build_posix/aclocal/ax_prog_javac.m4 b/build_posix/aclocal/ax_prog_javac.m4 new file mode 100644 index 00000000000..d9bcc2d7c34 --- /dev/null +++ b/build_posix/aclocal/ax_prog_javac.m4 @@ -0,0 +1,79 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_prog_javac.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_PROG_JAVAC +# +# DESCRIPTION +# +# AX_PROG_JAVAC tests an existing Java compiler. It uses the environment +# variable JAVAC then tests in sequence various common Java compilers. For +# political reasons, it starts with the free ones. +# +# If you want to force a specific compiler: +# +# - at the configure.in level, set JAVAC=yourcompiler before calling +# AX_PROG_JAVAC +# +# - at the configure level, setenv JAVAC +# +# You can use the JAVAC variable in your Makefile.in, with @JAVAC@. +# +# *Warning*: its success or failure can depend on a proper setting of the +# CLASSPATH env. variable. +# +# TODO: allow to exclude compilers (rationale: most Java programs cannot +# compile with some compilers like guavac). +# +# Note: This is part of the set of autoconf M4 macros for Java programs. +# It is VERY IMPORTANT that you download the whole set, some macros depend +# on other. Unfortunately, the autoconf archive does not support the +# concept of set of macros, so I had to break it for submission. The +# general documentation, as well as the sample configure.in, is included +# in the AX_PROG_JAVA macro. +# +# LICENSE +# +# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr> +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 6 + +AU_ALIAS([AC_PROG_JAVAC], [AX_PROG_JAVAC]) +AC_DEFUN([AX_PROG_JAVAC],[ +if test "x$JAVAPREFIX" = x; then + test "x$JAVAC" = x && AC_CHECK_PROGS(JAVAC, "gcj -C" guavac jikes javac) +else + test "x$JAVAC" = x && AC_CHECK_PROGS(JAVAC, "gcj -C" guavac jikes javac, $JAVAPREFIX) +fi +test "x$JAVAC" = x && AC_MSG_ERROR([no acceptable Java compiler found in \$PATH]) +AX_PROG_JAVAC_WORKS +AC_PROVIDE([$0])dnl +]) diff --git a/build_posix/aclocal/ax_prog_javac_works.m4 b/build_posix/aclocal/ax_prog_javac_works.m4 new file mode 100644 index 00000000000..7dfa1e37d89 --- /dev/null +++ b/build_posix/aclocal/ax_prog_javac_works.m4 @@ -0,0 +1,72 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_prog_javac_works.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_PROG_JAVAC_WORKS +# +# DESCRIPTION +# +# Internal use ONLY. +# +# Note: This is part of the set of autoconf M4 macros for Java programs. +# It is VERY IMPORTANT that you download the whole set, some macros depend +# on other. Unfortunately, the autoconf archive does not support the +# concept of set of macros, so I had to break it for submission. The +# general documentation, as well as the sample configure.in, is included +# in the AX_PROG_JAVA macro. +# +# LICENSE +# +# Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr> +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 6 + +AU_ALIAS([AC_PROG_JAVAC_WORKS], [AX_PROG_JAVAC_WORKS]) +AC_DEFUN([AX_PROG_JAVAC_WORKS],[ +AC_CACHE_CHECK([if $JAVAC works], ac_cv_prog_javac_works, [ +JAVA_TEST=Test.java +CLASS_TEST=Test.class +cat << \EOF > $JAVA_TEST +/* [#]line __oline__ "configure" */ +public class Test { +} +EOF +if AC_TRY_COMMAND($JAVAC $JAVACFLAGS $JAVA_TEST) >/dev/null 2>&1; then + ac_cv_prog_javac_works=yes +else + AC_MSG_ERROR([The Java compiler $JAVAC failed (see config.log, check the CLASSPATH?)]) + echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD + cat $JAVA_TEST >&AS_MESSAGE_LOG_FD +fi +rm -f $JAVA_TEST $CLASS_TEST +]) +AC_PROVIDE([$0])dnl +]) diff --git a/build_posix/aclocal/ax_try_compile_java.m4 b/build_posix/aclocal/ax_try_compile_java.m4 new file mode 100644 index 00000000000..8efd091c43b --- /dev/null +++ b/build_posix/aclocal/ax_try_compile_java.m4 @@ -0,0 +1,55 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_try_compile_java.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_TRY_COMPILE_JAVA +# +# DESCRIPTION +# +# AX_TRY_COMPILE_JAVA attempt to compile user given source. +# +# *Warning*: its success or failure can depend on a proper setting of the +# CLASSPATH env. variable. +# +# Note: This is part of the set of autoconf M4 macros for Java programs. +# It is VERY IMPORTANT that you download the whole set, some macros depend +# on other. Unfortunately, the autoconf archive does not support the +# concept of set of macros, so I had to break it for submission. The +# general documentation, as well as the sample configure.in, is included +# in the AX_PROG_JAVA macro. +# +# LICENSE +# +# Copyright (c) 2008 Devin Weaver <ktohg@tritarget.com> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 7 + +AU_ALIAS([AC_TRY_COMPILE_JAVA], [AX_TRY_COMPILE_JAVA]) +AC_DEFUN([AX_TRY_COMPILE_JAVA],[ +AC_REQUIRE([AX_PROG_JAVAC])dnl +cat << \EOF > Test.java +/* [#]line __oline__ "configure" */ +ifelse([$1], , , [import $1;]) +public class Test { +[$2] +} +EOF +if AC_TRY_COMMAND($JAVAC $JAVACFLAGS Test.java) && test -s Test.class +then +dnl Don't remove the temporary files here, so they can be examined. + ifelse([$3], , :, [$3]) +else + echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD + cat Test.java >&AS_MESSAGE_LOG_FD +ifelse([$4], , , [ rm -fr Test* + $4 +])dnl +fi +rm -fr Test*]) diff --git a/build_posix/aclocal/options.m4 b/build_posix/aclocal/options.m4 index a65724c5e66..6856cdd0769 100644 --- a/build_posix/aclocal/options.m4 +++ b/build_posix/aclocal/options.m4 @@ -46,10 +46,24 @@ no) wt_cv_enable_diagnostic=no;; esac AC_MSG_RESULT($wt_cv_enable_diagnostic) +AC_MSG_CHECKING(if --enable-java option specified) +AC_ARG_ENABLE(java, + [AS_HELP_STRING([--enable-java], + [Configure the Java API.])], r=$enableval, r=no) +case "$r" in +no) wt_cv_enable_java=no;; +*) if test "$enable_shared" = "no"; then + AC_MSG_ERROR([--enable-java requires shared libraries]) + fi + wt_cv_enable_java=yes;; +esac +AC_MSG_RESULT($wt_cv_enable_java) +AM_CONDITIONAL([JAVA], [test x$wt_cv_enable_java = xyes]) + AC_MSG_CHECKING(if --enable-python option specified) AC_ARG_ENABLE(python, [AS_HELP_STRING([--enable-python], - [Configure for python symbols.])], r=$enableval, r=no) + [Configure the python API.])], r=$enableval, r=no) case "$r" in no) wt_cv_enable_python=no;; *) if test "$enable_shared" = "no"; then diff --git a/build_posix/aclocal/version-set.m4 b/build_posix/aclocal/version-set.m4 index 138d5a7cc94..762c30c464d 100644 --- a/build_posix/aclocal/version-set.m4 +++ b/build_posix/aclocal/version-set.m4 @@ -1,14 +1,14 @@ dnl build by dist/s_version VERSION_MAJOR=1 -VERSION_MINOR=4 -VERSION_PATCH=2 -VERSION_STRING='"WiredTiger 1.4.2: (January 14, 2013)"' +VERSION_MINOR=5 +VERSION_PATCH=0 +VERSION_STRING='"WiredTiger 1.5.0: (March 14, 2013)"' AC_SUBST(VERSION_MAJOR) AC_SUBST(VERSION_MINOR) AC_SUBST(VERSION_PATCH) AC_SUBST(VERSION_STRING) -VERSION_NOPATCH=1.4 +VERSION_NOPATCH=1.5 AC_SUBST(VERSION_NOPATCH) diff --git a/build_posix/aclocal/version.m4 b/build_posix/aclocal/version.m4 index 75cb73db9bd..55081c7412c 100644 --- a/build_posix/aclocal/version.m4 +++ b/build_posix/aclocal/version.m4 @@ -1,2 +1,2 @@ dnl WiredTiger product version for AC_INIT. Maintained by dist/s_version -1.4.2 +1.5.0 diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in index 8fd08fd0877..eb689726494 100644 --- a/build_posix/configure.ac.in +++ b/build_posix/configure.ac.in @@ -50,17 +50,28 @@ fi AM_CONDITIONAL([DEBUG], [test "$wt_cv_enable_debug" = "yes"]) -# Python API +# Java and Python APIs +if test "$wt_cv_enable_java" = "yes" -o "$wt_cv_enable_python" = "yes"; then + AX_PKG_SWIG(2.0.4, [], + [AC_MSG_WARN([SWIG is required to rebuild Java or Python APIs.])]) +fi + +if test "$wt_cv_enable_java" = "yes"; then + JAVAC=${JAVAC-javac} + AX_PROG_JAVAC + AX_PROG_JAR + AX_JNI_INCLUDE_DIR + if test "$wt_cv_enable_debug" = "yes"; then + AX_CHECK_JUNIT + fi + for JNI_INCLUDE_DIR in $JNI_INCLUDE_DIRS ; do + JNI_CPPFLAGS="$JNI_CPPFLAGS -I$JNI_INCLUDE_DIR" + done + AC_SUBST(JNI_CPPFLAGS) +fi + if test "$wt_cv_enable_python" = "yes"; then AM_PATH_PYTHON([2.6]) - AX_PKG_SWIG(2.0.4, [], - [ AC_MSG_ERROR([SWIG is required to build Python support.]) ]) - - # Check that SWIG supports Python. - touch swigtest.i - $SWIG -python -module swigtest swigtest.i > /dev/null 2>&1 || \ - AC_MSG_ERROR([$SWIG does not include Python support.]) - rm -f swigtest* fi AM_TYPES @@ -73,7 +84,7 @@ AC_CHECK_LIB(dl, dlopen) AC_CHECK_LIB(rt, sched_yield) AC_CHECK_FUNCS([\ clock_gettime fcntl gettimeofday pthread_timedjoin_np posix_fadvise\ - posix_memalign strtouq]) + posix_memalign strtouq sync_file_range]) AC_SYS_LARGEFILE AC_C_BIGENDIAN diff --git a/dist/api_config.py b/dist/api_config.py index 37c2f022a77..ce36b082964 100644 --- a/dist/api_config.py +++ b/dist/api_config.py @@ -191,7 +191,7 @@ def get_default(c): return '(%s)' % (','.join('%s=%s' % (subc.name, get_default(subc)) for subc in sorted(c.subconfig))) elif (c.default or t == 'int') and c.default != 'true': - return str(c.default) + return str(c.default).replace('"', '\\"') else: return '' diff --git a/dist/api_data.py b/dist/api_data.py index baea9560db4..163e2629c93 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -60,12 +60,17 @@ column_meta = [ source_meta = [ Config('source', '', r''' - override the default data source URI derived from the object - name'''), + set a custom data source URI for a column group, index or simple + table. By default, the data source URI is derived from the \c + type and the column group or index name. Applications can + create tables from existing data sources by supplying a \c + source configuration'''), Config('type', 'file', r''' - set the data source type. This setting overrides the URI - prefix for the data source, if no \c source configuration - setting is provided''', + set the type of data source used to store a column group, index + or simple table. By default, a \c "file:" URI is derived from + the object name. The \c type configuration can be used to + switch to a different storage format, such as LSM. Ignored if + an explicit URI is supplied with a \c source configuration''', choices=['file', 'lsm']), ] @@ -122,7 +127,7 @@ lsm_config = [ ] # Per-file configuration -file_config = format_meta + lsm_config + [ +file_config = format_meta + [ Config('allocation_size', '512B', r''' the file unit allocation size, in bytes, must a power-of-two; smaller values decrease the file space required by overflow @@ -214,6 +219,17 @@ file_config = format_meta + lsm_config + [ soft - it is possible for pages to be temporarily larger than this value''', min='512B', max='10TB'), + Config('os_cache_max', '0', r''' + maximum system buffer cache usage, in bytes. If non-zero, evict + object blocks from the system buffer cache after that many bytes + from this object are read or written into the buffer cache''', + min=0), + Config('os_cache_dirty_max', '0', r''' + maximum dirty system buffer cache usage, in bytes. If non-zero, + schedule writes for dirty blocks belonging to this object in the + system buffer cache after that many bytes from this object are + written into the buffer cache''', + min=0), Config('prefix_compression', 'true', r''' configure row-store format key prefix compression''', type='boolean'), @@ -262,8 +278,8 @@ connection_runtime_config = [ Config('reserve', '0', r''' amount of cache this database is guaranteed to have available from the shared cache. This setting is per database. Defaults - to the chunk size'''), - Config('name', '', r''' + to the chunk size''', type='int'), + Config('name', 'pool', r''' name of a cache that is shared between databases'''), Config('size', '500MB', r''' maximum memory to allocate for the shared cache. Setting this @@ -289,6 +305,9 @@ connection_runtime_config = [ trigger eviction when the cache becomes this full (as a percentage)''', min=10, max=99), + Config('statistics', 'false', r''' + Maintain database statistics that may impact performance''', + type='boolean'), Config('verbose', '', r''' enable messages for various events. Options are given as a list, such as <code>"verbose=[evictserver,read]"</code>''', @@ -337,7 +356,7 @@ methods = { min='10', max='50'), ]), -'session.create' : Method(table_only_meta + file_config + source_meta + [ +'session.create' : Method(table_only_meta + file_config + lsm_config + source_meta + [ Config('exclusive', 'false', r''' fail if the object exists. When false (the default), if the object exists, check that its settings match the specified @@ -359,18 +378,19 @@ methods = { number key; valid only for cursors with record number keys''', type='boolean'), Config('bulk', 'false', r''' - configure the cursor for bulk loads, a fast, initial load - path. Bulk load may only be used for newly created objects, - and in the case of row-store objects, key/value items must - be loaded in sorted order. Cursors configured for bulk load - only support the WT_CURSOR::insert and WT_CURSOR::close - methods. The value is usually a true/false flag, but the the - special value \c "bitmap" is for use with fixed-length column - stores, and allows chunks of a memory resident bitmap to be - loaded directly into a file by passing a \c WT_ITEM to - WT_CURSOR::set_value where the \c size field indicates the - number of records in the bitmap (as specified by the file's - \c value_format). Bulk load bitmap values must end on a byte + configure the cursor for bulk-loading, a fast, initial load + path (see @ref bulk_load for more information). Bulk-load + may only be used for newly created objects and cursors + configured for bulk-load only support the WT_CURSOR::insert + and WT_CURSOR::close methods. When bulk-loading row-store + objects, keys must be loaded in sorted order. The value is + usually a true/false flag; when bulk-loading fixed-length + column store objects, the special value \c bitmap allows + chunks of a memory resident bitmap to be loaded directly into + a file by passing a \c WT_ITEM to WT_CURSOR::set_value where + the \c size field indicates the number of records in the + bitmap (as specified by the object's \c value_format + configuration). Bulk-loaded bitmap values must end on a byte boundary relative to the bit count (except for the last set of values loaded)'''), Config('checkpoint', '', r''' @@ -499,10 +519,21 @@ methods = { 'wiredtiger_open' : Method(connection_runtime_config + [ Config('buffer_alignment', '-1', r''' - in-memory alignment (in bytes) for buffers used for I/O. The default - value of -1 indicates that a platform-specific alignment value should - be used (512 bytes on Linux systems, zero elsewhere)''', + in-memory alignment (in bytes) for buffers used for I/O. The + default value of -1 indicates that a platform-specific + alignment value should be used (512 bytes on Linux systems, + zero elsewhere)''', min='-1', max='1MB'), + Config('checkpoint', '', r''' + periodically checkpoint the database''', + type='category', subconfig=[ + Config('name', '"WiredTigerCheckpoint"', r''' + the checkpoint name'''), + Config('wait', '0', r''' + seconds to wait between each checkpoint; setting this value + configures periodic checkpoints''', + min='1', max='100000'), + ]), Config('create', 'false', r''' create the database if it does not exist''', type='boolean'), @@ -539,6 +570,33 @@ methods = { maximum expected number of sessions (including server threads)''', min='1'), + Config('statistics_log', '', r''' + log database connection statistics into a file when the + \c statistics configuration value is set to true. See + @ref statistics_log for more information''', + type='category', subconfig=[ + Config('clear', 'true', r''' + reset statistics counters after each set of log records are + written''', type='boolean'), + Config('path', '"WiredTigerStat.%H"', r''' + the pathname to a file into which the log records are written, + may contain strftime conversion specifications. If the value + is not an absolute path name, the file is created relative to + the database home'''), + Config('sources', '', r''' + if non-empty, include statistics for the list of data source + URIs. No statistics that require traversing a tree are + reported, as if the \c statistics_fast configuration string + were set''', + type='list'), + Config('timestamp', '"%b %d %H:%M:%S"', r''' + a timestamp prepended to each log record, may contain strftime + conversion specifications'''), + Config('wait', '0', r''' + seconds to wait between each write of the log records; setting + this value configures \c statistics and statistics logging''', + min='5', max='100000'), + ]), Config('sync', 'true', r''' flush files to stable storage when closing or writing checkpoints''', diff --git a/dist/filelist b/dist/filelist index 2c3b707e51c..12534da6d36 100644 --- a/dist/filelist +++ b/dist/filelist @@ -7,6 +7,7 @@ src/block/block_addr.c src/block/block_ckpt.c src/block/block_compact.c src/block/block_ext.c +src/block/block_map.c src/block/block_mgr.c src/block/block_open.c src/block/block_read.c @@ -40,6 +41,7 @@ src/btree/bt_walk.c src/btree/col_modify.c src/btree/col_srch.c src/btree/rec_evict.c +src/btree/rec_merge.c src/btree/rec_track.c src/btree/rec_write.c src/btree/row_key.c @@ -54,6 +56,7 @@ src/conn/conn_api.c src/conn/conn_dhandle.c src/conn/conn_cache.c src/conn/conn_cache_pool.c +src/conn/conn_ckpt.c src/conn/conn_handle.c src/conn/conn_open.c src/conn/conn_stat.c @@ -103,8 +106,9 @@ src/os_posix/os_strtouq.c src/os_posix/os_thread.c src/os_posix/os_time.c src/os_posix/os_yield.c -src/packing/packing.c -src/packing/packing_api.c +src/packing/pack_api.c +src/packing/pack_impl.c +src/packing/pack_stream.c src/schema/schema_create.c src/schema/schema_drop.c src/schema/schema_list.c diff --git a/dist/flags.py b/dist/flags.py index 9f87fd1be17..32c1ecac1fe 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -13,6 +13,7 @@ flags = { 'SYNC_COMPACT', 'SYNC_DISCARD', 'SYNC_DISCARD_NOWRITE', + 'SYNC_WRITE_LEAVES', ], 'direct_io' : [ 'DIRECTIO_DATA', @@ -58,6 +59,7 @@ flags = { ################################################### 'conn' : [ 'CONN_CACHE_POOL', + 'CONN_EVICTION_RUN', 'CONN_LSM_MERGE', 'CONN_PANIC', 'CONN_SERVER_RUN', diff --git a/dist/java_doc.py b/dist/java_doc.py new file mode 100644 index 00000000000..ce42a53c118 --- /dev/null +++ b/dist/java_doc.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +# This program pulls the function names from wiredtiger.in and generates +# an input file for Java SWIG that adds doxygen copydoc comments to functions. + +import os, re, sys +import api_data +from dist import compare_srcfile + +# Temporary file. +tmp_file = '__tmp' + +##################################################################### +# Update wiredtiger.in with doxygen comments +##################################################################### +f='../src/include/wiredtiger.in' +o='../lang/java/java_doc.i' +tfile = open(tmp_file, 'w') + +tfile.write('''/* DO NOT EDIT: automatically built by dist/java_doc.py. */ + +''') + +cclass_re = re.compile('^struct __([a-z_]*) {') +cfunc_re = re.compile('\t.*? __F\(([a-z_]*)\)') + +curr_class = "" +for line in open(f, 'r'): + + m = cclass_re.match(line) + if m: + curr_class = m.group(1) + + if curr_class == "": + continue + + m = cfunc_re.match(line) + if m: + tfile.write('COPYDOC(__' + curr_class.lower() + ', ' + curr_class.upper() + ', ' + m.group(1) + ')\n') + +tfile.close() +compare_srcfile(tmp_file, o) + diff --git a/dist/s_all b/dist/s_all index abb39c87903..d84711e1e62 100644 --- a/dist/s_all +++ b/dist/s_all @@ -54,6 +54,7 @@ run "python flags.py" "building flags" run "python log.py" "building logging layer" run "python serial.py" "building serial function support" run "python stat.py" "building statistics support" +run "python java_doc.py" "building Java documentation index" run "sh ./s_typedef -b" "building standard typedefs" run "sh ./s_prototypes" "building function prototypes" diff --git a/dist/s_copyright.list b/dist/s_copyright.list index 359e97f1aa8..d6ac12c588b 100644 --- a/dist/s_copyright.list +++ b/dist/s_copyright.list @@ -1,4 +1,5 @@ skip bench/tcbench/wttest.c +skip build_posix/Test.java skip build_posix/wiredtiger_config.h skip dist/api_config.py skip dist/api_data.py @@ -6,12 +7,21 @@ skip dist/api_err.py skip dist/db.py skip dist/dist.py skip dist/flags.py +skip dist/java_doc.py skip dist/log.py skip dist/log_data.py skip dist/serial.py skip dist/stat.py skip dist/stat_data.py -skip docs/tools/doxypy.py +skip lang/java/java_doc.i +skip lang/java/src/com/wiredtiger/db/Connection.java +skip lang/java/src/com/wiredtiger/db/Cursor.java +skip lang/java/src/com/wiredtiger/db/SearchStatus.java +skip lang/java/src/com/wiredtiger/db/Session.java +skip lang/java/src/com/wiredtiger/db/wiredtiger.java +skip lang/java/src/com/wiredtiger/db/wiredtigerConstants.java +skip lang/java/src/com/wiredtiger/db/wiredtigerJNI.java +skip lang/java/wiredtiger_wrap.c skip lang/python/setup.py skip lang/python/src/wiredtiger/service/WiredTiger.py skip lang/python/src/wiredtiger/service/__init__.py @@ -29,7 +39,6 @@ skip src/include/queue.h skip src/include/serial_funcs.i skip src/log/log_desc.c skip src/support/stat.c -skip test/config.i skip test/packing/intpack-test.c skip test/packing/intpack-test2.c skip test/packing/packing-test.c diff --git a/dist/s_define.list b/dist/s_define.list index 13506d003f0..36308036fef 100644 --- a/dist/s_define.list +++ b/dist/s_define.list @@ -16,9 +16,10 @@ TXN_API_CALL_NOCONF TXN_API_END WT_BARRIER WT_BLOCK_DESC_SIZE +WT_CSTAT_SET WT_DEBUG_BYTE +WT_DSTAT_DECR WT_READ_BARRIER -WT_STAT_CHECK_SESSION WT_STAT_DECR __F __WIREDTIGER_EXT_H_ diff --git a/dist/s_docs b/dist/s_docs index d213f254a17..f2e673823d0 100755 --- a/dist/s_docs +++ b/dist/s_docs @@ -82,12 +82,19 @@ EOF e=1 } - # Run again to generate the full documentation set (with Python). - [ "$python" -eq 1 ] && [ -f ../lang/python/wiredtiger.py ] && ( + # Add optional extras + EXTRAS="../lang/java/src/com/wiredtiger/db ../lang/python/wiredtiger.py" + EXTRA_INPUT="" + for f in $EXTRAS ; do + [ -e "$f" ] && EXTRA_INPUT="$EXTRA_INPUT ../$f" + done + + # Run again to generate the full doc set with Python and Java. + [ "$additional_languages" -eq 1 ] && [ "x$EXTRA_INPUT" != "x" ] && ( cd ../src/docs && (eval cat Doxyfile $filter ; cat <<EOF QUIET=YES -INPUT+=../../lang/python/wiredtiger.py +INPUT+=$EXTRA_INPUT EOF ) | doxygen -) @@ -98,21 +105,21 @@ EOF } clean=0 -python=1 +additional_languages=1 filter="|sed '/PROJECT_NUMBER/s,=.*,=\"Version $WIREDTIGER_VERSION\",'" while : do case "$1" in -a) # Build from scratch clean=1 shift;; - -l) # Generate the top-level landing page in ../docs/top + -l) # Generate the top-level landing page in ../docs/top filter="$filter; cat top/Doxyfile" - python=0 + additional_languages=0 shift;; - -p) # Generate PDFs + -p) # Generate PDFs filter="$filter| sed '/GENERATE_LATEX/s,=.*,=YES,'" shift;; - -t) # Include the TODO list + -t) # Include the TODO list filter="$filter| sed '/GENERATE_TODOLIST/s,=.*,=YES,'" shift;; *) diff --git a/dist/s_funcs.list b/dist/s_funcs.list index 5390a976764..2b71a693226 100644 --- a/dist/s_funcs.list +++ b/dist/s_funcs.list @@ -15,6 +15,15 @@ __wt_log_printf __wt_nlpo2 __wt_nlpo2_round __wt_print_huffman_code +wiredtiger_pack_int +wiredtiger_pack_item +wiredtiger_pack_str +wiredtiger_pack_uint +wiredtiger_unpack_int +wiredtiger_unpack_item +wiredtiger_unpack_start +wiredtiger_unpack_str +wiredtiger_unpack_uint wiredtiger_struct_pack wiredtiger_struct_size wiredtiger_struct_unpack diff --git a/dist/s_release b/dist/s_release index e7fd4633b4e..996bb3f151c 100755 --- a/dist/s_release +++ b/dist/s_release @@ -32,11 +32,15 @@ fi echo "Running 'dist/s_all' in the release tree" (cd "$DEST/dist" && env WT_RELEASE_BUILD=yes sh s_all -A > /dev/null) -echo "Running swig to generate the Python API" -(cd "$DEST/build_posix" && \ - ../configure --enable-python && \ - (cd lang/python && make ../../../lang/python/wiredtiger_wrap.c) && \ - make distclean) > /dev/null +echo "Running swig to generate the Java and Python API" +(cd "$DEST/build_posix" && + ../configure --enable-java --enable-python && + (cd lang/java && make ../../../lang/java/wiredtiger_wrap.c) && + (cd lang/python && make ../../../lang/python/wiredtiger_wrap.c) && + make distclean && + find . -type d -a -empty | xargs rmdir && + find . -type d -a -empty | xargs rmdir && + find . -type d -a -empty | xargs rmdir) > /dev/null echo "Building documentation" (cd "$DEST/dist" && sh s_docs > /dev/null) @@ -45,7 +49,7 @@ echo "Packing release into $RELEASE_DIR/$PKG.tar.bz2" (cd "$RELEASE_DIR" && tar cf - $PKG | bzip2 -9 > $PKG.tar.bz2) echo "Packing documentation into $RELEASE_DIR/$PKG-docs.tar.bz2" -(cd "$RELEASE_DIR" && tar cf - $PKG/[A-Z][A-Z]* $PKG/docs | \ +(cd "$RELEASE_DIR" && tar cf - $PKG/LICENSE $PKG/NEWS $PKG/README $PKG/docs | \ bzip2 -9 > $PKG-docs.tar.bz2) rm -r $DEST diff --git a/dist/s_release.list b/dist/s_release.list index 749aee885e9..4f67e4cdb5b 100644 --- a/dist/s_release.list +++ b/dist/s_release.list @@ -1,6 +1,5 @@ # Exclusions from release packages. # Each non-comment line is passed as an "--exclude" argument to "hg archive". -lang/java lang/python/src src/server test/format diff --git a/dist/s_string.ok b/dist/s_string.ok index 91c9024f801..0dc672c9907 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -174,6 +174,7 @@ SLIST SLVG SML SQL +SSD SSq STAILQ SYS @@ -210,11 +211,13 @@ Vv VxWorks WIREDTIGER WeakHashLen +Wformat WinNT WiredTiger WiredTiger's WiredTigerCheckpoint WiredTigerHome +WiredTigerStat WithSeeds Wuninitialized XP @@ -239,6 +242,7 @@ argc args argv async +autockpt autocommit bdb bigram @@ -376,6 +380,7 @@ exactp extern extlist extlists +fadvise fblocks fclose fcntl @@ -458,6 +463,7 @@ intpack ints inuse io +ip ispo kb kcell @@ -504,6 +510,8 @@ memcpy memfree memmove memsize +mergeable +metaconf metadata metafile mfence @@ -530,10 +538,12 @@ negint newbar newname nextprev +nfilename nhex nl nlpo nocase +nonliteral noop nop notfound @@ -557,6 +567,7 @@ os ovfl packv patchp +pathname pathnames perf pfx @@ -569,6 +580,8 @@ presize printf printlog priv +ps +psp pthread putK putV @@ -625,6 +638,7 @@ sizep sizev skiplist skiplists +sl slotsp slvg snaplen @@ -637,6 +651,7 @@ srch srvr sset startup +statlog stbar stdarg stderr @@ -646,6 +661,7 @@ str strcmp strdup strerror +strftime stringin strncpy strndup @@ -738,6 +754,7 @@ untyped upd upg uri +usedp usr utf util diff --git a/dist/s_style b/dist/s_style index c91518678ab..1964a8bf79d 100644 --- a/dist/s_style +++ b/dist/s_style @@ -30,30 +30,59 @@ for f in `find examples ext src test -name '*.[chisy]' -o -name '*.in' | echo "$f: while (0) has trailing semi-colon" cat $t fi + if egrep '%l[diouxXn]|%[diouxXn]l' $f > $t; then echo "$f: incorrect or dangerous printf format: %l[diouxXn]" cat $t fi + if grep "(unsigned)" $f > $t; then echo "$f: (unsigned) cast is wrong" cat $t fi + egrep 'u_quad' $f | sed '/@u_quad_decl@/d' > $t test -s $t && { echo "$f: old-style type declaration: u_XXX_t or u_quad" cat $t } - # Direct calls to strtouq, not __wt_strtouq - if ! expr "$f" : '.*/os_strtouq.c' > /dev/null; then - egrep strtouq $f | egrep -v __wt_strtouq | \ - egrep -v '^[[:space:]].*\*' > $t - test -s $t && { - echo "$f: explicit call to strtouq" - cat $t - } + # Common typos (Wikipedia's list). + egrep -w 'a a|an an|and and|are are|be be|by by|for for|from from|if if|in in|is is|it it|of of|the the|this this|to to|was was|were were|when when|with with|a an|an a|a the|the a' $f > $t + test -s $t && { + echo "$f: paired typo" + cat $t + } + + # Direct calls to functions we're not supposed to use in the library. + # We don't check for all of them, just a few of the common ones. + if ! expr "$f" : 'examples/.*' > /dev/null && + ! expr "$f" : 'ext/.*' > /dev/null && + ! expr "$f" : 'test/.*' > /dev/null && + ! expr "$f" : '.*/utilities/.*' > /dev/null; then + if ! expr "$f" : '.*/os_alloc.c' > /dev/null && + egrep '[[:space:]]free[(]|[[:space:]]strdup[(]|[[:space:]]strndup[(]|[[:space:]]malloc[(]|[[:space:]]calloc[(]|[[:space:]]realloc[(]' $f > $t; then + test -s $t && { + echo "$f: call to illegal function" + cat $t + } + fi + if ! expr "$f" : '.*/os_strtouq.c' > /dev/null && + egrep '[[:space:]]strtouq[(]' $f > $t; then + test -s $t && { + echo "$f: call to illegal function" + cat $t + } + fi + if egrep '[[:space:]]exit[(]' $f > $t; then + test -s $t && { + echo "$f: call to illegal function" + cat $t + } + fi fi + # Declaration of an integer return variable. if ! expr "$f" : 'examples/.*' > /dev/null && ! expr "$f" : 'test/.*' > /dev/null && ! expr "$f" : 'ext/.*' > /dev/null; then @@ -71,13 +100,9 @@ for f in `find examples ext src test -name '*.[chisy]' -o -name '*.in' | egrep 'return|WT_RET' | \ sed -e "s,^,$f:," -e 's/$/ [return skips API_END call]/' - # Bad code we can't easily fix - grep -Hn 'bzero|exit[ ]*\(1\)|^[ ]+[|&=+-]' $f - tr -cd '[:alnum:][:space:][:punct:]' < $f | unexpand | sed -e 's/){/) {/' \ - -e 's/\([ ]\)exit (/\1exit(/g' \ -e 's/\([ ]\)for(/\1for (/' \ -e 's/\([ ]\)if(/\1if (/' \ -e 's/\([ ]\)index(/\1strchr(/' \ diff --git a/dist/s_symbols.list b/dist/s_symbols.list index 0bc487e25b3..565a82c6413 100644 --- a/dist/s_symbols.list +++ b/dist/s_symbols.list @@ -1,5 +1,16 @@ # List of OK external symbols. wiredtiger_open +wiredtiger_pack_start +wiredtiger_pack_close +wiredtiger_pack_int +wiredtiger_pack_item +wiredtiger_pack_str +wiredtiger_pack_uint +wiredtiger_unpack_start +wiredtiger_unpack_int +wiredtiger_unpack_item +wiredtiger_unpack_str +wiredtiger_unpack_uint wiredtiger_strerror wiredtiger_struct_pack wiredtiger_struct_size diff --git a/dist/s_typedef b/dist/s_typedef index 0969a3b1b89..78081be7160 100644 --- a/dist/s_typedef +++ b/dist/s_typedef @@ -54,6 +54,12 @@ check() { test -s $t && cat $t } +usage() +{ + echo 'usage: s_typedef [-bc]' >&2 + exit 1 +} +test "$#" -eq 1 || usage while : do case "$1" in -b) # -b builds the typedefs @@ -63,12 +69,9 @@ while : check shift;; *) + test "$#" -eq 0 || usage break;; esac done -test "$#" -eq 0 || { - echo 'usage: s_typedef [-bc]' >&2 - exit 1 -} exit 0 diff --git a/dist/stat.py b/dist/stat.py index 5d294f51fce..dfeaef82935 100644 --- a/dist/stat.py +++ b/dist/stat.py @@ -1,4 +1,5 @@ -# Read the source files and output the statistics #defines and allocation code. +# Read the source files and output the statistics #defines plus the +# initialize and clear code. import re, string, sys, textwrap from dist import compare_srcfile @@ -90,30 +91,21 @@ compare_srcfile(tmp_file, '../src/include/wiredtiger.in') def print_func(name, list): '''Print the functions for the stat.c file.''' f.write(''' -int -__wt_stat_alloc_''' + name + '''_stats(WT_SESSION_IMPL *session, WT_''' + - name.upper() + '''_STATS **statsp) +void +__wt_stat_init_''' + name + '''_stats(WT_''' + name.upper() + '''_STATS *stats) { -\tWT_''' + name.upper() + '''_STATS *stats; - -\tWT_RET(__wt_calloc_def(session, 1, &stats)); - ''') - for l in sorted(list): o = '\tstats->' + l.name + '.desc = "' + l.desc + '";\n' if len(o) + 7 > 80: o = o.replace('= ', '=\n\t ') f.write(o) - f.write(''' -\t*statsp = stats; -\treturn (0); -} + f.write('''} ''') f.write(''' void -__wt_stat_clear_''' + name + '''_stats(WT_STATS *stats_arg) +__wt_stat_clear_''' + name + '''_stats(void *stats_arg) { \tWT_''' + name.upper() + '''_STATS *stats; @@ -126,7 +118,7 @@ __wt_stat_clear_''' + name + '''_stats(WT_STATS *stats_arg) f.write('\tstats->' + l.name + '.v = 0;\n'); f.write('}\n') -# Write the stat allocation and clear routines to the stat.c file. +# Write the stat initialization and clear routines to the stat.c file. f = open(tmp_file, 'w') f.write('/* DO NOT EDIT: automatically built by dist/stat.py. */\n\n') f.write('#include "wt_internal.h"\n') diff --git a/dist/stat_data.py b/dist/stat_data.py index e5eafa0bc29..682941b4132 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -27,6 +27,7 @@ connection_stats = [ Stat('file_open', 'files currently open'), Stat('memory_allocation', 'total heap memory allocations'), Stat('memory_free', 'total heap memory frees'), + Stat('memory_grow', 'total heap memory re-allocations'), Stat('read_io', 'total read I/Os'), Stat('rwlock_read', 'pthread mutex shared lock read-lock calls'), Stat('rwlock_write', 'pthread mutex shared lock write-lock calls'), @@ -53,13 +54,23 @@ connection_stats = [ Stat('cache_bytes_write', 'cache: bytes written from cache'), Stat('cache_eviction_clean', 'cache: unmodified pages evicted'), Stat('cache_eviction_dirty', 'cache: modified pages evicted'), + Stat('cache_eviction_checkpoint', + 'cache: checkpoint blocked page eviction'), Stat('cache_eviction_fail', 'cache: pages selected for eviction unable to be evicted'), + Stat('cache_eviction_force', 'cache: pages queued for forced eviction'), Stat('cache_eviction_hazard', - 'cache: eviction unable to acquire hazard pointer'), + 'cache: hazard pointer blocked page eviction'), Stat('cache_eviction_internal', 'cache: internal pages evicted'), + Stat('cache_eviction_merge', + 'cache: internal page merge operations completed'), + Stat('cache_eviction_merge_fail', + 'cache: internal page merge attempts that could not complete'), + Stat('cache_eviction_merge_levels', + 'cache: internal levels merged'), Stat('cache_eviction_slow', 'cache: eviction server unable to reach eviction goal'), + Stat('cache_eviction_walk', 'cache: pages walked for eviction'), Stat('cache_pages_dirty', 'cache: tracked dirty pages in the cache'), Stat('cache_pages_inuse', 'cache: pages currently held in the cache', perm=1), @@ -67,6 +78,14 @@ connection_stats = [ Stat('cache_write', 'cache: pages written from cache'), ########################################## + # Reconciliation statistics + ########################################## + Stat('rec_pages', 'page reconciliation calls'), + Stat('rec_pages_eviction', 'page reconciliation calls for eviction'), + Stat('rec_skipped_update', + 'reconciliation failed because an update could not be included'), + + ########################################## # Transaction statistics ########################################## Stat('txn_ancient', 'ancient transactions'), @@ -157,12 +176,21 @@ dsrc_stats = [ Stat('cache_bytes_read', 'bytes read into cache'), Stat('cache_bytes_write', 'bytes written from cache'), Stat('cache_eviction_clean', 'unmodified pages evicted'), + Stat('cache_eviction_checkpoint', + 'cache: checkpoint blocked page eviction'), Stat('cache_eviction_dirty', 'modified pages evicted'), Stat('cache_eviction_fail', 'data source pages selected for eviction unable to be evicted'), + Stat('cache_eviction_force', 'cache: pages queued for forced eviction'), Stat('cache_eviction_hazard', - 'eviction unable to acquire hazard pointer'), + 'cache: hazard pointer blocked page eviction'), Stat('cache_eviction_internal', 'internal pages evicted'), + Stat('cache_eviction_merge', + 'cache: internal page merge operations completed'), + Stat('cache_eviction_merge_fail', + 'cache: internal page merge attempts that could not complete'), + Stat('cache_eviction_merge_levels', + 'cache: internal levels merged'), Stat('cache_overflow_value', 'overflow values cached in memory'), Stat('cache_read', 'pages read into cache'), Stat('cache_read_overflow', 'overflow pages read into cache'), @@ -193,9 +221,11 @@ dsrc_stats = [ Stat('rec_pages', 'page reconciliation calls'), Stat('rec_pages_eviction', 'page reconciliation calls for eviction'), Stat('rec_skipped_update', - 'page reconciliation failed when an update could not be included'), + 'reconciliation failed because an update could not be included'), Stat('rec_split_intl', 'reconciliation internal pages split'), Stat('rec_split_leaf', 'reconciliation leaf pages split'), + Stat('rec_split_max', + 'reconciliation maximum number of splits created by for a page'), ########################################## # Transaction statistics diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c index 1386e7ea037..540fc973c04 100644 --- a/examples/c/ex_all.c +++ b/examples/c/ex_all.c @@ -447,11 +447,13 @@ session_ops(WT_SESSION *session) ret = session->create(session, "table:mytable", "key_format=S,value_format=S"); /*! [Create a table] */ + ret = session->drop(session, "table:mytable", NULL); /*! [Create a column-store table] */ ret = session->create(session, "table:mytable", "key_format=r,value_format=S"); /*! [Create a column-store table] */ + ret = session->drop(session, "table:mytable", NULL); /*! [Create a table with columns] */ /* @@ -459,9 +461,10 @@ session_ops(WT_SESSION *session) * (string, signed 32-bit integer, unsigned 16-bit integer). */ ret = session->create(session, "table:mytable", - "key_format=r,value_format=SiH" + "key_format=r,value_format=SiH," "columns=(id,department,salary,year-started)"); /*! [Create a table with columns] */ + ret = session->drop(session, "table:mytable", NULL); /* * This example code gets run, and the compression libraries might not @@ -474,42 +477,64 @@ session_ops(WT_SESSION *session) "table:mytable", "block_compressor=bzip2,key_format=S,value_format=S"); /*! [Create a bzip2 compressed table] */ + ret = session->drop(session, "table:mytable", NULL); /*! [Create a snappy compressed table] */ ret = session->create(session, "table:mytable", "block_compressor=snappy,key_format=S,value_format=S"); /*! [Create a snappy compressed table] */ + ret = session->drop(session, "table:mytable", NULL); #endif /*! [Configure checksums to uncompressed] */ ret = session->create(session, "table:mytable", "key_format=S,value_format=S,checksum=uncompressed"); /*! [Configure checksums to uncompressed] */ + ret = session->drop(session, "table:mytable", NULL); - /*! [Configure dictionary compression off] */ + /*! [Configure dictionary compression on] */ ret = session->create(session, "table:mytable", - "key_format=S,value_format=S,dictionary=false"); - /*! [Configure dictionary compression off] */ + "key_format=S,value_format=S,dictionary=1000"); + /*! [Configure dictionary compression on] */ + ret = session->drop(session, "table:mytable", NULL); /*! [Configure key prefix compression off] */ ret = session->create(session, "table:mytable", "key_format=S,value_format=S,prefix_compression=false"); /*! [Configure key prefix compression off] */ + ret = session->drop(session, "table:mytable", NULL); + +#ifdef MIGHT_NOT_RUN + /* Requires sync_file_range */ + /*! [os_cache_dirty_max configuration] */ + ret = session->create( + session, "table:mytable", "os_cache_dirty_max=500MB"); + /*! [os_cache_dirty_max configuration] */ + ret = session->drop(session, "table:mytable", NULL); + + /* Requires posix_fadvise */ + /*! [os_cache_max configuration] */ + ret = session->create(session, "table:mytable", "os_cache_max=1GB"); + /*! [os_cache_max configuration] */ + ret = session->drop(session, "table:mytable", NULL); +#endif /*! [Create a cache-resident object] */ ret = session->create(session, "table:mytable", "key_format=r,value_format=S,cache_resident=true"); /*! [Create a cache-resident object] */ + ret = session->drop(session, "table:mytable", NULL); + + { + /* Create a table for the session operations. */ + ret = session->create( + session, "table:mytable", "key_format=S,value_format=S"); /*! [Compact a table] */ ret = session->compact(session, "table:mytable", NULL); /*! [Compact a table] */ - /*! [Drop a table] */ - ret = session->drop(session, "table:mytable", NULL); - /*! [Drop a table] */ - /*! [Print to the message stream] */ ret = session->msg_printf( session, "process ID %" PRIuMAX, (uintmax_t)getpid()); @@ -553,6 +578,11 @@ session_ops(WT_SESSION *session) ret = session->verify(session, "table:mytable", NULL); /*! [Verify a table] */ + /*! [Drop a table] */ + ret = session->drop(session, "table:mytable", NULL); + /*! [Drop a table] */ + } + /*! [Close a session] */ ret = session->close(session, NULL); /*! [Close a session] */ @@ -826,6 +856,31 @@ my_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session, } /*! [WT_COMPRESSOR presize] */ +static int +my_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session, + size_t page_max, u_int split_pct, size_t extra, + uint8_t *src, uint32_t *offsets, uint32_t slots, + uint8_t *dst, size_t dst_len, int final, + size_t *result_lenp, uint32_t *result_slotsp) +{ + /* Unused parameters */ + (void)compressor; + (void)session; + (void)page_max; + (void)split_pct; + (void)extra; + (void)src; + (void)offsets; + (void)slots; + (void)dst; + (void)dst_len; + (void)final; + (void)result_lenp; + (void)result_slotsp; + + return (0); +} + int add_compressor(WT_CONNECTION *conn) { @@ -833,7 +888,11 @@ add_compressor(WT_CONNECTION *conn) /*! [WT_COMPRESSOR register] */ static WT_COMPRESSOR my_compressor = { - my_compress, NULL, my_decompress, my_pre_size }; + my_compress, + my_compress_raw, /* NULL, if no raw compression */ + my_decompress, + my_pre_size /* NULL, if pre-sizing not needed */ + }; ret = conn->add_compressor(conn, "my_compress", &my_compressor, NULL); /*! [WT_COMPRESSOR register] */ @@ -875,9 +934,11 @@ connection_ops(WT_CONNECTION *conn) { int ret; +#ifdef MIGHT_NOT_RUN /*! [Load an extension] */ ret = conn->load_extension(conn, "my_extension.dll", NULL); /*! [Load an extension] */ +#endif add_collator(conn); add_data_source(conn); @@ -991,14 +1052,18 @@ hot_backup(WT_SESSION *session) int main(void) { + WT_CONNECTION *conn; int ret; - { - WT_CONNECTION *conn; /*! [Open a connection] */ ret = wiredtiger_open(home, NULL, "create,cache_size=500M", &conn); /*! [Open a connection] */ - } + + if (ret == 0) + connection_ops(conn); + /* + * The connection has been closed. + */ #ifdef MIGHT_NOT_RUN /* @@ -1006,34 +1071,70 @@ main(void) * be installed, causing the open to fail. The documentation requires * the code snippets, use #ifdef's to avoid running it. */ - { /*! [Configure bzip2 extension] */ - WT_CONNECTION *conn; - ret = wiredtiger_open(home, NULL, "create," "extensions=[\"/usr/local/lib/wiredtiger_bzip2.so\"]", &conn); /*! [Configure bzip2 extension] */ - } + if (ret == 0) + (void)conn->close(conn, NULL); - { /*! [Configure snappy extension] */ - WT_CONNECTION *conn; - ret = wiredtiger_open(home, NULL, "create," "extensions=[\"/usr/local/lib/wiredtiger_snappy.so\"]", &conn); /*! [Configure snappy extension] */ - } + if (ret == 0) + (void)conn->close(conn, NULL); /* - * We're not allowed to open multiple connections, don't run more than - * one wiredtiger_open call. + * This example code gets run, and direct I/O might not be available, + * causing the open to fail. The documentation requires code snippets, + * use #ifdef's to avoid running it. */ - { + /* Might Not Run: direct I/O may not be available. */ /*! [Configure direct_io for data files] */ ret = wiredtiger_open(home, NULL, "create,direct_io=[data]", &conn); /*! [Configure direct_io for data files] */ + if (ret == 0) + (void)conn->close(conn, NULL); +#endif + + /*! [Statistics configuration] */ + ret = wiredtiger_open(home, NULL, "create,statistics=true", &conn); + /*! [Statistics configuration] */ + if (ret == 0) + (void)conn->close(conn, NULL); + + /*! [Statistics logging] */ + ret = wiredtiger_open( + home, NULL, "create,statistics_log=(wait=30)", &conn); + /*! [Statistics logging] */ + if (ret == 0) + (void)conn->close(conn, NULL); + + /*! [Statistics logging with objects] */ + ret = wiredtiger_open(home, NULL, + "create," + "statistics_log=(sources=(\"table:table1\",\"table:table2\"))", + &conn); + /*! [Statistics logging with objects] */ + if (ret == 0) + (void)conn->close(conn, NULL); + +#ifdef MIGHT_NOT_RUN + /* + * This example code gets run, and a non-existent log file path might + * cause the open to fail. The documentation requires code snippets, + * use #ifdef's to avoid running it. + */ + /*! [Statistics logging with path] */ + ret = wiredtiger_open(home, NULL, + "create," + "statistics_log=(wait=120,path=\"/log/log.%m.%d.%y\")", &conn); + /*! [Statistics logging with path] */ + if (ret == 0) + (void)conn->close(conn, NULL); #endif /*! [Get the WiredTiger library version #1] */ diff --git a/examples/java/Makefile.am b/examples/java/Makefile.am new file mode 100644 index 00000000000..c7fbfffa48c --- /dev/null +++ b/examples/java/Makefile.am @@ -0,0 +1,21 @@ +AM_CPPFLAGS = -I$(abs_top_builddir) + +JAVAEXAMPLES = $(top_srcdir)/examples/java/com/wiredtiger/examples + +# TODO: How to add to existing Javadoc from main API? +# JDOCDIR = $(top_srcdir)/docs/java +# java_DATA = $(JDOCDIR)/index.html + +javadir = $(datadir)/java +dist_java_JAVA = \ + $(JAVAEXAMPLES)/ex_access.java + +all-local: wiredtiger.jar + +$(JDOCDIR)/index.html: $(dist_java_JAVA) + mkdir -p $(JDOCDIR) + javadoc -public -d $(JDOCDIR) -link http://docs.oracle.com/javase/6/docs/api $(JAVAEXAMPLES)/[A-Z]*.java + +wiredtiger.jar: $(dist_java_JAVA) + (cd $(top_builddir) && \ + $(JAR) -cf wiredtiger.jar com/) diff --git a/examples/java/com/wiredtiger/examples/ex_access.java b/examples/java/com/wiredtiger/examples/ex_access.java new file mode 100644 index 00000000000..9a681546c59 --- /dev/null +++ b/examples/java/com/wiredtiger/examples/ex_access.java @@ -0,0 +1,53 @@ +/*- + * Public Domain 2008-2013 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * ex_access.java + * demonstrates how to create and access a simple table. + */ +package com.wiredtiger.examples; +import com.wiredtiger.db.*; + +public class ex_access { + public static void main(String[] args) { + Connection conn = wiredtiger.open("WT_HOME", "create"); + Session s = conn.open_session(null); + s.create("table:t", "key_format=S,value_format=u"); + Cursor c = s.open_cursor("table:t", null, null); + System.out.println("Key format: " + c.getKeyFormat()); + System.out.println("Value format: " + c.getValueFormat()); + try { + c.putKeyString("foo"); + c.putValueByteArray("bar".getBytes()); + c.insert(); + c.reset(); + while (c.next() == 0) { + System.out.println("Got: " + c.getKeyString()); + } + } catch (WiredTigerPackingException wtpe) { + } + conn.close(null); + } +} diff --git a/ext/compressors/bzip2/bzip2_compress.c b/ext/compressors/bzip2/bzip2_compress.c index 16efaa7aa3e..95d0490262c 100644 --- a/ext/compressors/bzip2/bzip2_compress.c +++ b/ext/compressors/bzip2/bzip2_compress.c @@ -43,8 +43,8 @@ bzip2_decompress(WT_COMPRESSOR *, WT_SESSION *, uint8_t *, size_t, uint8_t *, size_t, size_t *); #ifdef WIREDTIGER_TEST_COMPRESS_RAW static int -bzip2_compress_raw(WT_COMPRESSOR *, WT_SESSION *, - size_t, size_t, uint8_t *, uint32_t *, uint32_t, uint8_t *, size_t, int, +bzip2_compress_raw(WT_COMPRESSOR *, WT_SESSION *, size_t, u_int, + size_t, uint8_t *, uint32_t *, uint32_t, uint8_t *, size_t, int, size_t *, uint32_t *); #endif @@ -209,7 +209,7 @@ __bzip2_compress_raw_random(void) */ static int bzip2_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session, - size_t page_max, size_t extra, + size_t page_max, u_int split_pct, size_t extra, uint8_t *src, uint32_t *offsets, uint32_t slots, uint8_t *dst, size_t dst_len, int final, size_t *result_lenp, uint32_t *result_slotsp) @@ -218,6 +218,7 @@ bzip2_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session, int compression_failed, ret; __UNUSED(page_max); + __UNUSED(split_pct); __UNUSED(extra); __UNUSED(final); @@ -264,14 +265,14 @@ bzip2_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session, #if 0 fprintf(stderr, - "bzip2_compress_raw (%s): page_max %" PRIuMAX ", extra %" PRIuMAX + "bzip2_compress_raw (%s): page_max %" PRIuMAX + ", split_pct %u, extra %" PRIuMAX ", slots %" PRIu32 ", take %" PRIu32 ": %" PRIu32 " -> %" PRIuMAX "\n", final ? "final" : "not final", - (uintmax_t)page_max, (uintmax_t)extra, + (uintmax_t)page_max, split_pct, (uintmax_t)extra, slots, take, offsets[take], (uintmax_t)*result_lenp); #endif - return (0); } #endif diff --git a/lang/java/Makefile.am b/lang/java/Makefile.am new file mode 100644 index 00000000000..75bd3aaf0bc --- /dev/null +++ b/lang/java/Makefile.am @@ -0,0 +1,75 @@ +AM_CPPFLAGS = -I$(abs_top_builddir) + +JAVASRC = $(top_srcdir)/lang/java +JAVADEST = src/com/wiredtiger/db +JAVADESTFULL = $(JAVASRC)/$(JAVADEST) +JAVAEXAMPLES = $(top_srcdir)/examples/java/com/wiredtiger/examples +JAVATEST = $(top_srcdir)/test/java/com/wiredtiger/test +BUILT_SOURCES = $(JAVASRC)/wiredtiger_wrap.c +SWIG_SOURCES = $(JAVASRC)/wiredtiger.i + +JDOCDIR = $(top_srcdir)/docs/java +# The Java documentation is currently generated by Doxygen - disable javadoc +#java_DATA = $(JDOCDIR)/index.html + +javadir = $(datadir)/java/$(PACKAGE)-$(PACKAGE_VERSION) +JAVA_SRC = \ + $(JAVADESTFULL)/Connection.java \ + $(JAVADESTFULL)/Cursor.java \ + $(JAVADESTFULL)/SearchStatus.java \ + $(JAVADESTFULL)/PackFormatInputStream.java \ + $(JAVADESTFULL)/PackInputStream.java \ + $(JAVADESTFULL)/PackOutputStream.java \ + $(JAVADESTFULL)/PackUtil.java \ + $(JAVADESTFULL)/Session.java \ + $(JAVADESTFULL)/WiredTigerException.java \ + $(JAVADESTFULL)/WiredTigerPackingException.java \ + $(JAVADESTFULL)/wiredtiger.java \ + $(JAVADESTFULL)/wiredtigerConstants.java \ + $(JAVADESTFULL)/wiredtigerJNI.java \ + $(JAVAEXAMPLES)/ex_access.java + +JAVA_JUNIT = \ + $(JAVATEST)/CursorTest.java \ + $(JAVATEST)/PackTest.java \ + $(JAVATEST)/WiredTigerSuite.java + +dist_java_JAVA = $(JAVA_SRC) @JAVA_JUNIT@ +dist_java_DATA = wiredtiger.jar + +EXTRA_JAVA = $(JAVA_JUNIT) + +java_LTLIBRARIES = libwiredtiger_java.la + +TESTS_JUNIT = AllJunitTests + +TESTS = @TESTS_JUNIT@ + +AllJunitTests: + echo "#! /bin/sh" > $@ + echo 'SCRIPT_DIR=`dirname $$0`' >> $@ + echo 'env LD_LIBRARY_PATH=$$SCRIPT_DIR/../../.libs:$$SCRIPT_DIR/.libs DYLD_LIBRARY_PATH=$$SCRIPT_DIR/../../.libs JAVA_LIBRARY_PATH=$$SCRIPT_DIR/.libs @JUNIT@ com.wiredtiger.test.WiredTigerSuite' >> $@ + chmod +x $@ + mkdir -p WT_HOME + +CPPFLAGS += $(JNI_CPPFLAGS) +# Some warnings when compiling the generated code are unavoidable +CFLAGS += -w +libwiredtiger_java_la_SOURCES = $(BUILT_SOURCES) $(SWIG_SOURCES) +#libwiredtiger_java_la_LDFLAGS = -module +libwiredtiger_java_la_LIBADD = $(abs_top_builddir)/libwiredtiger.la + +all-local: wiredtiger.jar + +$(JAVASRC)/wiredtiger_wrap.c: $(top_srcdir)/src/include/wiredtiger.in $(SWIG_SOURCES) + (cd $(JAVASRC) && \ + $(SWIG) -Wall -v -java -nodefaultctor -nodefaultdtor -package com.wiredtiger.db -I$(abs_top_builddir) -outdir $(JAVADEST) -o wiredtiger_wrap.c wiredtiger.i) + +$(JDOCDIR)/index.html: $(dist_java_JAVA) + mkdir -p $(JDOCDIR) + javadoc -public -d $(JDOCDIR) -link http://docs.oracle.com/javase/6/docs/api $(JAVADESTFULL)/wiredtiger.java $(JAVADESTFULL)/wiredtigerConstants.java $(JAVADESTFULL)/[A-Z]*.java + +wiredtiger.jar: $(dist_java_JAVA) classjava.stamp + (cd $(top_builddir) && \ + $(JAR) -cf wiredtiger.jar com/) + cp $(top_builddir)/wiredtiger.jar . diff --git a/lang/java/java_doc.i b/lang/java/java_doc.i new file mode 100644 index 00000000000..da2e9fecb94 --- /dev/null +++ b/lang/java/java_doc.i @@ -0,0 +1,41 @@ +/* DO NOT EDIT: automatically built by dist/java_doc.py. */ + +COPYDOC(__wt_cursor, WT_CURSOR, get_key) +COPYDOC(__wt_cursor, WT_CURSOR, get_value) +COPYDOC(__wt_cursor, WT_CURSOR, set_key) +COPYDOC(__wt_cursor, WT_CURSOR, set_value) +COPYDOC(__wt_cursor, WT_CURSOR, compare) +COPYDOC(__wt_cursor, WT_CURSOR, next) +COPYDOC(__wt_cursor, WT_CURSOR, prev) +COPYDOC(__wt_cursor, WT_CURSOR, reset) +COPYDOC(__wt_cursor, WT_CURSOR, search) +COPYDOC(__wt_cursor, WT_CURSOR, search_near) +COPYDOC(__wt_cursor, WT_CURSOR, insert) +COPYDOC(__wt_cursor, WT_CURSOR, update) +COPYDOC(__wt_cursor, WT_CURSOR, remove) +COPYDOC(__wt_cursor, WT_CURSOR, close) +COPYDOC(__wt_session, WT_SESSION, close) +COPYDOC(__wt_session, WT_SESSION, reconfigure) +COPYDOC(__wt_session, WT_SESSION, open_cursor) +COPYDOC(__wt_session, WT_SESSION, create) +COPYDOC(__wt_session, WT_SESSION, compact) +COPYDOC(__wt_session, WT_SESSION, drop) +COPYDOC(__wt_session, WT_SESSION, rename) +COPYDOC(__wt_session, WT_SESSION, salvage) +COPYDOC(__wt_session, WT_SESSION, truncate) +COPYDOC(__wt_session, WT_SESSION, upgrade) +COPYDOC(__wt_session, WT_SESSION, verify) +COPYDOC(__wt_session, WT_SESSION, begin_transaction) +COPYDOC(__wt_session, WT_SESSION, commit_transaction) +COPYDOC(__wt_session, WT_SESSION, rollback_transaction) +COPYDOC(__wt_session, WT_SESSION, checkpoint) +COPYDOC(__wt_session, WT_SESSION, msg_printf) +COPYDOC(__wt_connection, WT_CONNECTION, close) +COPYDOC(__wt_connection, WT_CONNECTION, reconfigure) +COPYDOC(__wt_connection, WT_CONNECTION, is_new) +COPYDOC(__wt_connection, WT_CONNECTION, open_session) +COPYDOC(__wt_connection, WT_CONNECTION, load_extension) +COPYDOC(__wt_connection, WT_CONNECTION, add_data_source) +COPYDOC(__wt_connection, WT_CONNECTION, add_collator) +COPYDOC(__wt_connection, WT_CONNECTION, add_compressor) +COPYDOC(__wt_connection, WT_CONNECTION, add_extractor) diff --git a/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java b/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java new file mode 100644 index 00000000000..607717ee95a --- /dev/null +++ b/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java @@ -0,0 +1,165 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +package com.wiredtiger.db; + +import java.io.ByteArrayInputStream; +import java.lang.StringBuffer; +import com.wiredtiger.db.PackUtil; +import com.wiredtiger.db.WiredTigerPackingException; + +/** + * An internal helper class for consuming pack format strings. + * + * Applications should not need to use this class. + */ +public class PackFormatInputStream { + + protected String format; + protected int formatOff; + protected int formatRepeatCount; + + /** + * Constructor for a format stream. + * + * \param format the encoded format backing string. + */ + protected PackFormatInputStream(String format) { + this.format = format; + formatOff = 0; + formatRepeatCount = 0; + } + + /** + * Standard toString - returns the string used during construction. + */ + public String toString() { + return format; + } + + /** + * Returns the approximate count of elements left in the format. + * This method does not account for repeat counts or string length + * encodings - so should be used as a guide only. + */ + public int available() { + return format.length() - formatOff + formatRepeatCount; + } + + /** + * Reset the current stream position. + */ + public void reset() { + formatOff = 0; + formatRepeatCount = 0; + } + + /** + * Return the decoded type for the next entry in the format stream. Does + * not adjust the position of the stream. + */ + protected char getType() + throws WiredTigerPackingException { + if (formatOff >= format.length()) { + System.err.println("Raw format is: " + format); + throw new WiredTigerPackingException( + "No more fields in format."); + } + + String fieldName; + boolean lenOK = false; + int countOff = 0; + + while (PackUtil.PackSpecialCharacters.indexOf( + format.charAt(formatOff + countOff)) != -1) { + countOff++; + } + // Skip repeat counts and sizes + while (Character.isDigit(format.charAt(formatOff + countOff))) { + countOff++; + } + return format.charAt(formatOff + countOff); + } + + /** + * Check to see if the next entry is compatible with the requested type. + * + * \param asking the format type to match. + * \param consume indicates whether to update the stream position. + */ + protected void checkType(char asking, boolean consume) + throws WiredTigerPackingException { + + char expected = getType(); + if (Character.toLowerCase(expected) != Character.toLowerCase(asking)) + throw new WiredTigerPackingException( + "Format mismatch. Wanted: " + asking + ", got: " + expected); + if (consume) { + consume(); + } + } + + /** + * Move the format stream position ahead one position. + */ + protected void consume() { + if (formatRepeatCount > 1) { + --formatRepeatCount; + } else if (formatRepeatCount == 1) { + formatRepeatCount = 0; + ++formatOff; + } else { + while (PackUtil.PackSpecialCharacters.indexOf( + format.charAt(formatOff)) != -1) { + ++formatOff; + } + + // Don't need to worry about String or byte array size counts + // since they have already been consumed. + formatRepeatCount = getIntFromFormat(true); + if (formatRepeatCount == 0) { + ++formatOff; + } + } + } + + /** + * Decode an integer from the format string, return zero if not starting + * on a digit. + * + * \param advance whether to move the stream position. + */ + private int getIntFromFormat(boolean advance) { + int valueLen = 0; + int countOff; + for (countOff = 0; + Character.isDigit(format.charAt(formatOff + countOff)); + countOff++) { + valueLen *= 10; + valueLen += Character.digit(format.charAt(formatOff + countOff), 10); + } + if (advance) { + formatOff += countOff; + } + return valueLen; + } + + /** + * Retrieve a length from the format string. Either for a repeat count + * or a string length. Return one if no explicit repeat count. + * + * \param advance whether to move the stream position. + */ + protected int getLengthFromFormat(boolean advance) { + int valueLen = getIntFromFormat(advance); + if (valueLen == 0) { + valueLen = 1; + } + return valueLen; + } +} + diff --git a/lang/java/src/com/wiredtiger/db/PackInputStream.java b/lang/java/src/com/wiredtiger/db/PackInputStream.java new file mode 100644 index 00000000000..6082684a8bf --- /dev/null +++ b/lang/java/src/com/wiredtiger/db/PackInputStream.java @@ -0,0 +1,320 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ +package com.wiredtiger.db; + +import java.io.ByteArrayInputStream; +import java.lang.StringBuffer; +import com.wiredtiger.db.PackUtil; +import com.wiredtiger.db.WiredTigerPackingException; + +/** + * An internal helper class for decoding WiredTiger packed values. + * + * Applications should not need to use this class. + */ +public class PackInputStream { + + protected PackFormatInputStream format; + protected byte[] value; + protected int valueOff; + protected int valueLen; + + /** + * Constructor. + * + * \param format A String that contains the WiredTiger format that + * defines the layout of this packed value. + * \param value The raw bytes that back the stream. + */ + public PackInputStream(String format, byte[] value) { + this(format, value, 0, value.length); + } + + /** + * Constructor. + * + * \param format A String that contains the WiredTiger format that + * defines the layout of this packed value. + * \param value The raw bytes that back the stream. + * \param off Offset into the value array at which the stream begins. + * \param len Length of the value array that forms the stream. + */ + public PackInputStream(String format, byte[] value, int off, int len) { + this.format = new PackFormatInputStream(format); + this.value = value; + this.valueOff = off; + this.valueLen = len; + } + + /** + * Returns the raw packing format string. + */ + public String getFormat() { + return format.toString(); + } + + /** + * Returns the raw value byte array. + */ + public byte[] getValue() { + return value; + } + + /** + * Retrieves a byte field from the stream. + */ + public byte getByte() + throws WiredTigerPackingException { + format.checkType('b', false); + format.consume(); + return (byte)(value[valueOff++] - 0x80); + } + + /** + * Retrieves a byte array field from the stream. + * + * \param dest The byte array where the returned value will be stored. The + * array should be large enough to store the entire data item, + * if it is not, a truncated value will be returned. + */ + public void getByteArray(byte[] dest) + throws WiredTigerPackingException { + this.getByteArray(dest, 0, dest.length); + } + + /** + * Retrieves a byte array field from the stream. + * + * \param dest The byte array where the returned value will be stored. + * \param off Offset into the destination buffer to start copying into. + * \param len The length should be large enough to store the entire data + * item, if it is not, a truncated value will be returned. + */ + public void getByteArray(byte[] dest, int off, int len) + throws WiredTigerPackingException { + format.checkType('U', false); + getByteArrayInternal(getByteArrayLength(), dest, off, len); + + } + + /** + * Retrieves a byte array field from the stream. Creates a new byte array + * that is the size of the object being retrieved. + */ + public byte[] getByteArray() + throws WiredTigerPackingException { + int itemLen = getByteArrayLength(); + byte[] unpacked = new byte[itemLen]; + getByteArrayInternal(itemLen, unpacked, 0, itemLen); + return unpacked; + } + + /** + * Finds the length of a byte array. Either by decoding the length from + * the format or using the remaining size of the stream. + */ + private int getByteArrayLength() + throws WiredTigerPackingException { + int itemLen = 0; + /* The rest of the buffer is a byte array. */ + if (format.available() == 1) { + itemLen = valueLen - valueOff; + } else { + itemLen = unpackInt(false); + } + return itemLen; + } + + /** + * Do the work of retrieving a byte array. + */ + private void getByteArrayInternal( + int itemLen, byte[] dest, int off, int destLen) + throws WiredTigerPackingException { + /* TODO: padding. */ + int copyLen = itemLen; + if (itemLen > destLen) { + copyLen = destLen; + } + format.consume(); + System.arraycopy(value, valueOff, dest, off, copyLen); + valueOff += itemLen; + } + + /** + * Retrieves an integer field from the stream. + */ + public int getInt() + throws WiredTigerPackingException { + boolean signed = false; + format.checkType('i', false); + if (format.getType() == 'I' || + format.getType() == 'L') { + signed = true; + } + format.consume(); + return unpackInt(signed); + } + + /** + * Retrieves a long field from the stream. + */ + public long getLong() + throws WiredTigerPackingException { + boolean signed = false; + format.checkType('q', false); + if (format.getType() == 'Q') { + signed = true; + } + format.consume(); + return unpackLong(signed); + } + + /** + * Retrieves a record field from the stream. + */ + public long getRecord() + throws WiredTigerPackingException { + format.checkType('r', false); + format.consume(); + return unpackLong(false); + } + + /** + * Retrieves a short field from the stream. + */ + public short getShort() + throws WiredTigerPackingException { + boolean signed = false; + format.checkType('h', false); + if (format.getType() == 'H') { + signed = true; + } + format.consume(); + return unpackShort(signed); + } + + /** + * Retrieves a string field from the stream. + */ + public String getString() + throws WiredTigerPackingException { + int stringLength = 0; + format.checkType('S', false); + // Get the length for a fixed length string + if (format.getType() != 'S') { + stringLength = format.getLengthFromFormat(true); + } else { + // The string is null terminated, but we need to know how many + // bytes are consumed - which won't necessarily match up to the + // string length. + for (; valueOff + stringLength < value.length && + value[valueOff + stringLength] != 0; stringLength++) {} + } + format.consume(); + String result = new String(value, valueOff, stringLength); + valueOff += stringLength + 1; + return result; + } + + /** + * Decodes an encoded short from the stream. This method does bounds + * checking, to ensure values fit, since some values may be encoded as + * unsigned values, and Java types are all signed. + */ + private short unpackShort(boolean signed) + throws WiredTigerPackingException { + long ret = unpackLong(true); + if ((signed && (ret > Short.MAX_VALUE || ret > Short.MIN_VALUE)) || + (!signed && (short)ret < 0)) { + throw new WiredTigerPackingException("Overflow unpacking short."); + } + return (short)ret; + } + + /** + * Decodes an encoded integer from the stream. This method does bounds + * checking, to ensure values fit, since some values may be encoded as + * unsigned values, and Java types are all signed. + */ + private int unpackInt(boolean signed) + throws WiredTigerPackingException { + long ret = unpackLong(true); + if ((signed && (ret > Integer.MAX_VALUE || ret > Integer.MIN_VALUE)) || + (!signed && (int)ret < 0)) { + throw new WiredTigerPackingException("Overflow unpacking integer."); + } + return (int)ret; + } + + /** + * Decodes an encoded long from the stream. This method does bounds + * checking, to ensure values fit, since some values may be encoded as + * unsigned values, and Java types are all signed. + * The packing format is defined in the WiredTiger C integer packing + * implementation, which is at src/include/intpack.i + */ + private long unpackLong(boolean signed) + throws WiredTigerPackingException { + int len; + long unpacked = 0; + switch (value[valueOff] & 0xf0) { + case PackUtil.NEG_MULTI_MARKER & 0xff: + len = (int)PackUtil.SIZEOF_LONG - (value[valueOff++] & 0xf); + + for (unpacked = 0xffffffff; len != 0; --len) { + unpacked = (unpacked << 8) | value[valueOff++] & 0xff; + } + break; + case PackUtil.NEG_2BYTE_MARKER & 0xff: + case (PackUtil.NEG_2BYTE_MARKER | 0x10) & 0xff: + unpacked = PackUtil.GET_BITS(value[valueOff++], 5, 0) << 8; + unpacked |= value[valueOff++] & 0xff; + unpacked += PackUtil.NEG_2BYTE_MIN; + break; + case PackUtil.NEG_1BYTE_MARKER & 0xff: + case (PackUtil.NEG_1BYTE_MARKER | 0x10) & 0xff: + case (PackUtil.NEG_1BYTE_MARKER | 0x20) & 0xff: + case (PackUtil.NEG_1BYTE_MARKER | 0x30) & 0xff: + unpacked = PackUtil.NEG_1BYTE_MIN + + PackUtil.GET_BITS(value[valueOff++], 6, 0); + break; + case PackUtil.POS_1BYTE_MARKER & 0xff: + case (PackUtil.POS_1BYTE_MARKER | 0x10) & 0xff: + case (PackUtil.POS_1BYTE_MARKER | 0x20) & 0xff: + case (PackUtil.POS_1BYTE_MARKER | 0x30) & 0xff: + unpacked = PackUtil.GET_BITS(value[valueOff++], 6, 0); + break; + case PackUtil.POS_2BYTE_MARKER & 0xff: + case (PackUtil.POS_2BYTE_MARKER | 0x10) & 0xff: + unpacked = PackUtil.GET_BITS(value[valueOff++], 5, 0) << 8; + unpacked |= value[valueOff++] & 0xff; + unpacked += PackUtil.POS_1BYTE_MAX + 1; + break; + case PackUtil.POS_MULTI_MARKER & 0xff: + // There are four length bits in the first byte. + len = (value[valueOff++] & 0xf); + + for (unpacked = 0; len != 0; --len) { + unpacked = (unpacked << 8) | value[valueOff++] & 0xff; + } + unpacked += PackUtil.POS_2BYTE_MAX + 1; + break; + default: + throw new WiredTigerPackingException( + "Error decoding packed value."); + } + // Check for overflow if decoding an unsigned value - since Java only + // supports signed values. + if (!signed && unpacked < 0) { + throw new WiredTigerPackingException("Overflow unpacking long."); + } + + return (unpacked); + } +} + diff --git a/lang/java/src/com/wiredtiger/db/PackOutputStream.java b/lang/java/src/com/wiredtiger/db/PackOutputStream.java new file mode 100644 index 00000000000..693f79c3ff2 --- /dev/null +++ b/lang/java/src/com/wiredtiger/db/PackOutputStream.java @@ -0,0 +1,244 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ +package com.wiredtiger.db; + +import java.io.ByteArrayOutputStream; +import java.lang.StringBuffer; +import com.wiredtiger.db.WiredTigerPackingException; + +/** + * An internal helper class for encoding WiredTiger packed values. + * + * Applications should not need to use this class. + */ +public class PackOutputStream { + + final static int MAX_INT_BYTES = 21; + protected PackFormatInputStream format; + protected ByteArrayOutputStream packed; + protected byte[] intBuf; + + /** + * Constructor. + * + * \param format A String that contains the WiredTiger format that + * defines the layout of this packed value. + */ + public PackOutputStream(String format) { + this.format = new PackFormatInputStream(format); + intBuf = new byte[MAX_INT_BYTES]; + packed = new ByteArrayOutputStream(100); + } + + /** + * Returns the raw packing format string. + */ + public String getFormat() { + return format.toString(); + } + + /** + * Returns the current packed value. + */ + public byte[] getValue() { + return packed.toByteArray(); + } + + /** + * Reset the stream position. + */ + public void reset() { + format.reset(); + packed.reset(); + } + + /** + * Add a byte field to the stream. + * + * \param value The byte value to be added. + */ + public void addByte(byte value) + throws WiredTigerPackingException { + format.checkType('b', true); + /* Translate to maintain ordering with the sign bit. */ + byte input = (byte)(value + 0x80); + packed.write(input); + } + + /** + * Add a byte array field to the stream. + * + * \param value The byte array value to be added. + */ + public void addByteArray(byte[] value) + throws WiredTigerPackingException { + this.addByteArray(value, 0, value.length); + } + + /** + * Add a byte array field to the stream. + * + * \param value The byte array value to be added. + * \param off The offset from the start of value to begin using the array. + * \param len The length of the value to encode. + */ + public void addByteArray(byte[] value, int off, int len) + throws WiredTigerPackingException { + format.checkType('U', true); + // If this is not the last item, store the size. + if (format.available() > 0) { + packLong(len, false); + } + + packed.write(value, off, len); + /* TODO: padding. */ + } + + /** + * Add an integer field to the stream. + * + * \param value The integer value to be added. + */ + public void addInt(int value) + throws WiredTigerPackingException { + format.checkType('i', true); + packLong(value, true); + } + + /** + * Add a long field to the stream. + * + * \param value The long value to be added. + */ + public void addLong(long value) + throws WiredTigerPackingException { + format.checkType('q', true); + packLong(value, true); + } + + /** + * Add a record field to the stream. + * + * \param value The record value to be added. + */ + public void addRecord(long value) + throws WiredTigerPackingException { + format.checkType('r', true); + packLong(value, true); + } + + /** + * Add a short field to the stream. + * + * \param value The short value to be added. + */ + public void addShort(short value) + throws WiredTigerPackingException { + format.checkType('h', true); + packLong(value, true); + } + + /** + * Add a string field to the stream. + * + * \param value The string value to be added. + */ + public void addString(String value) + throws WiredTigerPackingException { + format.checkType('s', false); + char fieldFormat = format.getType(); + int stringLen = 0; + int padBytes = 0; + // Strings have two possible encodings. A lower case 's' is not null + // terminated, and has a length define in the format (default 1). An + // upper case 'S' is variable length and has a null terminator. + if (fieldFormat == 's') { + stringLen = format.getLengthFromFormat(true); + if (stringLen > value.length()) { + padBytes = stringLen - value.length(); + } + } else { + stringLen = value.length(); + padBytes = 1; // Null terminator + } + // We're done pulling information from the field now. + format.consume(); + + // Use the default Charset. + packed.write(value.getBytes(), 0, stringLen); + while(padBytes-- > 0) { + packed.write(0); + } + } + + /** + * Add a long field to the stream. + * The packing format is defined in the WiredTiger C integer packing + * implementation, which is at src/include/intpack.i + * + * \param x The long value to be added. + * \param signed Whether the value is signed or unsigned. + */ + private void packLong(long x, boolean signed) + throws WiredTigerPackingException { + int offset = 0; + + if (!signed && x < 0) { + throw new WiredTigerPackingException("Overflow packing long."); + } + + if (x < PackUtil.NEG_2BYTE_MIN) { + intBuf[offset] = PackUtil.NEG_MULTI_MARKER; + int lz = Long.numberOfLeadingZeros(~x) / 8; + int len = PackUtil.SIZEOF_LONG - lz; + + // + // There are four size bits we can use in the first + // byte. For negative numbers, we store the number of + // leading 0xff byes to maintain ordering (if this is + // not obvious, it may help to remember that -1 is the + // largest negative number). + intBuf[offset++] |= (lz & 0xf); + + for (int shift = (len - 1) << 3; + len != 0; shift -= 8, --len) { + intBuf[offset++] = (byte)(x >> shift); + } + } else if (x < PackUtil.NEG_1BYTE_MIN) { + x -= PackUtil.NEG_2BYTE_MIN; + intBuf[offset++] = + (byte)(PackUtil.NEG_2BYTE_MARKER | PackUtil.GET_BITS(x, 13, 8)); + intBuf[offset++] = PackUtil.GET_BITS(x, 8, 0); + } else if (x < 0) { + x -= PackUtil.NEG_1BYTE_MIN; + intBuf[offset++] = + (byte)(PackUtil.NEG_1BYTE_MARKER | PackUtil.GET_BITS(x, 6, 0)); + } else if (x <= PackUtil.POS_1BYTE_MAX) { + intBuf[offset++] = + (byte)(PackUtil.POS_1BYTE_MARKER | PackUtil.GET_BITS(x, 6, 0)); + } else if (x <= PackUtil.POS_2BYTE_MAX) { + x -= PackUtil.POS_1BYTE_MAX + 1; + intBuf[offset++] = + (byte)(PackUtil.POS_2BYTE_MARKER | PackUtil.GET_BITS(x, 13, 8)); + intBuf[offset++] = PackUtil.GET_BITS(x, 8, 0); + } else { + x -= PackUtil.POS_2BYTE_MAX + 1; + intBuf[offset] = PackUtil.POS_MULTI_MARKER; + int lz = Long.numberOfLeadingZeros(x) / 8; + int len = PackUtil.SIZEOF_LONG - lz; + + // There are four bits we can use in the first byte. + intBuf[offset++] |= (len & 0xf); + + for (int shift = (len - 1) << 3; + len != 0; --len, shift -= 8) { + intBuf[offset++] = (byte)(x >> shift); + } + } + packed.write(intBuf, 0, offset); + } +} diff --git a/lang/java/src/com/wiredtiger/db/PackUtil.java b/lang/java/src/com/wiredtiger/db/PackUtil.java new file mode 100644 index 00000000000..b82e0294073 --- /dev/null +++ b/lang/java/src/com/wiredtiger/db/PackUtil.java @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ +package com.wiredtiger.db; + +import java.lang.String; + +/** + * An internal helper class with utilities for packing and unpacking values. + * + * Applications should not need to use this class. + */ +class PackUtil { + /* Contants. */ + final static byte NEG_MULTI_MARKER = (byte)0x10; + final static byte NEG_2BYTE_MARKER = (byte)0x20; + final static byte NEG_1BYTE_MARKER = (byte)0x40; + final static byte POS_1BYTE_MARKER = (byte)0x80; + final static byte POS_2BYTE_MARKER = (byte)0xc0; + final static byte POS_MULTI_MARKER = (byte)0xe0; + + final static int NEG_1BYTE_MIN = ((-1) << 6); + final static int NEG_2BYTE_MIN = (((-1) << 13) + NEG_1BYTE_MIN); + final static int POS_1BYTE_MAX = ((1 << 6) - 1); + final static int POS_2BYTE_MAX = ((1 << 13) + POS_1BYTE_MAX); + + // See: http://docs.python.org/2/library/struct.html for an explanation + // of what these special characters mean. + // TODO: Care about byte ordering and padding in packed formats. + final static String PackSpecialCharacters = "@=<>!x"; + + final static int SIZEOF_LONG = 8; + + /** + * Extract bits from a value, counting from LSB == 0. + * + * \param x The value to extract bits from. + * \param start The first bit to extract. + * \param end The last bit to extract. + */ + public static byte GET_BITS(long x, int start, int end) { + return (byte)((x & ((1 << start) - 1)) >> end); + } + + +} diff --git a/lang/java/src/com/wiredtiger/db/WiredTigerException.java b/lang/java/src/com/wiredtiger/db/WiredTigerException.java new file mode 100644 index 00000000000..6424cb3a92e --- /dev/null +++ b/lang/java/src/com/wiredtiger/db/WiredTigerException.java @@ -0,0 +1,19 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ +package com.wiredtiger.db; + +/** + * An exception that is generated by the WiredTiger application. + */ +public class WiredTigerException extends Exception { + /** + * Constructor. + */ + public WiredTigerException(String msg) { + super(msg); + } +} diff --git a/lang/java/src/com/wiredtiger/db/WiredTigerPackingException.java b/lang/java/src/com/wiredtiger/db/WiredTigerPackingException.java new file mode 100644 index 00000000000..1c4ab079748 --- /dev/null +++ b/lang/java/src/com/wiredtiger/db/WiredTigerPackingException.java @@ -0,0 +1,21 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ +package com.wiredtiger.db; + +/** + * An exception that is generated by the WiredTiger application during + * encoding or decoding of packed values. + */ +public class WiredTigerPackingException extends WiredTigerException { + /** + * Constructor. + */ + public WiredTigerPackingException(String msg) { + super(msg); + } +} + diff --git a/lang/java/wiredtiger.i b/lang/java/wiredtiger.i new file mode 100644 index 00000000000..f93a45c2581 --- /dev/null +++ b/lang/java/wiredtiger.i @@ -0,0 +1,823 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + * + * wiredtiger.i + * The SWIG interface file defining the wiredtiger Java API. + */ + +%module wiredtiger + +%include "enums.swg" +%include "typemaps.i" + +%pragma(java) jniclasscode=%{ + static { + try { + System.loadLibrary("wiredtiger_java"); + } catch (UnsatisfiedLinkError e) { + System.err.println("Native code library failed to load. \n" + e); + System.exit(1); + } + } +%} + +%{ +typedef int bool; + +static void throwWiredTigerException(JNIEnv *jenv, const char *msg) { + jclass excep = (*jenv)->FindClass(jenv, "com/wiredtiger/db/WiredTigerException"); + if (excep) + (*jenv)->ThrowNew(jenv, excep, msg); +} +%} + +/* No finalizers */ +%typemap(javafinalize) SWIGTYPE "" + +/* Event handlers are not supported in Java. */ +%typemap(in, numinputs=0) WT_EVENT_HANDLER * %{ $1 = NULL; %} + +/* Allow silently passing the Java object and JNIEnv into our code. */ +%typemap(in, numinputs=0) jobject *jthis %{ $1 = jarg1_; %} +%typemap(in, numinputs=0) JNIEnv * %{ $1 = jenv; %} + +/* 64 bit typemaps. */ +%typemap(jni) uint64_t "jlong" +%typemap(jtype) uint64_t "long" +%typemap(jstype) uint64_t "long" + +%typemap(javain) uint64_t "$javainput" +%typemap(javaout) uint64_t { + return $jnicall; +} + +/* Return byte[] from cursor.get_value */ +%typemap(jni) WT_ITEM * "jbyteArray" +%typemap(jtype) WT_ITEM * "byte[]" +%typemap(jstype) WT_ITEM * "byte[]" + +%typemap(javain) WT_ITEM * "$javainput" +%typemap(javaout) WT_ITEM * { + return $jnicall; +} + +%typemap(in) WT_ITEM * (WT_ITEM item) %{ + $1 = &item; + $1->data = (*jenv)->GetByteArrayElements(jenv, $input, 0); + $1->size = (*jenv)->GetArrayLength(jenv, $input); +%} + +%typemap(argout) WT_ITEM * %{ + (*jenv)->ReleaseByteArrayElements(jenv, $input, $1->data, 0); +%} + +%typemap(out) WT_ITEM * %{ + if ($1 == NULL) + $result = NULL; + else if (($result = (*jenv)->NewByteArray(jenv, $1->size)) != NULL) { + (*jenv)->SetByteArrayRegion(jenv, + $result, 0, $1->size, $1->data); + } +%} + +/* Don't require empty config strings. */ +%typemap(default) const char *config %{ $1 = NULL; %} + +%typemap(out) int %{ + if ($1 != 0 && $1 != WT_NOTFOUND) { + throwWiredTigerException(jenv, wiredtiger_strerror($1)); + return $null; + } + $result = $1; +%} + +/* + * Extra 'self' elimination. + * The methods we're wrapping look like this: + * struct __wt_xxx { + * int method(WT_XXX *, ...otherargs...); + * }; + * To SWIG, that is equivalent to: + * int method(struct __wt_xxx *self, WT_XXX *, ...otherargs...); + * and we use consecutive argument matching of typemaps to convert two args to + * one. + */ +%define WT_CLASS(type, class, name) +%typemap(in, numinputs=0) type *name "$1 = *(type **)&jarg1;" +%typemap(javaimports) type " +/** + * @copydoc class + * @ingroup wt_java + */" +%enddef + +%pragma(java) moduleimports=%{ +/** + * @defgroup wt_java WiredTiger Java API + * + * Java wrappers around the WiredTiger C API. + */ + +/** + * @ingroup wt_java + */ +%} + +WT_CLASS(struct __wt_connection, WT_CONNECTION, connection) +WT_CLASS(struct __wt_session, WT_SESSION, session) +WT_CLASS(struct __wt_cursor, WT_CURSOR, cursor) + +%define COPYDOC(SIGNATURE_CLASS, CLASS, METHOD) +%javamethodmodifiers SIGNATURE_CLASS::METHOD " + /** + * @copydoc CLASS::METHOD + */ + public "; +%enddef + +%include "java_doc.i" + +/* WT_CURSOR customization. */ +/* First, replace the varargs get / set methods with Java equivalents. */ +%ignore __wt_cursor::get_key; +%ignore __wt_cursor::get_value; +%ignore __wt_cursor::set_key; +%ignore __wt_cursor::set_value; +%ignore __wt_cursor::insert; +%ignore __wt_cursor::remove; +%ignore __wt_cursor::search; +%ignore __wt_cursor::search_near; +%ignore __wt_cursor::update; +%javamethodmodifiers __wt_cursor::next "protected"; +%rename (next_wrap) __wt_cursor::next; +%javamethodmodifiers __wt_cursor::prev "protected"; +%rename (prev_wrap) __wt_cursor::prev; +%javamethodmodifiers __wt_cursor::key_format "protected"; +%javamethodmodifiers __wt_cursor::value_format "protected"; + +%ignore __wt_cursor::compare(WT_CURSOR *, WT_CURSOR *, int *); +%rename (compare_wrap) __wt_cursor::compare; + +/* SWIG magic to turn Java byte strings into data / size. */ +%apply (char *STRING, int LENGTH) { (char *data, int size) }; + +/* Status from search_near */ +%javaconst(1); +%inline %{ +enum SearchStatus { FOUND, NOTFOUND, SMALLER, LARGER }; +%} + +%extend __wt_cursor { + + %javamethodmodifiers get_key_wrap "protected"; + WT_ITEM *get_key_wrap(JNIEnv *jenv) { + WT_ITEM k; + int ret; + if ((ret = $self->get_key($self, &k)) != 0) { + throwWiredTigerException(jenv, wiredtiger_strerror(ret)); + return NULL; + } + return &$self->key; + } + + %javamethodmodifiers get_value_wrap "protected"; + WT_ITEM *get_value_wrap(JNIEnv *jenv) { + WT_ITEM v; + int ret; + if ((ret = $self->get_value($self, &v)) != 0) { + throwWiredTigerException(jenv, wiredtiger_strerror(ret)); + return NULL; + } + return &$self->value; + } + + %javamethodmodifiers insert_wrap "protected"; + int insert_wrap(WT_ITEM *k, WT_ITEM *v) { + $self->set_key($self, k); + $self->set_value($self, v); + return $self->insert($self); + } + + %javamethodmodifiers remove_wrap "protected"; + int remove_wrap(WT_ITEM *k) { + $self->set_key($self, k); + return $self->remove($self); + } + + %javamethodmodifiers search_wrap "protected"; + int search_wrap(WT_ITEM *k) { + $self->set_key($self, k); + return $self->search($self); + } + + %javamethodmodifiers search_near_wrap "protected"; + enum SearchStatus search_near_wrap(JNIEnv *jenv, WT_ITEM *k) { + int cmp, ret; + + $self->set_key($self, k); + ret = $self->search_near(self, &cmp); + if (ret != 0 && ret != WT_NOTFOUND) + throwWiredTigerException(jenv, wiredtiger_strerror(ret)); + if (ret == 0) + return (cmp == 0 ? FOUND : cmp < 0 ? SMALLER : LARGER); + return (NOTFOUND); + } + + %javamethodmodifiers update_wrap "protected"; + int update_wrap(WT_ITEM *k, WT_ITEM *v) { + $self->set_key($self, k); + $self->set_value($self, v); + return $self->update($self); + } + + int compare_wrap(JNIEnv *jenv, WT_CURSOR *other) { + int cmp, ret = $self->compare($self, other, &cmp); + if (ret != 0) + throwWiredTigerException(jenv, wiredtiger_strerror(ret)); + return cmp; + } +} + +/* Cache key/value formats in Cursor */ +%typemap(javabody) struct __wt_cursor %{ + private long swigCPtr; + protected boolean swigCMemOwn; + protected String keyFormat; + protected String valueFormat; + protected PackOutputStream keyPacker; + protected PackOutputStream valuePacker; + protected PackInputStream keyUnpacker; + protected PackInputStream valueUnpacker; + + protected $javaclassname(long cPtr, boolean cMemoryOwn) { + swigCMemOwn = cMemoryOwn; + swigCPtr = cPtr; + keyFormat = getKey_format(); + valueFormat = getValue_format(); + keyPacker = new PackOutputStream(keyFormat); + valuePacker = new PackOutputStream(valueFormat); + } + + protected static long getCPtr($javaclassname obj) { + return (obj == null) ? 0 : obj.swigCPtr; + } +%} + +%typemap(javacode) struct __wt_cursor %{ + + /** + * Retrieve the format string for this cursor's key. + */ + public String getKeyFormat() { + return keyFormat; + } + + /** + * Retrieve the format string for this cursor's value. + */ + public String getValueFormat() { + return valueFormat; + } + + /** + * Append a byte to the cursor's key. + * + * \param value The value to append. + * \return This cursor object, so put calls can be chained. + */ + public Cursor putKeyByte(byte value) + throws WiredTigerPackingException { + keyPacker.addByte(value); + return this; + } + + /** + * Append a byte array to the cursor's key. + * + * \param value The value to append. + * \return This cursor object, so put calls can be chained. + */ + public Cursor putKeyByteArray(byte[] value) + throws WiredTigerPackingException { + this.putKeyByteArray(value, 0, value.length); + return this; + } + + /** + * Append a byte array to the cursor's key. + * + * \param value The value to append. + * \param off The offset into value at which to start. + * \param len The length of the byte array. + * \return This cursor object, so put calls can be chained. + */ + public Cursor putKeyByteArray(byte[] value, int off, int len) + throws WiredTigerPackingException { + keyPacker.addByteArray(value, off, len); + return this; + } + + /** + * Append an integer to the cursor's key. + * + * \param value The value to append + * \return This cursor object, so put calls can be chained. + */ + public Cursor putKeyInt(int value) + throws WiredTigerPackingException { + keyPacker.addInt(value); + return this; + } + + /** + * Append a long to the cursor's key. + * + * \param value The value to append + * \return This cursor object, so put calls can be chained. + */ + public Cursor putKeyLong(long value) + throws WiredTigerPackingException { + keyPacker.addLong(value); + return this; + } + + /** + * Append a short integer to the cursor's key. + * + * \param value The value to append + * \return This cursor object, so put calls can be chained. + */ + public Cursor putKeyShort(short value) + throws WiredTigerPackingException { + keyPacker.addShort(value); + return this; + } + + /** + * Append a string to the cursor's key. + * + * \param value The value to append + * \return This cursor object, so put calls can be chained. + */ + public Cursor putKeyString(String value) + throws WiredTigerPackingException { + keyPacker.addString(value); + return this; + } + + /** + * Append a byte to the cursor's value. + * + * \param value The value to append + * \return This cursor object, so put calls can be chained. + */ + public Cursor putValueByte(byte value) + throws WiredTigerPackingException { + valuePacker.addByte(value); + return this; + } + + /** + * Append a byte array to the cursor's value. + * + * \param value The value to append + * \return This cursor object, so put calls can be chained. + */ + public Cursor putValueByteArray(byte[] value) + throws WiredTigerPackingException { + this.putValueByteArray(value, 0, value.length); + return this; + } + + /** + * Append a byte array to the cursor's value. + * + * \param value The value to append + * \param off The offset into value at which to start. + * \param len The length of the byte array. + * \return This cursor object, so put calls can be chained. + */ + public Cursor putValueByteArray(byte[] value, int off, int len) + throws WiredTigerPackingException { + valuePacker.addByteArray(value, off, len); + return this; + } + + /** + * Append an integer to the cursor's value. + * + * \param value The value to append + * \return This cursor object, so put calls can be chained. + */ + public Cursor putValueInt(int value) + throws WiredTigerPackingException { + valuePacker.addInt(value); + return this; + } + + /** + * Append a long to the cursor's value. + * + * \param value The value to append + * \return This cursor object, so put calls can be chained. + */ + public Cursor putValueLong(long value) + throws WiredTigerPackingException { + valuePacker.addLong(value); + return this; + } + + /** + * Append a short integer to the cursor's value. + * + * \param value The value to append + * \return This cursor object, so put calls can be chained. + */ + public Cursor putValueShort(short value) + throws WiredTigerPackingException { + valuePacker.addShort(value); + return this; + } + + /** + * Append a string to the cursor's value. + * + * \param value The value to append + * \return This cursor object, so put calls can be chained. + */ + public Cursor putValueString(String value) + throws WiredTigerPackingException { + valuePacker.addString(value); + return this; + } + + /** + * Retrieve a byte from the cursor's key. + * + * \return The requested value. + */ + public byte getKeyByte() + throws WiredTigerPackingException { + return keyUnpacker.getByte(); + } + + /** + * Retrieve a byte array from the cursor's key. + * + * \param output The byte array where the returned value will be stored. + * The array should be large enough to store the entire + * data item, if not a truncated value will be returned. + */ + public void getKeyByteArray(byte[] output) + throws WiredTigerPackingException { + this.getKeyByteArray(output, 0, output.length); + } + + /** + * Retrieve a byte array from the cursor's key. + * + * \param output The byte array where the returned value will be stored. + * \param off Offset into the destination buffer to start copying into. + * \param len The length should be large enough to store the entire + * data item, if not a truncated value will be returned. + */ + public void getKeyByteArray(byte[] output, int off, int len) + throws WiredTigerPackingException { + keyUnpacker.getByteArray(output, off, len); + } + + /** + * Retrieve a byte array from the cursor's key. + * + * \return The requested value. + */ + public byte[] getKeyByteArray() + throws WiredTigerPackingException { + return keyUnpacker.getByteArray(); + } + + /** + * Retrieve an integer from the cursor's key. + * + * \return The requested value. + */ + public int getKeyInt() + throws WiredTigerPackingException { + return keyUnpacker.getInt(); + } + + /** + * Retrieve a long from the cursor's key. + * + * \return The requested value. + */ + public long getKeyLong() + throws WiredTigerPackingException { + return keyUnpacker.getLong(); + } + + /** + * Retrieve a short integer from the cursor's key. + * + * \return The requested value. + */ + public short getKeyShort() + throws WiredTigerPackingException { + return keyUnpacker.getShort(); + } + + /** + * Retrieve a string from the cursor's key. + * + * \return The requested value. + */ + public String getKeyString() + throws WiredTigerPackingException { + return keyUnpacker.getString(); + } + + /** + * Retrieve a byte from the cursor's value. + * + * \return The requested value. + */ + public byte getValueByte() + throws WiredTigerPackingException { + return valueUnpacker.getByte(); + } + + /** + * Retrieve a byte array from the cursor's value. + * + * \param output The byte array where the returned value will be stored. + * The array should be large enough to store the entire + * data item, if not a truncated value will be returned. + */ + public void getValueByteArray(byte[] output) + throws WiredTigerPackingException { + this.getValueByteArray(output, 0, output.length); + } + + /** + * Retrieve a byte array from the cursor's value. + * + * \param output The byte array where the returned value will be stored. + * \param off Offset into the destination buffer to start copying into. + * \param len The length should be large enough to store the entire + * data item, if not a truncated value will be returned. + */ + public void getValueByteArray(byte[] output, int off, int len) + throws WiredTigerPackingException { + valueUnpacker.getByteArray(output, off, len); + } + + /** + * Retrieve a byte array from the cursor's value. + * + * \return The requested value. + */ + public byte[] getValueByteArray() + throws WiredTigerPackingException { + return valueUnpacker.getByteArray(); + } + + /** + * Retrieve an integer from the cursor's value. + * + * \return The requested value. + */ + public int getValueInt() + throws WiredTigerPackingException { + return valueUnpacker.getInt(); + } + + /** + * Retrieve a long from the cursor's value. + * + * \return The requested value. + */ + public long getValueLong() + throws WiredTigerPackingException { + return valueUnpacker.getLong(); + } + + /** + * Retrieve a short integer from the cursor's value. + * + * \return The requested value. + */ + public short getValueShort() + throws WiredTigerPackingException { + return valueUnpacker.getShort(); + } + + /** + * Retrieve a string from the cursor's value. + * + * \return The requested value. + */ + public String getValueString() + throws WiredTigerPackingException { + return valueUnpacker.getString(); + } + + /** + * Insert the cursor's current key/value into the table. + * + * \return The status of the operation. + */ + public int insert() { + byte[] key = keyPacker.getValue(); + byte[] value = valuePacker.getValue(); + keyPacker.reset(); + valuePacker.reset(); + return insert_wrap(key, value); + } + + /** + * Update the cursor's current key/value into the table. + * + * \return The status of the operation. + */ + public int update() { + byte[] key = keyPacker.getValue(); + byte[] value = valuePacker.getValue(); + keyPacker.reset(); + valuePacker.reset(); + return update_wrap(key, value); + } + + /** + * Remove the cursor's current key/value into the table. + * + * \return The status of the operation. + */ + public int remove() { + byte[] key = keyPacker.getValue(); + keyPacker.reset(); + return remove_wrap(key); + } + + /** + * Compare this cursor's position to another Cursor. + * + * \return The result of the comparison. + */ + public int compare(Cursor other) { + return compare_wrap(other); + } + + /** + * Retrieve the next item in the table. + * + * \return The result of the comparison. + */ + public int next() { + int ret = next_wrap(); + keyPacker.reset(); + valuePacker.reset(); + keyUnpacker = (ret == 0) ? + new PackInputStream(keyFormat, get_key_wrap()) : null; + valueUnpacker = (ret == 0) ? + new PackInputStream(valueFormat, get_value_wrap()) : null; + return ret; + } + + /** + * Retrieve the previous item in the table. + * + * \return The result of the comparison. + */ + public int prev() { + int ret = prev_wrap(); + keyPacker.reset(); + valuePacker.reset(); + keyUnpacker = (ret == 0) ? + new PackInputStream(keyFormat, get_key_wrap()) : null; + valueUnpacker = (ret == 0) ? + new PackInputStream(valueFormat, get_value_wrap()) : null; + return ret; + } + + /** + * Search for an item in the table. + * + * \return The result of the comparison. + */ + public int search() { + int ret = search_wrap(keyPacker.getValue()); + keyPacker.reset(); + valuePacker.reset(); + keyUnpacker = (ret == 0) ? + new PackInputStream(keyFormat, get_key_wrap()) : null; + valueUnpacker = (ret == 0) ? + new PackInputStream(valueFormat, get_value_wrap()) : null; + return ret; + } + + /** + * Search for an item in the table. + * + * \return The result of the comparison. + */ + public SearchStatus search_near() { + SearchStatus ret = search_near_wrap(keyPacker.getValue()); + keyPacker.reset(); + valuePacker.reset(); + keyUnpacker = (ret != SearchStatus.NOTFOUND) ? + new PackInputStream(keyFormat, get_key_wrap()) : null; + valueUnpacker = (ret != SearchStatus.NOTFOUND) ? + new PackInputStream(valueFormat, get_value_wrap()) : null; + return ret; + } +%} + +/* Remove / rename parts of the C API that we don't want in Java. */ +%immutable __wt_cursor::session; +%immutable __wt_cursor::uri; +%immutable __wt_cursor::key_format; +%immutable __wt_cursor::value_format; +%immutable __wt_session::connection; + +%ignore __wt_collator; +%ignore __wt_connection::add_collator; +%ignore __wt_compressor; +%ignore __wt_connection::add_compressor; +%ignore __wt_data_source; +%ignore __wt_connection::add_data_source; +%ignore __wt_event_handler; +%ignore __wt_extractor; +%ignore __wt_connection::add_extractor; +%ignore __wt_item; +%ignore __wt_session::msg_printf; + +%ignore wiredtiger_struct_pack; +%ignore wiredtiger_struct_packv; +%ignore wiredtiger_struct_size; +%ignore wiredtiger_struct_sizev; +%ignore wiredtiger_struct_unpack; +%ignore wiredtiger_struct_unpackv; + +%ignore wiredtiger_version; + +%ignore wiredtiger_extension_init; + +%ignore wiredtiger_open; +%javamethodmodifiers wiredtiger_open_wrap " + /** + * @copydoc ::wiredtiger_open + */ + public "; + +%rename(open) wiredtiger_open_wrap; +%ignore __wt_connection::open_session; +%rename(open_session) __wt_connection::open_session_wrap; +%ignore __wt_session::open_cursor; +%javamethodmodifiers __wt_session::open_cursor_wrap " + /** + * @copydoc WT_SESSION::open_cursor + */ + public "; +%rename(open_cursor) __wt_session::open_cursor_wrap; + +%rename(Cursor) __wt_cursor; +%rename(Session) __wt_session; +%rename(Connection) __wt_connection; + +%include "wiredtiger.h" + +/* Return new connections, sessions and cursors. */ +%inline { +WT_CONNECTION *wiredtiger_open_wrap(JNIEnv *jenv, const char *home, const char *config) { + WT_CONNECTION *conn = NULL; + int ret; + if ((ret = wiredtiger_open(home, NULL, config, &conn)) != 0) + throwWiredTigerException(jenv, wiredtiger_strerror(ret)); + return conn; +} +} + +%extend __wt_connection { + WT_SESSION *open_session_wrap(JNIEnv *jenv, const char *config) { + WT_SESSION *session = NULL; + int ret; + if ((ret = $self->open_session($self, NULL, config, &session)) != 0) + throwWiredTigerException(jenv, wiredtiger_strerror(ret)); + return session; + } +} + +%extend __wt_session { + WT_CURSOR *open_cursor_wrap(JNIEnv *jenv, const char *uri, WT_CURSOR *to_dup, const char *config) { + WT_CURSOR *cursor = NULL; + int ret; + if ((ret = $self->open_cursor($self, uri, to_dup, config, &cursor)) != 0) + throwWiredTigerException(jenv, wiredtiger_strerror(ret)); + else + cursor->flags |= WT_CURSTD_RAW; + return cursor; + } +} diff --git a/lang/python/setup.py b/lang/python/setup.py index 995175d6ad2..5634fe7744f 100644 --- a/lang/python/setup.py +++ b/lang/python/setup.py @@ -15,16 +15,8 @@ if not 'ARCHFLAGS' in os.environ: # Suppress warnings building SWIG generated code extra_cflags = [ - '-Wno-unused-value', + '-Wno-error', ] -if sys.platform == 'darwin': - kernel_version = os.uname()[2] # e.g. 12.0.0 is Mountain Lion - major_version = int(kernel_version.split('.')[0]) - if major_version >= 12: - extra_cflags += [ - '-Wno-self-assign', - '-Qunused-arguments', - ] dir = os.path.dirname(__file__) diff --git a/lang/python/wiredtiger.i b/lang/python/wiredtiger.i index a6a8ed532ae..8c381d5aae5 100644 --- a/lang/python/wiredtiger.i +++ b/lang/python/wiredtiger.i @@ -442,8 +442,6 @@ typedef int int_void; %include "wiredtiger.h" %pythoncode %{ -## @} - class stat: '''keys for statistics cursors''' @@ -455,6 +453,8 @@ class stat: '''keys for cursors on data source statistics''' pass +## @} + import sys # All names starting with 'WT_STAT_DSRC_' are renamed to # the wiredtiger.stat.dsrc class, those starting with 'WT_STAT_CONN' are diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c index b2535110a36..61728cfad15 100644 --- a/src/block/block_ckpt.c +++ b/src/block/block_ckpt.c @@ -209,17 +209,7 @@ __wt_block_checkpoint(WT_SESSION_IMPL *session, data_cksum, 0)); /* Process the checkpoint list, deleting and updating as required. */ - WT_RET(__ckpt_process(session, block, ckptbase)); - - /* - * Checkpoints have to hit disk (it would be reasonable to configure for - * lazy checkpoints, but we don't support them yet). Regardless, we're - * not holding any locks, other writers can proceed while we wait. - */ - if (F_ISSET(S2C(session), WT_CONN_SYNC)) - WT_RET(__wt_fsync(session, block->fh)); - - return (0); + return (__ckpt_process(session, block, ckptbase)); } /* @@ -275,7 +265,7 @@ __ckpt_extlist_fblocks( * file that contains a previous checkpoint's extents. */ return (__wt_block_insert_ext( - session, &block->live.ckpt_avail, el->offset, el->size)); + session, block, &block->live.ckpt_avail, el->offset, el->size)); } /* @@ -419,7 +409,7 @@ __ckpt_process( * must be paired in the checkpoint. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) - WT_ERR(__wt_block_insert_ext(session, + WT_ERR(__wt_block_insert_ext(session, block, &a->discard, a->root_offset, a->root_size)); /* @@ -436,10 +426,10 @@ __ckpt_process( */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( - session, &a->alloc, &b->alloc)); + session, block, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( - session, &a->discard, &b->discard)); + session, block, &a->discard, &b->discard)); /* * If the "to" checkpoint is also being deleted, we're done with @@ -576,12 +566,12 @@ __ckpt_update( alloc = &block->live.alloc; WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL)); if (ci->alloc.offset != WT_BLOCK_INVALID_OFFSET) - WT_RET(__wt_block_off_remove_overlap( - session, alloc, ci->alloc.offset, ci->alloc.size)); + WT_RET(__wt_block_off_remove_overlap(session, + block, alloc, ci->alloc.offset, ci->alloc.size)); WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL)); if (ci->discard.offset != WT_BLOCK_INVALID_OFFSET) - WT_RET(__wt_block_off_remove_overlap( - session, alloc, ci->discard.offset, ci->discard.size)); + WT_RET(__wt_block_off_remove_overlap(session, + block, alloc, ci->discard.offset, ci->discard.size)); /* * We only write an avail list for the live system, other checkpoint's @@ -599,8 +589,8 @@ __ckpt_update( WT_RET(__wt_block_extlist_write( session, block, &ci->avail, &ci->ckpt_avail)); if (ci->avail.offset != WT_BLOCK_INVALID_OFFSET) - WT_RET(__wt_block_off_remove_overlap( - session, alloc, ci->avail.offset, ci->avail.size)); + WT_RET(__wt_block_off_remove_overlap(session, + block, alloc, ci->avail.offset, ci->avail.size)); } /* @@ -679,7 +669,8 @@ __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block) * available list. */ __wt_spin_lock(session, &block->live_lock); - ret = __wt_block_extlist_merge(session, &ci->ckpt_avail, &ci->avail); + ret = __wt_block_extlist_merge( + session, block, &ci->ckpt_avail, &ci->avail); __wt_spin_unlock(session, &block->live_lock); /* Discard the list. */ diff --git a/src/block/block_ext.c b/src/block/block_ext.c index 7758730f57a..204fd418c81 100644 --- a/src/block/block_ext.c +++ b/src/block/block_ext.c @@ -11,7 +11,71 @@ static int __block_ext_overlap(WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, WT_EXT **, WT_EXTLIST *, WT_EXT **); static int __block_extlist_dump( WT_SESSION_IMPL *, const char *, WT_EXTLIST *, int); -static int __block_merge(WT_SESSION_IMPL *, WT_EXTLIST *, off_t, off_t); +static int __block_merge( + WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, off_t, off_t); + +/* + * __block_ext_alloc -- + * Return a cached WT_EXT structure, or allocate one if none cached. + */ +static inline int +__block_ext_alloc(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXT **extp) +{ + u_int skipdepth; + + *extp = NULL; + + /* + * Select (and set) the WT_EXT structure's depth because we have to + * know how deep the skiplist goes at the entry to allocate it. + */ + if (block->free_ext == NULL) { + skipdepth = __wt_skip_choose_depth(); + WT_RET(__wt_calloc(session, 1, + sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), extp)); + (*extp)->depth = (uint8_t)skipdepth; + } else { + --block->free_ext_cnt; + + (*extp) = block->free_ext; + block->free_ext = block->free_ext->next[0]; + } + return (0); +} + +/* + * __block_ext_free -- + * Add an EXT structure to the cached list, or free it if enough cached. + */ +static inline void +__block_ext_free(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXT *ext) +{ + if (block->free_ext_cnt >= 100) + __wt_free(session, ext); + else { + ++block->free_ext_cnt; + + ext->next[0] = block->free_ext; + block->free_ext = ext; + } +} + +/* + * __wt_block_ext_cleanup -- + * Discard any cached structures from the list. + */ +void +__wt_block_ext_cleanup(WT_SESSION_IMPL *session, WT_BLOCK *block) +{ + WT_EXT *ext, *next; + + for (ext = block->free_ext; ext != NULL; ext = next) { + --block->free_ext_cnt; + next = ext->next[0]; + __wt_free(session, ext); + } + WT_ASSERT(session, block->free_ext_cnt == 0); +} /* * __block_off_srch -- @@ -183,20 +247,16 @@ __block_ext_insert(WT_SESSION_IMPL *session, WT_EXTLIST *el, WT_EXT *ext) * Insert a file range into an extent list. */ static int -__block_off_insert( - WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size) +__block_off_insert(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_EXTLIST *el, off_t off, off_t size) { WT_EXT *ext; - u_int skipdepth; - /* Allocate a new WT_EXT structure. */ - skipdepth = __wt_skip_choose_depth(); - WT_RET(__wt_calloc(session, 1, - sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), &ext)); + WT_RET(__block_ext_alloc(session, block, &ext)); ext->off = off; ext->size = size; - ext->depth = (uint8_t)skipdepth; + return (__block_ext_insert(session, el, ext)); } @@ -247,11 +307,12 @@ __wt_block_misplaced(WT_SESSION_IMPL *session, else if (__block_off_match(&block->live.discard, offset, size)) name = "discard"; __wt_spin_unlock(session, &block->live_lock); - if (name != NULL) - WT_RET_MSG(session, WT_ERROR, + if (name != NULL) { + __wt_errx(session, "%s failed: %" PRIuMAX "/%" PRIu32 " is on the %s list", tag, (uintmax_t)offset, size, name); - + return (__wt_panic(session)); + } return (0); } #endif @@ -261,8 +322,8 @@ __wt_block_misplaced(WT_SESSION_IMPL *session, * Remove a record from an extent list. */ static int -__block_off_remove( - WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, WT_EXT **extp) +__block_off_remove(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_EXTLIST *el, off_t off, WT_EXT **extp) { WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH]; WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH]; @@ -301,7 +362,7 @@ __block_off_remove( /* Return the record if our caller wants it, otherwise free it. */ if (extp == NULL) - __wt_free(session, ext); + __block_ext_free(session, block, ext); else *extp = ext; @@ -318,8 +379,8 @@ corrupt: * overlapping entry. */ int -__wt_block_off_remove_overlap( - WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size) +__wt_block_off_remove_overlap(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_EXTLIST *el, off_t off, off_t size) { WT_EXT *before, *after, *ext; off_t a_off, a_size, b_off, b_size; @@ -331,7 +392,8 @@ __wt_block_off_remove_overlap( /* If "before" or "after" overlaps, retrieve the overlapping entry. */ if (before != NULL && before->off + before->size > off) { - WT_RET(__block_off_remove(session, el, before->off, &ext)); + WT_RET( + __block_off_remove(session, block, el, before->off, &ext)); /* Calculate overlapping extents. */ a_off = ext->off; @@ -339,7 +401,8 @@ __wt_block_off_remove_overlap( b_off = off + size; b_size = ext->size - (a_size + size); } else if (after != NULL && off + size > after->off) { - WT_RET(__block_off_remove(session, el, after->off, &ext)); + WT_RET( + __block_off_remove(session, block, el, after->off, &ext)); /* * Calculate overlapping extents. There's no initial overlap @@ -364,7 +427,8 @@ __wt_block_off_remove_overlap( } if (b_size != 0) { if (ext == NULL) - WT_RET(__block_off_insert(session, el, b_off, b_size)); + WT_RET(__block_off_insert( + session, block, el, b_off, b_size)); else { ext->off = b_off; ext->size = b_size; @@ -373,7 +437,7 @@ __wt_block_off_remove_overlap( } } if (ext != NULL) - __wt_free(session, ext); + __block_ext_free(session, block, ext); return (0); } @@ -455,7 +519,8 @@ __wt_block_alloc( /* Remove the first record, and set the returned offset. */ ext = szp->off[0]; - WT_RET(__block_off_remove(session, &block->live.avail, ext->off, &ext)); + WT_RET(__block_off_remove( + session, block, &block->live.avail, ext->off, &ext)); *offp = ext->off; /* If doing a partial allocation, adjust the record and put it back. */ @@ -476,11 +541,12 @@ __wt_block_alloc( "allocate range %" PRIdMAX "-%" PRIdMAX, (intmax_t)ext->off, (intmax_t)(ext->off + ext->size)); - __wt_free(session, ext); + __block_ext_free(session, block, ext); } done: /* Add the newly allocated extent to the list of allocations. */ - WT_RET(__block_merge(session, &block->live.alloc, *offp, (off_t)size)); + WT_RET(__block_merge( + session, block, &block->live.alloc, *offp, (off_t)size)); return (0); } @@ -536,12 +602,12 @@ __wt_block_off_free( * list. */ if ((ret = __wt_block_off_remove_overlap( - session, &block->live.alloc, offset, size)) == 0) + session, block, &block->live.alloc, offset, size)) == 0) ret = __block_merge( - session, &block->live.avail, offset, (off_t)size); + session, block, &block->live.avail, offset, (off_t)size); else if (ret == WT_NOTFOUND) ret = __block_merge( - session, &block->live.discard, offset, (off_t)size); + session, block, &block->live.discard, offset, (off_t)size); return (ret); } @@ -637,7 +703,7 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * We can think of the overlap possibilities as 11 different cases: * * AAAAAAAAAAAAAAAAAA - * #1 BBBBBBBBBBBBBBBBBB ranges are are the same + * #1 BBBBBBBBBBBBBBBBBB ranges are the same * #2 BBBBBBBBBBBBB overlaps the beginning * #3 BBBBBBBBBBBBBBBB overlaps the end * #4 BBBBB B is a prefix of A @@ -658,7 +724,7 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * eliminate cases #2, #8, #10 and #11, and only handle 7 cases: * * AAAAAAAAAAAAAAAAAA - * #1 BBBBBBBBBBBBBBBBBB ranges are are the same + * #1 BBBBBBBBBBBBBBBBBB ranges are the same * #3 BBBBBBBBBBBBBBBB overlaps the end * #4 BBBBB B is a prefix of A * #5 BBBBBB B is middle of A @@ -688,9 +754,12 @@ __block_ext_overlap(WT_SESSION_IMPL *session, */ *ap = (*ap)->next[0]; *bp = (*bp)->next[0]; - WT_RET(__block_merge(session, avail, b->off, b->size)); - WT_RET(__block_off_remove(session, ael, a->off, NULL)); - WT_RET(__block_off_remove(session, bel, b->off, NULL)); + WT_RET(__block_merge( + session, block, avail, b->off, b->size)); + WT_RET(__block_off_remove( + session, block, ael, a->off, NULL)); + WT_RET(__block_off_remove( + session, block, bel, b->off, NULL)); } else if (a->size > b->size) { /* Case #4 */ /* @@ -698,7 +767,8 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Increment/Decrement A's offset/size by the size of B * Insert A on its list */ - WT_RET(__block_off_remove(session, ael, a->off, &a)); + WT_RET(__block_off_remove( + session, block, ael, a->off, &a)); a->off += b->size; a->size -= b->size; WT_RET(__block_ext_insert(session, ael, a)); @@ -709,15 +779,18 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Delete B */ *bp = (*bp)->next[0]; - WT_RET(__block_merge(session, avail, b->off, b->size)); - WT_RET(__block_off_remove(session, bel, b->off, NULL)); + WT_RET(__block_merge( + session, block, avail, b->off, b->size)); + WT_RET(__block_off_remove( + session, block, bel, b->off, NULL)); } else { /* Case #9 */ /* * Remove B from its list * Increment/Decrement B's offset/size by the size of A * Insert B on its list */ - WT_RET(__block_off_remove(session, bel, b->off, &b)); + WT_RET(__block_off_remove( + session, block, bel, b->off, &b)); b->off += a->size; b->size -= a->size; WT_RET(__block_ext_insert(session, bel, b)); @@ -728,8 +801,10 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Delete A */ *ap = (*ap)->next[0]; - WT_RET(__block_merge(session, avail, a->off, a->size)); - WT_RET(__block_off_remove(session, ael, a->off, NULL)); + WT_RET(__block_merge( + session, block, avail, a->off, a->size)); + WT_RET(__block_off_remove( + session, block, ael, a->off, NULL)); } /* Case #6 */ } else if (a->off + a->size == b->off + b->size) { /* @@ -737,7 +812,7 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Decrement A's size by the size of B * Insert A on its list */ - WT_RET(__block_off_remove(session, ael, a->off, &a)); + WT_RET(__block_off_remove(session, block, ael, a->off, &a)); a->size -= b->size; WT_RET(__block_ext_insert(session, ael, a)); @@ -747,8 +822,8 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Delete B */ *bp = (*bp)->next[0]; - WT_RET(__block_merge(session, avail, b->off, b->size)); - WT_RET(__block_off_remove(session, bel, b->off, NULL)); + WT_RET(__block_merge(session, block, avail, b->off, b->size)); + WT_RET(__block_off_remove(session, block, bel, b->off, NULL)); } else if /* Case #3, #7 */ (a->off + a->size < b->off + b->size) { /* @@ -756,14 +831,14 @@ __block_ext_overlap(WT_SESSION_IMPL *session, */ off = b->off; size = (a->off + a->size) - b->off; - WT_RET(__block_merge(session, avail, off, size)); + WT_RET(__block_merge(session, block, avail, off, size)); /* * Remove A from its list * Decrement A's size by the overlap * Insert A on its list */ - WT_RET(__block_off_remove(session, ael, a->off, &a)); + WT_RET(__block_off_remove(session, block, ael, a->off, &a)); a->size -= size; WT_RET(__block_ext_insert(session, ael, a)); @@ -772,7 +847,7 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Increment/Decrement B's offset/size by the overlap * Insert B on its list */ - WT_RET(__block_off_remove(session, bel, b->off, &b)); + WT_RET(__block_off_remove(session, block, bel, b->off, &b)); b->off += size; b->size -= size; WT_RET(__block_ext_insert(session, bel, b)); @@ -786,12 +861,12 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Decrement A's size by trailing part of A plus B's size * Insert A on its list */ - WT_RET(__block_off_remove(session, ael, a->off, &a)); + WT_RET(__block_off_remove(session, block, ael, a->off, &a)); a->size = b->off - a->off; WT_RET(__block_ext_insert(session, ael, a)); /* Add trailing part of A to A's list as a new element. */ - WT_RET(__block_merge(session, ael, off, size)); + WT_RET(__block_merge(session, block, ael, off, size)); /* * Move caller's B to the next element @@ -799,8 +874,8 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Delete B */ *bp = (*bp)->next[0]; - WT_RET(__block_merge(session, avail, b->off, b->size)); - WT_RET(__block_off_remove(session, bel, b->off, NULL)); + WT_RET(__block_merge(session, block, avail, b->off, b->size)); + WT_RET(__block_off_remove(session, block, bel, b->off, NULL)); } return (0); @@ -811,14 +886,15 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Merge one extent list into another. */ int -__wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b) +__wt_block_extlist_merge( + WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *a, WT_EXTLIST *b) { WT_EXT *ext; WT_VERBOSE_RET(session, block, "merging %s into %s", a->name, b->name); WT_EXT_FOREACH(ext, a->off) - WT_RET(__block_merge(session, b, ext->off, ext->size)); + WT_RET(__block_merge(session, block, b, ext->off, ext->size)); return (0); } @@ -828,8 +904,8 @@ __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b) * Insert an extent into an extent list, merging if possible. */ int -__wt_block_insert_ext( - WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size) +__wt_block_insert_ext(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_EXTLIST *el, off_t off, off_t size) { /* * There are currently two copies of this function (this code is a one- @@ -842,10 +918,11 @@ __wt_block_insert_ext( * Callers of this function are expected to have already acquired any * locks required to manipulate the extent list. */ - return (__block_merge(session, el, off, size)); + return (__block_merge(session, block, el, off, size)); } static int -__block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size) +__block_merge(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_EXTLIST *el, off_t off, off_t size) { WT_EXT *ext, *after, *before; @@ -884,7 +961,7 @@ __block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size) "%s: insert range %" PRIdMAX "-%" PRIdMAX, el->name, (intmax_t)off, (intmax_t)(off + size)); - return (__block_off_insert(session, el, off, size)); + return (__block_off_insert(session, block, el, off, size)); } /* @@ -895,7 +972,8 @@ __block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size) * the record we're going to use, adjust it and re-insert it. */ if (before == NULL) { - WT_RET(__block_off_remove(session, el, after->off, &ext)); + WT_RET( + __block_off_remove(session, block, el, after->off, &ext)); WT_VERBOSE_RET(session, block, "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %" @@ -909,10 +987,11 @@ __block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size) } else { if (after != NULL) { size += after->size; - WT_RET( - __block_off_remove(session, el, after->off, NULL)); + WT_RET(__block_off_remove( + session, block, el, after->off, NULL)); } - WT_RET(__block_off_remove(session, el, before->off, &ext)); + WT_RET( + __block_off_remove(session, block, el, before->off, &ext)); WT_VERBOSE_RET(session, block, "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %" @@ -946,7 +1025,8 @@ __wt_block_extlist_read_avail( * avail list, the extent blocks might be included, remove them. */ WT_RET_NOTFOUND_OK( - __wt_block_off_remove_overlap(session, el, el->offset, el->size)); + __wt_block_off_remove_overlap( + session, block, el, el->offset, el->size)); return (0); } @@ -1013,7 +1093,7 @@ corrupted: WT_ERR_MSG(session, WT_ERROR, * list and crashed, and rolled back to a corrupted checkpoint, * this might save us?) */ - WT_ERR(__block_merge(session, el, off, size)); + WT_ERR(__block_merge(session, block, el, off, size)); } if (WT_VERBOSE_ISSET(session, block)) @@ -1147,7 +1227,7 @@ __wt_block_extlist_truncate( */ file_size = ext->off; WT_RET(__wt_ftruncate(session, fh, file_size)); - WT_RET(__block_off_remove(session, el, file_size, NULL)); + WT_RET(__block_off_remove(session, block, el, file_size, NULL)); fh->file_size = file_size; return (0); diff --git a/src/block/block_map.c b/src/block/block_map.c new file mode 100644 index 00000000000..93dcc4bec6c --- /dev/null +++ b/src/block/block_map.c @@ -0,0 +1,63 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_block_map -- + * Map a segment of the file in, if possible. + */ +int +__wt_block_map( + WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp) +{ + *(void **)mapp = NULL; + *maplenp = 0; + + /* + * Turn off mapping when verifying the file, because we can't perform + * checksum validation of mapped segments, and verify has to checksum + * pages. + */ + if (block->verify) + return (0); + + /* + * Turn off mapping when direct I/O is configured for the file, the + * Linux open(2) documentation says applications should avoid mixing + * mmap(2) of files with direct I/O to the same files. + */ + if (block->fh->direct_io) + return (0); + + /* + * Turn off mapping if the application configured a cache size maximum, + * we can't control how much of the cache size we use in that case. + */ + if (block->os_cache_max != 0) + return (0); + + /* + * Map the file into memory. + * Ignore errors, we'll read the file through the cache if map fails. + */ + (void)__wt_mmap(session, block->fh, mapp, maplenp); + + return (0); +} + +/* + * __wt_block_unmap -- + * Unmap any mapped-in segment of the file. + */ +int +__wt_block_unmap( + WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen) +{ + /* Unmap the file from memory. */ + return (__wt_munmap(session, block->fh, map, maplen)); +} diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c index 307263d5779..ddf23a97866 100644 --- a/src/block/block_mgr.c +++ b/src/block/block_mgr.c @@ -67,6 +67,16 @@ __bm_checkpoint(WT_BM *bm, } /* + * __bm_sync -- + * Flush a file to disk. + */ +static int +__bm_sync(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__wt_fsync(session, bm->block->fh)); +} + +/* * __bm_checkpoint_load -- * Load a checkpoint point. */ @@ -86,17 +96,12 @@ __bm_checkpoint_load(WT_BM *bm, WT_SESSION_IMPL *session, if (checkpoint) { /* - * Read-only objects are mapped into memory instead of being - * read into cache buffers. Ignore errors, with no mapping - * we'll read into the cache. - * - * Turn off mapping when verifying the file, because we can't - * perform checksum validation of mapped segments, and verify - * has to checksum pages. + * Read-only objects are optionally mapped into memory instead + * of being read into cache buffers. */ - if (conn->mmap && !bm->block->verify) - (void)__wt_mmap( - session, bm->block->fh, &bm->map, &bm->maplen); + if (conn->mmap) + WT_RET(__wt_block_map( + session, bm->block, &bm->map, &bm->maplen)); /* * If this handle is for a checkpoint, that is, read-only, there @@ -132,7 +137,7 @@ __bm_checkpoint_unload(WT_BM *bm, WT_SESSION_IMPL *session) /* Unmap any mapped segment. */ if (bm->map != NULL) WT_TRET( - __wt_munmap(session, bm->block->fh, bm->map, bm->maplen)); + __wt_block_unmap(session, bm->block, bm->map, bm->maplen)); /* Unload the checkpoint. */ WT_TRET(__wt_block_checkpoint_unload(session, bm->block, !bm->is_live)); @@ -197,9 +202,9 @@ __bm_free(WT_BM *bm, * Block-manager statistics. */ static int -__bm_stat(WT_BM *bm, WT_SESSION_IMPL *session) +__bm_stat(WT_BM *bm, WT_SESSION_IMPL *session, WT_DSRC_STATS *stats) { - __wt_block_stat(session, bm->block); + __wt_block_stat(session, bm->block, stats); return (0); } @@ -333,6 +338,7 @@ __bm_method_set(WT_BM *bm, int readonly) bm->salvage_valid = (int (*)(WT_BM *, WT_SESSION_IMPL *, uint8_t *, uint32_t))__bm_readonly; bm->stat = __bm_stat; + bm->sync = (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly; bm->verify_addr = __bm_verify_addr; bm->verify_end = __bm_verify_end; bm->verify_start = __bm_verify_start; @@ -358,6 +364,7 @@ __bm_method_set(WT_BM *bm, int readonly) bm->salvage_start = __bm_salvage_start; bm->salvage_valid = __bm_salvage_valid; bm->stat = __bm_stat; + bm->sync = __bm_sync; bm->verify_addr = __bm_verify_addr; bm->verify_end = __bm_verify_end; bm->verify_start = __bm_verify_start; @@ -371,8 +378,8 @@ __bm_method_set(WT_BM *bm, int readonly) * Open a file. */ int -__wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, - const char *config, const char *cfg[], int forced_salvage, WT_BM **bmp) +__wt_block_manager_open(WT_SESSION_IMPL *session, + const char *filename, const char *cfg[], int forced_salvage, WT_BM **bmp) { WT_BM *bm; WT_DECL_RET; @@ -383,7 +390,7 @@ __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, __bm_method_set(bm, 0); WT_ERR(__wt_block_open( - session, filename, config, cfg, forced_salvage, &bm->block)); + session, filename, cfg, forced_salvage, &bm->block)); *bmp = bm; return (0); diff --git a/src/block/block_open.c b/src/block/block_open.c index 6a542a29e7e..973df7e2250 100644 --- a/src/block/block_open.c +++ b/src/block/block_open.c @@ -71,6 +71,7 @@ __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block) WT_DECL_RET; conn = S2C(session); + TAILQ_REMOVE(&conn->blockqh, block, q); if (block->name != NULL) __wt_free(session, block->name); @@ -80,7 +81,7 @@ __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block) __wt_spin_destroy(session, &block->live_lock); - TAILQ_REMOVE(&conn->blockqh, block, q); + __wt_block_ext_cleanup(session, block); __wt_overwrite_and_free(session, block); @@ -93,8 +94,7 @@ __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block) */ int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, - const char *config, const char *cfg[], int forced_salvage, - WT_BLOCK **blockp) + const char *cfg[], int forced_salvage, WT_BLOCK **blockp) { WT_BLOCK *block; WT_CONFIG_ITEM cval; @@ -124,9 +124,38 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename, WT_ERR(__wt_strdup(session, filename, &block->name)); /* Get the allocation size. */ - WT_ERR(__wt_config_getones(session, config, "allocation_size", &cval)); + WT_ERR(__wt_config_gets(session, cfg, "allocation_size", &cval)); block->allocsize = (uint32_t)cval.val; + /* Configuration: optional OS buffer cache maximum size. */ + WT_ERR(__wt_config_gets(session, cfg, "os_cache_max", &cval)); + block->os_cache_max = cval.val; +#ifdef HAVE_POSIX_FADVISE + if (conn->direct_io && block->os_cache_max) + WT_ERR_MSG(session, EINVAL, + "os_cache_max not supported in combination with direct_io"); +#else + if (block->os_cache_max) + WT_ERR_MSG(session, EINVAL, + "os_cache_max not supported if posix_fadvise not " + "available"); +#endif + + /* Configuration: optional immediate write scheduling flag. */ + WT_ERR(__wt_config_gets(session, cfg, "os_cache_dirty_max", &cval)); + block->os_cache_dirty_max = cval.val; +#ifdef HAVE_SYNC_FILE_RANGE + if (conn->direct_io && block->os_cache_dirty_max) + WT_ERR_MSG(session, EINVAL, + "os_cache_dirty_max not supported in combination with " + "direct_io"); +#else + if (block->os_cache_dirty_max) + WT_ERR_MSG(session, EINVAL, + "os_cache_dirty_max not supported if sync_file_range not " + "available"); +#endif + /* Open the underlying file handle. */ WT_ERR(__wt_open(session, filename, 0, 0, 1, &block->fh)); @@ -258,8 +287,11 @@ __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block) (desc->majorv == WT_BLOCK_MAJOR_VERSION && desc->minorv > WT_BLOCK_MINOR_VERSION)) WT_ERR_MSG(session, WT_ERROR, - "%s is an unsupported version of a WiredTiger file", - block->name); + "unsupported WiredTiger file version: this build only " + "supports major/minor versions up to %d/%d, and the file " + "is version %d/%d", + WT_BLOCK_MAJOR_VERSION, WT_BLOCK_MINOR_VERSION, + desc->majorv, desc->minorv); err: __wt_scr_free(&buf); return (ret); @@ -270,7 +302,7 @@ err: __wt_scr_free(&buf); * Block statistics */ void -__wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block) +__wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats) { /* * We're looking inside the live system's structure, which normally @@ -279,11 +311,11 @@ __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block) * isn't like this is a common function for an application to call. */ __wt_spin_lock(session, &block->live_lock); - WT_DSTAT_SET(session, block_allocsize, block->allocsize); - WT_DSTAT_SET(session, block_checkpoint_size, block->live.ckpt_size); - WT_DSTAT_SET(session, block_magic, WT_BLOCK_MAGIC); - WT_DSTAT_SET(session, block_major, WT_BLOCK_MAJOR_VERSION); - WT_DSTAT_SET(session, block_minor, WT_BLOCK_MINOR_VERSION); - WT_DSTAT_SET(session, block_size, block->fh->file_size); + WT_STAT_SET(stats, block_allocsize, block->allocsize); + WT_STAT_SET(stats, block_checkpoint_size, block->live.ckpt_size); + WT_STAT_SET(stats, block_magic, WT_BLOCK_MAGIC); + WT_STAT_SET(stats, block_major, WT_BLOCK_MAJOR_VERSION); + WT_STAT_SET(stats, block_minor, WT_BLOCK_MINOR_VERSION); + WT_STAT_SET(stats, block_size, block->fh->file_size); __wt_spin_unlock(session, &block->live_lock); } diff --git a/src/block/block_read.c b/src/block/block_read.c index 4a5ba4c4478..20bd7c17b31 100644 --- a/src/block/block_read.c +++ b/src/block/block_read.c @@ -55,7 +55,22 @@ __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, } /* Read the block. */ - return (__wt_block_read_off(session, block, buf, offset, size, cksum)); + WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum)); + +#ifdef HAVE_POSIX_FADVISE + /* Optionally discard blocks from the system's buffer cache. */ + if (block->os_cache_max != 0 && + (block->os_cache += size) > block->os_cache_max) { + WT_DECL_RET; + + block->os_cache = 0; + if ((ret = posix_fadvise(block->fh->fd, + (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0) + WT_RET_MSG( + session, ret, "%s: posix_fadvise", block->name); + } +#endif + return (0); } /* diff --git a/src/block/block_slvg.c b/src/block/block_slvg.c index 46df9dd210b..488278fd41a 100644 --- a/src/block/block_slvg.c +++ b/src/block/block_slvg.c @@ -52,7 +52,7 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block) * Start with the entire file on the allocation list, we'll "free" * any blocks we don't want as we process the file. */ - WT_RET(__wt_block_insert_ext(session, &block->live.alloc, + WT_RET(__wt_block_insert_ext(session, block, &block->live.alloc, WT_BLOCK_DESC_SECTOR, len - WT_BLOCK_DESC_SECTOR)); return (0); diff --git a/src/block/block_vrfy.c b/src/block/block_vrfy.c index d88f6d67fdd..7c06bb8b193 100644 --- a/src/block/block_vrfy.c +++ b/src/block/block_vrfy.c @@ -226,7 +226,7 @@ __wt_verify_ckpt_load( WT_RET(__wt_block_extlist_read( session, block, el, ci->file_size)); WT_RET(__wt_block_extlist_merge( - session, el, &block->verify_alloc)); + session, block, el, &block->verify_alloc)); __wt_block_extlist_free(session, el); } el = &ci->discard; @@ -235,7 +235,7 @@ __wt_verify_ckpt_load( session, block, el, ci->file_size)); WT_EXT_FOREACH(ext, el->off) WT_RET(__wt_block_off_remove_overlap(session, - &block->verify_alloc, ext->off, ext->size)); + block, &block->verify_alloc, ext->off, ext->size)); __wt_block_extlist_free(session, el); } @@ -247,7 +247,7 @@ __wt_verify_ckpt_load( * checkpoints. */ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) - WT_RET(__wt_block_off_remove_overlap(session, + WT_RET(__wt_block_off_remove_overlap(session, block, &block->verify_alloc, ci->root_offset, ci->root_size)); /* diff --git a/src/block/block_write.c b/src/block/block_write.c index ce07bd6ae57..13cb0f25f0e 100644 --- a/src/block/block_write.c +++ b/src/block/block_write.c @@ -28,7 +28,8 @@ __wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep) { WT_UNUSED(session); - *sizep = WT_ALIGN(*sizep + WT_BLOCK_HEADER_BYTE_SIZE, block->allocsize); + *sizep = (size_t) + WT_ALIGN(*sizep + WT_BLOCK_HEADER_BYTE_SIZE, block->allocsize); return (0); } @@ -87,7 +88,7 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ - align_size = WT_ALIGN32(buf->size, block->allocsize); + align_size = (uint32_t)WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, @@ -141,6 +142,31 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_RET(ret); } +#ifdef HAVE_SYNC_FILE_RANGE + /* + * Optionally schedule writes for dirty pages in the system buffer + * cache. + */ + if (block->os_cache_dirty_max != 0 && + (block->os_cache_dirty += align_size) > block->os_cache_dirty_max) { + block->os_cache_dirty = 0; + if ((ret = sync_file_range(block->fh->fd, + (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) != 0) + WT_RET_MSG( + session, ret, "%s: sync_file_range", block->name); + } +#endif +#ifdef HAVE_POSIX_FADVISE + /* Optionally discard blocks from the system buffer cache. */ + if (block->os_cache_max != 0 && + (block->os_cache += align_size) > block->os_cache_max) { + block->os_cache = 0; + if ((ret = posix_fadvise(block->fh->fd, + (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0) + WT_RET_MSG( + session, ret, "%s: posix_fadvise", block->name); + } +#endif WT_CSTAT_INCR(session, block_write); WT_CSTAT_INCRV(session, block_byte_write, align_size); diff --git a/src/bloom/bloom.c b/src/bloom/bloom.c index df16aa39c55..9e0a2817a11 100644 --- a/src/bloom/bloom.c +++ b/src/bloom/bloom.c @@ -65,7 +65,8 @@ static int __bloom_setup( WT_BLOOM *bloom, uint64_t n, uint64_t m, uint32_t factor, uint32_t k) { - WT_ASSERT(bloom->session, k > 1); + if (k < 2) + return (EINVAL); bloom->k = k; bloom->factor = factor; @@ -240,6 +241,7 @@ __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash) uint64_t h1, h2; uint8_t bit; + /* Get operations are only supported by finalized bloom filters. */ WT_ASSERT(bloom->session, bloom->bitstring == NULL); wt_session = (WT_SESSION *)bloom->session; diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c index b2123d0c0f0..c390d7347d9 100644 --- a/src/btree/bt_curnext.c +++ b/src/btree/bt_curnext.c @@ -402,7 +402,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, int discard) session = (WT_SESSION_IMPL *)cbt->iface.session; WT_DSTAT_INCR(session, cursor_next); - flags = 0; /* Tree walk flags. */ + flags = WT_TREE_SKIP_INTL; /* Tree walk flags. */ if (discard) LF_SET(WT_TREE_DISCARD); @@ -476,12 +476,11 @@ retry: WT_RET(__cursor_func_init(cbt, 0)); } cbt->page = NULL; - do { - WT_ERR(__wt_tree_walk(session, &page, flags)); - WT_ERR_TEST(page == NULL, WT_NOTFOUND); - } while ( - page->type == WT_PAGE_COL_INT || - page->type == WT_PAGE_ROW_INT); + WT_ERR(__wt_tree_walk(session, &page, flags)); + WT_ERR_TEST(page == NULL, WT_NOTFOUND); + WT_ASSERT(session, + page->type != WT_PAGE_COL_INT && + page->type != WT_PAGE_ROW_INT); cbt->page = page; /* Initialize the page's modification information */ diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index a0591937fa6..418e89dcecd 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -494,7 +494,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int discard) session = (WT_SESSION_IMPL *)cbt->iface.session; WT_DSTAT_INCR(session, cursor_prev); - flags = WT_TREE_PREV; /* Tree walk flags. */ + flags = WT_TREE_SKIP_INTL | WT_TREE_PREV; /* Tree walk flags. */ if (discard) LF_SET(WT_TREE_DISCARD); @@ -559,12 +559,11 @@ retry: WT_RET(__cursor_func_init(cbt, 0)); } cbt->page = NULL; - do { - WT_ERR(__wt_tree_walk(session, &page, flags)); - WT_ERR_TEST(page == NULL, WT_NOTFOUND); - } while ( - page->type == WT_PAGE_COL_INT || - page->type == WT_PAGE_ROW_INT); + WT_ERR(__wt_tree_walk(session, &page, flags)); + WT_ERR_TEST(page == NULL, WT_NOTFOUND); + WT_ASSERT(session, + page->type != WT_PAGE_COL_INT && + page->type != WT_PAGE_ROW_INT); cbt->page = page; /* Initialize the page's modification information */ diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index eeb1a6dd1e1..277f46a76c1 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -48,13 +48,8 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) hp->page, hp->file, hp->line); } #endif - /* - * Pages without a memory footprint aren't associated with the cache - * and were never counted as "pages read". If the page has a memory - * footprint, update the cache information based on the discard. - */ - if (page->memory_footprint != 0) - __wt_cache_page_evict(session, page); + /* Update the cache's information. */ + __wt_cache_page_evict(session, page); /* Free the page modification information. */ if (page->modify != NULL) @@ -153,9 +148,6 @@ __free_page_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_free(session, ((WT_ADDR *)ref->addr)->addr); __wt_free(session, ref->addr); } - - /* Free the subtree-reference array. */ - __wt_free(session, page->u.intl.t); } /* @@ -165,9 +157,6 @@ __free_page_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) static void __free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) { - /* Free the in-memory index array. */ - __wt_free(session, page->u.col_var.d); - /* Free the RLE lookup array. */ __wt_free(session, page->u.col_var.repeats); } @@ -198,9 +187,6 @@ __free_page_row_int(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_free(session, ref->addr); } } - - /* Free the subtree-reference array. */ - __wt_free(session, page->u.intl.t); } /* @@ -226,7 +212,6 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) if (ikey != NULL && __wt_off_page(page, ikey)) __wt_free(session, ikey); } - __wt_free(session, page->u.row.d); /* * Free the insert array. diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index df36f609369..38fca0db2d1 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -10,10 +10,11 @@ static void __evict_dirty_validate(WT_CONNECTION_IMPL *); static int __evict_file(WT_SESSION_IMPL *, int); static int __evict_file_request_walk(WT_SESSION_IMPL *); -static int __evict_init_candidate( +static void __evict_init_candidate( WT_SESSION_IMPL *, WT_EVICT_ENTRY *, WT_PAGE *); static int __evict_lru(WT_SESSION_IMPL *, int); static int __evict_lru_cmp(const void *, const void *); +static int __evict_page(WT_SESSION_IMPL *, WT_PAGE *); static int __evict_walk(WT_SESSION_IMPL *, uint32_t *, int); static int __evict_walk_file(WT_SESSION_IMPL *, u_int *, int); static int __evict_worker(WT_SESSION_IMPL *); @@ -22,10 +23,10 @@ static int __evict_worker(WT_SESSION_IMPL *); * Tuning constants: I hesitate to call this tuning, but we want to review some * number of pages from each file's in-memory tree for each page we evict. */ -#define WT_EVICT_INT_SKEW (1<<20) /* Prefer leaf pages over internal +#define WT_EVICT_INT_SKEW (1<<12) /* Prefer leaf pages over internal pages by this many increments of the read generation. */ -#define WT_EVICT_WALK_PER_FILE 5 /* Pages to visit per file */ +#define WT_EVICT_WALK_PER_FILE 10 /* Pages to visit per file */ #define WT_EVICT_WALK_BASE 100 /* Pages tracked across file visits */ #define WT_EVICT_WALK_INCR 100 /* Pages added each walk */ @@ -50,8 +51,15 @@ __evict_read_gen(const WT_EVICT_ENTRY *entry) return (0); read_gen = page->read_gen + entry->btree->evict_priority; - if (page->type == WT_PAGE_ROW_INT || - page->type == WT_PAGE_COL_INT) + + /* + * Skew the read generation for internal pages that aren't split merge + * pages. We want to consider leaf pages in preference to real internal + * pages, but merges are relatively cheap in-memory operations that make + * reads faster, so don't make them too unlikely. + */ + if ((page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) && + !__wt_btree_mergeable(page)) read_gen += WT_EVICT_INT_SKEW; return (read_gen); @@ -151,73 +159,72 @@ __wt_evict_list_clr_page(WT_SESSION_IMPL *session, WT_PAGE *page) /* * __wt_evict_forced_page -- - * If a page matches the force criteria add it to the eviction queue and - * trigger the eviction server. + * If a page matches the force criteria,try to add it to the eviction + * queue and trigger the eviction server. Best effort only, so no error + * is returned if the page is busy. */ -int +void __wt_evict_forced_page(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - u_int count; + WT_PAGE *top; + u_int levels; conn = S2C(session); cache = conn->cache; + /* Don't queue a page for forced eviction if we already have one. */ + if (F_ISSET(cache, WT_EVICT_FORCE_PASS)) + return; + + /* + * Check if the page we have been asked to forcefully evict is at the + * bottom of a stack of split-merge pages. If so, lock the top of the + * stack instead. + */ + for (top = page, levels = 0; + __wt_btree_mergeable(top->parent); + top = top->parent, ++levels) + ; + + if (levels >= WT_MERGE_STACK_MIN) + page = top; + /* * Try to lock the page. If this succeeds, we're going to queue * it for forced eviction. We don't go right to the EVICT_FORCED * state, because that is cleared by __wt_evict_list_clr_page. */ if (!WT_ATOMIC_CAS(page->ref->state, WT_REF_MEM, WT_REF_LOCKED)) - return (EBUSY); + return; /* If the page is already queued for ordinary eviction, clear it. */ __wt_evict_list_clr_page(session, page); __wt_spin_lock(session, &cache->evict_lock); - /* - * Add the page to the head of the eviction queue. Initialize the - * eviction array if necessary. - */ - if (cache->evict_allocated == 0) { - count = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; - WT_ERR(__wt_realloc(session, &cache->evict_allocated, - count * sizeof(WT_EVICT_ENTRY), &cache->evict)); - cache->evict_entries = count; - } - WT_ERR(__evict_init_candidate(session, cache->evict, page)); + /* Add the page to the head of the eviction queue. */ + __evict_init_candidate(session, cache->evict, page); + /* Set the location in the eviction queue to the new entry. */ cache->evict_current = cache->evict; - /* - * If the candidate list was empty we are adding a candidate, in all - * other cases we are replacing an existing candidate. - */ - if (cache->evict_candidates == 0) - cache->evict_candidates++; /* * Lock the page so other threads cannot get new read locks on the * page - which makes it more likely that the next pass of the eviction * server will successfully evict the page. */ - if (!WT_ATOMIC_CAS(page->ref->state, WT_REF_LOCKED, WT_REF_EVICT_FORCE)) - WT_ERR(EBUSY); + WT_PUBLISH(page->ref->state, WT_REF_EVICT_FORCE); -err: __wt_spin_unlock(session, &cache->evict_lock); + F_SET(cache, WT_EVICT_FORCE_PASS); + __wt_spin_unlock(session, &cache->evict_lock); - /* - * Only wake the server if the page was successfully queued. - * Otherwise, unlock it. - */ - if (ret == 0) { - F_SET(S2C(session)->cache, WT_EVICT_FORCE_PASS); - ret = __wt_evict_server_wake(session); - } else - page->ref->state = WT_REF_MEM; - return (ret); + WT_CSTAT_INCR(session, cache_eviction_force); + WT_DSTAT_INCR(session, cache_eviction_force); + + /* Try to wake the server, but don't worry if that fails. */ + (void)__wt_evict_server_wake(session); } /* @@ -288,11 +295,14 @@ __wt_cache_evict_server(void *arg) conn = S2C(session); cache = conn->cache; - while (F_ISSET(conn, WT_CONN_SERVER_RUN)) { + cache->evict_entries = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; + WT_ERR(__wt_calloc_def(session, cache->evict_entries, &cache->evict)); + + while (F_ISSET(conn, WT_CONN_EVICTION_RUN)) { /* Evict pages from the cache as needed. */ WT_ERR(__evict_worker(session)); - if (!F_ISSET(conn, WT_CONN_SERVER_RUN)) + if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) break; WT_VERBOSE_ERR(session, evictserver, "sleeping"); @@ -304,13 +314,16 @@ __wt_cache_evict_server(void *arg) WT_VERBOSE_ERR(session, evictserver, "exiting"); if (ret == 0) { - if (__wt_cache_bytes_inuse(cache) != 0) { + if (cache->pages_inmem != cache->pages_evict) __wt_errx(session, - "cache server: exiting with %" PRIu64 " pages, " - "%" PRIu64 " bytes in use", - __wt_cache_pages_inuse(cache), - __wt_cache_bytes_inuse(cache)); - } + "cache server: exiting with %" PRIu64 " pages in " + "memory and %" PRIu64 " pages evicted", + cache->pages_inmem, cache->pages_evict); + if (cache->bytes_inmem != cache->bytes_evict) + __wt_errx(session, + "cache server: exiting with %" PRIu64 " bytes in " + "memory and %" PRIu64 " bytes evicted", + cache->bytes_inmem, cache->bytes_evict); } else err: WT_PANIC_ERR(session, ret, "eviction server error"); @@ -330,17 +343,21 @@ err: WT_PANIC_ERR(session, ret, "eviction server error"); static int __evict_worker(WT_SESSION_IMPL *session) { + WT_BTREE *force_btree; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; + WT_PAGE *force_page; uint64_t bytes_inuse, bytes_max, dirty_inuse; - int clean, force, loop; + int clean, loop; conn = S2C(session); cache = conn->cache; /* Evict pages from the cache. */ for (loop = 0;; loop++) { + force_page = NULL; + /* * Block out concurrent eviction while we are handling requests. */ @@ -350,22 +367,31 @@ __evict_worker(WT_SESSION_IMPL *session) while (ret == 0 && cache->sync_complete != cache->sync_request) ret = __evict_file_request_walk(session); - /* Check for forced eviction while we hold the lock. */ - force = F_ISSET(cache, WT_EVICT_FORCE_PASS) ? 1 : 0; - F_CLR(cache, WT_EVICT_FORCE_PASS); - - __wt_spin_unlock(session, &cache->evict_lock); - WT_RET(ret); - /* * If we've been awoken for forced eviction, just try to evict * the first page in the queue: don't do a walk and sort first. - * Sometimes the page won't be available for eviction because - * there is a reader still holding a hazard reference. Give up - * in that case, the application thread can add it again. */ - if (force) - (void)__wt_evict_lru_page(session, 0); + force_btree = NULL; + force_page = NULL; + if (ret == 0 && F_ISSET(cache, WT_EVICT_FORCE_PASS)) { + if (cache->evict->page != NULL && + WT_ATOMIC_CAS(cache->evict->page->ref->state, + WT_REF_EVICT_FORCE, WT_REF_LOCKED)) { + force_btree = cache->evict->btree; + force_page = cache->evict->page; + __evict_list_clr(session, cache->evict); + } + F_CLR(cache, WT_EVICT_FORCE_PASS); + } + + __wt_spin_unlock(session, &cache->evict_lock); + WT_RET(ret); + + if (force_page != NULL) { + WT_SET_BTREE_IN_SESSION(session, force_btree); + (void)__evict_page(session, force_page); + WT_CLEAR_BTREE_IN_SESSION(session); + } /* * Keep evicting until we hit the target cache usage and the @@ -393,16 +419,23 @@ __evict_worker(WT_SESSION_IMPL *session) if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) clean = 1; + /* + * Track whether pages are being evicted. This will be cleared + * by the next thread to successfully evict a page. + */ + F_SET(cache, WT_EVICT_NO_PROGRESS); WT_RET(__evict_lru(session, clean)); __evict_dirty_validate(conn); + /* * If we're making progress, keep going; if we're not making - * any progress at all, go back to sleep, it's not something - * we can fix. + * any progress at all, mark the cache "stuck" and go back to + * sleep, it's not something we can fix. */ - if (clean && __wt_cache_bytes_inuse(cache) >= bytes_inuse) { + if (F_ISSET(cache, WT_EVICT_NO_PROGRESS)) { if (loop == 10) { + F_SET(cache, WT_EVICT_STUCK); WT_CSTAT_INCR(session, cache_eviction_slow); WT_VERBOSE_RET(session, evictserver, "unable to reach eviction goal"); @@ -474,13 +507,9 @@ __evict_page(WT_SESSION_IMPL *session, WT_PAGE *page) WT_RET(__wt_txn_init(session)); __wt_txn_get_evict_snapshot(session); - saved_txn.oldest_snap_min = txn->oldest_snap_min; txn->isolation = TXN_ISO_READ_COMMITTED; ret = __wt_rec_evict(session, page, 0); - /* Keep count of any failures. */ - saved_txn.eviction_fails = txn->eviction_fails; - if (was_running) { WT_ASSERT(session, txn->snapshot == NULL || txn->snapshot != saved_txn.snapshot); @@ -488,6 +517,9 @@ __evict_page(WT_SESSION_IMPL *session, WT_PAGE *page) } else __wt_txn_release_snapshot(session); + /* If the oldest transaction was updated, keep the newer value. */ + saved_txn.oldest_snap_min = txn->oldest_snap_min; + *txn = saved_txn; return (ret); } @@ -668,28 +700,44 @@ __wt_sync_file(WT_SESSION_IMPL *session, int syncop) WT_CACHE *cache; WT_DECL_RET; WT_PAGE *page; + WT_TXN *txn; uint32_t flags; btree = S2BT(session); cache = S2C(session)->cache; page = NULL; + txn = &session->txn; switch (syncop) { case WT_SYNC_CHECKPOINT: + case WT_SYNC_WRITE_LEAVES: /* * The first pass walks all cache leaf pages, waiting for * concurrent activity in a page to be resolved, acquiring * hazard references to prevent eviction. */ - flags = WT_TREE_CACHE | WT_TREE_SKIP_INTL | WT_TREE_WAIT; + flags = WT_TREE_CACHE | WT_TREE_SKIP_INTL; + if (syncop == WT_SYNC_CHECKPOINT) + flags |= WT_TREE_WAIT; WT_ERR(__wt_tree_walk(session, &page, flags)); while (page != NULL) { - /* Write dirty pages. */ - if (__wt_page_is_modified(page)) - WT_ERR(__wt_rec_write(session, page, NULL, 0)); + /* Write dirty pages if nobody beat us to it. */ + if (__wt_page_is_modified(page)) { + if (txn->isolation == TXN_ISO_READ_COMMITTED) + __wt_txn_get_snapshot(session, + WT_TXN_NONE, WT_TXN_NONE, 0); + ret = __wt_rec_write(session, page, NULL, 0); + if (txn->isolation == TXN_ISO_READ_COMMITTED) + __wt_txn_release_snapshot(session); + WT_ERR(ret); + } + WT_ERR(__wt_tree_walk(session, &page, flags)); } + if (syncop == WT_SYNC_WRITE_LEAVES) + break; + /* * Pages cannot disappear from underneath internal pages when * internal pages are being reconciled by checkpoint; also, @@ -793,20 +841,20 @@ __evict_lru(WT_SESSION_IMPL *session, int clean) qsort(cache->evict, candidates, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp); - /* Find the bottom 25% */ while (candidates > 0 && cache->evict[candidates - 1].page == NULL) --candidates; + /* Find the bottom 25% of read generations. */ cutoff = (3 * __evict_read_gen(&cache->evict[0]) + __evict_read_gen(&cache->evict[candidates - 1])) / 4; /* - * Don't take more than half, regardless. That said, if there is only - * one candidate page, which is normal when populating an empty file, - * don't exclude it. + * Don't take less than 10% or more than 50% of candidates, regardless. + * That said, if there is only one candidate page, which is normal when + * populating an empty file, don't exclude it. */ - for (i = 0; i < candidates / 2; i++) - if (cache->evict[i].page->read_gen > cutoff) + for (i = candidates / 10; i < candidates / 2; i++) + if (__evict_read_gen(&cache->evict[i]) > cutoff) break; cache->evict_candidates = i + 1; @@ -836,29 +884,14 @@ __evict_walk(WT_SESSION_IMPL *session, u_int *entriesp, int clean) WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - u_int elem, file_count, i, retries; + u_int file_count, i, retries; conn = S2C(session); cache = S2C(session)->cache; retries = 0; - /* - * Resize the array in which we're tracking pages, as necessary, then - * get some pages from each underlying file. In practice, a realloc - * is rarely needed, so it is worth avoiding the LRU lock. - */ - elem = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; - if (elem > cache->evict_entries) { - __wt_spin_lock(session, &cache->evict_lock); - /* Save the offset of the eviction point. */ - i = (u_int)(cache->evict_current - cache->evict); - WT_ERR(__wt_realloc(session, &cache->evict_allocated, - elem * sizeof(WT_EVICT_ENTRY), &cache->evict)); - cache->evict_entries = elem; - if (cache->evict_current != NULL) - cache->evict_current = cache->evict + i; - __wt_spin_unlock(session, &cache->evict_lock); - } + /* Update the oldest transaction ID -- we use it to filter pages. */ + __wt_txn_get_oldest(session); /* * NOTE: we don't hold the schema lock: files can't be removed without @@ -903,15 +936,11 @@ retry: file_count = 0; } cache->evict_file_next = (btree == NULL) ? 0 : file_count; - /* In the extreme case, all of the pages have to come from one file. */ - if (ret == 0 && i < cache->evict_entries && - retries++ < WT_EVICT_WALK_INCR / WT_EVICT_WALK_PER_FILE) + /* Walk the files a few times if we don't find enough pages. */ + if (ret == 0 && i < cache->evict_entries && retries++ < 10) goto retry; *entriesp = i; - if (0) { -err: __wt_spin_unlock(session, &cache->evict_lock); - } return (ret); } @@ -919,7 +948,7 @@ err: __wt_spin_unlock(session, &cache->evict_lock); * __evict_init_candidate -- * Initialize a WT_EVICT_ENTRY structure with a given page. */ -static int +static void __evict_init_candidate( WT_SESSION_IMPL *session, WT_EVICT_ENTRY *evict, WT_PAGE *page) { @@ -930,7 +959,6 @@ __evict_init_candidate( /* Mark the page on the list */ F_SET_ATOMIC(page, WT_PAGE_EVICT_LRU); - return (0); } /* @@ -945,7 +973,8 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean) WT_DECL_RET; WT_EVICT_ENTRY *end, *evict, *start; WT_PAGE *page; - int modified, restarts; + wt_txnid_t oldest_txn; + int modified, restarts, levels; btree = S2BT(session); cache = S2C(session)->cache; @@ -953,6 +982,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean) end = start + WT_EVICT_WALK_PER_FILE; if (end > cache->evict + cache->evict_entries) end = cache->evict + cache->evict_entries; + oldest_txn = session->txn.oldest_snap_min; /* * Get some more eviction candidate pages. @@ -974,47 +1004,93 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean) continue; } + WT_CSTAT_INCR(session, cache_eviction_walk); + + /* Ignore root pages entirely. */ + if (WT_PAGE_IS_ROOT(page)) + continue; + + /* Look for a split-merge (grand)parent page to merge. */ + levels = 0; + if (__wt_btree_mergeable(page)) + for (levels = 1; + levels < WT_MERGE_STACK_MIN && + __wt_btree_mergeable(page->parent); + page = page->parent, levels++) + ; + else if (page->modify != NULL && + F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE)) + continue; + /* - * Skip root pages and split-merge pages: they can't be evicted. - * (Split-merge pages are always merged into their parents.) - * Don't skip empty or split pages: updates after their last - * reconciliation may have changed their state and only the - * reconciliation/eviction code can confirm if they should be - * skipped. + * Only look for a parent at exactly the right height above: if + * the stack is deep enough, we'll find it eventually, and we + * don't want to do too much work on every level. * - * Use the EVICT_LRU flag to avoid putting pages onto the list - * multiple times. + * !!! + * Don't restrict ourselves to only the top-most page (that is, + * don't require that page->parent is not mergeable). If there + * is a big, busy enough split-merge tree, the top-level merge + * will only happen if we can lock the whole subtree + * exclusively. Consider smaller merges in case locking the + * whole tree fails. */ - if (WT_PAGE_IS_ROOT(page) || - (page->modify != NULL && - F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE)) || - F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) + if (levels != 0 && levels != WT_MERGE_STACK_MIN) continue; /* - * If the file is being checkpointed, there's a period of time - * where we can't discard any page with a modification - * structure because it might race with the checkpointing - * thread. - * - * During this phase, there is little point trying to evict - * dirty pages: we might be lucky and find an internal page - * that has not yet been checkpointed, but much more likely is - * that we will waste effort considering dirty leaf pages that - * cannot be evicted because they have modifications more - * recent than the checkpoint. + * If this page has never been considered for eviction, set its + * read generation to a little bit in the future and move on, + * give readers a chance to start updating the read generation. */ - modified = __wt_page_is_modified(page); - if (modified && btree->checkpointing) + if (page->read_gen == WT_READ_GEN_NOTSET) { + page->read_gen = __wt_cache_read_gen_set(session); continue; + } - /* Optionally ignore clean pages. */ - if (!modified && !clean) + /* + * Use the EVICT_LRU flag to avoid putting pages onto the list + * multiple times. + */ + if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) continue; + /* The following checks apply to eviction but not merges. */ + if (levels == 0) { + /* + * If the file is being checkpointed, there's a period + * of time where we can't discard any page with a + * modification structure because it might race with + * the checkpointing thread. + * + * During this phase, there is little point trying to + * evict dirty pages: we might be lucky and find an + * internal page that has not yet been checkpointed, + * but much more likely is that we will waste effort + * considering dirty leaf pages that cannot be evicted + * because they have modifications more recent than the + * checkpoint. + */ + modified = __wt_page_is_modified(page); + if (modified && btree->checkpointing) + continue; + + /* Optionally ignore clean pages. */ + if (!modified && !clean) + continue; + + /* + * If the oldest transaction hasn't changed since the + * last time this page was written, there's no chance + * to make progress... + */ + if (modified && + TXNID_LE(oldest_txn, page->modify->disk_txn)) + continue; + } + WT_ASSERT(session, evict->page == NULL); - if (__evict_init_candidate(session, evict, page) != 0) - continue; + __evict_init_candidate(session, evict, page); ++evict; WT_VERBOSE_RET(session, evictserver, @@ -1029,7 +1105,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean) * __evict_get_page -- * Get a page for eviction. */ -static void +static int __evict_get_page( WT_SESSION_IMPL *session, int is_app, WT_BTREE **btreep, WT_PAGE **pagep) { @@ -1042,6 +1118,18 @@ __evict_get_page( *btreep = NULL; *pagep = NULL; + /* + * A pathological case: if we're the oldest transaction in the system + * and the eviction server is stuck trying to find space, abort the + * transaction to give up all hazard references before trying again. + */ + if (is_app && F_ISSET(cache, WT_EVICT_STUCK) && + __wt_txn_am_oldest(session)) { + F_CLR(cache, WT_EVICT_STUCK); + WT_CSTAT_INCR(session, txn_fail_cache); + return (WT_DEADLOCK); + } + candidates = cache->evict_candidates; /* The eviction server only considers half of the entries. */ if (!is_app && candidates > 1) @@ -1057,7 +1145,7 @@ __evict_get_page( for (;;) { if (cache->evict_current == NULL || cache->evict_current >= cache->evict + candidates) - return; + return (WT_NOTFOUND); if (__wt_spin_trylock(session, &cache->evict_lock) == 0) break; __wt_yield(); @@ -1081,7 +1169,7 @@ __evict_get_page( * unlocked the page and some other thread may have evicted it * by the time we look at it. */ - evict->page->read_gen = __wt_cache_read_gen(session); + evict->page->read_gen = __wt_cache_read_gen_set(session); /* * Lock the page while holding the eviction mutex to prevent @@ -1124,6 +1212,8 @@ __evict_get_page( if (is_app && *pagep == NULL) cache->evict_current = NULL; __wt_spin_unlock(session, &cache->evict_lock); + + return ((*pagep == NULL) ? WT_NOTFOUND : 0); } /* @@ -1135,12 +1225,11 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app) { WT_BTREE *btree; WT_DATA_HANDLE *saved_dhandle; + WT_CACHE *cache; WT_DECL_RET; WT_PAGE *page; - __evict_get_page(session, is_app, &btree, &page); - if (page == NULL) - return (WT_NOTFOUND); + WT_RET(__evict_get_page(session, is_app, &btree, &page)); WT_ASSERT(session, page->ref->state == WT_REF_LOCKED); @@ -1155,6 +1244,10 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app) WT_CLEAR_BTREE_IN_SESSION(session); session->dhandle = saved_dhandle; + cache = S2C(session)->cache; + if (ret == 0 && F_ISSET(cache, WT_EVICT_NO_PROGRESS | WT_EVICT_STUCK)) + F_CLR(cache, WT_EVICT_NO_PROGRESS | WT_EVICT_STUCK); + return (ret); } diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 6723d177f6c..33788f248e1 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -9,38 +9,18 @@ static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt); static int __btree_get_last_recno(WT_SESSION_IMPL *); -static int __btree_page_sizes(WT_SESSION_IMPL *, const char *); +static int __btree_page_sizes(WT_SESSION_IMPL *); static int __btree_tree_open_empty(WT_SESSION_IMPL *, int); static int pse1(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t); static int pse2(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t, uint32_t); /* - * __wt_btree_create -- - * Create a Btree. - */ -int -__wt_btree_create(WT_SESSION_IMPL *session, const char *filename) -{ - return (__wt_block_manager_create(session, filename)); -} - -/* - * __wt_btree_truncate -- - * Truncate a Btree. - */ -int -__wt_btree_truncate(WT_SESSION_IMPL *session, const char *filename) -{ - return (__wt_block_manager_truncate(session, filename)); -} - -/* * __wt_btree_open -- * Open a Btree. */ int -__wt_btree_open(WT_SESSION_IMPL *session, const char *cfg[]) +__wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) { WT_BM *bm; WT_BTREE *btree; @@ -75,12 +55,9 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *cfg[]) /* Handle salvage configuration. */ forced_salvage = 0; - if (F_ISSET(btree, WT_BTREE_SALVAGE) && cfg != NULL) { - ret = __wt_config_gets(session, cfg, "force", &cval); - if (ret != 0 && ret != WT_NOTFOUND) - WT_ERR(ret); - if (ret == 0 && cval.val != 0) - forced_salvage = 1; + if (F_ISSET(btree, WT_BTREE_SALVAGE)) { + WT_ERR(__wt_config_gets(session, op_cfg, "force", &cval)); + forced_salvage = (cval.val != 0); } /* Initialize and configure the WT_BTREE structure. */ @@ -91,8 +68,8 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *cfg[]) if (!WT_PREFIX_SKIP(filename, "file:")) WT_ERR_MSG(session, EINVAL, "expected a 'file:' URI"); - WT_ERR(__wt_block_manager_open(session, - filename, dhandle->config, cfg, forced_salvage, &btree->bm)); + WT_ERR(__wt_block_manager_open( + session, filename, dhandle->cfg, forced_salvage, &btree->bm)); bm = btree->bm; /* @@ -179,7 +156,6 @@ __wt_btree_close(WT_SESSION_IMPL *session) __wt_free(session, btree->value_format); if (btree->val_ovfl_lock != NULL) WT_TRET(__wt_rwlock_destroy(session, &btree->val_ovfl_lock)); - __wt_free(session, dhandle->stats); btree->bulk_load_ok = 0; @@ -200,14 +176,14 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) WT_NAMED_COMPRESSOR *ncomp; uint32_t bitcnt; int fixed; - const char *config; + const char **cfg; btree = S2BT(session); conn = S2C(session); - config = btree->dhandle->config; + cfg = btree->dhandle->cfg; /* Validate file types and check the data format plan. */ - WT_RET(__wt_config_getones(session, config, "key_format", &cval)); + WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL)); if (WT_STRING_MATCH("r", cval.str, cval.len)) btree->type = BTREE_COL_VAR; @@ -215,12 +191,12 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) btree->type = BTREE_ROW; WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->key_format)); - WT_RET(__wt_config_getones(session, config, "value_format", &cval)); + WT_RET(__wt_config_gets(session, cfg, "value_format", &cval)); WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->value_format)); /* Row-store key comparison and key gap for prefix compression. */ if (btree->type == BTREE_ROW) { - WT_RET(__wt_config_getones(session, config, "collator", &cval)); + WT_RET(__wt_config_gets(session, cfg, "collator", &cval)); if (cval.len > 0) { TAILQ_FOREACH(ncoll, &conn->collqh, q) { if (WT_STRING_MATCH( @@ -234,7 +210,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) "unknown collator '%.*s'", (int)cval.len, cval.str); } - WT_RET(__wt_config_getones(session, config, "key_gap", &cval)); + WT_RET(__wt_config_gets(session, cfg, "key_gap", &cval)); btree->key_gap = (uint32_t)cval.val; } /* Check for fixed-size data. */ @@ -252,14 +228,13 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) } /* Page sizes */ - WT_RET(__btree_page_sizes(session, config)); + WT_RET(__btree_page_sizes(session)); /* Eviction; the metadata file is never evicted. */ if (WT_IS_METADATA(btree->dhandle)) F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD); else { - WT_RET(__wt_config_getones( - session, config, "cache_resident", &cval)); + WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval)); if (cval.val) F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD); else @@ -267,7 +242,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) } /* Checksums */ - WT_RET(__wt_config_getones(session, config, "checksum", &cval)); + WT_RET(__wt_config_gets(session, cfg, "checksum", &cval)); if (WT_STRING_MATCH("on", cval.str, cval.len)) btree->checksum = CKSUM_ON; else if (WT_STRING_MATCH("off", cval.str, cval.len)) @@ -276,7 +251,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) btree->checksum = CKSUM_UNCOMPRESSED; /* Huffman encoding */ - WT_RET(__wt_btree_huffman_open(session, config)); + WT_RET(__wt_btree_huffman_open(session)); /* * Reconciliation configuration: @@ -290,25 +265,21 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) case BTREE_COL_FIX: break; case BTREE_ROW: - WT_RET(__wt_config_getones( - session, config, "internal_key_truncate", &cval)); + WT_RET(__wt_config_gets( + session, cfg, "internal_key_truncate", &cval)); btree->internal_key_truncate = cval.val == 0 ? 0 : 1; - WT_RET(__wt_config_getones( - session, config, "prefix_compression", &cval)); + WT_RET(__wt_config_gets( + session, cfg, "prefix_compression", &cval)); btree->prefix_compression = cval.val == 0 ? 0 : 1; /* FALLTHROUGH */ case BTREE_COL_VAR: - WT_RET( - __wt_config_getones(session, config, "dictionary", &cval)); + WT_RET(__wt_config_gets(session, cfg, "dictionary", &cval)); btree->dictionary = (u_int)cval.val; break; } - WT_RET(__wt_config_getones(session, config, "split_pct", &cval)); - btree->split_pct = (u_int)cval.val; - - WT_RET(__wt_config_getones(session, config, "block_compressor", &cval)); + WT_RET(__wt_config_gets(session, cfg, "block_compressor", &cval)); if (cval.len > 0) { TAILQ_FOREACH(ncomp, &conn->compqh, q) if (WT_STRING_MATCH(ncomp->name, cval.str, cval.len)) { @@ -325,7 +296,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) WT_RET(__wt_rwlock_alloc( session, "btree overflow lock", &btree->val_ovfl_lock)); - WT_RET(__wt_stat_alloc_dsrc_stats(session, &btree->dhandle->stats)); + __wt_stat_init_dsrc_stats(&btree->dhandle->stats); btree->write_gen = ckpt->write_gen; /* Write generation */ btree->modified = 0; /* Clean */ @@ -403,29 +374,25 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation) * __wt_page_out on error, we require a correct page setup at each point * where we might fail. */ - WT_ERR(__wt_calloc_def(session, 1, &root)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: - root->type = WT_PAGE_COL_INT; + WT_ERR(__wt_page_alloc(session, WT_PAGE_COL_INT, 1, &root)); root->u.intl.recno = 1; - WT_ERR(__wt_calloc_def(session, 1, &root->u.intl.t)); ref = root->u.intl.t; WT_ERR(__wt_btree_leaf_create(session, root, ref, &leaf)); - ref->page = leaf; ref->addr = NULL; ref->state = WT_REF_MEM; ref->u.recno = 1; break; case BTREE_ROW: - root->type = WT_PAGE_ROW_INT; - WT_ERR(__wt_calloc_def(session, 1, &root->u.intl.t)); + WT_ERR(__wt_page_alloc(session, WT_PAGE_ROW_INT, 1, &root)); ref = root->u.intl.t; WT_ERR(__wt_btree_leaf_create(session, root, ref, &leaf)); - ref->page = leaf; ref->addr = NULL; ref->state = WT_REF_MEM; - WT_ERR(__wt_row_ikey_alloc(session, 0, "", 1, &ref->u.key)); + WT_ERR( + __wt_row_ikey_incr(session, root, 0, "", 1, &ref->u.key)); break; WT_ILLEGAL_VALUE_ERR(session); } @@ -474,7 +441,7 @@ err: if (leaf != NULL) /* * __wt_btree_leaf_create -- - * Create an empty leaf page. + * Create an empty leaf page and link it into a reference in its parent. */ int __wt_btree_leaf_create( @@ -485,62 +452,38 @@ __wt_btree_leaf_create( btree = S2BT(session); - WT_RET(__wt_calloc_def(session, 1, &leaf)); switch (btree->type) { case BTREE_COL_FIX: + WT_RET(__wt_page_alloc(session, WT_PAGE_COL_FIX, 0, &leaf)); leaf->u.col_fix.recno = 1; - leaf->type = WT_PAGE_COL_FIX; break; case BTREE_COL_VAR: + WT_RET(__wt_page_alloc(session, WT_PAGE_COL_VAR, 0, &leaf)); leaf->u.col_var.recno = 1; - leaf->type = WT_PAGE_COL_VAR; break; case BTREE_ROW: - leaf->type = WT_PAGE_ROW_LEAF; + WT_RET(__wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, &leaf)); break; + WT_ILLEGAL_VALUE(session); } leaf->entries = 0; - leaf->ref = ref; - leaf->parent = parent; + WT_LINK_PAGE(parent, ref, leaf); *pagep = leaf; return (0); } /* - * __wt_btree_get_memsize -- - * Access the size of an in-memory tree with a single leaf page. + * __wt_btree_no_eviction -- + * Setup or release a cache-resident tree. */ -int -__wt_btree_get_memsize( - WT_SESSION_IMPL *session, WT_BTREE *btree, uint32_t **memsizep) +void +__wt_btree_evictable(WT_SESSION_IMPL *session, int on) { - WT_PAGE *root, *child; - - WT_UNUSED(session); - root = btree->root_page; - child = root->u.intl.t->page; - - if (root->entries != 1 || child == NULL) { - *memsizep = NULL; - return (WT_ERROR); - } - - *memsizep = &child->memory_footprint; - F_SET(btree, WT_BTREE_NO_EVICTION); - return (0); -} - -/* - * __wt_btree_release_memsize -- - * Release a cache-resident tree. - */ -int -__wt_btree_release_memsize(WT_SESSION_IMPL *session, WT_BTREE *btree) -{ - WT_UNUSED(session); - F_CLR(btree, WT_BTREE_NO_EVICTION); - return (0); + if (on) + F_CLR(S2BT(session), WT_BTREE_NO_EVICTION); + else + F_SET(S2BT(session), WT_BTREE_NO_EVICTION); } /* @@ -566,50 +509,41 @@ __btree_get_last_recno(WT_SESSION_IMPL *session) /* * __btree_page_sizes -- - * Verify the page sizes. + * Verify the page sizes. Some of these sizes are automatically checked + * using limits defined in the API, don't duplicate the logic here. */ static int -__btree_page_sizes(WT_SESSION_IMPL *session, const char *config) +__btree_page_sizes(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CONFIG_ITEM cval; - uint32_t intl_split_size, leaf_split_size, split_pct; + uint32_t intl_split_size, leaf_split_size; + const char **cfg; btree = S2BT(session); + cfg = btree->dhandle->cfg; - WT_RET(__wt_config_getones(session, config, "allocation_size", &cval)); + WT_RET(__wt_config_gets(session, cfg, "allocation_size", &cval)); btree->allocsize = (uint32_t)cval.val; - WT_RET( - __wt_config_getones(session, config, "internal_page_max", &cval)); + WT_RET(__wt_config_gets(session, cfg, "internal_page_max", &cval)); btree->maxintlpage = (uint32_t)cval.val; - WT_RET(__wt_config_getones( - session, config, "internal_item_max", &cval)); + WT_RET(__wt_config_gets(session, cfg, "internal_item_max", &cval)); btree->maxintlitem = (uint32_t)cval.val; - WT_RET(__wt_config_getones(session, config, "leaf_page_max", &cval)); + WT_RET(__wt_config_gets(session, cfg, "leaf_page_max", &cval)); btree->maxleafpage = (uint32_t)cval.val; - WT_RET(__wt_config_getones( - session, config, "leaf_item_max", &cval)); + WT_RET(__wt_config_gets(session, cfg, "leaf_item_max", &cval)); btree->maxleafitem = (uint32_t)cval.val; + WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval)); + btree->split_pct = (u_int)cval.val; + /* * When a page is forced to split, we want at least 50 entries on its * parent. */ - WT_RET(__wt_config_getones(session, config, "memory_page_max", &cval)); + WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval)); btree->maxmempage = WT_MAX((uint64_t)cval.val, 50 * btree->maxleafpage); - /* - * Limit allocation units to 128MB, and page sizes to 512MB. There's no - * reason we couldn't support larger values (any value up to the smaller - * of an off_t and a size_t should work), but an application specifying - * larger allocation units or page sizes is likely making a mistake. The - * API checked this, but we assert it anyway. - */ - WT_ASSERT(session, btree->allocsize >= WT_BTREE_ALLOCATION_SIZE_MIN); - WT_ASSERT(session, btree->allocsize <= WT_BTREE_ALLOCATION_SIZE_MAX); - WT_ASSERT(session, btree->maxintlpage <= WT_BTREE_PAGE_SIZE_MAX); - WT_ASSERT(session, btree->maxleafpage <= WT_BTREE_PAGE_SIZE_MAX); - /* Allocation sizes must be a power-of-two, nothing else makes sense. */ if (!__wt_ispo2(btree->allocsize)) WT_RET_MSG(session, @@ -628,12 +562,8 @@ __btree_page_sizes(WT_SESSION_IMPL *session, const char *config) * Set the split percentage: reconciliation splits to a smaller-than- * maximum page size so we don't split every time a new entry is added. */ - WT_RET(__wt_config_getones(session, config, "split_pct", &cval)); - split_pct = (uint32_t)cval.val; - intl_split_size = WT_SPLIT_PAGE_SIZE( - btree->maxintlpage, btree->allocsize, split_pct); - leaf_split_size = WT_SPLIT_PAGE_SIZE( - btree->maxleafpage, btree->allocsize, split_pct); + intl_split_size = __wt_split_page_size(btree, btree->maxintlpage); + leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage); /* * Default values for internal and leaf page items: make sure at least @@ -667,14 +597,45 @@ __btree_page_sizes(WT_SESSION_IMPL *session, const char *config) */ if (btree->maxintlitem > intl_split_size / 2) return (pse2(session, "internal", - btree->maxintlpage, btree->maxintlitem, split_pct)); + btree->maxintlpage, btree->maxintlitem, btree->split_pct)); if (btree->maxleafitem > leaf_split_size / 2) return (pse2(session, "leaf", - btree->maxleafpage, btree->maxleafitem, split_pct)); + btree->maxleafpage, btree->maxleafitem, btree->split_pct)); return (0); } +/* + * __wt_split_page_size -- + * Split page size calculation: we don't want to repeatedly split every + * time a new entry is added, so we split to a smaller-than-maximum page size. + */ +uint32_t +__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) +{ + uintmax_t a; + uint32_t split_size; + + /* + * Ideally, the split page size is some percentage of the maximum page + * size rounded to an allocation unit (round to an allocation unit so + * we don't waste space when we write). + */ + a = maxpagesize; /* Don't overflow. */ + split_size = + (uint32_t)WT_ALIGN((a * btree->split_pct) / 100, btree->allocsize); + + /* + * If the result of that calculation is the same as the allocation unit + * (that happens if the maximum size is the same size as an allocation + * unit, use a percentage of the maximum page size). + */ + if (split_size == btree->allocsize) + split_size = (uint32_t)((a * btree->split_pct) / 100); + + return (split_size); +} + static int pse1(WT_SESSION_IMPL *session, const char *type, uint32_t max, uint32_t ovfl) { diff --git a/src/btree/bt_huffman.c b/src/btree/bt_huffman.c index 6385524f2f1..867fcdfe93d 100644 --- a/src/btree/bt_huffman.c +++ b/src/btree/bt_huffman.c @@ -132,19 +132,20 @@ static int __wt_huffman_read(WT_SESSION_IMPL *, * Configure Huffman encoding for the tree. */ int -__wt_btree_huffman_open(WT_SESSION_IMPL *session, const char *config) +__wt_btree_huffman_open(WT_SESSION_IMPL *session) { struct __wt_huffman_table *table; WT_BTREE *btree; WT_CONFIG_ITEM key_conf, value_conf; WT_DECL_RET; + const char **cfg; u_int entries, numbytes; btree = S2BT(session); + cfg = btree->dhandle->cfg; - WT_RET(__wt_config_getones(session, config, "huffman_key", &key_conf)); - WT_RET(__wt_config_getones( - session, config, "huffman_value", &value_conf)); + WT_RET(__wt_config_gets(session, cfg, "huffman_key", &key_conf)); + WT_RET(__wt_config_gets(session, cfg, "huffman_value", &value_conf)); if (key_conf.len == 0 && value_conf.len == 0) return (0); diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c index 1bbae2b0ebb..dee629aea0b 100644 --- a/src/btree/bt_ovfl.c +++ b/src/btree/bt_ovfl.c @@ -275,7 +275,7 @@ __wt_val_ovfl_cache(WT_SESSION_IMPL *session, * a snapshot transaction after the item was deleted from a page that's * subsequently been checkpointed, where the checkpoint must know about * the freed blocks. We don't have any way to delay a free of the - * underlying blocks until a particular set of transactions exit(and + * underlying blocks until a particular set of transactions exit (and * this isn't a common scenario), so cache the overflow value in memory. * * This gets hard because the snapshot transaction reader might: diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index 4622cae269b..3d326a238da 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -7,11 +7,13 @@ #include "wt_internal.h" -static int __inmem_col_fix(WT_SESSION_IMPL *, WT_PAGE *); -static int __inmem_col_int(WT_SESSION_IMPL *, WT_PAGE *, size_t *); +static void __inmem_col_fix(WT_SESSION_IMPL *, WT_PAGE *); +static void __inmem_col_int(WT_SESSION_IMPL *, WT_PAGE *); static int __inmem_col_var(WT_SESSION_IMPL *, WT_PAGE *, size_t *); static int __inmem_row_int(WT_SESSION_IMPL *, WT_PAGE *, size_t *); -static int __inmem_row_leaf(WT_SESSION_IMPL *, WT_PAGE *, size_t *); +static int __inmem_row_leaf(WT_SESSION_IMPL *, WT_PAGE *); +static int __inmem_row_leaf_entries( + WT_SESSION_IMPL *, WT_PAGE_HEADER *, uint32_t *); /* * __wt_page_in -- @@ -68,7 +70,8 @@ __wt_page_in_func( break; page = ref->page; - WT_ASSERT(session, !WT_PAGE_IS_ROOT(page)); + WT_ASSERT(session, + page != NULL && !WT_PAGE_IS_ROOT(page)); /* * Ensure the page doesn't have ancient updates on it. @@ -79,7 +82,7 @@ __wt_page_in_func( */ if (page->modify != NULL && __wt_txn_ancient(session, page->modify->first_id)) { - page->read_gen = 0; + page->read_gen = WT_READ_GEN_OLDEST; WT_RET(__wt_hazard_clear(session, page)); WT_RET(__wt_evict_server_wake(session)); break; @@ -91,7 +94,15 @@ __wt_page_in_func( return (ret); } - page->read_gen = __wt_cache_read_gen(session); + /* + * If this page has ever been considered for eviction, + * and its generation is aging, update it. + */ + if (page->read_gen != WT_READ_GEN_NOTSET && + page->read_gen < __wt_cache_read_gen(session)) + page->read_gen = + __wt_cache_read_gen_set(session); + return (0); WT_ILLEGAL_VALUE(session); } @@ -102,6 +113,74 @@ __wt_page_in_func( } /* + * __wt_page_alloc -- + * Create or read a page into the cache. + */ +int +__wt_page_alloc(WT_SESSION_IMPL *session, + uint8_t type, uint32_t alloc_entries, WT_PAGE **pagep) +{ + WT_CACHE *cache; + WT_PAGE *page; + size_t size; + void *p; + + *pagep = NULL; + + cache = S2C(session)->cache; + + /* + * Allocate a page, and for most page types, the additional information + * it needs to describe the disk image. + */ + size = sizeof(WT_PAGE); + switch (type) { + case WT_PAGE_COL_FIX: + break; + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + size += alloc_entries * sizeof(WT_REF); + break; + case WT_PAGE_COL_VAR: + size += alloc_entries * sizeof(WT_COL); + break; + case WT_PAGE_ROW_LEAF: + size += alloc_entries * sizeof(WT_ROW); + break; + WT_ILLEGAL_VALUE(session); + } + + WT_RET(__wt_calloc(session, 1, size, &page)); + p = (uint8_t *)page + sizeof(WT_PAGE); + + switch (type) { + case WT_PAGE_COL_FIX: + break; + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + page->u.intl.t = p; + break; + case WT_PAGE_COL_VAR: + page->u.col_var.d = p; + break; + case WT_PAGE_ROW_LEAF: + page->u.row.d = p; + break; + WT_ILLEGAL_VALUE(session); + } + + /* Increment the cache statistics. */ + __wt_cache_page_inmem_incr(session, page, size); + (void)WT_ATOMIC_ADD(cache->pages_inmem, 1); + + /* The one page field we set is the type. */ + page->type = type; + + *pagep = page; + return (0); +} + +/* * __wt_page_inmem -- * Build in-memory page information. */ @@ -112,54 +191,100 @@ __wt_page_inmem( { WT_DECL_RET; WT_PAGE *page; - size_t inmem_size; - - WT_ASSERT_RET(session, dsk->u.entries > 0); + uint32_t alloc_entries; + size_t size; + alloc_entries = 0; *pagep = NULL; /* - * Allocate and initialize the WT_PAGE. - * Set the LRU so the page is not immediately selected for eviction. - * Set the read generation (which can't match a search where the write - * generation wasn't set, that is, remained 0). + * Figure out how many underlying objects the page references so + * we can allocate them along with the page. */ - WT_RET(__wt_calloc_def(session, 1, &page)); - page->parent = parent; - page->ref = parent_ref; + switch (dsk->type) { + case WT_PAGE_COL_FIX: + break; + case WT_PAGE_COL_INT: + /* + * Column-store internal page entries map one-to-one to the + * number of physical entries on the page (each physical entry + * is an offset object). + */ + alloc_entries = dsk->u.entries; + break; + case WT_PAGE_COL_VAR: + /* + * Column-store leaf page entries map one-to-one to the number + * of physical entries on the page (each physical entry is a + * data item). + */ + alloc_entries = dsk->u.entries; + break; + case WT_PAGE_ROW_INT: + /* + * Row-store internal page entries map one-to-two to the number + * of physical entries on the page (each in-memory entry is a + * key item and location cookie). + */ + alloc_entries = dsk->u.entries / 2; + break; + case WT_PAGE_ROW_LEAF: + /* + * Row-store leaf page entries map in an indeterminate way to + * the physical entries on the page, we have to walk the page + * to figure it out. + */ + WT_RET(__inmem_row_leaf_entries(session, dsk, &alloc_entries)); + break; + WT_ILLEGAL_VALUE(session); + } + + /* Allocate and initialize a new WT_PAGE. */ + WT_RET(__wt_page_alloc(session, dsk->type, alloc_entries, &page)); page->dsk = dsk; - page->read_gen = __wt_cache_read_gen(session); - page->type = dsk->type; + page->read_gen = WT_READ_GEN_NOTSET; if (disk_not_alloc) F_SET_ATOMIC(page, WT_PAGE_DISK_NOT_ALLOC); - inmem_size = sizeof(WT_PAGE); - if (!disk_not_alloc) - inmem_size += dsk->mem_size; + /* + * Track the memory allocated to build this page so we can update the + * cache statistics in a single call. + */ + size = disk_not_alloc ? 0 : dsk->mem_size; switch (page->type) { case WT_PAGE_COL_FIX: + page->entries = dsk->u.entries; page->u.col_fix.recno = dsk->recno; - WT_ERR(__inmem_col_fix(session, page)); + __inmem_col_fix(session, page); break; case WT_PAGE_COL_INT: + page->entries = dsk->u.entries; page->u.intl.recno = dsk->recno; - WT_ERR(__inmem_col_int(session, page, &inmem_size)); + __inmem_col_int(session, page); break; case WT_PAGE_COL_VAR: + page->entries = dsk->u.entries; page->u.col_var.recno = dsk->recno; - WT_ERR(__inmem_col_var(session, page, &inmem_size)); + WT_ERR(__inmem_col_var(session, page, &size)); break; case WT_PAGE_ROW_INT: - WT_ERR(__inmem_row_int(session, page, &inmem_size)); + page->entries = dsk->u.entries / 2; + WT_ERR(__inmem_row_int(session, page, &size)); break; case WT_PAGE_ROW_LEAF: - WT_ERR(__inmem_row_leaf(session, page, &inmem_size)); + page->entries = alloc_entries; + WT_ERR(__inmem_row_leaf(session, page)); break; WT_ILLEGAL_VALUE_ERR(session); } - __wt_cache_page_read(session, page, inmem_size); + /* Update the page's in-memory size and the cache statistics. */ + __wt_cache_page_inmem_incr(session, page, size); + + /* Link the new page into the parent. */ + if (parent_ref != NULL) + WT_LINK_PAGE(parent, parent_ref, page); *pagep = page; return (0); @@ -172,7 +297,7 @@ err: __wt_page_out(session, &page); * __inmem_col_fix -- * Build in-memory index for fixed-length column-store leaf pages. */ -static int +static void __inmem_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; @@ -182,16 +307,14 @@ __inmem_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page) dsk = page->dsk; page->u.col_fix.bitf = WT_PAGE_HEADER_BYTE(btree, dsk); - page->entries = dsk->u.entries; - return (0); } /* * __inmem_col_int -- * Build in-memory index for column-store internal pages. */ -static int -__inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) +static void +__inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_CELL *cell; @@ -205,14 +328,6 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) unpack = &_unpack; /* - * Column-store page entries map one-to-one to the number of physical - * entries on the page (each physical entry is a offset object). - */ - WT_RET(__wt_calloc_def( - session, (size_t)dsk->u.entries, &page->u.intl.t)); - *inmem_sizep += dsk->u.entries * sizeof(*page->u.intl.t); - - /* * Walk the page, building references: the page contains value items. * The value items are on-page items (WT_CELL_VALUE). */ @@ -223,9 +338,6 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) ref->u.recno = unpack->v; ++ref; } - - page->entries = dsk->u.entries; - return (0); } /* @@ -234,7 +346,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) * column-store trees. */ static int -__inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) +__inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) { WT_BTREE *btree; WT_COL *cip; @@ -254,20 +366,12 @@ __inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) recno = page->u.col_var.recno; /* - * Column-store page entries map one-to-one to the number of physical - * entries on the page (each physical entry is a data item). - */ - WT_RET(__wt_calloc_def( - session, (size_t)dsk->u.entries, &page->u.col_var.d)); - *inmem_sizep += dsk->u.entries * sizeof(*page->u.col_var.d); - - /* * Walk the page, building references: the page contains unsorted value * items. The value items are on-page (WT_CELL_VALUE), overflow items * (WT_CELL_VALUE_OVFL) or deleted items (WT_CELL_DEL). */ - cip = page->u.col_var.d; indx = 0; + cip = page->u.col_var.d; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { __wt_cell_unpack(cell, unpack); (cip++)->__value = WT_PAGE_DISK_OFFSET(page, cell); @@ -292,11 +396,10 @@ __inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) indx++; recno += rle; } + *sizep += bytes_allocated; page->u.col_var.repeats = repeats; page->u.col_var.nrepeats = nrepeats; - page->entries = dsk->u.entries; - *inmem_sizep += bytes_allocated; return (0); } @@ -305,7 +408,7 @@ __inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) * Build in-memory index for row-store internal pages. */ static int -__inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) +__inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) { WT_BTREE *btree; WT_CELL *cell; @@ -316,7 +419,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) WT_ITEM *tmp; WT_PAGE_HEADER *dsk; WT_REF *ref; - uint32_t i, nindx, prefix; + uint32_t i, prefix; void *huffman; btree = S2BT(session); @@ -328,22 +431,6 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) WT_ERR(__wt_scr_alloc(session, 0, &last)); /* - * Internal row-store page entries map one-to-two to the number of - * physical entries on the page (each in-memory entry is a key item - * and location cookie). - */ - nindx = dsk->u.entries / 2; - WT_ERR((__wt_calloc_def(session, (size_t)nindx, &page->u.intl.t))); - *inmem_sizep += nindx * sizeof(*page->u.intl.t); - - /* - * Set the number of elements now -- we're about to allocate memory, - * and if we fail in the middle of the page, we want to discard that - * memory properly. - */ - page->entries = nindx; - - /* * Walk the page, instantiating keys: the page contains sorted key and * location cookie pairs. Keys are on-page/overflow items and location * cookies are WT_CELL_ADDR items. @@ -446,10 +533,10 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) * for reconciliation, the row-store reconciliation function * depends on keys always be instantiated. */ - WT_ERR(__wt_row_ikey_alloc(session, + WT_ERR(__wt_row_ikey(session, WT_PAGE_DISK_OFFSET(page, cell), current->data, current->size, &ref->u.key)); - *inmem_sizep += sizeof(WT_IKEY) + current->size; + *sizep += sizeof(WT_IKEY) + current->size; /* * Swap buffers if it's not an overflow key, we have a new @@ -468,21 +555,19 @@ err: __wt_scr_free(¤t); } /* - * __inmem_row_leaf -- - * Build in-memory index for row-store leaf pages. + * __inmem_row_leaf_entries -- + * Return the number of entries for row-store leaf pages. */ static int -__inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) +__inmem_row_leaf_entries( + WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, uint32_t *nindxp) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; - WT_PAGE_HEADER *dsk; - WT_ROW *rip; uint32_t i, nindx; btree = S2BT(session); - dsk = page->dsk; unpack = &_unpack; /* @@ -519,10 +604,29 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) */ WT_ASSERT(session, cell == (WT_CELL *)((uint8_t *)dsk + dsk->mem_size)); - WT_RET((__wt_calloc_def(session, (size_t)nindx, &page->u.row.d))); - *inmem_sizep += nindx * sizeof(*page->u.row.d); + *nindxp = nindx; + return (0); +} - /* Walk the page again, building indices. */ +/* + * __inmem_row_leaf -- + * Build in-memory index for row-store leaf pages. + */ +static int +__inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *unpack, _unpack; + WT_PAGE_HEADER *dsk; + WT_ROW *rip; + uint32_t i; + + btree = S2BT(session); + dsk = page->dsk; + unpack = &_unpack; + + /* Walk the page, building indices. */ rip = page->u.row.d; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { __wt_cell_unpack(cell, unpack); @@ -539,14 +643,12 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep) } } - page->entries = nindx; - /* * If the keys are Huffman encoded, instantiate some set of them. It * doesn't matter if we are randomly searching the page or scanning a * cursor through it, there isn't a fast-path to getting keys off the * page. */ - return (btree->huffman_key == NULL ? - 0 : __wt_row_leaf_keys(session, page)); + return ( + btree->huffman_key == NULL ? 0 : __wt_row_leaf_keys(session, page)); } diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index 3c599a2c129..37155beb0dd 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -49,8 +49,8 @@ __cache_read_row_deleted( upd->txnid = ref->txnid; } - __wt_cache_page_inmem_incr( - session, page, sizeof(WT_UPDATE) * page->entries); + __wt_cache_page_inmem_incr(session, page, + page->entries * (sizeof(WT_UPDATE *) + sizeof(WT_UPDATE))); return (0); } @@ -116,7 +116,6 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref) WT_VERBOSE_ERR(session, read, "page %p: %s", page, __wt_page_type_string(page->type)); - ref->page = page; WT_PUBLISH(ref->state, WT_REF_MEM); return (0); diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index 04268162073..d13be782394 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -376,7 +376,7 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss) * checksum and still be broken, but paranoia is healthy in * salvage. Regardless, verify does return failure because * it detects failures we'd expect to see in a corrupted file, - * like overflow references past the the end of the file or + * like overflow references past the end of the file or * overflow references to non-existent pages, might as well * discard these pages now. */ @@ -1087,17 +1087,13 @@ __slvg_col_build_internal( WT_TRACK *trk; uint32_t i; - /* Allocate a column-store internal page. */ - WT_RET(__wt_calloc_def(session, 1, &page)); - WT_ERR(__wt_calloc_def(session, (size_t)leaf_cnt, &page->u.intl.t)); - - /* Fill it in. */ + /* Allocate a column-store root (internal) page and fill it in. */ + WT_RET(__wt_page_alloc(session, WT_PAGE_COL_INT, leaf_cnt, &page)); page->parent = NULL; /* Root page */ page->ref = NULL; - page->read_gen = 0; + page->read_gen = WT_READ_GEN_NOTSET; page->u.intl.recno = 1; page->entries = leaf_cnt; - page->type = WT_PAGE_COL_INT; WT_ERR(__slvg_modify_init(session, page)); for (ref = page->u.intl.t, i = 0; i < ss->pages_next; ++i) { @@ -1665,16 +1661,12 @@ __slvg_row_build_internal( WT_TRACK *trk; uint32_t i; - /* Allocate a row-store internal page. */ - WT_RET(__wt_calloc_def(session, 1, &page)); - WT_ERR(__wt_calloc_def(session, (size_t)leaf_cnt, &page->u.intl.t)); - - /* Fill it in. */ - page->parent = NULL; /* Root page */ + /* Allocate a row-store root (internal) page and fill it in. */ + WT_RET(__wt_page_alloc(session, WT_PAGE_ROW_INT, leaf_cnt, &page)); + page->parent = NULL; page->ref = NULL; - page->read_gen = 0; + page->read_gen = WT_READ_GEN_NOTSET; page->entries = leaf_cnt; - page->type = WT_PAGE_ROW_INT; WT_ERR(__slvg_modify_init(session, page)); for (ref = page->u.intl.t, i = 0; i < ss->pages_next; ++i) { @@ -1706,9 +1698,8 @@ __slvg_row_build_internal( WT_ERR(__slvg_row_build_leaf( session, trk, page, ref, ss)); } else - WT_ERR(__wt_row_ikey_alloc(session, 0, - trk->row_start.data, - trk->row_start.size, + WT_ERR(__wt_row_ikey_incr(session, page, 0, + trk->row_start.data, trk->row_start.size, &ref->u.key)); ++ref; } @@ -1814,13 +1805,10 @@ __slvg_row_build_leaf(WT_SESSION_IMPL *session, ++skip_stop; } - /* - * I believe it's no longer possible for a salvaged page to be entirely - * empty, that is, if we selected the page for salvage, there is at - * least one cell on the page we want. This is a change from previous - * behavior, so I'm asserting it. - */ - WT_ASSERT_ERR(session, skip_start + skip_stop < page->entries); + /* We should have selected some entries, but not the entire page. */ + WT_ASSERT(session, + skip_start + skip_stop > 0 && + skip_start + skip_stop < page->entries); /* * Take a copy of this page's first key to define the start of @@ -1829,8 +1817,8 @@ __slvg_row_build_leaf(WT_SESSION_IMPL *session, */ rip = page->u.row.d + skip_start; WT_ERR(__wt_row_key(session, page, rip, key, 0)); - WT_ERR( - __wt_row_ikey_alloc(session, 0, key->data, key->size, &ref->u.key)); + WT_ERR(__wt_row_ikey_incr( + session, parent, 0, key->data, key->size, &ref->u.key)); /* * Discard backing overflow pages for any items being discarded that diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index 36854553397..337cce7983a 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -7,9 +7,9 @@ #include "wt_internal.h" -static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *); -static int __stat_page_col_var(WT_SESSION_IMPL *, WT_PAGE *); -static int __stat_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *); +static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *); +static int __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *); +static int __stat_page_row_leaf(WT_PAGE *, WT_DSRC_STATS *); /* * __wt_btree_stat_init -- @@ -21,26 +21,28 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, uint32_t flags) WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; + WT_DSRC_STATS *stats; WT_PAGE *page; btree = S2BT(session); bm = btree->bm; + stats = &btree->dhandle->stats; - WT_RET(bm->stat(bm, session)); + WT_RET(bm->stat(bm, session, stats)); - WT_DSTAT_SET(session, btree_fixed_len, btree->bitcnt); - WT_DSTAT_SET(session, btree_maximum_depth, btree->maximum_depth); - WT_DSTAT_SET(session, btree_maxintlitem, btree->maxintlitem); - WT_DSTAT_SET(session, btree_maxintlpage, btree->maxintlpage); - WT_DSTAT_SET(session, btree_maxleafitem, btree->maxleafitem); - WT_DSTAT_SET(session, btree_maxleafpage, btree->maxleafpage); + WT_STAT_SET(stats, btree_fixed_len, btree->bitcnt); + WT_STAT_SET(stats, btree_maximum_depth, btree->maximum_depth); + WT_STAT_SET(stats, btree_maxintlitem, btree->maxintlitem); + WT_STAT_SET(stats, btree_maxintlpage, btree->maxintlpage); + WT_STAT_SET(stats, btree_maxleafitem, btree->maxleafitem); + WT_STAT_SET(stats, btree_maxleafpage, btree->maxleafpage); page = NULL; if (LF_ISSET(WT_STATISTICS_FAST)) return (0); while ((ret = __wt_tree_walk(session, &page, 0)) == 0 && page != NULL) - WT_RET(__stat_page(session, page)); + WT_RET(__stat_page(session, page, stats)); return (ret == WT_NOTFOUND ? 0 : ret); } @@ -49,7 +51,7 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, uint32_t flags) * Stat any Btree page. */ static int -__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page) +__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats) { /* * All internal pages and overflow pages are trivial, all we track is @@ -57,25 +59,25 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page) */ switch (page->type) { case WT_PAGE_COL_FIX: - WT_DSTAT_INCR(session, btree_column_fix); - WT_DSTAT_INCRV(session, btree_entries, page->entries); + WT_STAT_INCR(stats, btree_column_fix); + WT_STAT_INCRV(stats, btree_entries, page->entries); break; case WT_PAGE_COL_INT: - WT_DSTAT_INCR(session, btree_column_internal); - WT_DSTAT_INCRV(session, btree_entries, page->entries); + WT_STAT_INCR(stats, btree_column_internal); + WT_STAT_INCRV(stats, btree_entries, page->entries); break; case WT_PAGE_COL_VAR: - WT_RET(__stat_page_col_var(session, page)); + WT_RET(__stat_page_col_var(page, stats)); break; case WT_PAGE_OVFL: - WT_DSTAT_INCR(session, btree_overflow); + WT_STAT_INCR(stats, btree_overflow); break; case WT_PAGE_ROW_INT: - WT_DSTAT_INCR(session, btree_row_internal); - WT_DSTAT_INCRV(session, btree_entries, page->entries); + WT_STAT_INCR(stats, btree_row_internal); + WT_STAT_INCRV(stats, btree_entries, page->entries); break; case WT_PAGE_ROW_LEAF: - WT_RET(__stat_page_row_leaf(session, page)); + WT_RET(__stat_page_row_leaf(page, stats)); break; WT_ILLEGAL_VALUE(session); } @@ -87,7 +89,7 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page) * Stat a WT_PAGE_COL_VAR page. */ static int -__stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) +__stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats) { WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; @@ -99,7 +101,7 @@ __stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) unpack = &_unpack; - WT_DSTAT_INCR(session, btree_column_variable); + WT_STAT_INCR(stats, btree_column_variable); /* * Walk the page, counting regular and overflow data items, and checking @@ -111,12 +113,12 @@ __stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) WT_COL_FOREACH(page, cip, i) { if ((cell = WT_COL_PTR(page, cip)) == NULL) { orig_deleted = 1; - WT_DSTAT_INCR(session, btree_column_deleted); + WT_STAT_INCR(stats, btree_column_deleted); } else { orig_deleted = 0; __wt_cell_unpack(cell, unpack); - WT_DSTAT_INCRV( - session, btree_entries, __wt_cell_rle(unpack)); + WT_STAT_INCRV( + stats, btree_entries, __wt_cell_rle(unpack)); } /* @@ -128,13 +130,13 @@ __stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) if (WT_UPDATE_DELETED_ISSET(upd)) { if (orig_deleted) continue; - WT_DSTAT_INCR(session, btree_column_deleted); - WT_DSTAT_DECR(session, btree_entries); + WT_STAT_INCR(stats, btree_column_deleted); + WT_STAT_DECR(stats, btree_entries); } else { if (!orig_deleted) continue; - WT_DSTAT_DECR(session, btree_column_deleted); - WT_DSTAT_INCR(session, btree_entries); + WT_STAT_DECR(stats, btree_column_deleted); + WT_STAT_INCR(stats, btree_entries); } } } @@ -146,14 +148,14 @@ __stat_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) * Stat a WT_PAGE_ROW_LEAF page. */ static int -__stat_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) +__stat_page_row_leaf(WT_PAGE *page, WT_DSRC_STATS *stats) { WT_INSERT *ins; WT_ROW *rip; WT_UPDATE *upd; uint32_t cnt, i; - WT_DSTAT_INCR(session, btree_row_leaf); + WT_STAT_INCR(stats, btree_row_leaf); /* * Stat any K/V pairs inserted into the page before the first from-disk @@ -176,7 +178,7 @@ __stat_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) ++cnt; } - WT_DSTAT_INCRV(session, btree_entries, cnt); + WT_STAT_INCRV(stats, btree_entries, cnt); return (0); } diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 12ffc0bba34..3fdfd8b7b56 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -62,10 +62,9 @@ __wt_bt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op) switch (op) { case WT_SYNC_CHECKPOINT: - WT_ERR(__wt_sync_file(session, WT_SYNC_CHECKPOINT)); - break; case WT_SYNC_COMPACT: - WT_ERR(__wt_sync_file(session, WT_SYNC_COMPACT)); + case WT_SYNC_WRITE_LEAVES: + WT_ERR(__wt_sync_file(session, op)); break; case WT_SYNC_DISCARD: case WT_SYNC_DISCARD_NOWRITE: diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index 0cc9a7ce3f6..323876d8c04 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -162,7 +162,7 @@ int __wt_tree_walk(WT_SESSION_IMPL *session, WT_PAGE **pagep, uint32_t flags) { WT_BTREE *btree; - WT_PAGE *page, *parent; + WT_PAGE *couple, *page; WT_REF *ref; uint32_t slot; int cache, compact, discard, eviction, prev, set_read_gen; @@ -184,6 +184,29 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_PAGE **pagep, uint32_t flags) page = *pagep; *pagep = NULL; + /* + * If not the eviction thread, we're hazard-pointer coupling through the + * tree and that's OK (hazard pointers can't deadlock, so there's none + * of the usual problems found when logically locking up a btree). If + * the eviction thread tries to evict the active page, it fails because + * of our hazard pointer. If eviction tries to evict our parent, that + * fails because the parent has a child page that can't be discarded. + * We do play one game: don't couple up to our parent and then back down + * to a new leaf, couple to the next page to which we're descending, it + * saves a hazard-pointer swap for each cursor page movement. + * + * !!! + * NOTE: we don't bother checking if we're hazard-pointer coupling when + * setting the variable couple in this code. We never actually use the + * variable couple if the variable eviction is true. + * + * NOTE: we depend on the fact it's OK to release a page we don't hold, + * that is, it's OK to release couple, when couple is set to NULL. + * + * Remember the hazard pointer we're currently holding. + */ + couple = page; + /* If no page is active, begin a walk from the start of the tree. */ if (page == NULL) { if ((page = btree->root_page) == NULL) @@ -192,13 +215,12 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_PAGE **pagep, uint32_t flags) goto descend; } -ascend: /* If the active page was the root, we've reached the walk's end. */ +ascend: /* + * If the active page was the root, we've reached the walk's end. + * Release any hazard-pointer we're holding. + */ if (WT_PAGE_IS_ROOT(page)) - return (0); - - /* Figure out the current slot in the parent page's WT_REF array. */ - parent = page->parent; - slot = (uint32_t)(page->ref - parent->u.intl.t); + return (eviction ? 0 : __wt_page_release(session, couple)); /* If the eviction thread, clear the page's walk status. */ if (eviction) @@ -206,41 +228,43 @@ ascend: /* If the active page was the root, we've reached the walk's end. */ page->ref->state = WT_REF_MEM; /* - * Move to the parent. - * - * If not the eviction thread, swap our hazard pointer for the hazard - * pointer of our parent, if it's not the root page (we could access - * it directly because we know it's in memory, but we need a hazard as - * we climb the tree). Don't leave a hazard pointer dangling on error. - * - * We're hazard-pointer coupling up the tree and that's OK: first, - * hazard pointers can't deadlock, so there's none of the usual - * problems found when logically locking up a Btree; second, we don't - * release our current hazard pointer until we have our parent's - * hazard pointer. If the eviction thread tries to evict the active - * page, that fails because of our hazard pointer. If eviction tries - * to evict our parent, that fails because the parent has a child page - * that can't be discarded. + * Figure out the current slot in the parent page's WT_REF array and + * switch to the parent. */ - if (!eviction) { - if (WT_PAGE_IS_ROOT(parent)) - WT_RET(__wt_page_release(session, page)); - else - WT_RET( - __wt_page_swap(session, page, parent, parent->ref)); - } - page = parent; + slot = (uint32_t)(page->ref - page->parent->u.intl.t); + page = page->parent; - /* - * If we're at the last/first slot on the page, return this page in - * post-order traversal. Otherwise we move to the next/prev slot - * and left/right-most element in its subtree. - */ for (;;) { + /* + * If we're at the last/first slot on the page, return this + * page in post-order traversal. Otherwise we move to the + * next/prev slot and left/right-most element in its subtree. + */ if ((prev && slot == 0) || (!prev && slot == page->entries - 1)) { + /* Optionally skip internal pages. */ if (skip_intl) goto ascend; + + /* + * We've ascended the tree and are returning an internal + * page. If it's the root, discard any hazard pointer + * we have, otherwise, swap any hazard pointer we have + * for the page we'll return. We could keep the hazard + * pointer we have as it's sufficient to pin any page in + * our page stack, but we have no place to store it and + * it's simpler if callers just know they hold a hazard + * pointer on any page they're using. + */ + if (!eviction) { + if (WT_PAGE_IS_ROOT(page)) + WT_RET( + __wt_page_release(session, couple)); + else + WT_RET(__wt_page_swap( + session, couple, page, page->ref)); + } + *pagep = page; return (0); } @@ -275,7 +299,6 @@ descend: for (;;) { * another thread. The other cases get hazard pointers * and protect the page from eviction that way. */ - set_read_gen = 0; if (eviction) { retry: if (ref->state != WT_REF_MEM || !WT_ATOMIC_CAS(ref->state, @@ -315,7 +338,7 @@ retry: if (ref->state != WT_REF_MEM || ref->state == WT_REF_DISK) break; WT_RET( - __wt_page_swap(session, page, page, ref)); + __wt_page_swap(session, couple, page, ref)); } else if (discard) { /* * If deleting a range, try to delete the page @@ -326,7 +349,7 @@ retry: if (ref->state != WT_REF_MEM || if (skip) break; WT_RET( - __wt_page_swap(session, page, page, ref)); + __wt_page_swap(session, couple, page, ref)); } else { /* * If iterating a cursor (or doing compaction), @@ -341,11 +364,11 @@ retry: if (ref->state != WT_REF_MEM || * we don't want to read it if it won't help. * * Pages read for compaction aren't "useful"; - * reset the page generation to 0 so the page - * is quickly chosen for eviction. (This can - * race of course, but it's unlikely and will - * only result in an incorrectly low page read - * generation.) + * reset the page generation to a low value so + * the page is quickly chosen for eviction. + * (This can race of course, but it's unlikely + * and will only result in an incorrectly low + * page read generation and possible eviction.) */ set_read_gen = 0; if (compact) { @@ -357,12 +380,12 @@ retry: if (ref->state != WT_REF_MEM || ref->state == WT_REF_DISK ? 1 : 0; } WT_RET( - __wt_page_swap(session, page, page, ref)); + __wt_page_swap(session, couple, page, ref)); if (set_read_gen) - page->read_gen = 0; + page->read_gen = WT_READ_GEN_OLDEST; } - page = ref->page; + couple = page = ref->page; slot = prev ? page->entries - 1 : 0; } } diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index 0e3e130062d..b05c2281c30 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -38,9 +38,15 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify) WT_ASSERT(session, ref == NULL || ref->u.recno == page->u.intl.recno); + /* Fast path appends. */ + base = page->entries; + ref = &page->u.intl.t[base - 1]; + if (recno >= ref->u.recno) + goto descend; + /* Binary search of internal pages. */ - for (base = 0, - limit = page->entries; limit != 0; limit >>= 1) { + for (base = 0, ref = NULL, + limit = page->entries - 1; limit != 0; limit >>= 1) { indx = base + (limit >> 1); ref = page->u.intl.t + indx; @@ -51,7 +57,8 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify) base = indx + 1; --limit; } - WT_ASSERT(session, ref != NULL); + +descend: WT_ASSERT(session, ref != NULL); /* * Reference the slot used for next step down the tree. diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c index b82140b2a6d..5f35327201f 100644 --- a/src/btree/rec_evict.c +++ b/src/btree/rec_evict.c @@ -13,7 +13,7 @@ static void __rec_discard_tree(WT_SESSION_IMPL *, WT_PAGE *, int); static void __rec_excl_clear(WT_SESSION_IMPL *); static void __rec_page_clean_update(WT_SESSION_IMPL *, WT_PAGE *); static int __rec_page_dirty_update(WT_SESSION_IMPL *, WT_PAGE *); -static int __rec_review(WT_SESSION_IMPL *, WT_REF *, WT_PAGE *, int, int); +static int __rec_review(WT_SESSION_IMPL *, WT_REF *, WT_PAGE *, int, int, int); static void __rec_root_update(WT_SESSION_IMPL *); /* @@ -25,6 +25,7 @@ __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive) { WT_DECL_RET; WT_PAGE_MODIFY *mod; + int merge; WT_VERBOSE_RET(session, evict, "page %p (%s)", page, __wt_page_type_string(page->type)); @@ -32,19 +33,17 @@ __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive) WT_ASSERT(session, session->excl_next == 0); /* - * Split-merge pages cannot be evicted, they're always merged into their - * parent; split-merge pages are ignored by the eviction thread, we - * never get a split-merge page to evict. Check out of sheer paranoia. - * Split pages are NOT included in this test, because a split page can - * be separately evicted, at which point it's replaced in its parent by - * a reference to a split-merge page. That's a normal part of the leaf - * page life-cycle if it grows too large and must be pushed out of the - * cache. + * If we get a split-merge page during normal eviction, try to collapse + * it. During close, it will be merged into its parent. */ mod = page->modify; - if (mod != NULL && F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) + merge = __wt_btree_mergeable(page); + if (merge && exclusive) return (EBUSY); + WT_ASSERT(session, merge || mod == NULL || + !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)); + /* * Get exclusive access to the page and review the page and its subtree * for conditions that would block our eviction of the page. If the @@ -54,10 +53,14 @@ __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive) * not disallowed anywhere. * * Note that page->ref may be NULL in some cases (e.g., for root pages - * or during salvage). That's OK if WT_REC_SINGLE is set: we won't - * check hazard pointers in that case. + * or during salvage). That's OK if exclusive is set: we won't check + * hazard pointers in that case. */ - WT_ERR(__rec_review(session, page->ref, page, exclusive, 1)); + WT_ERR(__rec_review(session, page->ref, page, exclusive, merge, 1)); + + /* Try to merge internal pages. */ + if (merge) + WT_ERR(__wt_merge_tree(session, page)); /* * Update the page's modification reference, reconciliation might have @@ -66,7 +69,7 @@ __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive) mod = page->modify; /* Count evictions of internal pages during normal operation. */ - if (!exclusive && + if (!exclusive && !merge && (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) { WT_CSTAT_INCR(session, cache_eviction_internal); WT_DSTAT_INCR(session, cache_eviction_internal); @@ -291,17 +294,15 @@ __rec_discard_page(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive) */ static int __rec_review(WT_SESSION_IMPL *session, - WT_REF *ref, WT_PAGE *page, int exclusive, int top) + WT_REF *ref, WT_PAGE *page, int exclusive, int merge, int top) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE_MODIFY *mod; WT_PAGE *t; - WT_TXN *txn; uint32_t i; btree = S2BT(session); - txn = &session->txn; /* * Get exclusive access to the page if our caller doesn't have the tree @@ -322,8 +323,8 @@ __rec_review(WT_SESSION_IMPL *session, case WT_REF_DELETED: /* On-disk, deleted */ break; case WT_REF_MEM: /* In-memory */ - WT_RET(__rec_review( - session, ref, ref->page, exclusive, 0)); + WT_RET(__rec_review(session, + ref, ref->page, exclusive, merge, 0)); break; case WT_REF_EVICT_WALK: /* Walk point */ case WT_REF_EVICT_FORCE: /* Forced evict */ @@ -374,15 +375,18 @@ __rec_review(WT_SESSION_IMPL *session, * we find a page which can't be merged into its parent, and failing if * we never find such a page. */ - if (btree->checkpointing && __wt_page_is_modified(page)) + if (btree->checkpointing && !merge && __wt_page_is_modified(page)) { +ckpt: WT_CSTAT_INCR(session, cache_eviction_checkpoint); + WT_DSTAT_INCR(session, cache_eviction_checkpoint); return (EBUSY); + } if (btree->checkpointing && top) for (t = page->parent;; t = t->parent) { if (t == NULL || t->ref == NULL) /* root */ - return (EBUSY); + goto ckpt; if (t->ref->state != WT_REF_MEM) /* scary */ - return (EBUSY); + goto ckpt; if (t->modify == NULL || /* not merged */ !F_ISSET(t->modify, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)) @@ -390,6 +394,13 @@ __rec_review(WT_SESSION_IMPL *session, } /* + * If we are merging internal pages, we just need exclusive access, we + * don't need to write everything. + */ + if (merge) + return (0); + + /* * Fail if any page in the top-level page's subtree won't be merged into * its parent, the page that cannot be merged must be evicted first. * The test is necessary but should not fire much: the eviction code is @@ -435,20 +446,6 @@ __rec_review(WT_SESSION_IMPL *session, WT_VERBOSE_RET(session, evict, "eviction failed, reconciled page not clean"); - /* - * A pathological case: if we're the oldest transaction - * in the system and we're stuck trying to find space, - * abort the transaction to give up all hazard - * references before trying again. - */ - if (F_ISSET(txn, TXN_RUNNING) && - __wt_txn_am_oldest(session) && - ++txn->eviction_fails >= 100) { - txn->eviction_fails = 0; - ret = WT_DEADLOCK; - WT_CSTAT_INCR(session, txn_fail_cache); - } - /* * We may be able to discard any "update" memory the * page no longer needs. @@ -466,7 +463,6 @@ __rec_review(WT_SESSION_IMPL *session, WT_RET(ret); WT_ASSERT(session, __wt_page_is_modified(page) == 0); - txn->eviction_fails = 0; } /* diff --git a/src/btree/rec_merge.c b/src/btree/rec_merge.c new file mode 100644 index 00000000000..caac7c77215 --- /dev/null +++ b/src/btree/rec_merge.c @@ -0,0 +1,538 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * WT_VISIT_STATE -- + * The state maintained across calls to the "visit" callback functions: + * the number of refs visited, the maximum depth, and the current page and + * reference when moving reference into the new tree. + */ +typedef struct { + WT_SESSION_IMPL *session; + WT_PAGE *first, *page, *second; /* New pages to be populated. */ + WT_REF *ref, *second_ref; /* Insert and split point. */ + + uint64_t refcnt, split; /* Ref count, split point. */ + uint64_t first_live, last_live; /* First/last in-memory ref. */ + u_int maxdepth; /* Maximum subtree depth. */ + int seen_live; /* Has a ref been live? */ +} WT_VISIT_STATE; + +/* + * __merge_walk -- + * Visit all of the child references in a locked subtree and apply a + * callback function to them. + */ +static int +__merge_walk(WT_SESSION_IMPL *session, WT_PAGE *page, u_int depth, + void (*visit)(WT_PAGE *, WT_REF *, WT_VISIT_STATE *), + WT_VISIT_STATE *state) +{ + WT_PAGE *child; + WT_REF *ref; + uint32_t i; + + if (depth > state->maxdepth) + state->maxdepth = depth; + + WT_REF_FOREACH(page, ref, i) + switch (ref->state) { + case WT_REF_LOCKED: + child = ref->page; + + /* + * Visit internal pages recursively. This must match + * the walk in __rec_review: if the merge succeeds, we + * have to unlock everything. + */ + if (child->type == page->type && + __wt_btree_mergeable(child)) { + WT_RET(__merge_walk( + session, child, depth + 1, visit, state)); + break; + } + /* FALLTHROUGH */ + + case WT_REF_DELETED: + case WT_REF_DISK: + (*visit)(page, ref, state); + break; + + case WT_REF_EVICT_FORCE: + case WT_REF_EVICT_WALK: + case WT_REF_MEM: + case WT_REF_READING: + WT_ILLEGAL_VALUE(session); + } + + return (0); +} + +/* + * __merge_count -- + * A callback function that counts the number of references as well as + * the first/last "live" reference. + */ +static void +__merge_count(WT_PAGE *parent, WT_REF *ref, WT_VISIT_STATE *state) +{ + WT_UNUSED(parent); + + if (ref->state == WT_REF_LOCKED) { + /* Prevent eviction until it is hooked into the new tree. */ + __wt_evict_list_clr_page(state->session, ref->page); + + if (!state->seen_live) { + state->first_live = state->refcnt; + state->seen_live = 1; + } + state->last_live = state->refcnt; + } + + /* + * Sanity check that we don't overflow the counts. We can't put more + * than 2**32 keys on one page anyway. + */ + ++state->refcnt; +} + +/* + * __merge_copy_ref -- + * Copy a child reference from the locked subtree to a new page. + */ +static void +__merge_copy_ref(WT_PAGE *parent, WT_REF *ref, WT_VISIT_STATE *state) +{ + WT_REF *newref; + + WT_UNUSED(parent); + + if (state->split != 0 && state->refcnt++ == state->split) + state->ref = state->second_ref; + + newref = state->ref++; + *newref = *ref; +} + +/* + * __merge_unlock -- + * Unlock all pages under an internal page being merged. + */ +static void +__merge_unlock(WT_PAGE *page) +{ + WT_REF *ref; + uint32_t i; + + WT_REF_FOREACH(page, ref, i) + if (ref->state == WT_REF_LOCKED) { + if (ref->page->type == WT_PAGE_ROW_INT || + ref->page->type == WT_PAGE_COL_INT) + __merge_unlock(ref->page); + WT_PUBLISH(ref->state, WT_REF_MEM); + } +} + +/* + * __merge_transfer_footprint -- + * Transfer the size of references from an old page to a new page. + * + * Note that both pages are locked and there is no net change, so avoid + * __wt_cache_page_inmem_incr. + */ +static void +__merge_transfer_footprint(WT_SESSION_IMPL *session, + WT_PAGE *newpage, WT_PAGE *oldpage, uint32_t size) +{ + WT_ASSERT(session, size < oldpage->memory_footprint); + oldpage->memory_footprint -= size; + newpage->memory_footprint += size; +} + +/* + * __merge_switch_page -- + * Switch a page from the locked tree into the new tree. + */ +static void +__merge_switch_page(WT_PAGE *parent, WT_REF *ref, WT_VISIT_STATE *state) +{ + WT_PAGE *child; + WT_PAGE_MODIFY *modify; + WT_REF *newref; + + if (state->split != 0 && state->refcnt++ == state->split) { + state->page = state->second; + state->ref = state->second_ref; + } + + newref = state->ref++; + + if (ref->addr != NULL) + __merge_transfer_footprint( + state->session, state->page, parent, + (uint32_t)sizeof(WT_ADDR) + ((WT_ADDR *)ref->addr)->size); + + if (parent->type == WT_PAGE_ROW_INT) + __merge_transfer_footprint( + state->session, state->page, parent, + (uint32_t)sizeof(WT_IKEY) + ((WT_IKEY *)ref->u.key)->size); + + if (ref->state == WT_REF_LOCKED) { + child = ref->page; + + /* + * If the child has been split, update the split page to point + * into the new tree. That way, if the split-merge page is + * later swapped into place, it will point to the new parent. + * + * The order here is important: the parent page should point to + * the original child page, so we link that in last. + */ + if ((modify = child->modify) != NULL && + F_ISSET(modify, WT_PM_REC_SPLIT)) + WT_LINK_PAGE(state->page, newref, modify->u.split); + + WT_LINK_PAGE(state->page, newref, child); + + /* + * If we have a child that is a live internal page, its subtree + * was locked by __rec_review. We're swapping it into the new + * tree, unlock it now. + */ + if (child->type == WT_PAGE_ROW_INT || + child->type == WT_PAGE_COL_INT) + __merge_unlock(child); + + newref->state = WT_REF_MEM; + } + + WT_CLEAR(*ref); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __merge_check_discard -- + * Make sure we are only discarding split-merge pages. + */ +static void +__merge_check_discard(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_REF *ref; + uint32_t i; + + WT_ASSERT(session, page->type == WT_PAGE_ROW_INT || + page->type == WT_PAGE_COL_INT); + WT_ASSERT(session, page->modify != NULL && + F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE)); + + WT_REF_FOREACH(page, ref, i) { + if (ref->state == WT_REF_DISK || + ref->state == WT_REF_DELETED) + continue; + + WT_ASSERT(session, ref->state == WT_REF_LOCKED); + __merge_check_discard(session, ref->page); + } +} +#endif + +/* + * __merge_new_page -- + * Create a new in-memory internal page. + */ +static int +__merge_new_page(WT_SESSION_IMPL *session, + uint8_t type, uint32_t entries, int merge, WT_PAGE **pagep) +{ + WT_DECL_RET; + WT_PAGE *newpage; + + /* Allocate a new internal page and fill it in. */ + WT_RET(__wt_page_alloc(session, type, entries, &newpage)); + newpage->read_gen = WT_READ_GEN_NOTSET; + newpage->entries = entries; + + WT_ERR(__wt_page_modify_init(session, newpage)); + if (merge) + F_SET(newpage->modify, WT_PM_REC_SPLIT_MERGE); + else + __wt_page_modify_set(session, newpage); + + *pagep = newpage; + return (0); + +err: __wt_page_out(session, &newpage); + return (ret); +} + +/* + * __merge_promote_key -- + * Copy a key from a child page into the reference in its parent, so it + * can be found by searches. + */ +static int +__merge_promote_key(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_IKEY *ikey; + WT_PAGE *page; + WT_REF *child_ref; + + page = ref->page; + switch (page->type) { + case WT_PAGE_COL_INT: + child_ref = &page->u.intl.t[0]; + ref->u.recno = page->u.intl.recno = child_ref->u.recno; + return (0); + + case WT_PAGE_ROW_INT: + child_ref = &page->u.intl.t[0]; + ikey = child_ref->u.key; + WT_ASSERT(session, ikey != NULL); + return (__wt_row_ikey_incr(session, + page, 0, WT_IKEY_DATA(ikey), ikey->size, &ref->u.key)); + + WT_ILLEGAL_VALUE(session); + } +} + +/* + * __wt_merge_tree -- + * Attempt to collapse a stack of split-merge pages in memory into a + * shallow tree. If enough keys are found, create a real internal node + * that can be evicted (and, if necessary, split further). + * + * This code is designed to deal with workloads that otherwise create + * arbitrarily deep (and slow) trees in memory. + */ +int +__wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) +{ + WT_DECL_RET; + WT_PAGE *lchild, *newtop, *rchild; + WT_REF *newref; + WT_VISIT_STATE visit_state; + uint32_t refcnt, split; + int promote; + u_int levels; + uint8_t page_type; + + WT_CLEAR(visit_state); + visit_state.session = session; + lchild = newtop = rchild = NULL; + page_type = top->type; + + WT_ASSERT(session, __wt_btree_mergeable(top)); + WT_ASSERT(session, top->ref->state == WT_REF_LOCKED); + + /* + * Walk the subtree, count the references at the bottom level and + * calculate the maximum depth. + */ + WT_RET(__merge_walk(session, top, 1, __merge_count, &visit_state)); + + /* If there aren't enough useful levels, give up. */ + if (visit_state.maxdepth < WT_MERGE_STACK_MIN) + return (EBUSY); + + /* Pages cannot grow larger than 2**32, but that should never happen. */ + if (visit_state.refcnt > UINT32_MAX) + return (ENOMEM); + + /* Make sure the top page isn't queued for eviction. */ + __wt_evict_list_clr_page(session, top); + + /* Clear the eviction walk: it may be in our subtree. */ + __wt_evict_clear_tree_walk(session, NULL); + + /* + * Now we either collapse the internal pages into one split-merge page, + * or if there are "enough" keys, we split into two equal internal + * pages, each of which can be evicted independently. + * + * We set a flag (WT_PM_REC_SPLIT_MERGE) on the created page if it + * isn't big enough to justify the cost of evicting it. If splits + * continue, it will be merged again until it gets over this limit. + */ + promote = 0; + refcnt = (uint32_t)visit_state.refcnt; + if (refcnt >= WT_MERGE_FULL_PAGE && visit_state.seen_live) { + /* + * In the normal case where there are live children spread + * through the subtree, create two child pages. + * + * Handle the case where the only live child is first / last + * specially: put the live child into the top-level page. + * + * Set SPLIT_MERGE on the internal pages if there are any live + * children: they can't be evicted, so there is no point + * permanently deepening the tree. + */ + if (visit_state.first_live == visit_state.last_live && + (visit_state.first_live == 0 || + visit_state.first_live == refcnt - 1)) + split = (visit_state.first_live == 0) ? 1 : refcnt - 1; + else + split = (refcnt + 1) / 2; + + /* Only promote if we can create a real page. */ + if (split == 1 || split == refcnt - 1) + promote = 1; + else if (split >= WT_MERGE_FULL_PAGE && + visit_state.first_live >= split) + promote = 1; + else if (refcnt - split >= WT_MERGE_FULL_PAGE && + visit_state.last_live < split) + promote = 1; + } + + if (promote) { + /* Create a new top-level split-merge page with two entries. */ + WT_ERR(__merge_new_page(session, page_type, 2, 1, &newtop)); + + visit_state.split = split; + + /* Left split. */ + if (split == 1) + visit_state.first = newtop; + else { + WT_ERR(__merge_new_page(session, page_type, split, + visit_state.first_live < split, &lchild)); + visit_state.first = lchild; + } + + /* Right split. */ + if (split == refcnt - 1) { + visit_state.second = newtop; + visit_state.second_ref = &newtop->u.intl.t[1]; + } else { + WT_ERR(__merge_new_page(session, page_type, + refcnt - split, visit_state.last_live >= split, + &rchild)); + visit_state.second = rchild; + visit_state.second_ref = + &visit_state.second->u.intl.t[0]; + } + } else { + /* + * Create a new split-merge page for small merges, or if the + * page above is a split merge page. When we do a big enough + * merge, we create a real page at the top and don't consider + * it as a merge candidate again. Over time with an insert + * workload the tree will grow deeper, but that's inevitable, + * and this keeps individual merges small. + */ + WT_ERR(__merge_new_page(session, page_type, refcnt, + refcnt < WT_MERGE_FULL_PAGE || + __wt_btree_mergeable(top->parent), + &newtop)); + + visit_state.first = newtop; + } + + /* + * Copy the references into the new tree, but don't update anything in + * the locked tree in case there is an error and we need to back out. + * We do this in a separate pass so that we can figure out the key for + * the split point: that allocates memory and so it could still fail. + */ + visit_state.page = visit_state.first; + visit_state.ref = visit_state.page->u.intl.t; + visit_state.refcnt = 0; + WT_ERR(__merge_walk(session, top, 0, __merge_copy_ref, &visit_state)); + + if (promote) { + /* Promote keys into the top-level page. */ + if (lchild != NULL) { + newref = &newtop->u.intl.t[0]; + WT_LINK_PAGE(newtop, newref, lchild); + newref->state = WT_REF_MEM; + WT_ERR(__merge_promote_key(session, newref)); + } + + if (rchild != NULL) { + newref = &newtop->u.intl.t[1]; + WT_LINK_PAGE(newtop, newref, rchild); + newref->state = WT_REF_MEM; + WT_ERR(__merge_promote_key(session, newref)); + } + } + + /* + * We have copied everything into place and allocated all of the memory + * we need. Now link all pages into the new tree and unlock them. + * + * The only way this could fail is if a reference state has been + * changed by another thread since they were locked. Panic in that + * case: that should never happen. + */ + visit_state.page = visit_state.first; + visit_state.ref = visit_state.page->u.intl.t; + visit_state.refcnt = 0; + ret = __merge_walk(session, top, 0, __merge_switch_page, &visit_state); + + if (ret != 0) + WT_ERR(__wt_illegal_value(session, "__wt_merge_tree")); + + newtop->u.intl.recno = top->u.intl.recno; + newtop->parent = top->parent; + newtop->ref = top->ref; + +#ifdef HAVE_DIAGNOSTIC + /* + * Before swapping in the new tree, walk the pages we are discarding, + * check that everything looks right. + */ + __merge_check_discard(session, top); +#endif + + /* + * Set up the new top-level page as a split so that it will be swapped + * into place by our caller. + */ + top->modify->flags = WT_PM_REC_SPLIT; + top->modify->u.split = newtop; + + WT_VERBOSE_ERR(session, evict, + "Successfully %s %" PRIu32 + " split-merge pages containing %" PRIu32 " keys\n", + promote ? "promoted" : "merged", visit_state.maxdepth, refcnt); + + /* Queue new child pages for forced eviction, if possible. */ + if (lchild != NULL && !F_ISSET(lchild->modify, WT_PM_REC_SPLIT_MERGE)) + __wt_evict_forced_page(session, lchild); + if (rchild != NULL && !F_ISSET(rchild->modify, WT_PM_REC_SPLIT_MERGE)) + __wt_evict_forced_page(session, rchild); + + /* Update statistics. */ + WT_CSTAT_INCR(session, cache_eviction_merge); + WT_DSTAT_INCR(session, cache_eviction_merge); + + /* How many levels did we remove? */ + levels = visit_state.maxdepth - (promote ? 2 : 1); + WT_CSTAT_INCRV(session, cache_eviction_merge_levels, levels); + WT_DSTAT_INCRV(session, cache_eviction_merge_levels, levels); + + return (0); + +err: WT_VERBOSE_TRET(session, evict, + "Failed to merge %" PRIu32 + " split-merge pages containing %" PRIu32 " keys\n", + visit_state.maxdepth, refcnt); + + WT_CSTAT_INCR(session, cache_eviction_merge_fail); + WT_DSTAT_INCR(session, cache_eviction_merge_fail); + + if (newtop != NULL) + __wt_page_out(session, &newtop); + if (lchild != NULL) + __wt_page_out(session, &lchild); + if (rchild != NULL) + __wt_page_out(session, &rchild); + return (ret); +} diff --git a/src/btree/rec_track.c b/src/btree/rec_track.c index beeeeb43461..2f0f284fbd1 100644 --- a/src/btree/rec_track.c +++ b/src/btree/rec_track.c @@ -237,17 +237,8 @@ __wt_rec_track_onpage_srch( /* * __wt_rec_track_onpage_addr -- - * Search for a permanently tracked object (based on an addr/size pair), - * and add it if it isn't already tracked. - * - * __wt_rec_track_onpage_ref -- - * Search for a permanently tracked object (based on a page and ref), - * and add it if it isn't already tracked. - * - * These functions are short-hand for "search the on-page records, and if the - * address is not already listed as an object, add it". Note there is no - * possibility of object re-use, the object is discarded when reconciliation - * completes. + * Search the on-page records for a permanently tracked object (based on + * an addr/size pair), and add it if it isn't already tracked. */ int __wt_rec_track_onpage_addr(WT_SESSION_IMPL *session, @@ -256,21 +247,14 @@ __wt_rec_track_onpage_addr(WT_SESSION_IMPL *session, if (__wt_rec_track_onpage_srch(page, addr, addr_size)) return (0); + /* + * Note there is no possibility of object re-use, the object is + * discarded when reconciliation completes. + */ return (__wt_rec_track( session, page, addr, addr_size, NULL, 0, WT_TRK_ONPAGE)); } -int -__wt_rec_track_onpage_ref( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE *refpage, WT_REF *ref) -{ - uint32_t size; - const uint8_t *addr; - - __wt_get_addr(refpage, ref, &addr, &size); - return (__wt_rec_track_onpage_addr(session, page, addr, size)); -} - /* * __wt_rec_track_ovfl_reuse -- * Search for a matching overflow record and reactivate it. diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c index 9116679e98e..8809ee9e8c6 100644 --- a/src/btree/rec_write.c +++ b/src/btree/rec_write.c @@ -133,6 +133,7 @@ typedef struct { int already_compressed; } *bnd; /* Saved boundaries */ uint32_t bnd_next; /* Next boundary slot */ + uint32_t bnd_next_max; /* Maximum boundary slots used */ uint32_t bnd_entries; /* Total boundary slots */ size_t bnd_allocated; /* Bytes allocated */ @@ -177,7 +178,6 @@ typedef struct { * these fields work. */ int cell_zero; /* Row-store internal page 0th key */ - WT_REF *merge_ref; /* Row-store merge correction key */ /* * WT_DICTIONARY -- @@ -278,7 +278,9 @@ __wt_rec_write(WT_SESSION_IMPL *session, WT_DECL_RET; /* We're shouldn't get called with a clean page, that's an error. */ - WT_ASSERT_RET(session, __wt_page_is_modified(page)); + if (!__wt_page_is_modified(page)) + WT_RET_MSG(session, WT_ERROR, + "Attempt to reconcile a clean page."); /* * We can't do anything with a split-merge page, it must be merged into @@ -289,9 +291,12 @@ __wt_rec_write(WT_SESSION_IMPL *session, WT_VERBOSE_RET( session, reconcile, "%s", __wt_page_type_string(page->type)); + WT_CSTAT_INCR(session, rec_pages); WT_DSTAT_INCR(session, rec_pages); - if (LF_ISSET(WT_EVICTION_SERVER_LOCKED)) + if (LF_ISSET(WT_EVICTION_SERVER_LOCKED)) { + WT_CSTAT_INCR(session, rec_pages_eviction); WT_DSTAT_INCR(session, rec_pages_eviction); + } /* Initialize the reconciliation structure for each new run. */ WT_RET(__rec_write_init(session, page, flags, &session->reconcile)); @@ -574,6 +579,7 @@ __rec_txn_skip_chk(WT_SESSION_IMPL *session, WT_RECONCILE *r) WT_PANIC_RETX( session, "reconciliation illegally skipped an update"); case WT_SKIP_UPDATE_QUIT: + WT_CSTAT_INCR(session, rec_skipped_update); WT_DSTAT_INCR(session, rec_skipped_update); return (EBUSY); case 0: @@ -808,10 +814,11 @@ __rec_child_deleted(WT_SESSION_IMPL *session, * If no such transactions exist, we can discard the leaf page to the * block manager and no cell needs to be written at all. We do this * outside of the underlying tracking routines because this action is - * permanent and irrevocable. (Setting the WT_REF.addr value to NULL - * means we've lost track of the disk address in a permanent way. If - * we ever read into this chunk of the name space again, the cache read - * function instantiates a new page.) + * permanent and irrevocable. (Clearing the address means we've lost + * track of the disk address in a permanent way. This is safe because + * there's no path to reading the leaf page again: if reconciliation + * fails, and we ever read into this part of the name space again, the + * cache read function instantiates a new page.) * * One final note: if the WT_REF transaction ID is set to WT_TXN_NONE, * it means this WT_REF is the re-creation of a deleted node (we wrote @@ -992,16 +999,22 @@ __rec_key_state_update(WT_RECONCILE *r, int ovfl_key) static int __rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r) { + uint32_t incr; + /* * Make sure there's enough room in which to save another boundary. * * The calculation is actually +1, because we save the start point one - * past the current entry -- make it +20 so we don't grow slot-by-slot. + * past the current entry; normal reconciliation generally doesn't use + * a lot of buffers, but we grow aggressively anyway, bulk load eats up + * a lot of these entries because we have an entry for each page that's + * created by the bulk load. */ if (r->bnd_next + 1 >= r->bnd_entries) { - WT_RET(__wt_realloc(session, &r->bnd_allocated, - (r->bnd_entries + 20) * sizeof(*r->bnd), &r->bnd)); - r->bnd_entries += 20; + incr = r->bnd_entries + r->bnd_entries / 2 + 20; + WT_RET(__wt_realloc(session, + &r->bnd_allocated, incr * sizeof(*r->bnd), &r->bnd)); + r->bnd_entries = incr; } return (0); } @@ -1055,9 +1068,8 @@ __rec_split_init(WT_SESSION_IMPL *session, * split pages, because otherwise we could end up splitting one large * packed page over and over. We don't want to pick the minimum size * either, because that penalizes an application that did a bulk load - * and subsequently inserted a few items into packed pages. Currently, - * I'm using 75%, but I have no empirical evidence that's a good value. - * We should leave this as a tuning variable, but probably undocumented. + * and subsequently inserted a few items into packed pages. Currently + * defaulted to 75%, but I have no empirical evidence that's "correct". * * The maximum page size may be a multiple of the split page size (for * example, there's a maximum page size of 128KB, but because the table @@ -1083,10 +1095,9 @@ __rec_split_init(WT_SESSION_IMPL *session, if (r->raw_compression) r->split_size = 0; else if (page->type == WT_PAGE_COL_FIX) - r->split_size = r->page_size; + r->split_size = r->page_size_max; else - r->split_size = WT_SPLIT_PAGE_SIZE( - r->page_size, btree->allocsize, btree->split_pct); + r->split_size = __wt_split_page_size(btree, r->page_size_max); /* * If the maximum page size is the same as the split page size, either @@ -1181,8 +1192,9 @@ __rec_split_row_promote_cell( */ cell = WT_PAGE_HEADER_BYTE(btree, dsk); __wt_cell_unpack(cell, unpack); - WT_ASSERT_RET(session, - unpack->raw != WT_CELL_VALUE_COPY && unpack->prefix == 0); + WT_ASSERT(session, + unpack->prefix == 0 && unpack->raw != WT_CELL_VALUE_COPY); + WT_RET(__wt_cell_unpack_copy(session, unpack, copy)); return (0); } @@ -1203,18 +1215,11 @@ __rec_split_row_promote(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint8_t type) * length byte string, get a copy. * * This function is called from the split code at each split boundary, - * but that means we're not called before the first boundary. When we - * do the split work at the second boundary, we need to copy the key - * for the first boundary from the page we're building. Alternatively, - * we could store a copy of the first key we put on a page somewhere, - * perhaps while building the keys for a page, but that's likely to be - * even uglier. - */ - if (r->bnd_next == 1) - WT_RET(__rec_split_row_promote_cell( - session, r->dsk.mem, &r->bnd[0].key)); - - /* + * but that means we're not called before the first boundary. It's OK + * we never do the work for the first boundary because that key cannot + * come from the page, it has to come from the parent. See the comment + * in the code that creates the row-store split-merge page for details. + * * For the current slot, take the last key we built, after doing suffix * compression. The "last key we built" describes some process: before * calling the split code, we must place the last key on the page before @@ -1446,8 +1451,14 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RECONCILE *r, int final) * Set the promotion key for the chunk. Repeated each time we * try and split, which might be wasted work, but detecting * repeated key-building is probably more complicated than it's - * worth. + * worth. Don't bother doing the work for the first boundary, + * that key cannot come from the page, it has to come from the + * parent. See the comment in the code that creates the row- + * store split-merge page for details. */ + if (r->bnd_next == 0) + break; + WT_RET(__rec_split_row_promote_cell(session, dsk, &bnd->key)); break; WT_ILLEGAL_VALUE(session); @@ -1579,8 +1590,8 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RECONCILE *r, int final) */ memcpy(dst->mem, dsk, WT_BLOCK_COMPRESS_SKIP); WT_ERR(compressor->compress_raw(compressor, wt_session, - r->page_size_max, WT_BLOCK_COMPRESS_SKIP, - (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, + r->page_size_max, btree->split_pct, + WT_BLOCK_COMPRESS_SKIP, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, r->raw_offsets, slots, (uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP, result_len, final, &result_len, &result_slots)); @@ -1787,10 +1798,8 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) * entries to 0, is because there's another entry to write, which then * sets entries to 1). If the page was empty, we eventually delete it. */ - if (r->entries == 0) { - WT_ASSERT_RET(session, r->bnd_next == 0); + if (r->entries == 0) return (0); - } return (r->raw_compression ? __rec_split_finish_raw(session, r) : @@ -1827,7 +1836,7 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) * WT_PAGE_HEADER header onto the scratch buffer, most of the header * information remains unchanged between the pages. */ - WT_RET(__wt_scr_alloc(session, r->split_size, &tmp)); + WT_RET(__wt_scr_alloc(session, r->page_size_max, &tmp)); dsk = tmp->mem; memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE); @@ -1859,8 +1868,9 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) * Fix up our caller's information. */ len = WT_PTRDIFF32(r->first_free, bnd->start); - WT_ASSERT_ERR( - session, len < r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)); + if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) + WT_PANIC_ERR(session, ret = WT_PANIC, + "Reconciliation remnant too large for the split buffer"); dsk = r->dsk.mem; dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); @@ -2969,37 +2979,6 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * Modified child. * The page may be emptied or internally created during a split. * Deleted/split pages are merged into the parent and discarded. - * - * There's one special case we have to handle here: the internal - * page being merged has a potentially incorrect first key and - * we need to replace it with the one we have. The problem is - * caused by the fact that the page search algorithm coerces the - * 0th key on any internal page to be smaller than any search - * key. We do that because we don't want to have to update the - * internal pages every time a new "smallest" key is inserted - * into the tree. But, if a new "smallest" key is inserted into - * our split-created subtree, and we don't update the internal - * page, when we merge that internal page into its parent page, - * the key may be incorrect (or more likely, have been coerced - * to a single byte because it's an internal page's 0th key). - * Imagine the following tree: - * - * 2 5 40 internal page - * | - * 10 | 20 split-created internal page - * | - * 6 inserted smallest key - * - * after a simple merge, we'd have corruption: - * - * 2 10 20 40 merged internal page - * | - * 6 key sorts before parent's key - * - * To fix this problem, we take the higher-level page's key as - * our first key, because that key sorts before any possible - * key inserted into the subtree, and discard whatever 0th key - * is on the split-created internal page. */ if (state == WT_CHILD_MODIFIED) switch (F_ISSET(rp->modify, WT_PM_REC_MASK)) { @@ -3040,7 +3019,6 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) session, page, kpack->data, kpack->size)); - r->merge_ref = ref; WT_RET(__rec_row_merge(session, r, F_ISSET(rp->modify, WT_PM_REC_SPLIT_MERGE) ? rp : rp->modify->u.split)); @@ -3082,7 +3060,6 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* * Build key cell. - * * Truncate any 0th key, internal pages don't need 0th keys. */ if (onpage_ovfl) { @@ -3202,14 +3179,6 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) break; case WT_PM_REC_SPLIT: case WT_PM_REC_SPLIT_MERGE: - /* - * If we have a merge key set, we're working our - * way down a merge tree. If we have not set a - * merge key, we're starting descent of a new - * merge tree, set the merge key. - */ - if (r->merge_ref == NULL) - r->merge_ref = ref; WT_RET(__rec_row_merge(session, r, F_ISSET(rp->modify, WT_PM_REC_SPLIT_MERGE) ? rp : rp->modify->u.split)); @@ -3240,14 +3209,10 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) __rec_cell_build_addr(r, p, size, vtype, 0); /* - * Build the key cell. If this is the first key in a "to be - * merged" subtree, use the merge correction key saved in the - * top-level parent page when this function was called. - * + * Build the key cell. * Truncate any 0th key, internal pages don't need 0th keys. */ - ikey = r->merge_ref == NULL ? ref->u.key : r->merge_ref->u.key; - r->merge_ref = NULL; + ikey = ref->u.key; WT_RET(__rec_cell_build_key(session, r, WT_IKEY_DATA(ikey), r->cell_zero ? 1 : ikey->size, 1, &ovfl_key)); r->cell_zero = 0; @@ -3764,8 +3729,10 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_BTREE *btree; WT_BOUNDARY *bnd; WT_PAGE_MODIFY *mod; - uint32_t page_size; + WT_REF *ref; + uint32_t size; int was_modified; + const uint8_t *addr; btree = S2BT(session); bm = btree->bm; @@ -3779,17 +3746,34 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) */ switch (F_ISSET(mod, WT_PM_REC_MASK)) { case 0: /* - * The page has never been reconciled before, track the original - * address blocks (if any). The "if any" is for empty trees we - * create when a new tree is opened, and for previously deleted - * pages that are instantiated in memory. + * The page has never been reconciled before, free the original + * address blocks (if any). The "if any" is for empty trees + * created when a new tree is opened, previously deleted pages + * instantiated in memory, or pages reconciled into split-merge + * pages and then replaced by other pages because the tree grew + * too deep. * * The exception is root pages are never tracked or free'd, they * are checkpoints, and must be explicitly dropped. */ - if (!WT_PAGE_IS_ROOT(page) && page->ref->addr != NULL) - WT_RET(__wt_rec_track_onpage_ref( - session, page, page->parent, page->ref)); + if (WT_PAGE_IS_ROOT(page)) + break; + + ref = page->ref; + if (ref->addr != NULL) { + /* + * Free the page and clear the address (so we don't free + * it twice). Logically, this is the same as adding the + * address to the reconciliation tracking information + * and freeing it when reconciliation ends as part of + * cleaning up the track information, but that is going + * to happen right at the end of this switch statement, + * might as well save the work. + */ + __wt_get_addr(page->parent, ref, &addr, &size); + WT_RET(bm->free(bm, session, addr, size)); + ref->addr = NULL; + } break; case WT_PM_REC_EMPTY: /* Page deleted */ break; @@ -3807,7 +3791,6 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* Discard the replacement page's address. */ __wt_free(session, mod->u.replace.addr); - mod->u.replace.addr = NULL; mod->u.replace.size = 0; break; case WT_PM_REC_SPLIT: /* Page split */ @@ -3928,6 +3911,11 @@ err: __wt_scr_free(&tkey); WT_RET(ret); } + if (r->bnd_next > r->bnd_next_max) { + r->bnd_next_max = r->bnd_next; + WT_DSTAT_SET(session, rec_split_max, r->bnd_next_max); + } + switch (page->type) { case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: @@ -3967,12 +3955,15 @@ err: __wt_scr_free(&tkey); */ if (!r->upd_skipped) { was_modified = __wt_page_is_modified(page); - WT_ORDERED_READ(page_size, page->memory_footprint); + WT_ORDERED_READ(size, page->memory_footprint); mod->disk_gen = r->orig_write_gen; if (was_modified && !__wt_page_is_modified(page)) - __wt_cache_dirty_decr(session, page_size); + __wt_cache_dirty_decr(session, size); } + /* Record the most recent transaction ID we could have written. */ + mod->disk_txn = session->txn.snap_min; + return (0); } @@ -4000,36 +3991,36 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) if (bnd->addr.addr != NULL) { WT_TRET(bm->free( bm, session, bnd->addr.addr, bnd->addr.size)); - bnd->addr.addr = NULL; + __wt_free(session, bnd->addr.addr); } return (ret); } /* - * __rec_split_row -- - * Split a row-store page, creating a new internal page. + * __rec_split_merge_new -- + * Create a split-merge page. */ static int -__rec_split_row( - WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *orig, WT_PAGE **splitp) +__rec_split_merge_new(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_PAGE *orig, WT_PAGE **pagep, uint8_t type) { - WT_ADDR *addr; - WT_BOUNDARY *bnd; - WT_DECL_RET; WT_PAGE *page; - WT_REF *ref; - uint32_t i; - /* Allocate a row-store internal page. */ - WT_RET(__wt_calloc_def(session, 1, &page)); - WT_ERR(__wt_calloc_def(session, (size_t)r->bnd_next, &page->u.intl.t)); - - /* Fill it in. */ + /* + * Allocate a new internal page and fill it in. + * + * Our caller cleans up, make sure we return a valid page reference, + * even on error. + */ + WT_RET(__wt_page_alloc(session, type, r->bnd_next, pagep)); + page = *pagep; page->parent = orig->parent; page->ref = orig->ref; - page->read_gen = __wt_cache_read_gen(session); + if (type == WT_PAGE_COL_INT) + page->u.intl.recno = r->bnd[0].recno; + page->read_gen = WT_READ_GEN_NOTSET; page->entries = r->bnd_next; - page->type = WT_PAGE_ROW_INT; + page->flags_atomic = WT_PAGE_DISK_NOT_ALLOC; /* * We don't re-write parent pages when child pages split, which means @@ -4050,27 +4041,100 @@ __rec_split_row( * its memory discarded, but the newly created split page cannot be * evicted, it can only be merged into its parent. */ - WT_ERR(__wt_page_modify_init(session, page)); + WT_RET(__wt_page_modify_init(session, page)); F_SET(page->modify, WT_PM_REC_SPLIT_MERGE); - /* Enter each split page into the new, internal page. */ + return (0); +} + +/* + * __rec_split_row -- + * Split a row-store page, creating a new internal page. + */ +static int +__rec_split_row( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *orig, WT_PAGE **splitp) +{ + WT_ADDR *addr; + WT_BOUNDARY *bnd; + WT_DECL_RET; + WT_IKEY *ikey; + WT_PAGE *page; + WT_REF *ref; + size_t size; + uint32_t i; + + /* Allocate a split-merge page. */ + WT_ERR(__rec_split_merge_new(session, r, orig, &page, WT_PAGE_ROW_INT)); + + /* + * The "parent" key for each split chunk is the first key on the chunk, + * except for the 0th chunk, which cannot come from the page itself as + * it might not be small enough. If the existing key for the page is + * smaller than the first key on the chunk we can lose after the merge. + * Imagine the following tree, where an internal page has keys 2, 5 and + * 40. The page with key 5 splits into two chunks, and 10 is the first + * key in the first chunk. + * + * 2 5 40 internal page + * | + * 10 | 20 split-created internal page + * + * If we subsequently insert a key 6, it works because the page search + * algorithm coerces the 0th key of an internal page to be smaller than + * any search key. (We do that because we don't want to have to update + * internal pages every time a new "smallest" key is inserted into the + * tree.) Anyway, that results in the following tree: + * + * 2 5 40 internal page + * | + * 10 | 20 split-created internal page + * | + * 6 inserted smallest key + * + * after a simple merge where we replace page 5 with pages 10 and 20, + * we'd have corruption: + * + * 2 10 20 40 merged internal page + * | + * 6 key sorts before parent's key + * + * To fix this problem, we take the original parent page's key as the + * first chunk's key because that key sorts before any possible key + * inserted into the subtree. + */ + if (WT_PAGE_IS_ROOT(orig)) + WT_ERR(__wt_buf_set(session, &r->bnd[0].key, "", 1)); + else { + ikey = orig->ref->u.key; + WT_ERR(__wt_buf_set( + session, &r->bnd[0].key, WT_IKEY_DATA(ikey), ikey->size)); + } + + /* Enter each split child page into the new internal page. */ + size = 0; for (ref = page->u.intl.t, bnd = r->bnd, i = 0; i < r->bnd_next; ++ref, ++bnd, ++i) { WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); *addr = bnd->addr; bnd->addr.addr = NULL; + size += bnd->addr.size; ref->page = NULL; - WT_ERR(__wt_row_ikey_alloc(session, 0, + WT_ERR(__wt_row_ikey(session, 0, bnd->key.data, bnd->key.size, &ref->u.key)); + size += sizeof(WT_IKEY) + bnd->key.size; ref->addr = addr; ref->state = WT_REF_DISK; } + __wt_cache_page_inmem_incr( + session, page, r->bnd_next * sizeof(WT_ADDR) + size); *splitp = page; return (0); -err: __wt_page_out(session, &page); +err: if (page != NULL) + __wt_page_out(session, &page); return (ret); } @@ -4089,25 +4153,10 @@ __rec_split_col( WT_REF *ref; uint32_t i; - /* Allocate a column-store internal page. */ - WT_RET(__wt_calloc_def(session, 1, &page)); - WT_ERR(__wt_calloc_def(session, (size_t)r->bnd_next, &page->u.intl.t)); - - /* Fill it in. */ - page->parent = orig->parent; - page->ref = orig->ref; - page->read_gen = __wt_cache_read_gen(session); - page->u.intl.recno = r->bnd[0].recno; - page->entries = r->bnd_next; - page->type = WT_PAGE_COL_INT; - - /* - * See the comment above in __rec_split_row(). - */ - WT_ERR(__wt_page_modify_init(session, page)); - F_SET(page->modify, WT_PM_REC_SPLIT_MERGE); + /* Allocate a split-merge page. */ + WT_ERR(__rec_split_merge_new(session, r, orig, &page, WT_PAGE_COL_INT)); - /* Enter each split page into the new, internal page. */ + /* Enter each split child page into the new internal page. */ for (ref = page->u.intl.t, bnd = r->bnd, i = 0; i < r->bnd_next; ++ref, ++bnd, ++i) { WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); @@ -4119,11 +4168,14 @@ __rec_split_col( ref->addr = addr; ref->state = WT_REF_DISK; } + __wt_cache_page_inmem_incr( + session, page, r->bnd_next * sizeof(WT_ADDR)); *splitp = page; return (0); -err: __wt_page_out(session, &page); +err: if (page != NULL) + __wt_page_out(session, &page); return (ret); } diff --git a/src/btree/row_key.c b/src/btree/row_key.c index 043cbace5ee..1142bffbdcc 100644 --- a/src/btree/row_key.c +++ b/src/btree/row_key.c @@ -309,7 +309,7 @@ next: switch (direction) { /* If still needed, instantiate the key. */ key = WT_ROW_KEY_COPY(rip_arg); if (!__wt_off_page(page, key)) { - WT_ERR(__wt_row_ikey_alloc(session, + WT_ERR(__wt_row_ikey(session, WT_PAGE_DISK_OFFSET(page, key), retb->data, retb->size, &ikey)); @@ -380,11 +380,27 @@ __wt_row_value(WT_PAGE *page, WT_ROW *rip) } /* - * __wt_row_ikey_alloc -- + * __wt_row_ikey_incr -- + * Instantiate a key in a WT_IKEY structure and increment the page's + * memory footprint. + */ +int +__wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, + uint32_t cell_offset, const void *key, uint32_t size, void *ikeyp) +{ + WT_RET(__wt_row_ikey(session, cell_offset, key, size, ikeyp)); + + __wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + size); + + return (0); +} + +/* + * __wt_row_ikey -- * Instantiate a key in a WT_IKEY structure. */ int -__wt_row_ikey_alloc(WT_SESSION_IMPL *session, +__wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, uint32_t size, void *ikeyp) { WT_IKEY *ikey; diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c index aa3f6fc1c58..d031315e7bb 100644 --- a/src/btree/row_modify.c +++ b/src/btree/row_modify.c @@ -223,7 +223,8 @@ __wt_insert_serial_func(WT_SESSION_IMPL *session, void *args) /* * Check the page's write-generation: if that fails, check whether we * are still in the expected position, and no item has been added where - * our insert belongs. + * our insert belongs. Take extra care at the beginning and end of the + * list (at each level): retry if we race there. */ WT_RET(__wt_page_write_gen_wrapped_check(page)); @@ -233,8 +234,8 @@ __wt_insert_serial_func(WT_SESSION_IMPL *session, void *args) *ins_stack[i] != next_stack[i]) return (WT_RESTART); if (next_stack[i] == NULL && - inshead->tail[i] != NULL && - ins_stack[i] != &inshead->tail[i]->next[i]) + (inshead->tail[i] == NULL || + ins_stack[i] != &inshead->tail[i]->next[i])) return (WT_RESTART); } } @@ -352,7 +353,6 @@ __wt_update_alloc(WT_SESSION_IMPL *session, WT_UPDATE * __wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd) { - WT_TXN *txn; WT_UPDATE *next; /* @@ -360,13 +360,7 @@ __wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd) * the rest of the chain; because this routine is called from inside * a serialization function, the caller has responsibility for actually * freeing the memory. - */ - txn = &session->txn; - if (txn->isolation != TXN_ISO_SNAPSHOT && - txn->isolation != TXN_ISO_READ_COMMITTED) - return (NULL); - - /* + * * Walk the list of updates, looking for obsolete updates. If we find * an update no session will ever move past, we can discard any updates * that appear after it. diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 23860831451..d8de7f118a9 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -129,9 +129,27 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify) item = &_item; for (depth = 2, page = btree->root_page; page->type == WT_PAGE_ROW_INT; ++depth) { + /* + * Fast-path internal pages with one child, a common case for + * the root page in new trees. + */ + base = page->entries; + ref = &page->u.intl.t[base - 1]; + if (base == 1) + goto descend; + + /* Fast-path appends. */ + ikey = ref->u.key; + item->data = WT_IKEY_DATA(ikey); + item->size = ikey->size; + + WT_ERR(WT_BTREE_CMP(session, btree, srch_key, item, cmp)); + if (cmp >= 0) + goto descend; + /* Binary search of internal pages. */ for (base = 0, ref = NULL, - limit = page->entries; limit != 0; limit >>= 1) { + limit = page->entries - 1; limit != 0; limit >>= 1) { indx = base + (limit >> 1); ref = page->u.intl.t + indx; @@ -157,7 +175,8 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify) base = indx + 1; --limit; } - WT_ASSERT(session, ref != NULL); + +descend: WT_ASSERT(session, ref != NULL); /* * Reference the slot used for next step down the tree. diff --git a/src/config/config.c b/src/config/config.c index 141a969895b..f0c413b0624 100644 --- a/src/config/config.c +++ b/src/config/config.c @@ -343,7 +343,7 @@ __config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value) WT_CONFIG_ITEM *out = key; int utf8_remain = 0; static const WT_CONFIG_ITEM true_value = { - "", 0, 1, ITEM_NUM + "", 0, 1, ITEM_BOOL }; key->len = 0; @@ -499,10 +499,10 @@ __config_process_value(WT_CONFIG *conf, WT_CONFIG_ITEM *value) if (value->type == ITEM_ID) { if (strncasecmp(value->str, "true", value->len) == 0) { - value->type = ITEM_NUM; + value->type = ITEM_BOOL; value->val = 1; } else if (strncasecmp(value->str, "false", value->len) == 0) { - value->type = ITEM_NUM; + value->type = ITEM_BOOL; value->val = 0; } } else if (value->type == ITEM_NUM) { @@ -547,10 +547,10 @@ __config_process_value(WT_CONFIG *conf, WT_CONFIG_ITEM *value) } /* - * If we parsed the the whole string but the number is out of - * range, report an error. Don't report an error for strings - * that aren't well-formed integers: if an integer is expected, - * that will be caught by __wt_config_check. + * If we parsed the whole string but the number is out of range, + * report an error. Don't report an error for strings that + * aren't well-formed integers: if an integer is expected, that + * will be caught by __wt_config_check. */ if (value->type == ITEM_NUM && errno == ERANGE) goto range; @@ -574,12 +574,12 @@ __wt_config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value) } /* - * __wt_config_getraw -- + * __config_getraw -- * Given a config parser, find the final value for a given key. */ -int -__wt_config_getraw( - WT_CONFIG *cparser, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value) +static int +__config_getraw( + WT_CONFIG *cparser, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, int top) { WT_CONFIG sparser; WT_CONFIG_ITEM k, v, subk; @@ -601,14 +601,16 @@ __wt_config_getraw( WT_RET(__wt_config_initn( cparser->session, &sparser, v.str, v.len)); if ((ret = - __wt_config_getraw(&sparser, &subk, value)) == 0) + __config_getraw(&sparser, &subk, value, 0)) == 0) found = 1; WT_RET_NOTFOUND_OK(ret); } } + WT_RET_NOTFOUND_OK(ret); - return ((found && ret == WT_NOTFOUND) ? - __config_process_value(cparser, value) : ret); + if (!found) + return (WT_NOTFOUND); + return (top ? __config_process_value(cparser, value) : 0); } /* @@ -626,7 +628,7 @@ __wt_config_get(WT_SESSION_IMPL *session, for (found = 0; *cfg != NULL; cfg++) { WT_RET(__wt_config_init(session, &cparser, *cfg)); - if ((ret = __wt_config_getraw(&cparser, key, value)) == 0) + if ((ret = __config_getraw(&cparser, key, value, 1)) == 0) found = 1; else if (ret != WT_NOTFOUND) return (ret); @@ -660,7 +662,7 @@ __wt_config_getone(WT_SESSION_IMPL *session, WT_CONFIG cparser; WT_RET(__wt_config_init(session, &cparser, config)); - return (__wt_config_getraw(&cparser, key, value)); + return (__config_getraw(&cparser, key, value, 1)); } /* @@ -675,7 +677,7 @@ __wt_config_getones(WT_SESSION_IMPL *session, WT_CONFIG_ITEM key_item = { key, strlen(key), 0, ITEM_STRING }; WT_RET(__wt_config_init(session, &cparser, config)); - return (__wt_config_getraw(&cparser, &key_item, value)); + return (__config_getraw(&cparser, &key_item, value, 1)); } /* @@ -723,7 +725,7 @@ __wt_config_subgetraw(WT_SESSION_IMPL *session, WT_CONFIG cparser; WT_RET(__wt_config_initn(session, &cparser, cfg->str, cfg->len)); - return (__wt_config_getraw(&cparser, key, value)); + return (__config_getraw(&cparser, key, value, 1)); } /* diff --git a/src/config/config_check.c b/src/config/config_check.c index 9f4628611dd..718547149c4 100644 --- a/src/config/config_check.c +++ b/src/config/config_check.c @@ -57,8 +57,9 @@ __wt_config_check(WT_SESSION_IMPL *session, if (strcmp(checks[i].type, "int") == 0) badtype = (v.type != ITEM_NUM); else if (strcmp(checks[i].type, "boolean") == 0) - badtype = (v.type != ITEM_NUM || - (v.val != 0 && v.val != 1)); + badtype = (v.type != ITEM_BOOL && + (v.type != ITEM_NUM || + (v.val != 0 && v.val != 1))); else if (strcmp(checks[i].type, "list") == 0) badtype = (v.len > 0 && v.type != ITEM_STRUCT); else if (strcmp(checks[i].type, "category") == 0) { diff --git a/src/config/config_def.c b/src/config/config_def.c index 6ff187ab824..d99ac800bbb 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -86,13 +86,13 @@ const char * __wt_confdfl_connection_reconfigure = "cache_size=100MB,error_prefix=,eviction_dirty_target=80," "eviction_target=80,eviction_trigger=95,shared_cache=(chunk=10MB," - "name=,reserve=0,size=500MB),verbose="; + "name=pool,reserve=0,size=500MB),statistics=0,verbose="; WT_CONFIG_CHECK __wt_confchk_shared_cache_subconfigs[] = { { "chunk", "int", "min=1MB,max=10TB", NULL }, { "name", "string", NULL, NULL }, - { "reserve", "string", NULL, NULL }, + { "reserve", "int", NULL, NULL }, { "size", "int", "min=1MB,max=10TB", NULL }, { NULL, NULL, NULL, NULL } }; @@ -106,6 +106,7 @@ __wt_confchk_connection_reconfigure[] = { { "eviction_trigger", "int", "min=10,max=99", NULL}, { "shared_cache", "category", NULL, __wt_confchk_shared_cache_subconfigs}, + { "statistics", "boolean", NULL, NULL}, { "verbose", "list", "choices=[\"block\",\"shared_cache\",\"ckpt\",\"evict\"," "\"evictserver\",\"fileops\",\"hazard\",\"lsm\",\"mutex\",\"read\"," @@ -129,11 +130,9 @@ __wt_confdfl_file_meta = "checksum=on,collator=,columns=,dictionary=0,format=btree," "huffman_key=,huffman_value=,internal_item_max=0," "internal_key_truncate=,internal_page_max=2KB,key_format=u,key_gap=10" - ",leaf_item_max=0,leaf_page_max=1MB,lsm_bloom=,lsm_bloom_bit_count=8," - "lsm_bloom_config=,lsm_bloom_hash_count=4,lsm_bloom_newest=0," - "lsm_bloom_oldest=0,lsm_chunk_size=2MB,lsm_merge_max=15," - "lsm_merge_threads=1,memory_page_max=5MB,prefix_compression=," - "split_pct=75,value_format=u,version=(major=0,minor=0)"; + ",leaf_item_max=0,leaf_page_max=1MB,memory_page_max=5MB," + "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=,split_pct=75" + ",value_format=u,version=(major=0,minor=0)"; WT_CONFIG_CHECK __wt_confchk_file_meta[] = { @@ -157,16 +156,9 @@ __wt_confchk_file_meta[] = { { "key_gap", "int", "min=0", NULL}, { "leaf_item_max", "int", "min=0", NULL}, { "leaf_page_max", "int", "min=512B,max=512MB", NULL}, - { "lsm_bloom", "boolean", NULL, NULL}, - { "lsm_bloom_bit_count", "int", "min=2,max=1000", NULL}, - { "lsm_bloom_config", "string", NULL, NULL}, - { "lsm_bloom_hash_count", "int", "min=2,max=100", NULL}, - { "lsm_bloom_newest", "boolean", NULL, NULL}, - { "lsm_bloom_oldest", "boolean", NULL, NULL}, - { "lsm_chunk_size", "int", "min=512K,max=500MB", NULL}, - { "lsm_merge_max", "int", "min=2,max=100", NULL}, - { "lsm_merge_threads", "int", "min=1,max=10", NULL}, { "memory_page_max", "int", "min=512B,max=10TB", NULL}, + { "os_cache_dirty_max", "int", "min=0", NULL}, + { "os_cache_max", "int", "min=0", NULL}, { "prefix_compression", "boolean", NULL, NULL}, { "split_pct", "int", "min=25,max=100", NULL}, { "value_format", "format", NULL, NULL}, @@ -255,8 +247,9 @@ __wt_confdfl_session_create = ",leaf_item_max=0,leaf_page_max=1MB,lsm_bloom=,lsm_bloom_bit_count=8," "lsm_bloom_config=,lsm_bloom_hash_count=4,lsm_bloom_newest=0," "lsm_bloom_oldest=0,lsm_chunk_size=2MB,lsm_merge_max=15," - "lsm_merge_threads=1,memory_page_max=5MB,prefix_compression=,source=," - "split_pct=75,type=file,value_format=u"; + "lsm_merge_threads=1,memory_page_max=5MB,os_cache_dirty_max=0," + "os_cache_max=0,prefix_compression=,source=,split_pct=75,type=file," + "value_format=u"; WT_CONFIG_CHECK __wt_confchk_session_create[] = { @@ -291,6 +284,8 @@ __wt_confchk_session_create[] = { { "lsm_merge_max", "int", "min=2,max=100", NULL}, { "lsm_merge_threads", "int", "min=1,max=10", NULL}, { "memory_page_max", "int", "min=512B,max=10TB", NULL}, + { "os_cache_dirty_max", "int", "min=0", NULL}, + { "os_cache_max", "int", "min=0", NULL}, { "prefix_compression", "boolean", NULL, NULL}, { "source", "string", NULL, NULL}, { "split_pct", "int", "min=25,max=100", NULL}, @@ -423,17 +418,39 @@ __wt_confchk_table_meta[] = { const char * __wt_confdfl_wiredtiger_open = - "buffer_alignment=-1,cache_size=100MB,create=0,direct_io=," - "error_prefix=,eviction_dirty_target=80,eviction_target=80," - "eviction_trigger=95,extensions=,hazard_max=1000,logging=0,lsm_merge=" - ",mmap=,multiprocess=0,session_max=50,shared_cache=(chunk=10MB,name=," - "reserve=0,size=500MB),sync=,transactional=,use_environment_priv=0," - "verbose="; + "buffer_alignment=-1,cache_size=100MB," + "checkpoint=(name=\"WiredTigerCheckpoint\",wait=0),create=0," + "direct_io=,error_prefix=,eviction_dirty_target=80,eviction_target=80" + ",eviction_trigger=95,extensions=,hazard_max=1000,logging=0," + "lsm_merge=,mmap=,multiprocess=0,session_max=50," + "shared_cache=(chunk=10MB,name=pool,reserve=0,size=500MB)," + "statistics=0,statistics_log=(clear=,path=\"WiredTigerStat.%H\"," + "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),sync=,transactional=," + "use_environment_priv=0,verbose="; + +WT_CONFIG_CHECK +__wt_confchk_checkpoint_subconfigs[] = { + { "name", "string", NULL, NULL }, + { "wait", "int", "min=1,max=100000", NULL }, + { NULL, NULL, NULL, NULL } +}; + +WT_CONFIG_CHECK +__wt_confchk_statistics_log_subconfigs[] = { + { "clear", "boolean", NULL, NULL }, + { "path", "string", NULL, NULL }, + { "sources", "list", NULL, NULL }, + { "timestamp", "string", NULL, NULL }, + { "wait", "int", "min=5,max=100000", NULL }, + { NULL, NULL, NULL, NULL } +}; WT_CONFIG_CHECK __wt_confchk_wiredtiger_open[] = { { "buffer_alignment", "int", "min=-1,max=1MB", NULL}, { "cache_size", "int", "min=1MB,max=10TB", NULL}, + { "checkpoint", "category", NULL, + __wt_confchk_checkpoint_subconfigs}, { "create", "boolean", NULL, NULL}, { "direct_io", "list", "choices=[\"data\",\"log\"]", NULL}, { "error_prefix", "string", NULL, NULL}, @@ -449,6 +466,9 @@ __wt_confchk_wiredtiger_open[] = { { "session_max", "int", "min=1", NULL}, { "shared_cache", "category", NULL, __wt_confchk_shared_cache_subconfigs}, + { "statistics", "boolean", NULL, NULL}, + { "statistics_log", "category", NULL, + __wt_confchk_statistics_log_subconfigs}, { "sync", "boolean", NULL, NULL}, { "transactional", "boolean", NULL, NULL}, { "use_environment_priv", "boolean", NULL, NULL}, diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index fcb759f0fdf..3946fd93483 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -319,6 +319,16 @@ __conn_close(WT_CONNECTION *wt_conn, const char *config) if (!F_ISSET(s, WT_SESSION_INTERNAL)) __wt_free(session, s->hazard); + /* + * Shut down server threads other than the eviction server, which is + * needed later to close btree handles. Some of these threads access + * btree handles, so take care in ordering shutdown to make sure they + * exit before files are closed. + */ + F_CLR(conn, WT_CONN_SERVER_RUN); + WT_TRET(__wt_checkpoint_destroy(conn)); + WT_TRET(__wt_statlog_destroy(conn)); + /* Clean up open LSM handles. */ WT_ERR(__wt_lsm_cleanup(&conn->iface)); @@ -338,6 +348,7 @@ __conn_close(WT_CONNECTION *wt_conn, const char *config) __conn_remove_data_source(conn, ndsrc); WT_TRET(__wt_connection_close(conn)); + /* We no longer have a session, don't try to update it. */ session = NULL; @@ -351,23 +362,37 @@ err: API_END_NOTFOUND_MAP(session, ret); static int __conn_reconfigure(WT_CONNECTION *wt_conn, const char *config) { + WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; + + /* + * Special version of cfg that doesn't include the default config: used + * to limit changes to values that the application sets explicitly. + * Note that any function using this value has to be prepared to handle + * not-found as a valid option return. + */ const char *raw_cfg[] = { config, NULL }; conn = (WT_CONNECTION_IMPL *)wt_conn; CONNECTION_API_CALL(conn, session, reconfigure, config, cfg); - WT_UNUSED(cfg); - /* - * Don't include the default config: only override values the - * application sets explicitly. - */ + /* Turning on statistics clears any existing values. */ + if ((ret = + __wt_config_gets(session, raw_cfg, "statistics", &cval)) == 0) { + conn->statistics = cval.val == 0 ? 0 : 1; + if (conn->statistics) + __wt_stat_clear_connection_stats(&conn->stats); + } + WT_ERR_NOTFOUND_OK(ret); + WT_ERR(__wt_conn_cache_pool_config(session, cfg)); WT_ERR(__wt_cache_config(conn, raw_cfg)); - WT_ERR(__conn_verbose_config(session, cfg)); + + WT_ERR(__conn_verbose_config(session, raw_cfg)); + /* Wake up the cache pool server so any changes are noticed. */ if (F_ISSET(conn, WT_CONN_CACHE_POOL)) WT_ERR(__wt_cond_signal( @@ -526,7 +551,6 @@ __conn_config_file(WT_SESSION_IMPL *session, const char **cfg, WT_ITEM **cbufp) #if 0 fprintf(stderr, "file config: {%s}\n", (const char *)cbuf->data); - exit(0); #endif /* Check the configuration string. */ @@ -750,7 +774,8 @@ __conn_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) conn = S2C(session); - WT_RET_NOTFOUND_OK(__wt_config_gets(session, cfg, "verbose", &cval)); + if ((ret = __wt_config_gets(session, cfg, "verbose", &cval)) != 0) + return (ret == WT_NOTFOUND ? 0 : ret); for (ft = verbtypes; ft->name != NULL; ft++) { if ((ret = __wt_config_subgets( session, &cval, ft->name, &sval)) == 0 && sval.val != 0) @@ -882,6 +907,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, "buffer_alignment requires posix_memalign"); #endif + /* + * Configuration: direct_io, mmap, statistics. + */ WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval)); for (ft = directio_types; ft->name != NULL; ft++) { ret = __wt_config_subgets(session, &cval, ft->name, &sval); @@ -891,10 +919,10 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, } else if (ret != WT_NOTFOUND) goto err; } - - /* Configure mmap. */ WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval)); conn->mmap = cval.val == 0 ? 0 : 1; + WT_ERR(__wt_config_gets(session, cfg, "statistics", &cval)); + conn->statistics = cval.val == 0 ? 0 : 1; /* Load any extensions referenced in the config. */ WT_ERR(__wt_config_gets(session, cfg, "extensions", &cval)); @@ -963,7 +991,11 @@ err: if (cbuf != NULL) __wt_buf_free(session, &exconfig); if (ret != 0 && conn != NULL) - WT_TRET(__wt_connection_destroy(conn)); + WT_TRET(__wt_connection_close(conn)); + + /* Let the server threads proceed. */ + if (ret == 0) + conn->connection_initialized = 1; return (ret); } diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index 24b03592de0..53a0d4c33e5 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -32,7 +32,7 @@ __wt_cache_config(WT_CONNECTION_IMPL *conn, const char *cfg[]) if (F_ISSET(conn, WT_CONN_CACHE_POOL) && (ret = __wt_config_gets(session, cfg, - "shared_cache.reserved", &cval)) == 0) + "shared_cache.reserve", &cval)) == 0 && cval.val != 0) cache->cp_reserved = (uint64_t)cval.val; else if ((ret = __wt_config_gets(session, cfg, "shared_cache.chunk", &cval)) == 0) @@ -98,10 +98,10 @@ __wt_cache_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) __wt_spin_init(session, &cache->evict_walk_lock); /* - * We pull some values from the cache statistics (rather than have two - * copies). Set them. + * We get/set some values in the cache statistics (rather than have + * two copies), configure them. */ - __wt_cache_stats_update(conn, 0); + __wt_cache_stats_update(session); return (0); err: WT_RET(__wt_cache_destroy(conn)); @@ -113,22 +113,25 @@ err: WT_RET(__wt_cache_destroy(conn)); * Update the cache statistics for return to the application. */ void -__wt_cache_stats_update(WT_CONNECTION_IMPL *conn, uint32_t flags) +__wt_cache_stats_update(WT_SESSION_IMPL *session) { WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_CONNECTION_STATS *stats; - WT_UNUSED(flags); + conn = S2C(session); cache = conn->cache; + stats = &conn->stats; - WT_STAT_SET(conn->stats, cache_bytes_max, conn->cache_size); - WT_STAT_SET( - conn->stats, cache_bytes_inuse, __wt_cache_bytes_inuse(cache)); - WT_STAT_SET( - conn->stats, cache_pages_inuse, __wt_cache_pages_inuse(cache)); - WT_STAT_SET( - conn->stats, cache_bytes_dirty, __wt_cache_bytes_dirty(cache)); - WT_STAT_SET( - conn->stats, cache_pages_dirty, __wt_cache_pages_dirty(cache)); + /* + * Some statistics are always set, regardless of the configuration of + * run-time statistics in the system. + */ + WT_STAT_SET(stats, cache_bytes_max, conn->cache_size); + WT_STAT_SET(stats, cache_bytes_inuse, __wt_cache_bytes_inuse(cache)); + WT_STAT_SET(stats, cache_pages_inuse, __wt_cache_pages_inuse(cache)); + WT_STAT_SET(stats, cache_bytes_dirty, __wt_cache_bytes_dirty(cache)); + WT_STAT_SET(stats, cache_pages_dirty, __wt_cache_pages_dirty(cache)); } /* diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index 76ea6914c17..02f54e8bad0 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -25,7 +25,7 @@ static int __cache_pool_balance(void); /* * __wt_conn_cache_pool_config -- - * Parse and setup and cache pool options. + * Parse and setup the cache pool options. */ int __wt_conn_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) @@ -47,10 +47,12 @@ __wt_conn_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) if (F_ISSET(conn, WT_CONN_CACHE_POOL)) reconfiguring = 1; else { + /* Only setup if a shared cache was explicitly configured. */ + if (__wt_config_gets(session, WT_SKIP_DEFAULT_CONFIG(cfg), + "shared_cache", &cval) == WT_NOTFOUND) + return (0); WT_RET_NOTFOUND_OK( __wt_config_gets(session, cfg, "shared_cache.name", &cval)); - if (cval.len == 0) - return (0); /* * NOTE: The allocations made when configuring and opening a * cache pool don't really belong to the connection that @@ -238,27 +240,39 @@ __wt_conn_cache_pool_destroy(WT_CONNECTION_IMPL *conn) break; } - if (!found) { + /* + * If there was an error during open, we may not have made it onto the + * queue. We did increment the reference count, so proceed regardless. + */ + if (found) { + WT_VERBOSE_TRET(session, shared_cache, + "Removing %s from cache pool.", entry->home); + TAILQ_REMOVE(&cp->cache_pool_qh, entry, cpq); + + /* Give the connection's resources back to the pool. */ + WT_ASSERT(session, cp->currently_used >= conn->cache_size); + cp->currently_used -= conn->cache_size; + } + + /* + * If there are no references, we are cleaning up after a failed + * wiredtiger_open, there is nothing further to do. + */ + if (cp->refs < 1) { __wt_spin_unlock(session, &cp->cache_pool_lock); return (0); } - WT_VERBOSE_TRET(session, shared_cache, - "Removing %s from cache pool.", entry->home); - TAILQ_REMOVE(&cp->cache_pool_qh, entry, cpq); - - /* Give the connection's resources back to the pool. */ - WT_ASSERT(session, cp->currently_used >= conn->cache_size); - cp->currently_used -= conn->cache_size; - --cp->refs; - if (cp->refs == 0 && TAILQ_EMPTY(&cp->cache_pool_qh)) + if (--cp->refs == 0) { + WT_ASSERT(session, TAILQ_EMPTY(&cp->cache_pool_qh)); F_CLR(cp, WT_CACHE_POOL_RUN); + } /* * Free the connection pool session if it was created by this * connection. A new one will be created by the next balance pass. */ - if (cp->session != NULL && entry == S2C(cp->session)) { + if (cp->session != NULL && conn == S2C(cp->session)) { WT_VERBOSE_TRET(cp->session, shared_cache, "Freeing a cache pool session due to connection close."); wt_session = &cp->session->iface; @@ -287,9 +301,11 @@ __wt_conn_cache_pool_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_unlock(session, &__wt_process.spinlock); __wt_spin_unlock(session, &cp->cache_pool_lock); - /* Shut down the cache pool worker. */ - WT_TRET(__wt_cond_signal(session, cp->cache_pool_cond)); - WT_TRET(__wt_thread_join(session, cp->cache_pool_tid)); + if (found) { + /* Shut down the cache pool worker. */ + WT_TRET(__wt_cond_signal(session, cp->cache_pool_cond)); + WT_TRET(__wt_thread_join(session, cp->cache_pool_tid)); + } /* Now free the pool. */ __wt_free(session, cp->name); diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c new file mode 100644 index 00000000000..41e0cd5c640 --- /dev/null +++ b/src/conn/conn_ckpt.c @@ -0,0 +1,160 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __ckpt_server_config -- + * Parse and setup the checkpoint server options. + */ +static int +__ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, int *runp) +{ + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + + conn = S2C(session); + + /* + * The checkpoint configuration requires a wait time -- if it's not set, + * we're not running at all. + */ + WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval)); + if (cval.val == 0) { + *runp = 0; + return (0); + } + conn->ckpt_usecs = (long)cval.val * 1000000; + *runp = 1; + + WT_RET(__wt_config_gets(session, cfg, "checkpoint.name", &cval)); + + if (!WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) { + WT_RET(__wt_scr_alloc(session, cval.len + 20, &tmp)); + strcpy((char *)tmp->data, "name="); + strncat((char *)tmp->data, cval.str, cval.len); + ret = __wt_strndup(session, + tmp->data, strlen("name=") + cval.len, &conn->ckpt_config); + __wt_scr_free(&tmp); + WT_RET(ret); + } + + return (0); +} + +/* + * __ckpt_server -- + * The checkpoint server thread. + */ +static void * +__ckpt_server(void *arg) +{ + struct timespec ts; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION *wt_session; + WT_SESSION_IMPL *session; + + session = arg; + conn = S2C(session); + wt_session = (WT_SESSION *)session; + + /* + * The checkpoint server may be running before the database is created, + * and checkpoints would fail. Wait for the wiredtiger_open call. + */ + while (!conn->connection_initialized) + __wt_sleep(1, 0); + + while (F_ISSET(conn, WT_CONN_SERVER_RUN)) { + /* Get the current local time of day. */ + WT_ERR(__wt_epoch(session, &ts)); + + /* Checkpoint the database. */ + WT_ERR(wt_session->checkpoint(wt_session, conn->ckpt_config)); + + /* Wait... */ + WT_ERR( + __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs)); + } + + if (0) { +err: __wt_err(session, ret, "checkpoint server error"); + } + return (NULL); +} + +/* + * __wt_checkpoint_create - + * Start the checkpoint server thread. + */ +int +__wt_checkpoint_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) +{ + WT_SESSION_IMPL *session; + int run; + + session = conn->default_session; + + /* Handle configuration. */ + WT_RET(__ckpt_server_config(session, cfg, &run)); + + /* If not configured, we're done. */ + if (!run) + return (0); + + /* The checkpoint server gets its own session. */ + WT_RET(__wt_open_session(conn, 1, NULL, NULL, &conn->ckpt_session)); + conn->ckpt_session->name = "checkpoint-server"; + + WT_RET( + __wt_cond_alloc(session, "checkpoint server", 0, &conn->ckpt_cond)); + + /* + * Start the thread. + */ + WT_RET(__wt_thread_create( + session, &conn->ckpt_tid, __ckpt_server, conn->ckpt_session)); + conn->ckpt_tid_set = 1; + + return (0); +} + +/* + * __wt_checkpoint_destroy - + * Destroy the checkpoint server thread. + */ +int +__wt_checkpoint_destroy(WT_CONNECTION_IMPL *conn) +{ + WT_DECL_RET; + WT_SESSION *wt_session; + WT_SESSION_IMPL *session; + + session = conn->default_session; + + if (conn->ckpt_tid_set) { + WT_TRET(__wt_cond_signal(session, conn->ckpt_cond)); + WT_TRET(__wt_thread_join(session, conn->ckpt_tid)); + conn->ckpt_tid_set = 0; + } + if (conn->ckpt_cond != NULL) + WT_TRET(__wt_cond_destroy(session, conn->ckpt_cond)); + + __wt_free(session, conn->ckpt_config); + + /* Close the server thread's session, free its hazard array. */ + if (conn->ckpt_session != NULL) { + wt_session = &conn->ckpt_session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + __wt_free(session, conn->ckpt_session->hazard); + } + + return (ret); +} diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 3a5bfa430eb..fdd0966232e 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -201,12 +201,78 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session) } /* + * __conn_btree_config_clear -- + * Clear the underlying object's configuration information. + */ +static void +__conn_btree_config_clear(WT_SESSION_IMPL *session) +{ + WT_DATA_HANDLE *dhandle; + const char **a; + + dhandle = session->dhandle; + + if (dhandle->cfg == NULL) + return; + for (a = dhandle->cfg; *a != NULL; ++a) + __wt_free(session, *a); + __wt_free(session, dhandle->cfg); +} + +/* + * __conn_btree_config_set -- + * Set up a btree handle's configuration information. + */ +static int +__conn_btree_config_set(WT_SESSION_IMPL *session) +{ + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + const char *metaconf; + + dhandle = session->dhandle; + + /* + * Read the object's entry from the metadata file, we're done if we + * don't find one. + */ + if ((ret = + __wt_metadata_read(session, dhandle->name, &metaconf)) != 0) { + if (ret == WT_NOTFOUND) + ret = ENOENT; + WT_RET(ret); + } + + /* + * The defaults are included because underlying objects have persistent + * configuration information stored in the metadata file. If defaults + * are included in the configuration, we can add new configuration + * strings without upgrading the metadata file or writing special code + * in case a configuration string isn't initialized, as long as the new + * configuration string has an appropriate default value. + * + * The error handling is a little odd, but be careful: we're holding a + * chunk of allocated memory in metaconf. If we fail before we copy a + * reference to it into the object's configuration array, we must free + * it, after the copy, we don't want to free it. + */ + WT_ERR(__wt_calloc_def(session, 3, &dhandle->cfg)); + WT_ERR(__wt_strdup(session, __wt_confdfl_file_meta, &dhandle->cfg[0])); + dhandle->cfg[1] = metaconf; + metaconf = NULL; + return (0); + +err: __wt_free(session, metaconf); + return (ret); +} + +/* * __conn_btree_open -- * Open the current btree handle. */ static int -__conn_btree_open(WT_SESSION_IMPL *session, - const char *config, const char *cfg[], uint32_t flags) +__conn_btree_open( + WT_SESSION_IMPL *session, const char *op_cfg[], uint32_t flags) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; @@ -219,10 +285,6 @@ __conn_btree_open(WT_SESSION_IMPL *session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) && !LF_ISSET(WT_DHANDLE_LOCK_ONLY)); - /* Open the underlying file, free any old config. */ - __wt_free(session, dhandle->config); - dhandle->config = config; - /* * If the handle is already open, it has to be closed so it can be * reopened with a new configuration. We don't need to check again: @@ -232,11 +294,15 @@ __conn_btree_open(WT_SESSION_IMPL *session, if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) WT_RET(__wt_conn_btree_sync_and_close(session)); + /* Discard any previous configuration, set up the new configuration. */ + __conn_btree_config_clear(session); + WT_RET(__conn_btree_config_set(session)); + /* Set any special flags on the handle. */ F_SET(btree, LF_ISSET(WT_BTREE_SPECIAL_FLAGS)); do { - WT_ERR(__wt_btree_open(session, cfg)); + WT_ERR(__wt_btree_open(session, op_cfg)); F_SET(dhandle, WT_DHANDLE_OPEN); /* * Checkpoint handles are read only, so eviction calculations @@ -268,11 +334,10 @@ err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); */ int __wt_conn_btree_get(WT_SESSION_IMPL *session, - const char *name, const char *ckpt, const char *cfg[], uint32_t flags) + const char *name, const char *ckpt, const char *op_cfg[], uint32_t flags) { WT_DATA_HANDLE *dhandle; WT_DECL_RET; - const char *treeconf; WT_CSTAT_INCR(session, file_open); @@ -281,19 +346,11 @@ __wt_conn_btree_get(WT_SESSION_IMPL *session, if (!LF_ISSET(WT_DHANDLE_LOCK_ONLY) && (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || - LF_ISSET(WT_BTREE_SPECIAL_FLAGS))) { - if ((ret = __wt_metadata_read(session, name, &treeconf)) != 0) { - if (ret == WT_NOTFOUND) - ret = ENOENT; - goto err; + LF_ISSET(WT_BTREE_SPECIAL_FLAGS))) + if ((ret = __conn_btree_open(session, op_cfg, flags)) != 0) { + F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); + WT_TRET(__wt_rwunlock(session, dhandle->rwlock)); } - ret = __conn_btree_open(session, treeconf, cfg, flags); - } - -err: if (ret != 0) { - F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); - WT_TRET(__wt_rwunlock(session, dhandle->rwlock)); - } WT_ASSERT(session, ret != 0 || LF_ISSET(WT_DHANDLE_EXCLUSIVE) == @@ -379,13 +436,11 @@ int __wt_conn_btree_close(WT_SESSION_IMPL *session, int locked) { WT_BTREE *btree; - WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; int inuse; btree = S2BT(session); - conn = S2C(session); dhandle = session->dhandle; WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); @@ -413,7 +468,7 @@ __wt_conn_btree_close(WT_SESSION_IMPL *session, int locked) */ WT_ASSERT(session, btree != session->metafile || - session == conn->default_session); + session == S2C(session)->default_session); if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) WT_TRET(__wt_conn_btree_sync_and_close(session)); @@ -511,18 +566,20 @@ __wt_conn_dhandle_discard_single( { WT_DECL_RET; - if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) { - session->dhandle = dhandle; + session->dhandle = dhandle; + + if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) WT_TRET(__wt_conn_btree_sync_and_close(session)); - session->dhandle = NULL; - } + WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock)); - __wt_free(session, dhandle->config); __wt_free(session, dhandle->name); __wt_free(session, dhandle->checkpoint); + __conn_btree_config_clear(session); __wt_free(session, dhandle->handle); __wt_overwrite_and_free(session, dhandle); + WT_CLEAR_BTREE_IN_SESSION(session); + return (ret); } diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index df5bac8d0b9..fb59812d75d 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -28,7 +28,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) TAILQ_INIT(&conn->lsmqh); /* WT_LSM_TREE list */ /* Statistics. */ - WT_RET(__wt_stat_alloc_connection_stats(session, &conn->stats)); + __wt_stat_init_connection_stats(&conn->stats); /* Locks. */ __wt_spin_init(session, &conn->api_lock); @@ -59,12 +59,12 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) WT_DECL_RET; WT_SESSION_IMPL *session; - session = conn->default_session; - /* Check there's something to destroy. */ if (conn == NULL) return (0); + session = conn->default_session; + /* * Close remaining open files (before discarding the mutex, the * underlying file-close code uses the mutex to guard lists of @@ -91,7 +91,6 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) /* Free allocated memory. */ __wt_free(session, conn->home); __wt_free(session, conn->sessions); - __wt_free(session, conn->stats); __wt_free(NULL, conn); return (ret); diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index ae5d0d24172..2bafe877767 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -38,7 +38,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) WT_WRITE_BARRIER(); /* Start worker threads. */ - F_SET(conn, WT_CONN_SERVER_RUN); + F_SET(conn, WT_CONN_EVICTION_RUN | WT_CONN_SERVER_RUN); /* * Start the eviction thread. @@ -51,6 +51,13 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) evict_session->name = "eviction-server"; WT_ERR(__wt_thread_create(session, &conn->cache_evict_tid, __wt_cache_evict_server, evict_session)); + conn->cache_evict_tid_set = 1; + + /* Start the optional checkpoint thread. */ + WT_ERR(__wt_checkpoint_create(conn, cfg)); + + /* Start the optional statistics thread. */ + WT_ERR(__wt_statlog_create(conn, cfg)); return (0); @@ -86,11 +93,12 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) fh = TAILQ_FIRST(&conn->fhqh); } - /* Shut down the server threads. */ - F_CLR(conn, WT_CONN_SERVER_RUN); - if (conn->cache_evict_tid != 0) { + /* Shut down the eviction server thread. */ + F_CLR(conn, WT_CONN_EVICTION_RUN); + if (conn->cache_evict_tid_set) { WT_TRET(__wt_evict_server_wake(session)); WT_TRET(__wt_thread_join(session, conn->cache_evict_tid)); + conn->cache_evict_tid_set = 0; } /* Disconnect from shared cache - must be before cache destroy. */ diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 6ca3be4659a..0660ab2435c 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -7,6 +7,15 @@ #include "wt_internal.h" +#ifdef __GNUC__ +/* + * !!! + * GCC with -Wformat-nonliteral complains about calls to strftime in this file. + * There's nothing wrong, this makes the warning go away. + */ +#pragma GCC diagnostic ignored "-Wformat-nonliteral" +#endif + /* * __wt_conn_stat_init -- * Initialize the per-connection statistics. @@ -14,9 +23,316 @@ void __wt_conn_stat_init(WT_SESSION_IMPL *session, uint32_t flags) { + WT_UNUSED(flags); + + __wt_cache_stats_update(session); +} + +/* + * __wt_statlog_config -- + * Parse and setup the statistics server options. + */ +static int +__statlog_config(WT_SESSION_IMPL *session, const char **cfg, int *runp) +{ + WT_CONFIG objectconf; + WT_CONFIG_ITEM cval, k, v; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + int cnt; + + conn = S2C(session); + + /* + * The statistics logging configuration requires a wait time -- if it's + * not set, we're not running at all. + */ + WT_RET(__wt_config_gets(session, cfg, "statistics_log.wait", &cval)); + if (cval.val == 0) { + *runp = 0; + return (0); + } + conn->stat_usecs = (long)cval.val * 1000000; + + /* Statistics logging implies statistics. */ + conn->statistics = *runp = 1; + + WT_RET(__wt_config_gets(session, cfg, "statistics_log.clear", &cval)); + conn->stat_clear = cval.val != 0; + + WT_RET(__wt_config_gets(session, cfg, "statistics_log.sources", &cval)); + WT_RET(__wt_config_subinit(session, &objectconf, &cval)); + for (cnt = 0; (ret = __wt_config_next(&objectconf, &k, &v)) == 0; ++cnt) + ; + WT_RET_NOTFOUND_OK(ret); + if (cnt != 0) { + WT_RET( + __wt_calloc_def(session, cnt * 2 + 1, &conn->stat_sources)); + WT_RET(__wt_config_subinit(session, &objectconf, &cval)); + for (cnt = 0; + (ret = __wt_config_next(&objectconf, &k, &v)) == 0;) { + /* + * We close and re-open each statistics cursor each time + * we dump statistics (the object may or may not exist + * underneath at any point, and I don't want this code + * to break if/when the lifetime of an underlying object + * changes). Create pairs of strings: the first is the + * object uri, written into the output, the second is + * the enhanced uri used to open the statistics cursor. + */ + WT_RET(__wt_strndup(session, + k.str, k.len, &conn->stat_sources[cnt])); + ++cnt; + + WT_RET(__wt_calloc_def(session, + strlen("statistics:") + k.len + 1, + &conn->stat_sources[cnt])); + strcpy(conn->stat_sources[cnt], "statistics:"); + strncat(conn->stat_sources[cnt], k.str, k.len); + ++cnt; + } + WT_RET_NOTFOUND_OK(ret); + } + + WT_RET(__wt_config_gets(session, cfg, "statistics_log.path", &cval)); + WT_RET(__wt_nfilename(session, cval.str, cval.len, &conn->stat_path)); + + WT_RET(__wt_config_gets( + session, cfg, "statistics_log.timestamp", &cval)); + WT_RET(__wt_strndup(session, cval.str, cval.len, &conn->stat_stamp)); + + return (0); +} + +/* + * __stat_server_dump -- + * Dump a single set of statistics. + */ +static int +__stat_server_dump(WT_SESSION_IMPL *session, + const char *name, const char *cursor_uri, const char *stamp, FILE *fp) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + WT_SESSION *wt_session; + uint64_t value; + const char *config, *desc, *pdesc; + + wt_session = (WT_SESSION *)session; + config = S2C(session)->stat_clear ? + "statistics_clear,statistics_fast" : "statistics_fast"; + + /* + * If we don't find an underlying object, silently ignore it, the object + * may exist only intermittently. User-level APIs return ENOENT instead + * of WT_NOTFOUND for missing files, check both, as well as for EBUSY if + * the handle is exclusively locked at the moment. + */ + ret = wt_session->open_cursor( + wt_session, cursor_uri, NULL, config, &cursor); + if (ret == EBUSY || ret == ENOENT || ret == WT_NOTFOUND) + return (0); + WT_RET(ret); + + while ((ret = cursor->next(cursor)) == 0 && + (ret = cursor->get_value(cursor, &desc, &pdesc, &value)) == 0) + WT_ERR_TEST((fprintf(fp, + "%s %" PRIu64 " %s %s\n", + stamp, value, name, desc) < 0), __wt_errno()); + WT_ERR_NOTFOUND_OK(ret); + +err: WT_TRET(cursor->close(cursor)); + + return (ret); +} + +/* + * __stat_server -- + * The statistics server thread. + */ +static void * +__stat_server(void *arg) +{ + struct timespec ts; + struct tm *tm, _tm; + FILE *fp; WT_CONNECTION_IMPL *conn; + WT_ITEM path, tmp; + WT_DECL_RET; + WT_SESSION_IMPL *session; + char **p; + session = arg; conn = S2C(session); - __wt_cache_stats_update(conn, flags); + WT_CLEAR(path); + WT_CLEAR(tmp); + fp = NULL; + + /* + * We need a temporary place to build a path and an entry prefix. + * The length of the path plus 128 should be more than enough. + * + * We also need a place to store the current path, because that's + * how we know when to close/re-open the file. + */ + WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128)); + WT_ERR(__wt_buf_init(session, &tmp, strlen(conn->stat_path) + 128)); + + /* + * The statistics log server may be running before the database is + * created (it should run fine because we're looking at statistics + * structures that have already been allocated, but it doesn't make + * sense and we have the information we need to wait). Wait for + * the wiredtiger_open call. + */ + while (!conn->connection_initialized) + __wt_sleep(1, 0); + + while (F_ISSET(conn, WT_CONN_SERVER_RUN)) { + /* + * If statistics are turned off, wait until it's time to output + * statistics and check again. + */ + if (conn->statistics == 0) { + WT_ERR(__wt_cond_wait( + session, conn->stat_cond, conn->stat_usecs)); + continue; + } + + /* Get the current local time of day. */ + WT_ERR(__wt_epoch(session, &ts)); + tm = localtime_r(&ts.tv_sec, &_tm); + + /* Create the logging path name for this time of day. */ + if (strftime(tmp.mem, tmp.memsize, conn->stat_path, tm) == 0) + WT_ERR_MSG( + session, ENOMEM, "strftime path conversion"); + + /* If the path has changed, close/open the new log file. */ + if (fp == NULL || strcmp(tmp.mem, path.mem) != 0) { + if (fp != NULL) { + (void)fclose(fp); + fp = NULL; + } + + (void)strcpy(path.mem, tmp.mem); + WT_ERR_TEST( + (fp = fopen(path.mem, "a")) == NULL, __wt_errno()); + } + + /* Create the entry prefix for this time of day. */ + if (strftime(tmp.mem, tmp.memsize, conn->stat_stamp, tm) == 0) + WT_ERR_MSG( + session, ENOMEM, "strftime timestamp conversion"); + + /* Dump the connection statistics. */ + WT_ERR(__stat_server_dump( + session, conn->home, "statistics:", tmp.mem, fp)); + + /* Dump the object list statistics. */ + if ((p = conn->stat_sources) != NULL) + for (; *p != NULL; p += 2) + WT_ERR(__stat_server_dump( + session, p[0], p[1], tmp.mem, fp)); + + /* Flush. */ + WT_ERR(fflush(fp) == 0 ? 0 : __wt_errno()); + + /* Wait until the next event. */ + WT_ERR( + __wt_cond_wait(session, conn->stat_cond, conn->stat_usecs)); + } + + if (0) { +err: __wt_err(session, ret, "statistics log server error"); + } + if (fp != NULL) + WT_TRET(fclose(fp) == 0 ? 0 : __wt_errno()); + __wt_buf_free(session, &path); + __wt_buf_free(session, &tmp); + return (NULL); +} + +/* + * __wt_statlog_create - + * Start the statistics server thread. + */ +int +__wt_statlog_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) +{ + WT_SESSION_IMPL *session; + int run; + + session = conn->default_session; + + /* Handle configuration. */ + WT_RET(__statlog_config(session, cfg, &run)); + + /* If not configured, we're done. */ + if (!run) + return (0); + + /* The statistics log server gets its own session. */ + WT_RET(__wt_open_session(conn, 1, NULL, NULL, &conn->stat_session)); + conn->stat_session->name = "statlog-server"; + + WT_RET(__wt_cond_alloc( + session, "statistics log server", 0, &conn->stat_cond)); + + /* + * Start the thread. + * + * Statistics logging creates a thread per database, rather than using + * a single thread to do logging for all of the databases. If we ever + * see lots of databases at a time, doing statistics logging, and we + * want to reduce the number of threads, there's no reason we have to + * have more than one thread, I just didn't feel like writing the code + * to figure out the scheduling. + */ + WT_RET(__wt_thread_create( + session, &conn->stat_tid, __stat_server, conn->stat_session)); + conn->stat_tid_set = 1; + + return (0); +} + +/* + * __wt_statlog_destroy - + * Destroy the statistics server thread. + */ +int +__wt_statlog_destroy(WT_CONNECTION_IMPL *conn) +{ + WT_DECL_RET; + WT_SESSION *wt_session; + WT_SESSION_IMPL *session; + char **p; + + session = conn->default_session; + + if (conn->stat_tid_set) { + WT_TRET(__wt_cond_signal(session, conn->stat_cond)); + WT_TRET(__wt_thread_join(session, conn->stat_tid)); + conn->stat_tid_set = 0; + } + if (conn->stat_cond != NULL) + WT_TRET(__wt_cond_destroy(session, conn->stat_cond)); + + if ((p = conn->stat_sources) != NULL) { + for (; *p != NULL; ++p) + __wt_free(session, *p); + __wt_free(session, conn->stat_sources); + } + __wt_free(session, conn->stat_path); + __wt_free(session, conn->stat_stamp); + + /* Close the server thread's session, free its hazard array. */ + if (conn->stat_session != NULL) { + wt_session = &conn->stat_session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + __wt_free(session, conn->stat_session->hazard); + } + + return (ret); } diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index ba49cb6927c..3557dc13026 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -359,7 +359,8 @@ __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, flags = 0; WT_RET(__wt_config_gets_defno(session, cfg, "bulk", &cval)); - if (cval.type == ITEM_NUM && (cval.val == 0 || cval.val == 1)) { + if (cval.type == ITEM_BOOL || + (cval.type == ITEM_NUM && (cval.val == 0 || cval.val == 1))) { bitmap = 0; bulk = (cval.val != 0); } else if (WT_STRING_MATCH("bitmap", cval.str, cval.len)) diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index 91ae5e2c8f2..e40bfa509f6 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -292,6 +292,9 @@ __curindex_close(WT_CURSOR *cursor) if (cindex->child != NULL) WT_TRET(cindex->child->close(cindex->child)); + + if (cindex->table != NULL) + __wt_schema_release_table(session, cindex->table); /* The URI is owned by the index. */ cursor->uri = NULL; WT_TRET(__wt_cursor_close(cursor)); @@ -408,7 +411,10 @@ __wt_curindex_open(WT_SESSION_IMPL *session, * using only the primary's recno as the index key. Disallow that for * now. */ - WT_ASSERT(session, !WT_CURSOR_RECNO(cursor)); + if (WT_CURSOR_RECNO(cursor)) + WT_ERR_MSG(session, WT_ERROR, + "Column store indexes based on a record number primary " + "key are not supported."); /* Handle projections. */ if (columns != NULL) { diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c index 8e93be354bb..4c344340c6c 100644 --- a/src/cursor/cur_stat.c +++ b/src/cursor/cur_stat.c @@ -307,8 +307,8 @@ __curstat_conn_init( cst->btree = NULL; cst->notpositioned = 1; - cst->stats_first = (WT_STATS *)S2C(session)->stats; - cst->stats_count = sizeof(*S2C(session)->stats) / sizeof(WT_STATS); + cst->stats_first = (WT_STATS *)&S2C(session)->stats; + cst->stats_count = sizeof(S2C(session)->stats) / sizeof(WT_STATS); cst->clear_func = LF_ISSET(WT_STATISTICS_CLEAR) ? __wt_stat_clear_connection_stats : NULL; } @@ -329,7 +329,7 @@ __curstat_file_init(WT_SESSION_IMPL *session, cst->btree = btree; cst->notpositioned = 1; - cst->stats_first = (WT_STATS *)btree->dhandle->stats; + cst->stats_first = (WT_STATS *)&btree->dhandle->stats; cst->stats_count = sizeof(WT_DSRC_STATS) / sizeof(WT_STATS); cst->clear_func = LF_ISSET(WT_STATISTICS_CLEAR) ? __wt_stat_clear_dsrc_stats : NULL; @@ -441,7 +441,8 @@ __wt_curstat_open(WT_SESSION_IMPL *session, WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); if (0) { -err: __wt_free(session, cst); +err: __wt_free(session, cst->stats); + __wt_free(session, cst); } return (ret); diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index 63b0a8c29ee..ab4f48cbf83 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -623,6 +623,7 @@ __curtable_close(WT_CURSOR *cursor) __wt_free(session, cursor->value_format); __wt_free(session, ctable->cg_cursors); __wt_free(session, ctable->idx_cursors); + __wt_schema_release_table(session, ctable->table); /* The URI is owned by the table. */ cursor->uri = NULL; WT_TRET(__wt_cursor_close(cursor)); @@ -742,10 +743,14 @@ __wt_curtable_open(WT_SESSION_IMPL *session, size = WT_PTRDIFF(columns, tablename); WT_RET(__wt_schema_get_table(session, tablename, size, 0, &table)); - if (table->is_simple) + if (table->is_simple) { /* Just return a cursor on the underlying data source. */ - return (__wt_open_cursor(session, - table->cgroups[0]->source, NULL, cfg, cursorp)); + ret = __wt_open_cursor(session, + table->cgroups[0]->source, NULL, cfg, cursorp); + + __wt_schema_release_table(session, table); + return (ret); + } WT_RET(__wt_calloc_def(session, 1, &ctable)); diff --git a/src/docs/bulk-load.dox b/src/docs/bulk-load.dox new file mode 100644 index 00000000000..f65e1f81ca0 --- /dev/null +++ b/src/docs/bulk-load.dox @@ -0,0 +1,23 @@ +/*! @page bulk_load Bulk-load + +WiredTiger cursors can be configured for bulk-load using the \c bulk +configuration keyword to WT_SESSION::open_cursor. Bulk-load is a "fast +path" for quickly loading a large number of rows. Bulk-load may only +be used on newly created objects, and an object being bulk-loaded is not +accessible from other cursors. + +Cursors configured for bulk-load only support the WT_CURSOR::insert and +WT_CURSOR::close methods. + +When bulk-loading row-store objects, keys must be loaded in sorted +order. + +When bulk-loading fixed-length column store objects, the \c bulk +configuration string value \c bitmap allows chunks of a memory resident +bitmap to be loaded directly into an object by passing a WT_ITEM to +WT_CURSOR::set_value, where the size field indicates the number of +records in the bitmap (as specified by the object's \c value_format +configuration). Bulk-loaded bitmap values must end on a byte boundary +relative to the bit count (except for the last set of values loaded). + + */ diff --git a/src/docs/cache-configuration.dox b/src/docs/cache-configuration.dox index 0faf0eaf144..3eda7a4fbdc 100644 --- a/src/docs/cache-configuration.dox +++ b/src/docs/cache-configuration.dox @@ -1,6 +1,6 @@ /*! @page cache_configuration Cache configuration -@section cache_basic Overview of WiredTiger cache configuration. +@section cache_basic Cache configuration The WiredTiger cache implements an approximation of a least recently used algorithm. Ideally the cache should be configured to be large enough to @@ -10,7 +10,7 @@ The WiredTiger cache size can be configured when first opening a database via @ref wiredtiger_open or changed after open using the WT_CONNECTION::reconfigure method. -@section shared_cache Overview of WiredTiger shared cache configuration. +@section shared_cache Shared cache configuration WiredTiger supports sharing a single cache among multiple databases within a process. @@ -50,7 +50,7 @@ WiredTiger shared cache tuning options can be configured when first opening a database via @ref wiredtiger_open or changed after open using the WT_CONNECTION::reconfigure method. -@section cache_eviction Overview of WiredTiger eviction configuration. +@section cache_eviction Eviction configuration WiredTiger provides several configuration options for tuning how aggressively pages are evicted from the cache. Different values will result in better diff --git a/src/docs/community.dox b/src/docs/community.dox new file mode 100644 index 00000000000..ee7674850dc --- /dev/null +++ b/src/docs/community.dox @@ -0,0 +1,23 @@ +/*! @page community WiredTiger community and contact information + +WiredTiger Inc., and the WiredTiger community maintain and develop +<a href="https://github.com/wiredtiger">WiredTiger on GitHub</a>, +and all contributors are welcome! + +All source code and revision histories for this software are available +in the <a href="https://github.com/wiredtiger/wiredtiger"> WiredTiger +source tree repository</a>. + +Please submit any feature suggestions and bug reports at +<a href="https://github.com/wiredtiger/wiredtiger/issues"> +WiredTiger's Issues page</a>. + +Discussion of WiredTiger issues and development can also be found in +the +<a href="http://groups.google.com/group/wiredtiger-users">WiredTiger +Google Group</a>. + +To contact WiredTiger, Inc. please send email to +<a mailto="info@wiredtiger.com">info@wiredtiger.com</a>. + +*/ diff --git a/src/docs/cursors.dox b/src/docs/cursors.dox index fa25e0148a0..1fd3646603e 100644 --- a/src/docs/cursors.dox +++ b/src/docs/cursors.dox @@ -57,21 +57,21 @@ The following are some of the common builtin cursor types: @hrow{URI, Type, Notes} @row{<tt>backup:</tt>, hot backup cursor, See also: @ref hot_backup} - @row{<tt>colgroup:\<tablename\>.\<columnset\></tt>, + @row{<tt>colgroup:\<table name\>:\<column group name\></tt>, column group cursor,} @row{<tt>config:[\<uri\>]</tt>, object configuration cursor, (key=config string\, value=config value)} - @row{<tt>file:\<filename\></tt>, + @row{<tt>file:\<file name\></tt>, file cursor (key=file key\, value=file value),} - @row{<tt>index:\<tablename\>.\<indexname\></tt>, + @row{<tt>index:\<table name\>:\<index name\></tt>, index cursor (key=index key\, value=table value),} @row{<tt>lsm:\<name\></tt>, LSM cursor (key=LSM key\, value=LSM value), See also: @ref lsm} - @row{<tt>statistics:[file</tt><tt>:\<filename\>]</tt>, + @row{<tt>statistics:[\<data source URI\>]</tt>, database or file statistics (key=(int)\, value=(string)description\, (string)value\, (uint64_t)value),} - @row{<tt>table:\<tablename\></tt>, + @row{<tt>table:\<table name\></tt>, table cursor (key=table key\, value=table value),} </table> diff --git a/src/docs/data_sources.dox b/src/docs/data_sources.dox index 458786a7617..2b9a33caddc 100644 --- a/src/docs/data_sources.dox +++ b/src/docs/data_sources.dox @@ -19,23 +19,23 @@ The following are the builtin cursor types: @hrow{URI, Type, Notes} @row{<tt>backup:</tt>, hot backup cursor, See also: @ref hot_backup} - @row{<tt>colgroup:\<tablename\>.\<columnset\></tt>, + @row{<tt>colgroup:\<table name\>:\<column group name\></tt>, column group cursor,} @row{<tt>config:[\<uri\>]</tt>, object configuration cursor (key=config string\, value=config value),} - @row{<tt>file:\<filename\></tt>, + @row{<tt>file:\<file name\></tt>, file cursor (key=file key\, value=file value),} - @row{<tt>index:\<tablename\>.\<indexname\></tt>, + @row{<tt>index:\<table name\>:\<index name\></tt>, index cursor (key=index key\, value=table value),} @row{<tt>join:\<cursor1\>\&\<cursor2\>[&\<cursor3\>...]</tt>, join cursor, @notyet{join cursors}} @row{<tt>lsm:\<name\></tt>, LSM cursor (key=LSM key\, value=LSM value), See also: @ref lsm} - @row{<tt>statistics:[file</tt><tt>:\<filename\>]</tt>, -database or file statistics (key=(int)\, + @row{<tt>statistics:[\<data source URI\>]</tt>, +database or data source statistics (key=(int)\, value=(string)description\, (string)value\, (uint64_t)value),} - @row{<tt>table:\<tablename\></tt>, + @row{<tt>table:\<table name\></tt>, table cursor (key=table key\, value=table value),} </table> diff --git a/src/docs/install.dox b/src/docs/install.dox index dbf4712ba1d..263ebc541b7 100644 --- a/src/docs/install.dox +++ b/src/docs/install.dox @@ -45,7 +45,7 @@ To rebuild from scratch, discard any previous configuration by cleaning out the build area: @code -make realclean +make distclean @endcode To see additional configuration options, run: diff --git a/src/docs/license.dox b/src/docs/license.dox index 2644daa22e2..2c1f9f8eeb4 100644 --- a/src/docs/license.dox +++ b/src/docs/license.dox @@ -1,7 +1,8 @@ /*! @page license WiredTiger license -The WiredTiger software is Open Source software: you may redistribute -it and modify it under the terms of version 3 of the +The complete WiredTiger software package is Open Source software: you +are welcome to modify and redistribute it under the terms of version 3 +of the <a href="http://www.gnu.org/licenses/gpl-3.0-standalone.html"> <b>GNU General Public License</b></a> as published by the Free Software Foundation. This program is @@ -11,6 +12,13 @@ FITNESS FOR A PARTICULAR PURPOSE. See the <a href="http://www.gnu.org/licenses/gpl-3.0-standalone.html"> <b>GNU General Public License</b></a> for details. +For a license to use the WiredTiger software under conditions other than +those described above, or for technical support for this software, please +contact WiredTiger, Inc. at +<a mailto="info@wiredtiger.com">info@wiredtiger.com</a>. + +@section library 3rd party software included in the WiredTiger library binary + The WiredTiger library binary includes software copyrighted under the terms of the <a href="http://www.opensource.org/licenses/BSD-3-Clause"> @@ -20,13 +28,6 @@ and the <b>MIT License</b></a>. Any redistribution should comply with these copyrights. -For a license to use the WiredTiger software under conditions other than -those described above, or for technical support for this software, please -contact WiredTiger, Inc. at -<a mailto="info@wiredtiger.com">info@wiredtiger.com</a>. - -@section library 3rd party software included in the WiredTiger library binary - The WiredTiger library binary includes the following 3rd party software, distributed under the following licenses: @@ -51,10 +52,9 @@ sources, please review the copyright notices and LICENSE files included in the WiredTiger distribution for the terms and conditions of such redistribution. -@section pd Public domain software +@section public_domain Public domain software -Portions of this program are public domain software. Public domain -files have copyright notices releasing the software into the public -domain and may be freely used. +Portions of this program are public domain software. Public domain files have +notices releasing the software into the public domain and may be freely used. */ diff --git a/src/docs/namespace.dox b/src/docs/namespace.dox index 67702c1f1bb..a307185c93e 100644 --- a/src/docs/namespace.dox +++ b/src/docs/namespace.dox @@ -1,4 +1,4 @@ -/*! @page name_space Programmatic name spaces +/*! @page name_space Name spaces @section env Process' environment name space diff --git a/src/docs/programming.dox b/src/docs/programming.dox index 90034b203b0..2b84792f075 100644 --- a/src/docs/programming.dox +++ b/src/docs/programming.dox @@ -5,27 +5,27 @@ WiredTiger applications: - @subpage basic_api - @subpage config_strings -- @subpage error_handling - - @subpage cursors -- @subpage threads +- @subpage transactions +- @subpage error_handling - @subpage schema -- @subpage file_formats - @subpage lsm +- @subpage file_formats -- @subpage transactions +- @subpage bulk_load - @subpage cache_configuration - @subpage checkpoints -- @subpage hot_backup - - @subpage compression +- @subpage hot_backup +- @subpage statistics +- @subpage threads +- @subpage tuning - @subpage home - @subpage database_config - @subpage name_space - @subpage security - @subpage signals -- @subpage tuning */ diff --git a/src/docs/spell.ok b/src/docs/spell.ok index 7e1b8fa2d04..62551104925 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -41,6 +41,7 @@ WiredTigerCheckpoint aR ack'ed alloc +allocator allocsize ao api @@ -97,6 +98,7 @@ del desc destructor destructors +distclean dl dlp dontlock @@ -116,6 +118,7 @@ env eof erlang errno +fadvise failchk fd's fieldname @@ -135,6 +138,7 @@ getopt getter gid github +gnuplot hb hotbackup href @@ -201,6 +205,7 @@ mutexes mutexing mvcc mygcc +mytable namespace ndary ndbm @@ -223,6 +228,7 @@ objectsin ol oltp oob +os ovfl pcoll pdf @@ -238,6 +244,7 @@ printvalue priv pthread pthreads +py qnx rdbms rdlock @@ -270,8 +277,10 @@ spinlocks sql src startsync +statlog str strerror +strftime struct structs subdatabases diff --git a/src/docs/statistics.dox b/src/docs/statistics.dox new file mode 100644 index 00000000000..0258ae2d59d --- /dev/null +++ b/src/docs/statistics.dox @@ -0,0 +1,74 @@ +/*! @page statistics Statistics + +WiredTiger can be configured to maintain a variety of run-time +statistics. The \c statistics configuration boolean must be set for +statistics to be maintained; see @ref data_statistics for information +about accessing the statistics. The following example configures +WiredTiger to maintain statistics: + +@snippet ex_all.c Statistics configuration + +Note that maintaining statistics involves updating shared-memory data +structures and may decrease application performance. + +@section statistics_log Statistics logging +WiredTiger will optionally log the current database statistics into a +file when configured the \c statistics_log.log configuration string of +the ::wiredtiger_open function is set. + +The following example logs statistics every 30 seconds: + +@snippet ex_all.c Statistics logging + +Each record is formatted as a space-separated timestamp, unsigned 64-bit +value and a variable length string which describes the statistic. + +The timestamp format may be changed with the \c statistics_log.timestamp +configuration string. The \c timestamp value may contain ISO C90 standard +strftime conversion specifications. + +By default, only the system's connection statistics are logged, but +statistics may be optionally reported for underlying objects by adding +a list of URIs to the \c statistics_log configuration string: + +@snippet ex_all.c Statistics logging with objects + +When database statistics are logged, the database home will be the first +space-separated entry for each record in the log file. For example: + +@code +Mar 08 11:38:23 463 /database/home pthread mutex condition wait calls +Mar 08 11:38:23 0 /database/home files currently open +Mar 08 11:38:23 1855437 /database/home total heap memory allocations +Mar 08 11:38:23 1856622 /database/home total heap memory frees +Mar 08 11:38:23 1 /database/home total heap memory re-allocations +Mar 08 11:38:23 472 /database/home total read I/Os +@endcode + +When object statistics are logged, the object URI will be the first +space-separated entry for each record in the log file. For example: + +@code +Mar 20 10:42:36 21 table:mytable compressed pages written +Mar 20 10:42:36 0 table:mytable page written failed to compress +Mar 20 10:42:36 5 table:mytable page written was too small to compress +Mar 20 10:42:36 586 table:mytable cursor insert calls +Mar 20 10:42:36 0 table:mytable bulk-loaded cursor-insert calls +@endcode + +The location of the log files may be changed with the \c statistics_log.path +configuration string. The \c path value value may contain ISO C90 standard +strftime conversion specifications. WiredTiger will not create non-existent +directories in the path, they must exist before ::wiredtiger_open is called. + +The following example logs statistics into files named with the month, +day and year: + +@snippet ex_all.c Statistics logging with path + +A Python script that parses the default logging output and uses the +<a href="http://www.gnuplot.info/">gnuplot</a>, utility to generate +Portable Network Graphics (PNG) format graphs is included in the +WiredTiger distribution in the file \c tools/statlog.py. + +*/ diff --git a/src/docs/style/DoxygenLayout.xml b/src/docs/style/DoxygenLayout.xml index 1d7363f48db..fba7c189509 100644 --- a/src/docs/style/DoxygenLayout.xml +++ b/src/docs/style/DoxygenLayout.xml @@ -19,6 +19,7 @@ <tab type="globals" visible="yes" title="" intro=""/> </tab> <tab type="examples" visible="yes" title="" intro=""/> + <tab type="user" url="community.html" visible="yes" title="Community"/> <tab type="user" url="license.html" visible="yes" title="License"/> </navindex> diff --git a/src/docs/top/main.dox b/src/docs/top/main.dox index 49e6b2f75ff..9293c21fc67 100644 --- a/src/docs/top/main.dox +++ b/src/docs/top/main.dox @@ -6,7 +6,10 @@ WiredTiger is an high performance, scalable, production quality, NoSQL, @section releases Releases <table> -@row{<b>WiredTiger 1.4.2</b> (current), +@row{<b>WiredTiger 1.5.0</b> (current), + <a href="releases/wiredtiger-1.5.0.tar.bz2"><b>[Release package]</b></a>, + <a href="1.5.0/index.html"><b>[Documentation]</b></a>} +@row{<b>WiredTiger 1.4.2</b>, <a href="releases/wiredtiger-1.4.2.tar.bz2"><b>[Release package]</b></a>, <a href="1.4.2/index.html"><b>[Documentation]</b></a>} @row{<b>Development branch</b>, diff --git a/src/docs/tuning.dox b/src/docs/tuning.dox index 202256b6ce5..ebfbd4d57a1 100644 --- a/src/docs/tuning.dox +++ b/src/docs/tuning.dox @@ -1,6 +1,8 @@ /*! @page tuning Performance Tuning -@section tuning_cache_size Cache size +@section tuning_cache WiredTiger's cache + +@subsection tuning_cache_size Cache size The cache size for the database is configurable by setting the \c cache_size configuration string when calling the ::wiredtiger_open @@ -13,18 +15,7 @@ An example of setting a cache size to 500MB: @snippet ex_config.c configure cache size -@section tuning_memory_allocation Memory allocation - -The performance of heavily-threaded WiredTiger applications can be -dominated by memory allocation because the WiredTiger engine has to free -and re-allocate memory as part of many queries. Replacing the system's -malloc implementation with one that has better threaded performance (for -example, Google's -<a href="http://goog-perftools.sourceforge.net/doc/tcmalloc.html">tcmalloc</a>, -or <a href="http://www.canonware.com/jemalloc">jemalloc</a>), -can dramatically improve throughput. - -@section tuning_read_only_objects Read-only objects +@subsection tuning_read_only_objects Read-only objects Cursors opened on checkpoints (either named, or using the special "last checkpoint" name "WiredTigerCheckpoint") are read-only objects. Unless @@ -45,7 +36,7 @@ string "checkpoint" with the name "WiredTigerCheckpoint" to the WT_SESSION::open_cursor method: @snippet ex_all.c open the default checkpoint -@section tuning_cache_resident Cache resident objects +@subsection tuning_cache_resident Cache resident objects Cache resident objects (objects never considered for the purposes of cache eviction), can be configured with the WT_SESSION::create @@ -63,6 +54,36 @@ An example of configuring a cache-resident object: @snippet ex_all.c Create a cache-resident object +@section tuning_memory_allocator Memory allocator + +The performance of heavily-threaded WiredTiger applications can be +dominated by memory allocation because the WiredTiger engine has to free +and re-allocate memory as part of many queries. Replacing the system's +malloc implementation with one that has better threaded performance (for +example, Google's +<a href="http://goog-perftools.sourceforge.net/doc/tcmalloc.html">tcmalloc</a>, +or <a href="http://www.canonware.com/jemalloc">jemalloc</a>), +can dramatically improve throughput. + +@section tuning_cursor_persistence Cursor persistence + +Opening a new cursor is a relatively expensive operation in WiredTiger +(especially in table objects and Log-Structured Merge Trees (LSM) trees, +where a logical cursor may require multiple, underlying object cursors), +and caching cursors can improve performance. On the other hand, cursors +hold positions in objects, and therefore long-lived cursor positions can +decrease performance. The best combination is to cache cursors, but use +the WT_CURSOR::reset method to discard the cursor's position in the +object when the position is no longer needed. + +Additionally, cursors are automatically reset whenever a transaction +boundary is crossed; when a transaction is started with the +WT_SESSION::begin_transaction or ended with either +WT_SESSION::commit_transaction or WT_SESSION::rollback_transaction, all +open cursors are automatically reset, there is no need to call the +WT_CURSOR::reset method explicitly, and the cursor can be immediately +reused. + @section tuning_page_size Page and overflow sizes There are four page and item size configuration values: \c internal_page_max, @@ -140,6 +161,82 @@ An example of configuring page sizes: @snippet ex_file.c file create +@section tuning_system_buffer_cache System buffer cache + +@subsection tuning_system_buffer_cache_direct_io Direct I/O + +WiredTiger optionally supports direct I/O. Configuring direct I/O may +be useful for applications wanting to: +- minimize the operating system cache effects of I/O to and from +WiredTiger's buffer cache, +- avoid double-buffering of blocks in WiredTiger's cache and the +operating system buffer cache, and +- avoid stalling underlying solid-state drives by writing a large number +of dirty blocks. + +Direct I/O is configured using the "direct_io" configuration string to +the ::wiredtiger_open function. An example of configuring direct I/O +for WiredTiger's data files: + +@snippet ex_all.c Configure direct_io for data files + +Direct I/O implies a writing thread waits for the write to complete +(which is a slower operation than writing into the system buffer cache), +and configuring direct I/O is likely to decrease overall application +performance. + +Direct I/O is based on the non-standard \c O_DIRECT flag to the POSIX +1003.1 open system call and may not available on all platforms. + +@subsection tuning_system_buffer_cache_os_cache_dirty_max os_cache_dirty_max + +As well as direct I/O, WiredTiger supports two additional configuration +options related to the system buffer cache: + +The first is \c os_cache_dirty_max, the maximum dirty bytes an object +is allowed to have in the system buffer cache. Once this many bytes +from an object are written into the system buffer cache, WiredTiger will +attempt to schedule writes for all of the dirty blocks the object has +in the system buffer cache. This configuration option allows +applications to flush dirty blocks from the object, avoiding stalling +any underlying drives when the object is subsequently flushed to disk +as part of a durability operation. + +An example of configuring \c os_cache_dirty_max: + +@snippet ex_all.c os_cache_dirty_max configuration + +The \c os_cache_dirty_max configuration may not be used in combination +with direct I/O. + +The \c os_cache_dirty_max configuration is based on the non-standard +Linux \c sync_file_range system call and may not available on all +platforms. + +@subsection tuning_system_buffer_cache_os_cache_max os_cache_max + +The second configuration option related to the system buffer cache is +\c os_cache_max, the maximum bytes an object is allowed to have in the +system buffer cache. Once this many bytes from an object are either +read into or written from the system buffer cache, WiredTiger will +attempt to evict all of the object's blocks from the buffer cache. This +configuration option allows applications to evict blocks from the system +buffer cache to limit double-buffering and system buffer cache overhead. + +An example of configuring \c os_cache_max: + +@snippet ex_all.c os_cache_max configuration + +The \c os_cache_max configuration may not be used in combination with +direct I/O. + +The \c os_cache_max configuration is based on the POSIX 1003.1 standard +\c posix_fadvise system call and may not available on all platforms. + +Configuring direct I/O, \c os_cache_dirty_max or \c os_cache_max all +have the side effect of turning off memory-mapping of objects in +WiredTiger. + @section tuning_checksums Checksums WiredTiger checksums file reads and writes, by default. In read-only @@ -158,45 +255,41 @@ blocks which are not compressed: @snippet ex_all.c Configure checksums to uncompressed -@section tuning_direct_io Direct I/O - -WiredTiger optionally supports direct I/O, based on the non-standard \c -O_DIRECT flag to the POSIX 1003.1 open system call. Configuring direct -I/O may be useful for applications wanting to minimize the operating -system cache effects of I/O to and from WiredTiger's buffer cache. - -Direct I/O is configured using the "direct_io" configuration string to -the ::wiredtiger_open function. An example of configuring direct I/O -for WiredTiger's data files: - -@snippet ex_all.c Configure direct_io for data files - @section tuning_compression Compression -WiredTiger configures key prefix compression for row-store objects, and -column-store compression for both row-store and column-store objects, -by default. -These forms of compression minimize in-memory and on-disk space, but at -some CPU cost when rows are read and written. Turning these forms of -compression off may increase application throughput. +WiredTiger configures key prefix compression for row-store objects by +default. Additional forms of compression for both row- and column-store +objects, including dictionary and block compression, and Huffman +encoding, are optional. Compression minimizes in-memory and on-disk +resource requirements and decreases the amount of I/O, at some CPU cost +when rows are read and written. + +Configuring compression on or off may change application throughput. +For example, in applications using solid-state drives (where I/O is less +expensive), turning off compression may increase application performance +by reducing CPU costs; in applications where I/O costs are more +expensive, turning on compression may increase application performance +by reducing the overall number of I/O operations. For example, turning off row-store key prefix compression: @snippet ex_all.c Configure key prefix compression off -For example, turning off row-store or column-store dictionary compression: +For example, turning on row-store or column-store dictionary compression: -@snippet ex_all.c Configure dictionary compression off +@snippet ex_all.c Configure dictionary compression on -WiredTiger does not configure Huffman encoding or block compression by -default, but these forms of compression can also impact overall -throughput. See @ref file_formats_compression for more information. +See @ref file_formats_compression for more information. @section tuning_statistics Performance monitoring with statistics -WiredTiger maintains a variety of statistics that can be read with a -cursor. See @ref data_statistics for general information about accessing -statistics. +WiredTiger optionally maintains a variety of statistics, when the +\c statistics configuration string is specified to ::wiredtiger_open; +see @ref statistics for general information about statistics, and +@ref data_statistics for information about accessing the statistics. + +Note that maintaining run-time statistics involves updating +shared-memory data structures and may decrease application performance. The statistics gathered by WiredTiger can be combined to derive information about the system's behavior. For example, a cursor can be opened on the diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index 5aab6284a76..c9ae4b62099 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -1,5 +1,16 @@ /*! @page upgrading Upgrading WiredTiger applications +@section version_143 Upgrading to Version 1.4.3 +<dl> +<dt>Statistics</dt> +<dd> +WiredTiger statistics are no longer maintained by default; to configure +statistics, use the \c statistics configuration string to the +::wiredtiger_open function. +</dd> + +</dl> +<hr> @section version_139 Upgrading to Version 1.3.9 <dl> diff --git a/src/include/api.h b/src/include/api.h index 6e7534af908..e0943fe4d94 100644 --- a/src/include/api.h +++ b/src/include/api.h @@ -74,7 +74,7 @@ (ret) != WT_DUPLICATE_KEY) \ F_SET(&(s)->txn, TXN_ERROR); \ break; \ -} while (1) +} while (ret == 0) /* * If a session or connection method is about to return WT_NOTFOUND (some diff --git a/src/include/block.h b/src/include/block.h index bb86d799349..0d4f3275a4c 100644 --- a/src/include/block.h +++ b/src/include/block.h @@ -159,7 +159,8 @@ struct __wt_bm { (WT_BM *, WT_SESSION_IMPL *, uint8_t *, uint32_t *, int *); int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *); int (*salvage_valid)(WT_BM *, WT_SESSION_IMPL *, uint8_t *, uint32_t); - int (*stat)(WT_BM *, WT_SESSION_IMPL *); + int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats); + int (*sync)(WT_BM *, WT_SESSION_IMPL *); int (*verify_addr) (WT_BM *, WT_SESSION_IMPL *, const uint8_t *, uint32_t); int (*verify_end)(WT_BM *, WT_SESSION_IMPL *); @@ -194,7 +195,12 @@ struct __wt_block { /* Configuration information, set when the file is opened. */ uint32_t allocsize; /* Allocation size */ - u_int block_header; /* Header length */ + u_int block_header; /* Header length */ + + int64_t os_cache; /* System buffer cache flush max */ + int64_t os_cache_max; + int64_t os_cache_dirty; /* System buffer cache write max */ + int64_t os_cache_dirty_max; /* * There is only a single checkpoint in a file that can be written. The @@ -206,6 +212,13 @@ struct __wt_block { WT_SPINLOCK live_lock; /* Live checkpoint lock */ WT_BLOCK_CKPT live; /* Live checkpoint */ + /* + * Array of free WT_EXTLIST structures, if we're doing lots of I/O, + * a cache avoids an allocation/free while holding the spin lock. + */ + WT_EXT *free_ext; /* List of free entries */ + u_int free_ext_cnt; /* Limit the number we cache */ + /* Salvage support */ off_t slvg_off; /* Salvage file offset */ diff --git a/src/include/btmem.h b/src/include/btmem.h index 8f2ec8e8fa9..d6b2deb6255 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -133,6 +133,14 @@ struct __wt_page_modify { */ uint32_t disk_gen; + /* + * Track the highest transaction ID at which the page was written to + * disk. This can be used to avoid trying to write the page multiple + * times if a snapshot is keeping old versions pinned (e.g., in a + * checkpoint). + */ + wt_txnid_t disk_txn; + union { WT_PAGE *split; /* Resulting split */ WT_ADDR replace; /* Resulting replacement */ @@ -224,7 +232,7 @@ struct __wt_page { */ struct { uint64_t recno; /* Starting recno */ - WT_REF *t; /* Subtree */ + WT_REF *t; /* Subtree */ } intl; /* Row-store leaf page. */ @@ -237,10 +245,9 @@ struct __wt_page { * WT_PAGE structure as small as possible for read-only * pages. For consistency, we could move the row-store * modification structures into WT_PAGE_MODIFY too, but - * it doesn't shrink WT_PAGE any further, and avoiding - * ugly naming in WT_PAGE_MODIFY to avoid growing it - * won't be pretty. So far, avoiding ugly naming has - * overridden consistency. + * that doesn't shrink WT_PAGE any further and it would + * require really ugly naming inside of WT_PAGE_MODIFY + * to avoid growing that structure. */ WT_INSERT_HEAD **ins; /* Inserts */ WT_UPDATE **upd; /* Updates */ @@ -274,17 +281,29 @@ struct __wt_page { WT_PAGE_MODIFY *modify; /* - * The read generation is incremented each time the page is searched, - * and acts as an LRU value for each page in the tree; it is read by - * the eviction server thread to select pages to be discarded from the - * in-memory tree. + * The page's read generation acts as an LRU value for each page in the + * tree; it is used by the eviction server thread to select pages to be + * discarded from the in-memory tree. * - * The read generation is a 64-bit value; incremented every time the - * page is searched, a 32-bit value could overflow. + * The read generation is a 64-bit value, if incremented frequently, a + * 32-bit value could overflow. * - * The read-generation is not declared volatile: read-generation is set - * a lot (on every access), and we don't want to write it that much. + * The read generation is a piece of shared memory potentially accessed + * by many threads. We don't want to update page read generations for + * in-cache workloads and suffer the cache misses, so we don't simply + * increment the read generation value on every access. Instead, the + * read generation is initialized to 0, then set to a real value if the + * page is ever considered for eviction. Once set to a real value, the + * read generation is potentially incremented every time the page is + * accessed. To try and avoid incrementing the page at a fast rate in + * this case, the read generation is incremented to a future point. + * + * The read generation is not declared volatile or published: the read + * generation is set a lot, and we don't want to write it that much. */ +#define WT_READ_GEN_NOTSET 0 +#define WT_READ_GEN_OLDEST 1 +#define WT_READ_GEN_STEP 1000 uint64_t read_gen; /* @@ -420,6 +439,28 @@ struct __wt_ref { (ref) = (page)->u.intl.t; (i) > 0; ++(ref), --(i)) /* + * WT_LINK_PAGE -- + * Link a child page into a reference in its parent. + */ +#define WT_LINK_PAGE(ppage, pref, cpage) do { \ + (pref)->page = (cpage); \ + (cpage)->parent = (ppage); \ + (cpage)->ref = (pref); \ +} while (0) + +/* + * WT_MERGE_STACK_MIN -- + * When stacks of in-memory pages become this deep, they are considered for + * merging. + * + * WT_MERGE_FULL_PAGE -- + * When the result of a merge contains more than this number of keys, it is + * considered "done" and will not be merged again. + */ +#define WT_MERGE_STACK_MIN 3 +#define WT_MERGE_FULL_PAGE 100 + +/* * WT_ROW -- * Each in-memory page row-store leaf page has an array of WT_ROW structures: * this is created from on-page data when a page is read from the file. It's diff --git a/src/include/btree.h b/src/include/btree.h index 4eff4851348..49b814266ba 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -9,12 +9,9 @@ #define WT_BTREE_MINOR_VERSION 0 /* - * The minimum btree leaf and internal page sizes are 512B, the maximum 512MB. - * (The maximum of 512MB is enforced by the software, it could be set as high - * as 4GB.) + * The maximum btree leaf and internal page size is 512MB. (The maximum of + * 512MB is enforced by the software, it could be set as high as 4GB.) */ -#define WT_BTREE_ALLOCATION_SIZE_MIN 512 -#define WT_BTREE_ALLOCATION_SIZE_MAX (128 * WT_MEGABYTE) #define WT_BTREE_PAGE_SIZE_MAX (512 * WT_MEGABYTE) /* @@ -40,13 +37,6 @@ #define WT_BTREE_MAX_ADDR_COOKIE 255 /* Maximum address cookie */ /* - * Split page size calculation -- we don't want to repeatedly split every time - * a new entry is added, so we split to a smaller-than-maximum page size. - */ -#define WT_SPLIT_PAGE_SIZE(pagesize, allocsize, pct) \ - WT_ALIGN32(((uintmax_t)(pagesize) * (pct)) / 100, allocsize) - -/* * XXX * The server threads use their own WT_SESSION_IMPL handles because they may * want to block (for example, the eviction server calls reconciliation, and @@ -70,12 +60,12 @@ struct __wt_data_handle { const char *name; /* Object name as a URI */ const char *checkpoint; /* Checkpoint name (or NULL) */ - const char *config; /* Configuration string */ + const char **cfg; /* Configuration information */ WT_DATA_SOURCE *dsrc; /* Data source for this handle */ void *handle; /* Generic handle */ - WT_DSRC_STATS *stats; /* Data source statistics */ + WT_DSRC_STATS stats; /* Data-source statistics */ /* Flags values over 0xff are reserved for WT_BTREE_* */ #define WT_DHANDLE_DISCARD 0x01 /* Discard on release */ diff --git a/src/include/btree.i b/src/include/btree.i index a04777169ee..c3c1c720fcb 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -20,23 +20,35 @@ __wt_page_is_modified(WT_PAGE *page) * __wt_eviction_page_force -- * Add a page for forced eviction if it matches the criteria. */ -static inline int +static inline void __wt_eviction_page_force(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; btree = S2BT(session); - if (btree != NULL && !F_ISSET(btree, WT_BTREE_NO_EVICTION) && + /* + * Ignore internal pages (check read-only information first to the + * extent possible, this is shared data). + */ + if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) + return; + + if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) && __wt_page_is_modified(page) && - page->type != WT_PAGE_ROW_INT && page->type != WT_PAGE_COL_INT && page->memory_footprint > btree->maxmempage) - return (__wt_evict_forced_page(session, page)); - - return (0); + __wt_evict_forced_page(session, page); } /* + * Estimate the per-allocation overhead. All implementations of malloc / free + * have some kind of header and pad for alignment. We can't know for sure what + * that adds up to, but this is an estimate based on some measurements of heap + * size versus bytes in use. + */ +#define WT_ALLOC_OVERHEAD 32 + +/* * __wt_cache_page_inmem_incr -- * Increment a page's memory footprint in the cache. */ @@ -45,6 +57,8 @@ __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) { WT_CACHE *cache; + size += WT_ALLOC_OVERHEAD; + cache = S2C(session)->cache; (void)WT_ATOMIC_ADD(cache->bytes_inmem, size); (void)WT_ATOMIC_ADD(page->memory_footprint, WT_STORE_SIZE(size)); @@ -61,6 +75,8 @@ __wt_cache_page_inmem_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) { WT_CACHE *cache; + size += WT_ALLOC_OVERHEAD; + cache = S2C(session)->cache; (void)WT_ATOMIC_SUB(cache->bytes_inmem, size); (void)WT_ATOMIC_SUB(page->memory_footprint, WT_STORE_SIZE(size)); @@ -96,31 +112,6 @@ __wt_cache_dirty_decr(WT_SESSION_IMPL *session, size_t size) } /* - * __wt_cache_page_read -- - * Read pages into the cache. - */ -static inline void -__wt_cache_page_read(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) -{ - WT_CACHE *cache; - - cache = S2C(session)->cache; - WT_ASSERT(session, size != 0); - (void)WT_ATOMIC_ADD(cache->pages_read, 1); - (void)WT_ATOMIC_ADD(cache->bytes_read, size); - (void)WT_ATOMIC_ADD(page->memory_footprint, WT_STORE_SIZE(size)); - - /* - * It's unusual, but possible, that the page is already dirty. - * For example, when reading an in-memory page with references to - * deleted leaf pages, the internal page may be marked dirty. If so, - * update the total bytes dirty here. - */ - if (__wt_page_is_modified(page)) - (void)WT_ATOMIC_ADD(cache->bytes_dirty, size); -} - -/* * __wt_cache_page_evict -- * Evict pages from the cache. */ @@ -130,7 +121,9 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) WT_CACHE *cache; cache = S2C(session)->cache; + WT_ASSERT(session, page->memory_footprint != 0); + (void)WT_ATOMIC_ADD(cache->pages_evict, 1); (void)WT_ATOMIC_ADD(cache->bytes_evict, page->memory_footprint); @@ -140,7 +133,22 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) static inline uint64_t __wt_cache_read_gen(WT_SESSION_IMPL *session) { - return (++S2C(session)->cache->read_gen); + return (S2C(session)->cache->read_gen); +} + +static inline uint64_t +__wt_cache_read_gen_set(WT_SESSION_IMPL *session) +{ + /* + * We return read-generations from the future (where "the future" is + * measured by increments of the global read generation). The reason + * is because when acquiring a new hazard reference on a page, we can + * check its read generation, and if the read generation isn't less + * than the current global generation, we don't bother updating the + * page. In other words, the goal is to avoid some number of updates + * immediately after each update we have to make. + */ + return (++S2C(session)->cache->read_gen + WT_READ_GEN_STEP); } /* @@ -158,7 +166,7 @@ __wt_cache_pages_inuse(WT_CACHE *cache) * (although "interesting" corruption is vanishingly unlikely, these * values just increment over time). */ - pages_in = cache->pages_read; + pages_in = cache->pages_inmem; pages_out = cache->pages_evict; return (pages_in > pages_out ? pages_in - pages_out : 0); } @@ -178,7 +186,7 @@ __wt_cache_bytes_inuse(WT_CACHE *cache) * (although "interesting" corruption is vanishingly unlikely, these * values just increment over time). */ - bytes_in = cache->bytes_read + cache->bytes_inmem; + bytes_in = cache->bytes_inmem; bytes_out = cache->bytes_evict; return (bytes_in > bytes_out ? bytes_in - bytes_out : 0); } @@ -219,9 +227,13 @@ __wt_page_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page) /* * Multiple threads of control may be searching and deciding to modify - * a page, if we don't do the update, discard the memory. + * a page. If our modify structure is used, update the page's memory + * footprint, else discard the modify structure, another thread did the + * work. */ - if (!WT_ATOMIC_CAS(page->modify, NULL, modify)) + if (WT_ATOMIC_CAS(page->modify, NULL, modify)) + __wt_cache_page_inmem_incr(session, page, sizeof(*modify)); + else __wt_free(session, modify); return (0); } @@ -237,7 +249,15 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) (void)WT_ATOMIC_ADD(S2C(session)->cache->pages_dirty, 1); (void)WT_ATOMIC_ADD( S2C(session)->cache->bytes_dirty, page->memory_footprint); + + /* + * The page can never end up with changes older than the oldest + * running transaction. + */ + if (F_ISSET(&session->txn, TXN_RUNNING)) + page->modify->disk_txn = session->txn.snap_min - 1; } + /* * Publish: there must be a barrier to ensure all changes to the page * are flushed before we update the page's write generation, otherwise @@ -475,7 +495,7 @@ __wt_page_hazard_check(WT_SESSION_IMPL *session, WT_PAGE *page) /* * No lock is required because the session array is fixed size, but it - * it may contain inactive entries. We must review any active session + * may contain inactive entries. We must review any active session * that might contain a hazard pointer, so insert a barrier before * reading the active session count. That way, no matter what sessions * come or go, we'll check the slots for all of the sessions that could @@ -509,6 +529,36 @@ __wt_skip_choose_depth(void) } /* + * __wt_btree_size_overflow -- + * Check if the size of an in-memory tree with a single leaf page is + * over a specified maximum. If called on anything other than a simple + * tree with a single leaf page, returns true so the calling code will + * switch to a new tree. + */ +static inline int +__wt_btree_size_overflow(WT_SESSION_IMPL *session, uint32_t maxsize) +{ + WT_BTREE *btree; + WT_PAGE *child, *root; + + btree = S2BT(session); + root = btree->root_page; + + if (btree == NULL || root == NULL || + (child = root->u.intl.t->page) == NULL) + return (0); + + /* Make sure this is a simple tree, or LSM should switch. */ + if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) || + root->entries != 1 || + root->u.intl.t->state != WT_REF_MEM || + child->type != WT_PAGE_ROW_LEAF) + return (1); + + return (child->memory_footprint > maxsize); +} + +/* * __wt_btree_lex_compare -- * Lexicographic comparison routine. * @@ -545,3 +595,18 @@ __wt_btree_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item) (((cmp) = __wt_btree_lex_compare((k1), (k2))), 0) : \ (bt)->collator->compare((bt)->collator, &(s)->iface, \ (k1), (k2), &(cmp))) + +/* + * __wt_btree_mergeable -- + * Determines whether the given page is a candidate for merging. + */ +static inline int +__wt_btree_mergeable(WT_PAGE *page) +{ + if (WT_PAGE_IS_ROOT(page) || + page->modify == NULL || + !F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE)) + return (0); + + return (!WT_PAGE_IS_ROOT(page->parent)); +} diff --git a/src/include/cache.h b/src/include/cache.h index ce88994b733..a9d2af5dc61 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -25,9 +25,8 @@ struct __wt_cache { * be exact, they can't be garbage, we track what comes in and what goes * out and calculate the difference as needed. */ - uint64_t bytes_read; /* Bytes/pages read by read server */ - uint64_t pages_read; - uint64_t bytes_inmem; /* Bytes/pages created in memory */ + uint64_t bytes_inmem; /* Bytes/pages in memory */ + uint64_t pages_inmem; uint64_t bytes_evict; /* Bytes/pages discarded by eviction */ uint64_t pages_evict; uint64_t bytes_dirty; /* Bytes/pages currently dirty */ @@ -54,7 +53,6 @@ struct __wt_cache { */ WT_EVICT_ENTRY *evict; /* LRU pages being tracked */ WT_EVICT_ENTRY *evict_current; /* LRU current page to be evicted */ - size_t evict_allocated; /* LRU list bytes allocated */ uint32_t evict_entries; /* LRU list eviction slots */ uint32_t evict_candidates; /* LRU list pages to evict */ u_int evict_file_next; /* LRU: next file to search */ @@ -77,6 +75,8 @@ struct __wt_cache { * Flags. */ #define WT_EVICT_FORCE_PASS 0x01 /* Ignore the eviction trigger */ +#define WT_EVICT_NO_PROGRESS 0x02 /* Check if pages are being evicted */ +#define WT_EVICT_STUCK 0x04 /* Eviction server is stuck */ uint32_t flags; }; diff --git a/src/include/cache.i b/src/include/cache.i index 1c370655fc4..bf29f728181 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -63,7 +63,8 @@ __wt_cache_full_check(WT_SESSION_IMPL *session) if (!lockout || F_ISSET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_SCHEMA_LOCKED)) return (0); - if (F_ISSET(btree, WT_BTREE_BULK | WT_BTREE_NO_EVICTION)) + if (btree != NULL && + F_ISSET(btree, WT_BTREE_BULK | WT_BTREE_NO_EVICTION)) return (0); if ((ret = __wt_evict_lru_page(session, 1)) == EBUSY) __wt_yield(); diff --git a/src/include/cell.i b/src/include/cell.i index 1ef5059536a..637ed50de4d 100644 --- a/src/include/cell.i +++ b/src/include/cell.i @@ -448,11 +448,13 @@ __wt_cell_type_raw(WT_CELL *cell) static inline int __wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end) { - WT_DECL_RET; uint64_t v; const uint8_t *p; uint32_t saved_len; uint64_t saved_v; + int copied; + + copied = 0; /* * The verification code specifies an end argument, a pointer to 1 past @@ -467,7 +469,8 @@ __wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end) return (WT_ERROR); \ } while (0) - memset(unpack, 0, sizeof(*unpack)); +restart: + WT_CLEAR(*unpack); unpack->cell = cell; /* @@ -560,11 +563,8 @@ __wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end) saved_len = WT_PTRDIFF32(p, cell); saved_v = unpack->v; cell = (WT_CELL *)((uint8_t *)cell - v); - ret = __wt_cell_unpack_safe(cell, unpack, end); - unpack->raw = WT_CELL_VALUE_COPY; - unpack->__len = saved_len; - unpack->v = saved_v; - return (ret); + copied = 1; + goto restart; case WT_CELL_KEY_OVFL: case WT_CELL_VALUE_OVFL: @@ -600,6 +600,11 @@ __wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end) * we need the right length). */ done: CHK(cell, unpack->__len); + if (copied) { + unpack->raw = WT_CELL_VALUE_COPY; + unpack->__len = saved_len; + unpack->v = saved_v; + } return (0); } diff --git a/src/include/config.h b/src/include/config.h index 7d5e64e715a..cfbebcb239f 100644 --- a/src/include/config.h +++ b/src/include/config.h @@ -19,7 +19,7 @@ struct __wt_config_item { const char *str; size_t len; int64_t val; - enum { ITEM_STRING, ITEM_ID, ITEM_NUM, ITEM_STRUCT } type; + enum { ITEM_STRING, ITEM_BOOL, ITEM_ID, ITEM_NUM, ITEM_STRUCT } type; }; struct __wt_config_check { diff --git a/src/include/connection.h b/src/include/connection.h index 21a40238ff0..8fb137802a0 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -76,8 +76,6 @@ struct __wt_connection_impl { WT_SPINLOCK schema_lock; /* Schema operation spinlock */ WT_SPINLOCK serial_lock; /* Serial function call spinlock */ - int ckpt_backup; /* Backup: don't delete checkpoints */ - /* Connection queue */ TAILQ_ENTRY(__wt_connection_impl) q; /* Cache pool queue */ @@ -86,9 +84,12 @@ struct __wt_connection_impl { const char *home; /* Database home */ int is_new; /* Connection created database */ + int connection_initialized; /* Connection is initialized */ + WT_FH *lock_fh; /* Lock file handle */ - pthread_t cache_evict_tid; /* Cache eviction server thread ID */ + pthread_t cache_evict_tid; /* Eviction server thread ID */ + int cache_evict_tid_set; /* Eviction server thread ID set */ /* Locked: data handle list */ TAILQ_HEAD(__wt_dhandle_qh, __wt_data_handle) dhqh; @@ -129,9 +130,28 @@ struct __wt_connection_impl { WT_CACHE *cache; /* Page cache */ uint64_t cache_size; - WT_TXN_GLOBAL txn_global; /* Global transaction state. */ + WT_TXN_GLOBAL txn_global; /* Global transaction state */ - WT_CONNECTION_STATS *stats; /* Connection statistics */ + int ckpt_backup; /* Backup: don't delete checkpoints */ + + WT_SESSION_IMPL *ckpt_session; /* Checkpoint thread session */ + pthread_t ckpt_tid; /* Checkpoint thread */ + int ckpt_tid_set; /* Checkpoint thread set */ + WT_CONDVAR *ckpt_cond; /* Checkpoint wait mutex */ + const char *ckpt_config; /* Checkpoint configuration */ + long ckpt_usecs; /* Checkpoint period */ + + WT_CONNECTION_STATS stats; /* Connection statistics */ + int statistics; /* Global statistics configuration */ + WT_SESSION_IMPL *stat_session; /* Statistics log session */ + pthread_t stat_tid; /* Statistics log thread */ + int stat_tid_set; /* Statistics log thread set */ + WT_CONDVAR *stat_cond; /* Statistics log wait mutex */ + int stat_clear; /* Statistics log clear */ + const char *stat_path; /* Statistics log path format */ + char **stat_sources; /* Statistics log list of objects */ + const char *stat_stamp; /* Statistics log timestamp format */ + long stat_usecs; /* Statistics log period */ WT_FH *log_fh; /* Logging file handle */ @@ -150,6 +170,8 @@ struct __wt_connection_impl { /* If non-zero, all buffers used for I/O will be aligned to this. */ size_t buffer_alignment; + uint32_t schema_gen; /* Schema generation number */ + uint32_t direct_io; /* O_DIRECT configuration */ int mmap; /* mmap configuration */ uint32_t verbose; diff --git a/src/include/cursor.h b/src/include/cursor.h index 6d1cf1f4d8e..e144c19de32 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -226,7 +226,7 @@ struct __wt_cursor_stat { uint64_t v; /* Current stats value */ WT_ITEM pv; /* Current stats value (string) */ - void (*clear_func)(WT_STATS *); /* Function to clear stats. */ + void (*clear_func)(void *); /* Function to clear stats. */ WT_BTREE *btree; /* Pinned btree handle. */ }; diff --git a/src/include/cursor.i b/src/include/cursor.i index 6e289058376..2a8ed9c2dcf 100644 --- a/src/include/cursor.i +++ b/src/include/cursor.i @@ -75,18 +75,18 @@ __cursor_leave(WT_CURSOR_BTREE *cbt) if (F_ISSET(cbt, WT_CBT_ACTIVE)) { WT_ASSERT(session, session->ncursors > 0); - if (--session->ncursors == 0) { + if (--session->ncursors == 0) __wt_txn_read_last(session); - - /* - * We no longer have any active cursors, check if our - * operation overflowed the cache. We don't care if we - * fail to evict pages: our operation is done - * regardless. - */ - (void)__wt_cache_full_check(session); - } F_CLR(cbt, WT_CBT_ACTIVE); + + /* + * If this is an autocommit operation that is just getting + * started, check that the cache isn't full. We may have other + * cursors open, but the one we just closed might help eviction + * make progress. + */ + if (F_ISSET(&session->txn, TXN_AUTOCOMMIT)) + WT_RET(__wt_cache_full_check(session)); } return (0); } diff --git a/src/include/error.h b/src/include/error.h index 842e7abe2ba..08efcfa1690 100644 --- a/src/include/error.h +++ b/src/include/error.h @@ -97,25 +97,17 @@ } while (0) /* - * WT_ASSERT, WT_ASSERT_ERR, WT_ASSERT_RET -- - * Assert an expression, abort in diagnostic mode, otherwise, optionally - * return an error. + * WT_ASSERT + * Assert an expression, aborting in diagnostic mode. Otherwise, + * "use" the session to keep the compiler quiet and don't evaluate the + * expression. */ +#ifdef HAVE_DIAGNOSTIC #define WT_ASSERT(session, exp) do { \ if (!(exp)) \ __wt_assert(session, 0, __FILE__, __LINE__, "%s", #exp);\ } while (0) -#define WT_ASSERT_ERR(session, exp) do { \ - if (!(exp)) { \ - __wt_assert( \ - session, WT_ERROR, __FILE__, __LINE__, "%s", #exp); \ - WT_ERR(WT_ERROR); \ - } \ -} while (0) -#define WT_ASSERT_RET(session, exp) do { \ - if (!(exp)) { \ - __wt_assert( \ - session, WT_ERROR, __FILE__, __LINE__, "%s", #exp); \ - return (WT_ERROR); \ - } \ -} while (0) +#else +#define WT_ASSERT(session, exp) \ + WT_UNUSED(session) +#endif diff --git a/src/include/extern.h b/src/include/extern.h index 902e21395a7..79057e17dae 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -58,12 +58,14 @@ extern int __wt_block_compact_page_skip(WT_SESSION_IMPL *session, const uint8_t *addr, uint32_t addr_size, int *skipp); +extern void __wt_block_ext_cleanup(WT_SESSION_IMPL *session, WT_BLOCK *block); extern int __wt_block_misplaced(WT_SESSION_IMPL *session, WT_BLOCK *block, const char *tag, off_t offset, uint32_t size); -extern int __wt_block_off_remove_overlap( WT_SESSION_IMPL *session, +extern int __wt_block_off_remove_overlap(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_EXTLIST *el, off_t off, off_t size); @@ -85,10 +87,12 @@ extern int __wt_block_extlist_check( WT_SESSION_IMPL *session, extern int __wt_block_extlist_overlap( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci); -extern int __wt_block_extlist_merge(WT_SESSION_IMPL *session, +extern int __wt_block_extlist_merge( WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_EXTLIST *a, WT_EXTLIST *b); -extern int __wt_block_insert_ext( WT_SESSION_IMPL *session, +extern int __wt_block_insert_ext(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_EXTLIST *el, off_t off, off_t size); @@ -112,9 +116,16 @@ extern int __wt_block_extlist_init(WT_SESSION_IMPL *session, const char *name, const char *extname); extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el); +extern int __wt_block_map( WT_SESSION_IMPL *session, + WT_BLOCK *block, + void *mapp, + size_t *maplenp); +extern int __wt_block_unmap( WT_SESSION_IMPL *session, + WT_BLOCK *block, + void *map, + size_t maplen); extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, - const char *config, const char *cfg[], int forced_salvage, WT_BM **bmp); @@ -124,13 +135,14 @@ extern int __wt_block_manager_create(WT_SESSION_IMPL *session, const char *filename); extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, - const char *config, const char *cfg[], int forced_salvage, WT_BLOCK **blockp); extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block); extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh); -extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block); +extern void __wt_block_stat(WT_SESSION_IMPL *session, + WT_BLOCK *block, + WT_DSRC_STATS *stats); extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, @@ -250,16 +262,14 @@ extern int __wt_debug_page(WT_SESSION_IMPL *session, const char *ofile); extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep); extern void __wt_evict_list_clr_page(WT_SESSION_IMPL *session, WT_PAGE *page); -extern int __wt_evict_forced_page(WT_SESSION_IMPL *session, WT_PAGE *page); +extern void __wt_evict_forced_page(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_evict_server_wake(WT_SESSION_IMPL *session); extern int __wt_sync_file_serial_func(WT_SESSION_IMPL *session, void *args); extern void *__wt_cache_evict_server(void *arg); extern void __wt_evict_clear_tree_walk(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_sync_file(WT_SESSION_IMPL *session, int syncop); extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app); -extern int __wt_btree_create(WT_SESSION_IMPL *session, const char *filename); -extern int __wt_btree_truncate(WT_SESSION_IMPL *session, const char *filename); -extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]); extern int __wt_btree_close(WT_SESSION_IMPL *session); extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, @@ -268,13 +278,9 @@ extern int __wt_btree_leaf_create( WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref, WT_PAGE **pagep); -extern int __wt_btree_get_memsize( WT_SESSION_IMPL *session, - WT_BTREE *btree, - uint32_t **memsizep); -extern int __wt_btree_release_memsize(WT_SESSION_IMPL *session, - WT_BTREE *btree); -extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session, - const char *config); +extern void __wt_btree_evictable(WT_SESSION_IMPL *session, int on); +extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize); +extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session); extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session); extern int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, @@ -313,6 +319,10 @@ __wt_page_in_func( , const char *file, int line #endif ); +extern int __wt_page_alloc(WT_SESSION_IMPL *session, + uint8_t type, + uint32_t alloc_entries, + WT_PAGE **pagep); extern int __wt_page_inmem( WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *parent_ref, @@ -351,6 +361,7 @@ extern int __wt_col_search(WT_SESSION_IMPL *session, extern int __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive); +extern int __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top); extern int __wt_rec_track(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, @@ -369,10 +380,6 @@ extern int __wt_rec_track_onpage_addr(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, uint32_t addr_size); -extern int __wt_rec_track_onpage_ref( WT_SESSION_IMPL *session, - WT_PAGE *page, - WT_PAGE *refpage, - WT_REF *ref); extern int __wt_rec_track_ovfl_reuse( WT_SESSION_IMPL *session, WT_PAGE *page, const void *data, @@ -401,7 +408,13 @@ extern int __wt_row_key_copy( WT_SESSION_IMPL *session, WT_ROW *rip_arg, WT_ITEM *retb); extern WT_CELL *__wt_row_value(WT_PAGE *page, WT_ROW *rip); -extern int __wt_row_ikey_alloc(WT_SESSION_IMPL *session, +extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, + WT_PAGE *page, + uint32_t cell_offset, + const void *key, + uint32_t size, + void *ikeyp); +extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, uint32_t size, @@ -450,9 +463,6 @@ extern int __wt_config_subinit( WT_SESSION_IMPL *session, extern int __wt_config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value); -extern int __wt_config_getraw( WT_CONFIG *cparser, - WT_CONFIG_ITEM *key, - WT_CONFIG_ITEM *value); extern int __wt_config_get(WT_SESSION_IMPL *session, const char **cfg, WT_CONFIG_ITEM *key, @@ -551,12 +561,14 @@ extern WT_CONFIG_CHECK __wt_confchk_session_verify[]; extern const char *__wt_confdfl_table_meta; extern WT_CONFIG_CHECK __wt_confchk_table_meta[]; extern const char *__wt_confdfl_wiredtiger_open; +extern WT_CONFIG_CHECK __wt_confchk_checkpoint_subconfigs[]; +extern WT_CONFIG_CHECK __wt_confchk_statistics_log_subconfigs[]; extern WT_CONFIG_CHECK __wt_confchk_wiredtiger_open[]; extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session); extern int __wt_conn_btree_get(WT_SESSION_IMPL *session, const char *name, const char *ckpt, - const char *cfg[], + const char *op_cfg[], uint32_t flags); extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, int (*func)(WT_SESSION_IMPL *, @@ -575,18 +587,22 @@ extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, extern int __wt_conn_dhandle_discard(WT_CONNECTION_IMPL *conn); extern int __wt_cache_config(WT_CONNECTION_IMPL *conn, const char *cfg[]); extern int __wt_cache_create(WT_CONNECTION_IMPL *conn, const char *cfg[]); -extern void __wt_cache_stats_update(WT_CONNECTION_IMPL *conn, uint32_t flags); +extern void __wt_cache_stats_update(WT_SESSION_IMPL *session); extern int __wt_cache_destroy(WT_CONNECTION_IMPL *conn); extern int __wt_conn_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg); extern int __wt_conn_cache_pool_open(WT_SESSION_IMPL *session); extern int __wt_conn_cache_pool_destroy(WT_CONNECTION_IMPL *conn); extern void *__wt_cache_pool_server(void *arg); +extern int __wt_checkpoint_create(WT_CONNECTION_IMPL *conn, const char *cfg[]); +extern int __wt_checkpoint_destroy(WT_CONNECTION_IMPL *conn); extern int __wt_connection_init(WT_CONNECTION_IMPL *conn); extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn); extern int __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]); extern int __wt_connection_close(WT_CONNECTION_IMPL *conn); extern void __wt_conn_stat_init(WT_SESSION_IMPL *session, uint32_t flags); +extern int __wt_statlog_create(WT_CONNECTION_IMPL *conn, const char *cfg[]); +extern int __wt_statlog_destroy(WT_CONNECTION_IMPL *conn); extern int __wt_curbackup_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], @@ -950,13 +966,16 @@ extern int __wt_schema_get_table(WT_SESSION_IMPL *session, size_t namelen, int ok_incomplete, WT_TABLE **tablep); +extern void __wt_schema_release_table(WT_SESSION_IMPL *session, + WT_TABLE *table); extern void __wt_schema_destroy_colgroup(WT_SESSION_IMPL *session, WT_COLGROUP *colgroup); extern void __wt_schema_destroy_index(WT_SESSION_IMPL *session, WT_INDEX *idx); extern void __wt_schema_destroy_table(WT_SESSION_IMPL *session, WT_TABLE *table); -extern int __wt_schema_remove_table( WT_SESSION_IMPL *session, WT_TABLE *table); -extern int __wt_schema_close_tables(WT_SESSION_IMPL *session); +extern void __wt_schema_remove_table( WT_SESSION_IMPL *session, + WT_TABLE *table); +extern void __wt_schema_close_tables(WT_SESSION_IMPL *session); extern int __wt_schema_colgroup_name(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, @@ -1117,9 +1136,14 @@ extern void __wt_assert(WT_SESSION_IMPL *session, extern int __wt_panic(WT_SESSION_IMPL *session); extern int __wt_illegal_value(WT_SESSION_IMPL *session, const char *name); extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri); +extern int __wt_absolute_path(const char *path); extern int __wt_filename(WT_SESSION_IMPL *session, const char *name, const char **path); +extern int __wt_nfilename(WT_SESSION_IMPL *session, + const char *name, + size_t namelen, + const char **path); extern int __wt_library_init(void); extern int __wt_breakpoint(void); extern void __wt_attach(WT_SESSION_IMPL *session); @@ -1214,18 +1238,17 @@ extern void *__wt_scr_alloc_ext(WT_SESSION *wt_session, size_t size); extern void __wt_scr_free_ext(WT_SESSION *wt_session, void *p); extern void __wt_session_dump_all(WT_SESSION_IMPL *session); extern void __wt_session_dump(WT_SESSION_IMPL *session); -extern int __wt_stat_alloc_dsrc_stats(WT_SESSION_IMPL *session, - WT_DSRC_STATS **statsp); -extern void __wt_stat_clear_dsrc_stats(WT_STATS *stats_arg); -extern int __wt_stat_alloc_connection_stats(WT_SESSION_IMPL *session, - WT_CONNECTION_STATS **statsp); -extern void __wt_stat_clear_connection_stats(WT_STATS *stats_arg); +extern void __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats); +extern void __wt_stat_clear_dsrc_stats(void *stats_arg); +extern void __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats); +extern void __wt_stat_clear_connection_stats(void *stats_arg); extern int __wt_txnid_cmp(const void *v1, const void *v2); extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session); extern void __wt_txn_get_oldest(WT_SESSION_IMPL *session); extern void __wt_txn_get_snapshot( WT_SESSION_IMPL *session, wt_txnid_t my_id, - wt_txnid_t max_id); + wt_txnid_t max_id, + int force); extern void __wt_txn_get_evict_snapshot(WT_SESSION_IMPL *session); extern int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]); extern void __wt_txn_release(WT_SESSION_IMPL *session); diff --git a/src/include/flags.h b/src/include/flags.h index e08724d1bfe..340bd924985 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -3,7 +3,8 @@ * flags section: BEGIN */ #define WT_CACHE_POOL_RUN 0x00000001 -#define WT_CONN_CACHE_POOL 0x00000020 +#define WT_CONN_CACHE_POOL 0x00000040 +#define WT_CONN_EVICTION_RUN 0x00000020 #define WT_CONN_LSM_MERGE 0x00000010 #define WT_CONN_PANIC 0x00000008 #define WT_CONN_SERVER_RUN 0x00000004 @@ -18,10 +19,11 @@ #define WT_SESSION_SCHEMA_LOCKED 0x00000001 #define WT_SKIP_UPDATE_ERR 0x00000002 #define WT_SKIP_UPDATE_QUIT 0x00000001 -#define WT_SYNC_CHECKPOINT 0x00000008 -#define WT_SYNC_COMPACT 0x00000004 -#define WT_SYNC_DISCARD 0x00000002 -#define WT_SYNC_DISCARD_NOWRITE 0x00000001 +#define WT_SYNC_CHECKPOINT 0x00000010 +#define WT_SYNC_COMPACT 0x00000008 +#define WT_SYNC_DISCARD 0x00000004 +#define WT_SYNC_DISCARD_NOWRITE 0x00000002 +#define WT_SYNC_WRITE_LEAVES 0x00000001 #define WT_TREE_CACHE 0x00000080 #define WT_TREE_COMPACT 0x00000040 #define WT_TREE_DISCARD 0x00000020 diff --git a/src/include/lsm.h b/src/include/lsm.h index 396363e9e44..3f288449de5 100644 --- a/src/include/lsm.h +++ b/src/include/lsm.h @@ -65,10 +65,9 @@ struct __wt_lsm_tree { WT_RWLOCK *rwlock; TAILQ_ENTRY(__wt_lsm_tree) q; - WT_DSRC_STATS *stats; /* LSM statistics */ + WT_DSRC_STATS stats; /* LSM statistics */ uint64_t dsk_gen; - uint32_t *memsizep; /* Configuration parameters */ uint32_t bloom_bit_count; diff --git a/src/include/misc.h b/src/include/misc.h index e42620b1d5f..53dbdc6544b 100644 --- a/src/include/misc.h +++ b/src/include/misc.h @@ -35,14 +35,9 @@ * Align an unsigned value of any type to a specified power-of-2, including the * offset result of a pointer subtraction; do the calculation using the largest * unsigned integer type available. - * - * Optionally cast the result to a uint32_t because that's the size of a piece - * of data in the WiredTiger engine. */ #define WT_ALIGN(n, v) \ ((((uintmax_t)(n)) + ((v) - 1)) & ~(((uintmax_t)(v)) - 1)) -#define WT_ALIGN32(n, v) \ - ((uint32_t)WT_ALIGN(n, v)) /* Min, max. */ #define WT_MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -165,6 +160,13 @@ #define WT_DECL_RET int ret = 0 /* + * Skip the default configuration string in an list of configurations. The + * default config is always the first entry in the array, and the array always + * has an explicit NULL terminator, so this is safe. + */ +#define WT_SKIP_DEFAULT_CONFIG(c) &(c)[1] + +/* * In diagnostic mode we track the locations from which hazard pointers and * scratch buffers were acquired. */ diff --git a/src/include/mutex.h b/src/include/mutex.h index b33c3b5b8a2..ca2b3346c5c 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -72,7 +72,7 @@ #if defined(_lint) #define WT_ATOMIC_ADD(v, val) ((v) += (val), (v)) #define WT_ATOMIC_CAS(v, oldv, newv) \ - ((v) == (oldv) || (v) == (newv) ? 1 : 0) + ((v) == (oldv) && (v) = (newv) ? 1 : 0) #define WT_ATOMIC_SUB(v, val) ((v) -= (val), (v)) #define WT_FULL_BARRIER() #define WT_READ_BARRIER() diff --git a/src/include/os.h b/src/include/os.h index f56fb939c43..ad2932cb403 100644 --- a/src/include/os.h +++ b/src/include/os.h @@ -30,12 +30,12 @@ } while (0) struct __wt_fh { + u_int refcnt; /* Reference count */ TAILQ_ENTRY(__wt_fh) q; /* List of open handles */ - off_t file_size; /* File size */ - char *name; /* File name */ + off_t file_size; /* File size */ int fd; /* POSIX file handle */ - u_int refcnt; /* Reference count */ + int direct_io; /* O_DIRECT configured */ }; diff --git a/src/include/packing.i b/src/include/packing.i index b5f50ebf56e..a47df42d56b 100644 --- a/src/include/packing.i +++ b/src/include/packing.i @@ -12,13 +12,6 @@ * gory details. The short version is that we have less cases to deal with * because the compiler promotes shorter types to int or unsigned int. */ - -typedef struct { - WT_SESSION_IMPL *session; - const char *cur, *end, *orig; - unsigned long repeats; -} WT_PACK; - typedef struct { union { int64_t i; @@ -31,6 +24,13 @@ typedef struct { char type; } WT_PACK_VALUE; +typedef struct { + WT_SESSION_IMPL *session; + const char *cur, *end, *orig; + unsigned long repeats; + WT_PACK_VALUE lastv; +} WT_PACK; + static inline int __pack_initn( WT_SESSION_IMPL *session, WT_PACK *pack, const char *fmt, size_t len) @@ -59,6 +59,7 @@ __pack_next(WT_PACK *pack, WT_PACK_VALUE *pv) char *endsize; if (pack->repeats > 0) { + *pv = pack->lastv; --pack->repeats; return (0); } @@ -66,13 +67,17 @@ __pack_next(WT_PACK *pack, WT_PACK_VALUE *pv) next: if (pack->cur == pack->end) return (WT_NOTFOUND); - pv->size = WT_STORE_SIZE(strtoul(pack->cur, &endsize, 10)); - pv->havesize = (endsize > pack->cur); - if (!pv->havesize) + if (isdigit(*pack->cur)) { + pv->havesize = 1; + pv->size = WT_STORE_SIZE(strtoul(pack->cur, &endsize, 10)); + pack->cur = endsize; + } else { + pv->havesize = 0; pv->size = 1; - pack->cur = endsize; - pack->repeats = 0; + } + pv->type = *pack->cur++; + pack->repeats = 0; switch (pv->type) { case 'S': @@ -107,6 +112,7 @@ next: if (pack->cur == pack->end) if (pv->size == 0) goto next; pack->repeats = pv->size - 1; + pack->lastv = *pv; return (0); default: WT_RET_MSG(pack->session, EINVAL, @@ -156,9 +162,8 @@ next: if (pack->cur == pack->end) case 'R': \ pv.u.u = va_arg(ap, uint64_t); \ break; \ - default: \ - WT_ASSERT(session, pv.type != pv.type); \ - break; \ + /* User format strings have already been validated. */ \ + WT_ILLEGAL_VALUE(session); \ } \ } while (0) @@ -436,8 +441,7 @@ __unpack_read(WT_SESSION_IMPL *session, case 'R': \ *va_arg(ap, uint64_t *) = pv.u.u; \ break; \ - default: \ - WT_ASSERT(session, pv.type != pv.type); \ - break; \ + /* User format strings have already been validated. */ \ + WT_ILLEGAL_VALUE(session); \ } \ } while (0) diff --git a/src/include/schema.h b/src/include/schema.h index 8312ec72836..68eb046e815 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -5,12 +5,12 @@ * See the file LICENSE for redistribution information. */ -/* Character constants for projection plans. */ -#define WT_PROJ_KEY 'k' /* Go to key in cursor <arg>. */ -#define WT_PROJ_NEXT 'n' /* Process the next item (<arg> repeats). */ -#define WT_PROJ_REUSE 'r' /* Reuse the previous item (<arg> repeats). */ -#define WT_PROJ_SKIP 's' /* Skip a column in the cursor (<arg> repeats). */ -#define WT_PROJ_VALUE 'v' /* Go to the value in cursor <arg>. */ +/* Character constants for projection plans */ +#define WT_PROJ_KEY 'k' /* Go to key in cursor <arg> */ +#define WT_PROJ_NEXT 'n' /* Process the next item (<arg> repeats) */ +#define WT_PROJ_REUSE 'r' /* Reuse the previous item (<arg> repeats) */ +#define WT_PROJ_SKIP 's' /* Skip a column in the cursor (<arg> repeats) */ +#define WT_PROJ_VALUE 'v' /* Go to the value in cursor <arg> */ struct __wt_colgroup { const char *name; /* Logical name */ @@ -32,7 +32,7 @@ struct __wt_index { const char *key_plan; /* Key projection plan */ const char *value_plan; /* Value projection plan */ - int need_value; /* Index must have a non-empty value. */ + int need_value; /* Index must have a non-empty value */ }; /* @@ -56,6 +56,9 @@ struct __wt_table { int cg_complete, idx_complete, is_simple; u_int ncolgroups, nindices, nkey_columns; + + uint32_t refcnt; /* Number of open cursors */ + uint32_t schema_gen; /* Cached schema generation number */ }; /* diff --git a/src/include/session.h b/src/include/session.h index 24f0c1b3860..293129b7f9e 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -38,7 +38,8 @@ typedef enum { #define S2C(session) ((WT_CONNECTION_IMPL *)(session)->iface.connection) /* Get the btree for a session */ -#define S2BT(session) ((WT_BTREE *)(session)->dhandle->handle) +#define S2BT(session) ((session)->dhandle == NULL ? \ + NULL : (WT_BTREE *)(session)->dhandle->handle) /* * WT_SESSION_IMPL -- diff --git a/src/include/stat.h b/src/include/stat.h index 36a69068622..d4f276b50bb 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -28,46 +28,40 @@ struct __wt_stats { (stats)->fld.v = (uint64_t)(value); \ } while (0) -#define WT_STAT_CHECK_SESSION(session) \ - ((session) != NULL && (session) != S2C(session)->default_session) - /* Connection statistics. */ #define WT_CSTAT_DECR(session, fld) do { \ - if (WT_STAT_CHECK_SESSION(session)) { \ - WT_STAT_DECR(S2C(session)->stats, fld); \ - } \ + if (S2C(session)->statistics) \ + WT_STAT_DECR(&S2C(session)->stats, fld); \ } while (0) #define WT_CSTAT_INCR(session, fld) do { \ - if (WT_STAT_CHECK_SESSION(session)) { \ - WT_STAT_INCR(S2C(session)->stats, fld); \ - } \ + if (S2C(session)->statistics) \ + WT_STAT_INCR(&S2C(session)->stats, fld); \ } while (0) #define WT_CSTAT_INCRV(session, fld, v) do { \ - if (WT_STAT_CHECK_SESSION(session)) { \ - WT_STAT_INCRV(S2C(session)->stats, fld, v); \ - } \ + if (S2C(session)->statistics) \ + WT_STAT_INCRV(&S2C(session)->stats, fld, v); \ +} while (0) +#define WT_CSTAT_SET(session, fld, v) do { \ + if (S2C(session)->statistics) \ + WT_STAT_SET(&S2C(session)->stats, fld, v); \ } while (0) /* Data-source statistics. */ +#define WT_DSTAT_DECR(session, fld) do { \ + if (S2C(session)->statistics) \ + WT_STAT_DECR(&(session)->btree->stats, fld); \ +} while (0) #define WT_DSTAT_INCR(session, fld) do { \ - if (WT_STAT_CHECK_SESSION(session)) { \ - WT_STAT_INCR(session->dhandle->stats, fld); \ - } \ + if (S2C(session)->statistics) \ + WT_STAT_INCR(&session->dhandle->stats, fld); \ } while (0) #define WT_DSTAT_INCRV(session, fld, v) do { \ - if (WT_STAT_CHECK_SESSION(session)) { \ - WT_STAT_INCRV(session->dhandle->stats, fld, v); \ - } \ -} while (0) -#define WT_DSTAT_DECR(session, fld) do { \ - if (WT_STAT_CHECK_SESSION(session)) { \ - WT_STAT_DECR(session->dhandle->stats, fld); \ - } \ + if (S2C(session)->statistics) \ + WT_STAT_INCRV(&session->dhandle->stats, fld, v); \ } while (0) #define WT_DSTAT_SET(session, fld, v) do { \ - if (WT_STAT_CHECK_SESSION(session)) { \ - WT_STAT_SET(session->dhandle->stats, fld, v); \ - } \ + if (S2C(session)->statistics) \ + WT_STAT_SET(&session->dhandle->stats, fld, v); \ } while (0) /* Flags used by statistics initialization. */ @@ -116,11 +110,16 @@ struct __wt_dsrc_stats { WT_STATS btree_row_leaf; WT_STATS cache_bytes_read; WT_STATS cache_bytes_write; + WT_STATS cache_eviction_checkpoint; WT_STATS cache_eviction_clean; WT_STATS cache_eviction_dirty; WT_STATS cache_eviction_fail; + WT_STATS cache_eviction_force; WT_STATS cache_eviction_hazard; WT_STATS cache_eviction_internal; + WT_STATS cache_eviction_merge; + WT_STATS cache_eviction_merge_fail; + WT_STATS cache_eviction_merge_levels; WT_STATS cache_overflow_value; WT_STATS cache_read; WT_STATS cache_read_overflow; @@ -157,6 +156,7 @@ struct __wt_dsrc_stats { WT_STATS rec_skipped_update; WT_STATS rec_split_intl; WT_STATS rec_split_leaf; + WT_STATS rec_split_max; WT_STATS session_compact; WT_STATS txn_update_conflict; WT_STATS txn_write_conflict; @@ -177,12 +177,18 @@ struct __wt_connection_stats { WT_STATS cache_bytes_max; WT_STATS cache_bytes_read; WT_STATS cache_bytes_write; + WT_STATS cache_eviction_checkpoint; WT_STATS cache_eviction_clean; WT_STATS cache_eviction_dirty; WT_STATS cache_eviction_fail; + WT_STATS cache_eviction_force; WT_STATS cache_eviction_hazard; WT_STATS cache_eviction_internal; + WT_STATS cache_eviction_merge; + WT_STATS cache_eviction_merge_fail; + WT_STATS cache_eviction_merge_levels; WT_STATS cache_eviction_slow; + WT_STATS cache_eviction_walk; WT_STATS cache_pages_dirty; WT_STATS cache_pages_inuse; WT_STATS cache_read; @@ -191,7 +197,11 @@ struct __wt_connection_stats { WT_STATS file_open; WT_STATS memory_allocation; WT_STATS memory_free; + WT_STATS memory_grow; WT_STATS read_io; + WT_STATS rec_pages; + WT_STATS rec_pages_eviction; + WT_STATS rec_skipped_update; WT_STATS rwlock_read; WT_STATS rwlock_write; WT_STATS txn_ancient; diff --git a/src/include/txn.h b/src/include/txn.h index 483bc1de289..e87df4bae5d 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -49,6 +49,7 @@ struct __wt_txn_state { struct __wt_txn_global { volatile wt_txnid_t current; /* Current transaction ID. */ + volatile uint32_t gen; /* Completed transaction generation */ WT_TXN_STATE *states; /* Per-session transaction states */ }; @@ -79,6 +80,10 @@ struct __wt_txn { */ wt_txnid_t oldest_snap_min; + /* Saved global state, to avoid repeating scans. */ + wt_txnid_t last_id, last_oldest_id; + uint32_t last_gen, last_oldest_gen; + /* * Arrays of txn IDs in WT_UPDATE or WT_REF structures created or * modified by this transaction. @@ -91,12 +96,6 @@ struct __wt_txn { size_t modref_alloc; u_int modref_count; - /* - * Count of unsuccessful eviction attempts, used to abort if the cache - * is full and no progress can be made. - */ - u_int eviction_fails; - #define TXN_AUTOCOMMIT 0x01 #define TXN_ERROR 0x02 #define TXN_OLDEST 0x04 diff --git a/src/include/txn.i b/src/include/txn.i index 4e470a50eb3..45c89fe278b 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -276,7 +276,7 @@ __wt_txn_read_first(WT_SESSION_IMPL *session) if (txn->isolation == TXN_ISO_READ_COMMITTED || (!F_ISSET(txn, TXN_RUNNING) && txn->isolation == TXN_ISO_SNAPSHOT)) - __wt_txn_get_snapshot(session, WT_TXN_NONE, WT_TXN_NONE); + __wt_txn_get_snapshot(session, WT_TXN_NONE, WT_TXN_NONE, 0); else if (!F_ISSET(txn, TXN_RUNNING)) txn_state->snap_min = txn_global->current; } diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 3c3ad74d1df..d61871af60a 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -509,26 +509,26 @@ struct __wt_session { * @row{<tt>backup:</tt>, * hot backup cursor, * key=<code>string</code>\, see @ref hot_backup for details} - * @row{<tt>colgroup:\<tablename\>.\<columnset\></tt>, + * @row{<tt>colgroup:\<table name\>:\<column group name\></tt>, * column group cursor, * table key\, column group value(s)} * @row{<tt>config:[\<uri\>]</tt>, * object configuration cursor, (key=config string\, * value=config value)} - * @row{<tt>file:\<filename\></tt>, + * @row{<tt>file:\<file name\></tt>, * file cursor, * file key\, file value(s)} - * @row{<tt>index:\<tablename\>.\<indexname\></tt>, + * @row{<tt>index:\<table name\>:\<index name\></tt>, * index cursor, * key=index key\, value=table value(s)} * @row{<tt>lsm:\<name\></tt>, * LSM cursor (key=LSM key\, value=LSM value), See also: @ref lsm} - * @row{<tt>statistics:[file</tt><tt>:\<filename\>]</tt>, - * database or file statistics cursor, + * @row{<tt>statistics:[\<data source URI\>]</tt>, + * database or data source statistics cursor, * key=<code>int id</code>\, value=(<code>string description\, * string value\, uint64_t value</code>)\, * see @ref data_statistics for details} - * @row{<tt>table:\<tablename\></tt>, + * @row{<tt>table:\<table name\></tt>, * table cursor, * table key\, table value(s)} * </table> @@ -537,17 +537,18 @@ struct __wt_session { * @config{append, append the value as a new record\, creating a new * record number key; valid only for cursors with record number keys.,a * boolean flag; default \c false.} - * @config{bulk, configure the cursor for bulk loads\, a fast\, initial - * load path. Bulk load may only be used for newly created objects\, - * and in the case of row-store objects\, key/value items must be loaded - * in sorted order. Cursors configured for bulk load only support the - * WT_CURSOR::insert and WT_CURSOR::close methods. The value is usually - * a true/false flag\, but the the special value \c "bitmap" is for use - * with fixed-length column stores\, and allows chunks of a memory - * resident bitmap to be loaded directly into a file by passing a \c - * WT_ITEM to WT_CURSOR::set_value where the \c size field indicates the - * number of records in the bitmap (as specified by the file's \c - * value_format). Bulk load bitmap values must end on a byte boundary + * @config{bulk, configure the cursor for bulk-loading\, a fast\, + * initial load path (see @ref bulk_load for more information). + * Bulk-load may only be used for newly created objects and cursors + * configured for bulk-load only support the WT_CURSOR::insert and + * WT_CURSOR::close methods. When bulk-loading row-store objects\, keys + * must be loaded in sorted order. The value is usually a true/false + * flag; when bulk-loading fixed-length column store objects\, the + * special value \c bitmap allows chunks of a memory resident bitmap to + * be loaded directly into a file by passing a \c WT_ITEM to + * WT_CURSOR::set_value where the \c size field indicates the number of + * records in the bitmap (as specified by the object's \c value_format + * configuration). Bulk-loaded bitmap values must end on a byte boundary * relative to the bit count (except for the last set of values * loaded).,a string; default \c false.} * @config{checkpoint, the name of a checkpoint to open (the reserved @@ -580,7 +581,7 @@ struct __wt_session { * @config{target, if non-empty\, backup the list of objects; valid only * for a backup data source.,a list of strings; default empty.} * @configend - * @param cursorp a pointer to the newly opened cursor + * @param[out] cursorp a pointer to the newly opened cursor * @errors */ int __F(open_cursor)(WT_SESSION *session, @@ -596,7 +597,9 @@ struct __wt_session { * @snippet ex_all.c Create a table * * @param session the session handle - * @param name the URI of the object to create, such as \c "table:stock" + * @param name the URI of the object to create, such as + * \c "table:stock". For a description of URI formats + * see @ref data_sources. * @configstart{session.create, see dist/api_data.py} * @config{allocation_size, the file unit allocation size\, in bytes\, * must a power-of-two; smaller values decrease the file space required @@ -718,19 +721,34 @@ struct __wt_session { * adjusted to a lower bound of <code>50 * leaf_page_max</code>. This * limit is soft - it is possible for pages to be temporarily larger * than this value.,an integer between 512B and 10TB; default \c 5MB.} + * @config{os_cache_dirty_max, maximum dirty system buffer cache usage\, + * in bytes. If non-zero\, schedule writes for dirty blocks belonging + * to this object in the system buffer cache after that many bytes from + * this object are written into the buffer cache.,an integer greater + * than or equal to 0; default \c 0.} + * @config{os_cache_max, maximum system buffer cache usage\, in bytes. + * If non-zero\, evict object blocks from the system buffer cache after + * that many bytes from this object are read or written into the buffer + * cache.,an integer greater than or equal to 0; default \c 0.} * @config{prefix_compression, configure row-store format key prefix * compression.,a boolean flag; default \c true.} - * @config{source, override the default data source URI derived from the - * object name.,a string; default empty.} + * @config{source, set a custom data source URI for a column group\, + * index or simple table. By default\, the data source URI is derived + * from the \c type and the column group or index name. Applications + * can create tables from existing data sources by supplying a \c source + * configuration.,a string; default empty.} * @config{split_pct, the Btree page split size as a percentage of the * maximum Btree page size\, that is\, when a Btree page is split\, it * will be split into smaller pages\, where each page is the specified * percentage of the maximum Btree page size.,an integer between 25 and * 100; default \c 75.} - * @config{type, set the data source type. This setting overrides the - * URI prefix for the data source\, if no \c source configuration - * setting is provided.,a string\, chosen from the following options: \c - * "file"\, \c "lsm"; default \c file.} + * @config{type, set the type of data source used to store a column + * group\, index or simple table. By default\, a \c "file:" URI is + * derived from the object name. The \c type configuration can be used + * to switch to a different storage format\, such as LSM. Ignored if an + * explicit URI is supplied with a \c source configuration.,a string\, + * chosen from the following options: \c "file"\, \c "lsm"; default \c + * file.} * @config{value_format, the format of the data packed into value items. * See @ref schema_format_types for details. By default\, the * value_format is \c 'u' and applications use a WT_ITEM structure to @@ -1052,13 +1070,15 @@ struct __wt_connection { * shared cache is redistributed.,an integer between 1MB and 10TB; * default \c 10MB.}@config{ reserve, amount of * cache this database is guaranteed to have available from the shared - * cache. This setting is per database. Defaults to the chunk size.,a - * string; default \c 0.}@config{ name, name of a - * cache that is shared between databases.,a string; default - * empty.}@config{ size, maximum memory to + * cache. This setting is per database. Defaults to the chunk size.,an + * integer; default \c 0.}@config{ name, name of + * a cache that is shared between databases.,a string; default \c + * pool.}@config{ size, maximum memory to * allocate for the shared cache. Setting this will update the value if * one is already set.,an integer between 1MB and 10TB; default \c * 500MB.}@config{ ),,} + * @config{statistics, Maintain database statistics that may impact + * performance.,a boolean flag; default \c false.} * @config{verbose, enable messages for various events. Options are * given as a list\, such as * <code>"verbose=[evictserver\,read]"</code>.,a list\, with values @@ -1108,7 +1128,7 @@ struct __wt_connection { * "read-uncommitted"\, \c "read-committed"\, \c "snapshot"; default \c * read-committed.} * @configend - * @param sessionp the new session handle + * @param[out] sessionp the new session handle * @errors */ int __F(open_session)(WT_CONNECTION *connection, @@ -1224,6 +1244,13 @@ struct __wt_connection { * @config{cache_size, maximum heap memory to allocate for the cache. A database * should configure either a cache_size or a shared_cache not both.,an integer * between 1MB and 10TB; default \c 100MB.} + * @config{checkpoint = (, periodically checkpoint the database.,a set of + * related configuration options defined + * below.}@config{ name, the checkpoint name.,a string; + * default \c "WiredTigerCheckpoint".}@config{ wait, + * seconds to wait between each checkpoint; setting this value configures + * periodic checkpoints.,an integer between 1 and 100000; default \c 0.}@config{ + * ),,} * @config{create, create the database if it does not exist.,a boolean flag; * default \c false.} * @config{direct_io, Use \c O_DIRECT to access files. Options are given as a @@ -1265,12 +1292,33 @@ struct __wt_connection { * cache is redistributed.,an integer between 1MB and 10TB; default \c * 10MB.}@config{ reserve, amount of cache this database * is guaranteed to have available from the shared cache. This setting is per - * database. Defaults to the chunk size.,a string; default \c + * database. Defaults to the chunk size.,an integer; default \c * 0.}@config{ name, name of a cache that is shared - * between databases.,a string; default - * empty.}@config{ size, maximum memory to allocate for + * between databases.,a string; default \c + * pool.}@config{ size, maximum memory to allocate for * the shared cache. Setting this will update the value if one is already * set.,an integer between 1MB and 10TB; default \c 500MB.}@config{ ),,} + * @config{statistics, Maintain database statistics that may impact + * performance.,a boolean flag; default \c false.} + * @config{statistics_log = (, log database connection statistics into a file + * when the \c statistics configuration value is set to true. See @ref + * statistics_log for more information.,a set of related configuration options + * defined below.}@config{ clear, reset statistics + * counters after each set of log records are written.,a boolean flag; default + * \c true.}@config{ path, the pathname to a file into + * which the log records are written\, may contain strftime conversion + * specifications. If the value is not an absolute path name\, the file is + * created relative to the database home.,a string; default \c + * "WiredTigerStat.%H".}@config{ sources, if non-empty\, + * include statistics for the list of data source URIs. No statistics that + * require traversing a tree are reported\, as if the \c statistics_fast + * configuration string were set.,a list of strings; default + * empty.}@config{ timestamp, a timestamp prepended to + * each log record\, may contain strftime conversion specifications.,a string; + * default \c "%b %d %H:%M:%S".}@config{ wait, seconds to + * wait between each write of the log records; setting this value configures \c + * statistics and statistics logging.,an integer between 5 and 100000; default + * \c 0.}@config{ ),,} * @config{sync, flush files to stable storage when closing or writing * checkpoints.,a boolean flag; default \c true.} * @config{transactional, support transactional semantics.,a boolean flag; @@ -1291,7 +1339,7 @@ struct __wt_connection { * for details). Configuration values specified in the \c config argument to * the ::wiredtiger_open function override configuration values specified in * the \c WiredTiger.config file. - * @param connectionp A pointer to the newly opened connection handle + * @param[out] connectionp A pointer to the newly opened connection handle * @errors */ int wiredtiger_open(const char *home, @@ -1356,6 +1404,9 @@ struct __wt_event_handler { const char *operation, uint64_t progress); }; +/*! @name Data packing and unpacking + * @{ + */ /*! Pack a structure into a buffer. * * See @ref packing for a description of the permitted format strings. @@ -1391,7 +1442,7 @@ int wiredtiger_struct_pack( * @snippet ex_all.c Get the packed size * * @param session the session handle - * @param sizep a location where the the number of bytes needed for the + * @param sizep a location where the number of bytes needed for the * matching call to ::wiredtiger_struct_pack is returned * @param format the data format, see @ref packing * @errors @@ -1414,6 +1465,136 @@ int wiredtiger_struct_size( int wiredtiger_struct_unpack(WT_SESSION *session, const void *buffer, size_t size, const char *format, ...); +#if !defined(SWIG) + +/*! + * Streaming interface to packing. + * + * This allows applications to pack or unpack records one field at a time. + * This is an opaque handle returned by ::wiredtiger_pack_start or + * ::wiredtiger_unpack_start. It must be closed with ::wiredtiger_pack_close. + */ +typedef struct __wt_pack_stream WT_PACK_STREAM; + +/*! + * Start a packing operation into a buffer with the given format string. This + * should be followed by a series of calls to ::wiredtiger_pack_item, + * ::wiredtiger_pack_int, ::wiredtiger_pack_str or ::wiredtiger_pack_uint + * to fill in the values. + * + * @param session the session handle + * @param format the data format, see @ref packing + * @param buffer a pointer to memory to hold the packed data + * @param size the size of the buffer + * @param[out] psp the new packing stream handle + * @errors + */ +int wiredtiger_pack_start(WT_SESSION *session, + const char *format, void *buffer, size_t size, WT_PACK_STREAM **psp); + +/*! + * Start an unpacking operation from a buffer with the given format string. + * This should be followed by a series of calls to ::wiredtiger_unpack_item, + * ::wiredtiger_unpack_int, ::wiredtiger_unpack_str or ::wiredtiger_unpack_uint + * to retrieve the packed values. + * + * @param session the session handle + * @param format the data format, see @ref packing + * @param buffer a pointer to memory holding the packed data + * @param size the size of the buffer + * @param[out] psp the new packing stream handle + * @errors + */ +int wiredtiger_unpack_start(WT_SESSION *session, + const char *format, const void *buffer, size_t size, WT_PACK_STREAM **psp); + +/*! + * Close a packing stream. + * + * @param ps the packing stream handle + * @param[out] usedp the number of bytes in the buffer used by the stream + * @errors + */ +int wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp); + +/*! + * Pack an item into a packing stream. + * + * @param ps the packing stream handle + * @param item an item to pack + * @errors + */ +int wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item); + +/*! + * Pack a signed integer into a packing stream. + * + * @param ps the packing stream handle + * @param i a signed integer to pack + * @errors + */ +int wiredtiger_pack_int(WT_PACK_STREAM *ps, int64_t i); + +/*! + * Pack a string into a packing stream. + * + * @param ps the packing stream handle + * @param s a string to pack + * @errors + */ +int wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s); + +/*! + * Pack an unsigned integer into a packing stream. + * + * @param ps the packing stream handle + * @param u an unsigned integer to pack + * @errors + */ +int wiredtiger_pack_uint(WT_PACK_STREAM *ps, uint64_t u); + +/*! + * Unpack an item from a packing stream. + * + * @param ps the packing stream handle + * @param item an item to unpack + * @errors + */ +int wiredtiger_unpack_item(WT_PACK_STREAM *ps, WT_ITEM *item); + +/*! + * Unpack a signed integer from a packing stream. + * + * @param ps the packing stream handle + * @param[out] ip the unpacked signed integer + * @errors + */ +int wiredtiger_unpack_int(WT_PACK_STREAM *ps, int64_t *ip); + +/*! + * Unpack a string from a packing stream. + * + * @param ps the packing stream handle + * @param[out] sp the unpacked string + * @errors + */ +int wiredtiger_unpack_str(WT_PACK_STREAM *ps, const char **sp); + +/*! + * Unpack an unsigned integer from a packing stream. + * + * @param ps the packing stream handle + * @param[out] up the unpacked unsigned integer + * @errors + */ +int wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up); + +#endif /* !defined(SWIG) */ + +/*! + * @} + */ + /*! Get version information. * * @snippet ex_all.c Get the WiredTiger library version #1 @@ -1596,6 +1777,11 @@ struct __wt_compressor { * either the \c internal_page_max or \c leaf_page_max value specified * to WT_SESSION::create when the object was created.) * + * On entry, \c split_pct is the configured Btree page split size for + * this object. (This value is provided for convenience, and will be + * the \c split_pct value specified to WT_SESSION::create when the + * object was created.) + * * On entry, \c extra is a count of additional bytes that will be added * to the encoded representation before it is written. In other words, * if the target write size is 8KB, the returned encoded representation @@ -1661,6 +1847,7 @@ struct __wt_compressor { * applicable, the WT_COMPRESSOR::compress callback is used instead. * * @param[in] page_max the configured maximum page size for this object + * @param[in] split_pct the configured page split size for this object * @param[in] extra the count of the additional bytes * @param[in] src the data to compress * @param[in] offsets the byte offsets of the byte strings in src @@ -1673,7 +1860,7 @@ struct __wt_compressor { * @returns zero for success, non-zero to indicate an error. */ int (*compress_raw)(WT_COMPRESSOR *compressor, WT_SESSION *session, - size_t page_max, size_t extra, + size_t page_max, u_int split_pct, size_t extra, uint8_t *src, uint32_t *offsets, uint32_t slots, uint8_t *dst, size_t dst_len, int final, @@ -1886,54 +2073,74 @@ extern int wiredtiger_extension_init(WT_SESSION *session, #define WT_STAT_CONN_CACHE_BYTES_READ 9 /*! cache: bytes written from cache */ #define WT_STAT_CONN_CACHE_BYTES_WRITE 10 +/*! cache: checkpoint blocked page eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 11 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 11 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 12 /*! cache: modified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 12 +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 13 /*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 13 -/*! cache: eviction unable to acquire hazard pointer */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 14 +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 14 +/*! cache: pages queued for forced eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_FORCE 15 +/*! cache: hazard pointer blocked page eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 16 /*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 15 +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 17 +/*! cache: internal page merge operations completed */ +#define WT_STAT_CONN_CACHE_EVICTION_MERGE 18 +/*! cache: internal page merge attempts that could not complete */ +#define WT_STAT_CONN_CACHE_EVICTION_MERGE_FAIL 19 +/*! cache: internal levels merged */ +#define WT_STAT_CONN_CACHE_EVICTION_MERGE_LEVELS 20 /*! cache: eviction server unable to reach eviction goal */ -#define WT_STAT_CONN_CACHE_EVICTION_SLOW 16 +#define WT_STAT_CONN_CACHE_EVICTION_SLOW 21 +/*! cache: pages walked for eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_WALK 22 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 17 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 23 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 18 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 24 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 19 +#define WT_STAT_CONN_CACHE_READ 25 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 20 +#define WT_STAT_CONN_CACHE_WRITE 26 /*! pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 21 +#define WT_STAT_CONN_COND_WAIT 27 /*! files currently open */ -#define WT_STAT_CONN_FILE_OPEN 22 +#define WT_STAT_CONN_FILE_OPEN 28 /*! total heap memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 23 +#define WT_STAT_CONN_MEMORY_ALLOCATION 29 /*! total heap memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 24 +#define WT_STAT_CONN_MEMORY_FREE 30 +/*! total heap memory re-allocations */ +#define WT_STAT_CONN_MEMORY_GROW 31 /*! total read I/Os */ -#define WT_STAT_CONN_READ_IO 25 +#define WT_STAT_CONN_READ_IO 32 +/*! page reconciliation calls */ +#define WT_STAT_CONN_REC_PAGES 33 +/*! page reconciliation calls for eviction */ +#define WT_STAT_CONN_REC_PAGES_EVICTION 34 +/*! reconciliation failed because an update could not be included */ +#define WT_STAT_CONN_REC_SKIPPED_UPDATE 35 /*! pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 26 +#define WT_STAT_CONN_RWLOCK_READ 36 /*! pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 27 +#define WT_STAT_CONN_RWLOCK_WRITE 37 /*! ancient transactions */ -#define WT_STAT_CONN_TXN_ANCIENT 28 +#define WT_STAT_CONN_TXN_ANCIENT 38 /*! transactions */ -#define WT_STAT_CONN_TXN_BEGIN 29 +#define WT_STAT_CONN_TXN_BEGIN 39 /*! transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 30 +#define WT_STAT_CONN_TXN_CHECKPOINT 40 /*! transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 31 +#define WT_STAT_CONN_TXN_COMMIT 41 /*! transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 32 +#define WT_STAT_CONN_TXN_FAIL_CACHE 42 /*! transactions rolled-back */ -#define WT_STAT_CONN_TXN_ROLLBACK 33 +#define WT_STAT_CONN_TXN_ROLLBACK 43 /*! total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 34 +#define WT_STAT_CONN_WRITE_IO 44 /*! * @} @@ -2007,95 +2214,107 @@ extern int wiredtiger_extension_init(WT_SESSION *session, #define WT_STAT_DSRC_CACHE_BYTES_READ 31 /*! bytes written from cache */ #define WT_STAT_DSRC_CACHE_BYTES_WRITE 32 +/*! cache: checkpoint blocked page eviction */ +#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 33 /*! unmodified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 33 +#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 34 /*! modified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 34 +#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 35 /*! data source pages selected for eviction unable to be evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 35 -/*! eviction unable to acquire hazard pointer */ -#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 36 +#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 36 +/*! cache: pages queued for forced eviction */ +#define WT_STAT_DSRC_CACHE_EVICTION_FORCE 37 +/*! cache: hazard pointer blocked page eviction */ +#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 38 /*! internal pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 37 +#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 39 +/*! cache: internal page merge operations completed */ +#define WT_STAT_DSRC_CACHE_EVICTION_MERGE 40 +/*! cache: internal page merge attempts that could not complete */ +#define WT_STAT_DSRC_CACHE_EVICTION_MERGE_FAIL 41 +/*! cache: internal levels merged */ +#define WT_STAT_DSRC_CACHE_EVICTION_MERGE_LEVELS 42 /*! overflow values cached in memory */ -#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 38 +#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 43 /*! pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ 39 +#define WT_STAT_DSRC_CACHE_READ 44 /*! overflow pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 40 +#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 45 /*! pages written from cache */ -#define WT_STAT_DSRC_CACHE_WRITE 41 +#define WT_STAT_DSRC_CACHE_WRITE 46 /*! raw compression call failed (no additional data available) */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 42 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 47 /*! raw compression call failed (additional data available) */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 43 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 48 /*! raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 44 +#define WT_STAT_DSRC_COMPRESS_RAW_OK 49 /*! compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 45 +#define WT_STAT_DSRC_COMPRESS_READ 50 /*! compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 46 +#define WT_STAT_DSRC_COMPRESS_WRITE 51 /*! page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 47 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 52 /*! page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 48 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 53 /*! cursor insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 49 +#define WT_STAT_DSRC_CURSOR_INSERT 54 /*! bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 50 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 55 /*! cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 51 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 56 /*! cursor next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 52 +#define WT_STAT_DSRC_CURSOR_NEXT 57 /*! cursor prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 53 +#define WT_STAT_DSRC_CURSOR_PREV 58 /*! cursor remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 54 +#define WT_STAT_DSRC_CURSOR_REMOVE 59 /*! cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 55 +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 60 /*! cursor reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 56 +#define WT_STAT_DSRC_CURSOR_RESET 61 /*! cursor search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 57 +#define WT_STAT_DSRC_CURSOR_SEARCH 62 /*! cursor search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 58 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 63 /*! cursor update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 59 +#define WT_STAT_DSRC_CURSOR_UPDATE 64 /*! cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 60 +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 65 /*! chunks in the LSM tree */ -#define WT_STAT_DSRC_LSM_CHUNK_COUNT 61 +#define WT_STAT_DSRC_LSM_CHUNK_COUNT 66 /*! highest merge generation in the LSM tree */ -#define WT_STAT_DSRC_LSM_GENERATION_MAX 62 +#define WT_STAT_DSRC_LSM_GENERATION_MAX 67 /*! queries that could have benefited from a Bloom filter that did not * exist */ -#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 63 +#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 68 /*! reconciliation dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 64 +#define WT_STAT_DSRC_REC_DICTIONARY 69 /*! reconciliation overflow keys written */ -#define WT_STAT_DSRC_REC_OVFL_KEY 65 +#define WT_STAT_DSRC_REC_OVFL_KEY 70 /*! reconciliation overflow values written */ -#define WT_STAT_DSRC_REC_OVFL_VALUE 66 +#define WT_STAT_DSRC_REC_OVFL_VALUE 71 /*! reconciliation pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 67 +#define WT_STAT_DSRC_REC_PAGE_DELETE 72 /*! reconciliation pages merged */ -#define WT_STAT_DSRC_REC_PAGE_MERGE 68 +#define WT_STAT_DSRC_REC_PAGE_MERGE 73 /*! page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 69 +#define WT_STAT_DSRC_REC_PAGES 74 /*! page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 70 -/*! page reconciliation failed when an update could not be included */ -#define WT_STAT_DSRC_REC_SKIPPED_UPDATE 71 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 75 +/*! reconciliation failed because an update could not be included */ +#define WT_STAT_DSRC_REC_SKIPPED_UPDATE 76 /*! reconciliation internal pages split */ -#define WT_STAT_DSRC_REC_SPLIT_INTL 72 +#define WT_STAT_DSRC_REC_SPLIT_INTL 77 /*! reconciliation leaf pages split */ -#define WT_STAT_DSRC_REC_SPLIT_LEAF 73 +#define WT_STAT_DSRC_REC_SPLIT_LEAF 78 +/*! reconciliation maximum number of splits created by for a page */ +#define WT_STAT_DSRC_REC_SPLIT_MAX 79 /*! object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 74 +#define WT_STAT_DSRC_SESSION_COMPACT 80 /*! update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 75 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 81 /*! write generation conflicts */ -#define WT_STAT_DSRC_TXN_WRITE_CONFLICT 76 +#define WT_STAT_DSRC_TXN_WRITE_CONFLICT 82 /*! @} */ /* * Statistics section: END diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index b3ca7fabcce..952f332d4e4 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -206,6 +206,7 @@ struct __wt_update; #include "posix.h" #include "txn.h" /* typedef for wt_txnid_t */ +#include "stat.h" /* WT_DSRC_STATS for data sources */ #include "api.h" #include "block.h" @@ -223,7 +224,6 @@ struct __wt_update; #include "meta.h" #include "os.h" #include "schema.h" -#include "stat.h" #include "session.h" /* required by connection.h */ #include "connection.h" diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 880243f85a6..87d2e891b71 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -214,14 +214,7 @@ __clsm_open_cursors( clsm->primary_chunk = chunk; (void)WT_ATOMIC_ADD(clsm->primary_chunk->ncursor, 1); - /* - * Peek into the btree layer to track the in-memory size. - * Ignore error returns since it is OK for the btree to be - * empty in this code path (and that is an error condition). - */ - if (lsm_tree->memsizep == NULL) - (void)__wt_btree_get_memsize( - session, S2BT(session), &lsm_tree->memsizep); + __wt_btree_evictable(session, 0); } clsm->dsk_gen = lsm_tree->dsk_gen; @@ -571,10 +564,10 @@ __clsm_search(WT_CURSOR *cursor) ret = __wt_bloom_hash_get(bloom, &bhash); if (ret == WT_NOTFOUND) { WT_STAT_INCR( - clsm->lsm_tree->stats, bloom_miss); + &clsm->lsm_tree->stats, bloom_miss); continue; } else if (ret == 0) - WT_STAT_INCR(clsm->lsm_tree->stats, bloom_hit); + WT_STAT_INCR(&clsm->lsm_tree->stats, bloom_hit); WT_ERR(ret); } c->set_key(c, &cursor->key); @@ -589,11 +582,11 @@ __clsm_search(WT_CURSOR *cursor) goto err; else if (bloom != NULL) WT_STAT_INCR( - clsm->lsm_tree->stats, bloom_false_positive); + &clsm->lsm_tree->stats, bloom_false_positive); /* The active chunk can't have a bloom filter. */ else if (clsm->primary_chunk == NULL || i != clsm->nchunks) WT_STAT_INCR( - clsm->lsm_tree->stats, lsm_lookup_no_bloom); + &clsm->lsm_tree->stats, lsm_lookup_no_bloom); } ret = WT_NOTFOUND; @@ -781,11 +774,10 @@ static inline int __clsm_put( WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, WT_ITEM *key, WT_ITEM *value) { - WT_BTREE *btree; + WT_DATA_HANDLE *saved_dhandle; WT_CURSOR *primary; WT_DECL_RET; WT_LSM_TREE *lsm_tree; - uint32_t *memsizep; lsm_tree = clsm->lsm_tree; @@ -826,13 +818,19 @@ __clsm_put( F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); clsm->current = primary; - if ((memsizep = lsm_tree->memsizep) != NULL && - *memsizep > lsm_tree->chunk_size) { + /* + * In LSM there are multiple btrees active at one time. The tree + * switch code needs to use btree API methods, and it wants to + * operate on the btree for the primary chunk. Set that up now. + */ + saved_dhandle = session->dhandle; + WT_SET_BTREE_IN_SESSION(session, ((WT_CURSOR_BTREE *)primary)->btree); + if (__wt_btree_size_overflow(session, lsm_tree->chunk_size)) { /* * Take the LSM lock first: we can't acquire it while * holding the schema lock, or we will deadlock. */ - WT_RET(__wt_writelock(session, lsm_tree->rwlock)); + WT_ERR(__wt_writelock(session, lsm_tree->rwlock)); /* Make sure we don't race. */ if (clsm->dsk_gen == lsm_tree->dsk_gen) WT_WITH_SCHEMA_LOCK(session, @@ -844,12 +842,15 @@ __clsm_put( * in switching: if something went wrong, we should keep * trying to switch. */ - btree = ((WT_CURSOR_BTREE *)primary)->btree; - if (ret == 0) - ret = __wt_btree_release_memsize(session, btree); + if (ret == 0) { + WT_SET_BTREE_IN_SESSION(session, + ((WT_CURSOR_BTREE *)primary)->btree); + __wt_btree_evictable(session, 1); + } WT_TRET(__wt_rwunlock(session, lsm_tree->rwlock)); } +err: session->dhandle = saved_dhandle; return (ret); } diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c index b3335089ae8..a738462ac69 100644 --- a/src/lsm/lsm_meta.c +++ b/src/lsm/lsm_meta.c @@ -19,15 +19,15 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_DECL_RET; WT_ITEM buf; WT_LSM_CHUNK *chunk; - const char *config; + const char *lsmconfig; size_t chunk_sz, alloc; u_int nchunks; WT_CLEAR(buf); chunk_sz = sizeof(WT_LSM_CHUNK); - WT_RET(__wt_metadata_read(session, lsm_tree->name, &config)); - WT_ERR(__wt_config_init(session, &cparser, config)); + WT_RET(__wt_metadata_read(session, lsm_tree->name, &lsmconfig)); + WT_ERR(__wt_config_init(session, &cparser, lsmconfig)); while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) { if (WT_STRING_MATCH("bloom_config", ck.str, ck.len)) { __wt_free(session, lsm_tree->bloom_config); @@ -146,7 +146,7 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) } WT_ERR_NOTFOUND_OK(ret); -err: __wt_free(session, config); +err: __wt_free(session, lsmconfig); return (ret); } diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c index e4e36311862..fa0babfb4be 100644 --- a/src/lsm/lsm_stat.c +++ b/src/lsm/lsm_stat.c @@ -38,14 +38,15 @@ __wt_lsm_stat_init(WT_SESSION_IMPL *session, if (cst->stats != NULL) stats = (WT_DSRC_STATS *)cst->stats; else { - WT_ERR(__wt_stat_alloc_dsrc_stats(session, &stats)); + WT_ERR(__wt_calloc_def(session, 1, &stats)); + __wt_stat_init_dsrc_stats(stats); cst->stats_first = cst->stats = (WT_STATS *)stats; cst->stats_count = sizeof(*stats) / sizeof(WT_STATS); } - *stats = *lsm_tree->stats; + *stats = lsm_tree->stats; if (LF_ISSET(WT_STATISTICS_CLEAR)) - __wt_stat_clear_dsrc_stats((WT_STATS *)lsm_tree->stats); + __wt_stat_clear_dsrc_stats(&lsm_tree->stats); /* Hold the LSM lock so that we can safely walk through the chunks. */ WT_ERR(__wt_readlock(session, lsm_tree->rwlock)); diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 0746c9f4443..82493354e1e 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -37,8 +37,6 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (lsm_tree->rwlock != NULL) WT_TRET(__wt_rwlock_destroy(session, &lsm_tree->rwlock)); - __wt_free(session, lsm_tree->stats); - for (i = 0; i < lsm_tree->nchunks; i++) { if ((chunk = lsm_tree->chunk[i]) == NULL) continue; @@ -296,6 +294,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, WT_DECL_RET; WT_LSM_TREE *lsm_tree; const char *cfg[] = API_CONF_DEFAULTS(session, create, config); + const char *tmpconfig; /* If the tree is open, it already exists. */ if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) { @@ -304,9 +303,15 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, } WT_RET_NOTFOUND_OK(ret); - /* If the tree has metadata, it already exists. */ - if (__wt_metadata_read(session, uri, &config) == 0) { - __wt_free(session, config); + /* + * If the tree has metadata, it already exists. + * + * !!! + * Use a local variable: we don't care what the existing configuration + * is, but we don't want to overwrite the real config. + */ + if (__wt_metadata_read(session, uri, &tmpconfig) == 0) { + __wt_free(session, tmpconfig); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); @@ -447,7 +452,7 @@ __lsm_tree_open( WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); WT_ERR(__wt_rwlock_alloc(session, "lsm tree", &lsm_tree->rwlock)); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); - WT_ERR(__wt_stat_alloc_dsrc_stats(session, &lsm_tree->stats)); + __wt_stat_init_dsrc_stats(&lsm_tree->stats); WT_ERR(__wt_lsm_meta_read(session, lsm_tree)); @@ -528,11 +533,7 @@ __wt_lsm_tree_switch( uint32_t new_id; new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); - - WT_VERBOSE_RET(session, lsm, - "Tree switch to: %d because %d > %d", new_id, - (lsm_tree->memsizep == NULL ? 0 : (int)*lsm_tree->memsizep), - (int)lsm_tree->chunk_size); + WT_VERBOSE_RET(session, lsm, "Tree switch to: %d", new_id); if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) > lsm_tree->chunk_alloc) @@ -550,8 +551,6 @@ __wt_lsm_tree_switch( ++lsm_tree->dsk_gen; WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); - lsm_tree->memsizep = NULL; - err: /* TODO: mark lsm_tree bad on error(?) */ return (ret); } diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index ca66cceffcb..56511546f9b 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -341,9 +341,9 @@ __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if ((chunk = lsm_tree->old_chunks[i]) == NULL) continue; if (!locked) { - locked = 1; /* TODO: Do we need the lsm_tree lock for all drops? */ WT_ERR(__wt_writelock(session, lsm_tree->rwlock)); + locked = 1; } if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_drop( diff --git a/src/meta/meta_table.c b/src/meta/meta_table.c index 1f82bd3fb7a..7d790800148 100644 --- a/src/meta/meta_table.c +++ b/src/meta/meta_table.c @@ -210,6 +210,8 @@ __wt_metadata_read( WT_DECL_RET; const char *value; + *valuep = NULL; + if (__metadata_turtle(key)) return (__wt_meta_turtle_read(session, key, valuep)); diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c index d81f562aa4e..6e9cc1e6352 100644 --- a/src/meta/meta_turtle.c +++ b/src/meta/meta_turtle.c @@ -64,6 +64,8 @@ __wt_meta_turtle_read( int match; const char *path; + *valuep = NULL; + fp = NULL; path = NULL; diff --git a/src/os_posix/os_alloc.c b/src/os_posix/os_alloc.c index 8a3fb45eec1..bc4ef463e2a 100644 --- a/src/os_posix/os_alloc.c +++ b/src/os_posix/os_alloc.c @@ -32,7 +32,7 @@ __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp) */ WT_ASSERT(session, number != 0 && size != 0); - if (session != NULL && S2C(session)->stats != NULL) + if (session != NULL) WT_CSTAT_INCR(session, memory_allocation); if ((p = calloc(number, size)) == NULL) @@ -56,21 +56,26 @@ __wt_realloc(WT_SESSION_IMPL *session, /* * !!! * This function MUST handle a NULL WT_SESSION_IMPL handle. - */ - WT_ASSERT(session, bytes_to_allocate != 0); - - /* + * * Sometimes we're allocating memory and we don't care about the * final length -- bytes_allocated_ret may be NULL. */ - bytes_allocated = (bytes_allocated_ret == NULL) ? - 0 : *bytes_allocated_ret; - WT_ASSERT(session, bytes_allocated < bytes_to_allocate); - p = *(void **)retp; + bytes_allocated = + (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret; + WT_ASSERT(session, + (p == NULL && bytes_allocated == 0) || + (p != NULL && + (bytes_allocated_ret == NULL || bytes_allocated != 0))); + WT_ASSERT(session, bytes_to_allocate != 0); + WT_ASSERT(session, bytes_allocated < bytes_to_allocate); - if (p == NULL && session != NULL && S2C(session)->stats != NULL) - WT_CSTAT_INCR(session, memory_allocation); + if (session != NULL) { + if (p == NULL) + WT_CSTAT_INCR(session, memory_allocation); + else + WT_CSTAT_INCR(session, memory_grow); + } if ((p = realloc(p, bytes_to_allocate)) == NULL) WT_RET_MSG(session, __wt_errno(), "memory allocation"); @@ -114,21 +119,21 @@ __wt_realloc_aligned(WT_SESSION_IMPL *session, void *p, *newp; size_t bytes_allocated; - WT_ASSERT(session, bytes_to_allocate != 0); - /* * Sometimes we're allocating memory and we don't care about the * final length -- bytes_allocated_ret may be NULL. */ - bytes_allocated = (bytes_allocated_ret == NULL) ? - 0 : *bytes_allocated_ret; - WT_ASSERT(session, bytes_allocated < bytes_to_allocate); - p = *(void **)retp; + bytes_allocated = + (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret; + WT_ASSERT(session, + (p == NULL && bytes_allocated == 0) || + (p != NULL && + (bytes_allocated_ret == NULL || bytes_allocated != 0))); + WT_ASSERT(session, bytes_to_allocate != 0); + WT_ASSERT(session, bytes_allocated < bytes_to_allocate); - WT_ASSERT(session, p == NULL || bytes_allocated != 0); - - if (p == NULL && session != NULL && S2C(session)->stats != NULL) + if (session != NULL) WT_CSTAT_INCR(session, memory_allocation); if ((ret = posix_memalign(&newp, @@ -207,22 +212,24 @@ __wt_free_int(WT_SESSION_IMPL *session, void *p_arg) { void *p; - /* - * !!! - * This function MUST handle a NULL WT_SESSION_IMPL handle. - */ - if (session != NULL && S2C(session)->stats != NULL) - WT_CSTAT_INCR(session, memory_free); + p = *(void **)p_arg; + if (p == NULL) /* ANSI C free semantics */ + return; /* * If there's a serialization bug we might race with another thread. * We can't avoid the race (and we aren't willing to flush memory), - * but we minimize the window by clearing the free address atomically, - * hoping a racing thread will see, and won't free, a NULL pointer. + * but we minimize the window by clearing the free address, hoping a + * racing thread will see, and won't free, a NULL pointer. */ - p = *(void **)p_arg; *(void **)p_arg = NULL; - if (p != NULL) /* ANSI C free semantics */ - free(p); + /* + * !!! + * This function MUST handle a NULL WT_SESSION_IMPL handle. + */ + if (session != NULL) + WT_CSTAT_INCR(session, memory_free); + + free(p); } diff --git a/src/os_posix/os_fsync.c b/src/os_posix/os_fsync.c index abc71a73324..2871aa9a21f 100644 --- a/src/os_posix/os_fsync.c +++ b/src/os_posix/os_fsync.c @@ -19,8 +19,8 @@ __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh) WT_VERBOSE_RET(session, fileops, "%s: fsync", fh->name); WT_SYSCALL_RETRY(fsync(fh->fd), ret); - if (ret == 0) - return (0); + if (ret != 0) + WT_RET_MSG(session, ret, "%s fsync error", fh->name); - WT_RET_MSG(session, ret, "%s fsync error", fh->name); + return (0); } diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c index fd321b7759c..f35d976aab5 100644 --- a/src/os_posix/os_open.c +++ b/src/os_posix/os_open.c @@ -59,12 +59,13 @@ __wt_open(WT_SESSION_IMPL *session, WT_DECL_RET; WT_FH *fh; mode_t mode; - int f, fd, matched; + int direct_io, f, fd, matched; const char *path; conn = S2C(session); fh = NULL; fd = -1; + direct_io = 0; WT_VERBOSE_RET(session, fileops, "%s: open", name); @@ -113,13 +114,18 @@ __wt_open(WT_SESSION_IMPL *session, mode = 0; #ifdef O_DIRECT - if (is_tree && FLD_ISSET(conn->direct_io, WT_DIRECTIO_DATA)) + if (is_tree && FLD_ISSET(conn->direct_io, WT_DIRECTIO_DATA)) { f |= O_DIRECT; + direct_io = 1; + } #endif WT_SYSCALL_RETRY(((fd = open(path, f, mode)) == -1 ? 1 : 0), ret); if (ret != 0) - WT_ERR_MSG(session, ret, "%s", name); + WT_ERR_MSG(session, ret, + direct_io ? + "%s: open failed with direct I/O configured, some " + "filesystem types do not support direct I/O" : "%s", name); #if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) && !defined(O_CLOEXEC) /* @@ -147,6 +153,11 @@ __wt_open(WT_SESSION_IMPL *session, fh->fd = fd; fh->refcnt = 1; +#ifdef O_DIRECT + if (f & O_DIRECT) + fh->direct_io = 1; +#endif + /* Set the file's size. */ WT_ERR(__wt_filesize(session, fh, &fh->file_size)); diff --git a/src/os_posix/os_remove.c b/src/os_posix/os_remove.c index 026a1e9740b..ccd163b1ac6 100644 --- a/src/os_posix/os_remove.c +++ b/src/os_posix/os_remove.c @@ -8,23 +8,23 @@ #include "wt_internal.h" /* - * __wt_remove -- - * Remove a file. + * __remove_file_check -- + * Check if the file is currently open before removing it. */ -int -__wt_remove(WT_SESSION_IMPL *session, const char *name) +static inline void +__remove_file_check(WT_SESSION_IMPL *session, const char *name) { +#ifdef HAVE_DIAGNOSTIC WT_CONNECTION_IMPL *conn; - WT_DECL_RET; WT_FH *fh; - const char *path; conn = S2C(session); fh = NULL; - WT_VERBOSE_RET(session, fileops, "%s: remove", name); - - /* If the file is open, close/free it. */ + /* + * Check if the file is open: it's an error if it is, since a higher + * level should have closed it before removing. + */ __wt_spin_lock(session, &conn->fh_lock); TAILQ_FOREACH(fh, &conn->fhqh, q) { if (strcmp(name, fh->name) == 0) @@ -32,8 +32,26 @@ __wt_remove(WT_SESSION_IMPL *session, const char *name) } __wt_spin_unlock(session, &conn->fh_lock); - /* This should be caught at a higher level. */ WT_ASSERT(session, fh == NULL); +#else + WT_UNUSED(session); + WT_UNUSED(name); +#endif +} + +/* + * __wt_remove -- + * Remove a file. + */ +int +__wt_remove(WT_SESSION_IMPL *session, const char *name) +{ + WT_DECL_RET; + const char *path; + + WT_VERBOSE_RET(session, fileops, "%s: remove", name); + + __remove_file_check(session, name); WT_RET(__wt_filename(session, name, &path)); diff --git a/src/os_posix/os_rw.c b/src/os_posix/os_rw.c index c355b054cb6..2b83d961592 100644 --- a/src/os_posix/os_rw.c +++ b/src/os_posix/os_rw.c @@ -21,13 +21,13 @@ __wt_read(WT_SESSION_IMPL *session, "%s: read %" PRIu32 " bytes at offset %" PRIuMAX, fh->name, bytes, (uintmax_t)offset); - if (pread(fh->fd, buf, (size_t)bytes, offset) == (ssize_t)bytes) - return (0); + if (pread(fh->fd, buf, (size_t)bytes, offset) != (ssize_t)bytes) + WT_RET_MSG(session, __wt_errno(), + "%s read error: failed to read %" PRIu32 + " bytes at offset %" PRIuMAX, + fh->name, bytes, (uintmax_t)offset); - WT_RET_MSG(session, __wt_errno(), - "%s read error: failed to read %" PRIu32 " bytes at offset %" - PRIuMAX, - fh->name, bytes, (uintmax_t)offset); + return (0); } /* @@ -44,11 +44,11 @@ __wt_write(WT_SESSION_IMPL *session, "%s: write %" PRIu32 " bytes at offset %" PRIuMAX, fh->name, bytes, (uintmax_t)offset); - if (pwrite(fh->fd, buf, (size_t)bytes, offset) == (ssize_t)bytes) - return (0); + if (pwrite(fh->fd, buf, (size_t)bytes, offset) != (ssize_t)bytes) + WT_RET_MSG(session, __wt_errno(), + "%s write error: failed to write %" PRIu32 + " bytes at offset %" PRIuMAX, + fh->name, bytes, (uintmax_t)offset); - WT_RET_MSG(session, __wt_errno(), - "%s write error: failed to write %" PRIu32 " bytes at offset %" - PRIuMAX, - fh->name, bytes, (uintmax_t)offset); + return (0); } diff --git a/src/packing/packing_api.c b/src/packing/pack_api.c index 143eee445eb..143eee445eb 100644 --- a/src/packing/packing_api.c +++ b/src/packing/pack_api.c diff --git a/src/packing/packing.c b/src/packing/pack_impl.c index d0db6bf4128..a5a4a75ce67 100644 --- a/src/packing/packing.c +++ b/src/packing/pack_impl.c @@ -21,8 +21,6 @@ __wt_struct_check(WT_SESSION_IMPL *session, WT_PACK_VALUE pv; int fields; - WT_CLEAR(pv); /* -Wuninitialized. */ - WT_RET(__pack_initn(session, &pack, fmt, len)); for (fields = 0; (ret = __pack_next(&pack, &pv)) == 0; fields++) @@ -57,8 +55,6 @@ __wt_struct_sizev( WT_PACK_VALUE pv; size_t total; - WT_CLEAR(pv); /* -Wuninitialized */ - WT_RET(__pack_init(session, &pack, fmt)); for (total = 0; __pack_next(&pack, &pv) == 0;) { @@ -99,8 +95,6 @@ __wt_struct_packv(WT_SESSION_IMPL *session, WT_PACK_VALUE pv; uint8_t *p, *end; - WT_CLEAR(pv); /* -Wuninitialized */ - WT_RET(__pack_init(session, &pack, fmt)); p = buffer; @@ -111,6 +105,7 @@ __wt_struct_packv(WT_SESSION_IMPL *session, WT_RET(__pack_write(session, &pv, &p, (size_t)(end - p))); } + /* Be paranoid - __pack_write should never overflow. */ WT_ASSERT(session, p <= end); if (ret != WT_NOTFOUND) @@ -154,13 +149,13 @@ __wt_struct_unpackv(WT_SESSION_IMPL *session, p = buffer; end = p + size; - WT_CLEAR(pv.u.item); /* GCC 4.6 lint */ while ((ret = __pack_next(&pack, &pv)) == 0) { WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p))); WT_UNPACK_PUT(session, pv, ap); } + /* Be paranoid - __pack_write should never overflow. */ WT_ASSERT(session, p <= end); if (ret != WT_NOTFOUND) diff --git a/src/packing/pack_stream.c b/src/packing/pack_stream.c new file mode 100644 index 00000000000..2e8c4a22040 --- /dev/null +++ b/src/packing/pack_stream.c @@ -0,0 +1,288 @@ +/*- + * Copyright (c) 2008-2013 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * Streaming interface to packing. + * + * This allows applications to pack or unpack records one field at a time. + */ +struct __wt_pack_stream { + WT_PACK pack; + uint8_t *end, *p, *start; +}; + +/* + * wiredtiger_pack_start -- + * Open a stream for packing. + */ +int +wiredtiger_pack_start(WT_SESSION *wt_session, + const char *format, void *buffer, size_t len, WT_PACK_STREAM **psp) +{ + WT_DECL_RET; + WT_PACK_STREAM *ps; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + WT_RET(__wt_calloc_def(session, 1, &ps)); + WT_ERR(__pack_init(session, &ps->pack, format)); + ps->p = ps->start = buffer; + ps->end = ps->p + len; + *psp = ps; + + if (0) { +err: (void)wiredtiger_pack_close(ps, NULL); + } + return (ret); +} + +/* + * wiredtiger_unpack_start -- + * Open a stream for unpacking. + */ +int +wiredtiger_unpack_start(WT_SESSION *wt_session, const char *format, + const void *buffer, size_t size, WT_PACK_STREAM **psp) +{ + return (wiredtiger_pack_start( + wt_session, format, (void *)buffer, size, psp)); +} + +/* + * wiredtiger_pack_close -- + * Close a packing stream. + */ +int +wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp) +{ + if (usedp != NULL) + *usedp = WT_PTRDIFF(ps->p, ps->start); + + if (ps != NULL) + __wt_free(ps->pack.session, ps); + + return (0); +} + +/* + * wiredtiger_pack_item -- + * Pack an item. + */ +int +wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item) +{ + WT_PACK_VALUE pv; + WT_SESSION_IMPL *session; + + session = ps->pack.session; + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'U': + case 'u': + pv.u.item.data = item->data; + pv.u.item.size = item->size; + WT_RET(__pack_write( + session, &pv, &ps->p, (size_t)(ps->end - ps->p))); + break; + WT_ILLEGAL_VALUE(session); + } + + return (0); +} + +/* + * wiredtiger_pack_int -- + * Pack a signed integer. + */ +int +wiredtiger_pack_int(WT_PACK_STREAM *ps, int64_t i) +{ + WT_PACK_VALUE pv; + WT_SESSION_IMPL *session; + + session = ps->pack.session; + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'b': + case 'h': + case 'i': + case 'l': + case 'q': + pv.u.i = i; + WT_RET(__pack_write( + session, &pv, &ps->p, (size_t)(ps->end - ps->p))); + break; + WT_ILLEGAL_VALUE(session); + } + + return (0); +} + +/* + * wiredtiger_pack_str -- + * Pack a string. + */ +int +wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s) +{ + WT_PACK_VALUE pv; + WT_SESSION_IMPL *session; + + session = ps->pack.session; + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'S': + case 's': + pv.u.s = s; + WT_RET(__pack_write( + session, &pv, &ps->p, (size_t)(ps->end - ps->p))); + break; + WT_ILLEGAL_VALUE(session); + } + + return (0); +} + +/* + * wiredtiger_pack_uint -- + * Pack an unsigned int. + */ +int +wiredtiger_pack_uint(WT_PACK_STREAM *ps, uint64_t u) +{ + WT_PACK_VALUE pv; + WT_SESSION_IMPL *session; + + session = ps->pack.session; + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'B': + case 'H': + case 'I': + case 'L': + case 'Q': + case 'R': + case 'r': + case 't': + pv.u.u = u; + WT_RET(__pack_write( + session, &pv, &ps->p, (size_t)(ps->end - ps->p))); + break; + WT_ILLEGAL_VALUE(session); + } + + return (0); +} + +/* + * wiredtiger_unpack_item -- + * Unpack an item. + */ +int +wiredtiger_unpack_item(WT_PACK_STREAM *ps, WT_ITEM *item) +{ + WT_PACK_VALUE pv; + WT_SESSION_IMPL *session; + + session = ps->pack.session; + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'U': + case 'u': + WT_RET(__unpack_read(session, + &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p))); + item->data = pv.u.item.data; + item->size = pv.u.item.size; + break; + WT_ILLEGAL_VALUE(session); + } + + return (0); +} + +/* + * wiredtiger_unpack_int -- + * Unpack a signed integer. + */ +int +wiredtiger_unpack_int(WT_PACK_STREAM *ps, int64_t *ip) +{ + WT_PACK_VALUE pv; + WT_SESSION_IMPL *session; + + session = ps->pack.session; + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'b': + case 'h': + case 'i': + case 'l': + case 'q': + WT_RET(__unpack_read(session, + &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p))); + *ip = pv.u.i; + break; + WT_ILLEGAL_VALUE(session); + } + return (0); +} + +/* + * wiredtiger_unpack_str -- + * Unpack a string. + */ +int +wiredtiger_unpack_str(WT_PACK_STREAM *ps, const char **sp) +{ + WT_PACK_VALUE pv; + WT_SESSION_IMPL *session; + + session = ps->pack.session; + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'S': + case 's': + WT_RET(__unpack_read(session, + &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p))); + *sp = pv.u.s; + break; + WT_ILLEGAL_VALUE(session); + } + return (0); +} + +/* + * wiredtiger_unpack_uint -- + * Unpack an unsigned integer. + */ +int +wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up) +{ + WT_PACK_VALUE pv; + WT_SESSION_IMPL *session; + + session = ps->pack.session; + WT_RET(__pack_next(&ps->pack, &pv)); + switch (pv.type) { + case 'B': + case 'H': + case 'I': + case 'L': + case 'Q': + case 'R': + case 'r': + case 't': + WT_RET(__unpack_read(session, + &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p))); + *up = pv.u.u; + break; + WT_ILLEGAL_VALUE(session); + } + return (0); +} diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c index e401f5cf484..4a6b75d5dd3 100644 --- a/src/schema/schema_create.c +++ b/src/schema/schema_create.c @@ -14,11 +14,10 @@ __create_file(WT_SESSION_IMPL *session, WT_DECL_ITEM(val); WT_DECL_RET; int is_metadata; - const char *cfg[] = API_CONF_DEFAULTS(session, create, config); const char *filecfg[4] = API_CONF_DEFAULTS(file, meta, config); - const char *filename, *treeconf; + const char *fileconf, *filename; - treeconf = NULL; + fileconf = NULL; is_metadata = strcmp(uri, WT_METADATA_URI) == 0; @@ -28,14 +27,14 @@ __create_file(WT_SESSION_IMPL *session, /* Check if the file already exists. */ if (!is_metadata && (ret = - __wt_metadata_read(session, uri, &treeconf)) != WT_NOTFOUND) { + __wt_metadata_read(session, uri, &fileconf)) != WT_NOTFOUND) { if (exclusive) WT_TRET(EEXIST); goto err; } /* Create the file. */ - WT_ERR(__wt_btree_create(session, filename)); + WT_ERR(__wt_block_manager_create(session, filename)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_fileop(session, NULL, uri)); @@ -50,8 +49,8 @@ __create_file(WT_SESSION_IMPL *session, WT_BTREE_MAJOR_VERSION, WT_BTREE_MINOR_VERSION)); filecfg[2] = val->data; filecfg[3] = NULL; - WT_ERR(__wt_config_collapse(session, filecfg, &treeconf)); - if ((ret = __wt_metadata_insert(session, uri, treeconf)) != 0) { + WT_ERR(__wt_config_collapse(session, filecfg, &fileconf)); + if ((ret = __wt_metadata_insert(session, uri, fileconf)) != 0) { if (ret == WT_DUPLICATE_KEY) ret = EEXIST; goto err; @@ -59,20 +58,23 @@ __create_file(WT_SESSION_IMPL *session, } /* - * Open the file to check that it was setup correctly. + * Open the file to check that it was setup correctly. We don't need + * to pass the configuration, we just wrote the collapsed configuration + * into the metadata file, and it's going to be read/used by underlying + * functions. * * Keep the handle exclusive until it is released at the end of the * call, otherwise we could race with a drop. */ WT_ERR(__wt_conn_btree_get( - session, uri, NULL, cfg, WT_DHANDLE_EXCLUSIVE)); + session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_handle_lock(session, 1)); else WT_ERR(__wt_session_release_btree(session)); err: __wt_scr_free(&val); - __wt_free(session, treeconf); + __wt_free(session, fileconf); return (ret); } @@ -143,7 +145,7 @@ __create_colgroup(WT_SESSION_IMPL *session, /* Make sure the column group is referenced from the table. */ if (cgname != NULL && (ret = __wt_config_subgets(session, &table->cgconf, cgname, &cval)) != 0) - WT_RET_MSG(session, EINVAL, + WT_ERR_MSG(session, EINVAL, "Column group '%s' not found in table '%.*s'", cgname, (int)tlen, tablename); @@ -203,6 +205,8 @@ err: __wt_free(session, cgconf); __wt_buf_free(session, &confbuf); __wt_buf_free(session, &fmt); __wt_buf_free(session, &namebuf); + + __wt_schema_release_table(session, table); return (ret); } @@ -355,6 +359,8 @@ err: __wt_free(session, idxconf); __wt_buf_free(session, &extra_cols); __wt_buf_free(session, &fmt); __wt_buf_free(session, &namebuf); + + __wt_schema_release_table(session, table); return (ret); } @@ -381,8 +387,10 @@ __create_table(WT_SESSION_IMPL *session, return (EINVAL); if ((ret = __wt_schema_get_table(session, - tablename, strlen(tablename), 0, &table)) == 0) + tablename, strlen(tablename), 0, &table)) == 0) { + __wt_schema_release_table(session, table); return (exclusive ? EEXIST : 0); + } WT_RET_NOTFOUND_OK(ret); WT_RET(__wt_config_gets(session, cfg, "colgroups", &cval)); @@ -416,9 +424,13 @@ __create_table(WT_SESSION_IMPL *session, } if (0) { -err: if (table != NULL) - WT_TRET(__wt_schema_remove_table(session, table)); +err: if (table != NULL) { + __wt_schema_remove_table(session, table); + table = NULL; + } } + if (table != NULL) + __wt_schema_release_table(session, table); __wt_free(session, cgname); __wt_free(session, tableconf); return (ret); diff --git a/src/schema/schema_drop.c b/src/schema/schema_drop.c index c6109a6f020..db0fcbb3f5d 100644 --- a/src/schema/schema_drop.c +++ b/src/schema/schema_drop.c @@ -117,6 +117,7 @@ __drop_table( name = uri; (void)WT_PREFIX_SKIP(name, "table:"); + table = NULL; WT_ERR(__wt_schema_get_table(session, name, strlen(name), 1, &table)); /* Drop the column groups. */ @@ -136,13 +137,16 @@ __drop_table( WT_ERR(__wt_schema_drop(session, idx->source, cfg)); } - WT_ERR(__wt_schema_remove_table(session, table)); + __wt_schema_remove_table(session, table); + table = NULL; /* Remove the metadata entry (ignore missing items). */ WT_ERR(__wt_metadata_remove(session, uri)); err: if (force && ret == WT_NOTFOUND) ret = 0; + if (table != NULL) + __wt_schema_release_table(session, table); return (ret); } @@ -185,6 +189,9 @@ __wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) if (ret == WT_NOTFOUND) ret = force ? 0 : ENOENT; + /* Bump the schema generation so that stale data is ignored. */ + ++S2C(session)->schema_gen; + WT_TRET(__wt_meta_track_off(session, ret != 0)); return (ret); diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c index 61dc88288eb..6508d9696d4 100644 --- a/src/schema/schema_list.c +++ b/src/schema/schema_list.c @@ -19,6 +19,9 @@ __schema_add_table(WT_SESSION_IMPL *session, WT_RET(__wt_schema_open_table(session, name, namelen, &table)); + /* Copy the schema generation into the new table. */ + table->schema_gen = S2C(session)->schema_gen; + TAILQ_INSERT_HEAD(&session->tables, table, q); *tablep = table; @@ -36,10 +39,28 @@ __schema_find_table(WT_SESSION_IMPL *session, WT_TABLE *table; const char *tablename; +restart: TAILQ_FOREACH(table, &session->tables, q) { tablename = table->name; (void)WT_PREFIX_SKIP(tablename, "table:"); if (WT_STRING_MATCH(tablename, name, namelen)) { + /* + * Ignore stale tables. + * + * XXX: should be managed the same as btree handles, + * with a local cache in each session and a shared list + * in the connection. There is still a race here + * between checking the generation and opening the + * first column group. + */ + if (table->schema_gen != S2C(session)->schema_gen) { + if (table->refcnt == 0) { + __wt_schema_remove_table( + session, table); + goto restart; + } + continue; + } *tablep = table; return (0); } @@ -59,7 +80,6 @@ __wt_schema_get_table(WT_SESSION_IMPL *session, WT_DECL_RET; WT_TABLE *table; - table = NULL; ret = __schema_find_table(session, name, namelen, &table); if (ret == WT_NOTFOUND) @@ -72,6 +92,7 @@ __wt_schema_get_table(WT_SESSION_IMPL *session, "until all column groups are created", table->name); + ++table->refcnt; *tablep = table; } @@ -79,6 +100,17 @@ __wt_schema_get_table(WT_SESSION_IMPL *session, } /* + * __wt_schema_release_table -- + * Release a table handle. + */ +void +__wt_schema_release_table(WT_SESSION_IMPL *session, WT_TABLE *table) +{ + WT_ASSERT(session, table->refcnt > 0); + --table->refcnt; +} + +/* * __wt_schema_destroy_colgroup -- * Free a column group handle. */ @@ -147,28 +179,25 @@ __wt_schema_destroy_table(WT_SESSION_IMPL *session, WT_TABLE *table) * __wt_schema_remove_table -- * Remove the table handle from the session, closing if necessary. */ -int +void __wt_schema_remove_table( WT_SESSION_IMPL *session, WT_TABLE *table) { + WT_ASSERT(session, table->refcnt <= 1); + TAILQ_REMOVE(&session->tables, table, q); __wt_schema_destroy_table(session, table); - - return (0); } /* * __wt_schema_close_tables -- * Close all of the tables in a session. */ -int +void __wt_schema_close_tables(WT_SESSION_IMPL *session) { - WT_DECL_RET; WT_TABLE *table; while ((table = TAILQ_FIRST(&session->tables)) != NULL) - WT_TRET(__wt_schema_remove_table(session, table)); - - return (ret); + __wt_schema_remove_table(session, table); } diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c index c624ccd0797..c95ae0b97d8 100644 --- a/src/schema/schema_open.c +++ b/src/schema/schema_open.c @@ -431,17 +431,19 @@ __wt_schema_get_colgroup(WT_SESSION_IMPL *session, WT_RET(__wt_schema_get_table(session, tablename, WT_PTRDIFF(tend, tablename), 0, &table)); - if (tablep != NULL) - *tablep = table; - for (i = 0; i < WT_COLGROUPS(table); i++) { colgroup = table->cgroups[i]; if (strcmp(colgroup->name, uri) == 0) { *colgroupp = colgroup; + if (tablep != NULL) + *tablep = table; + else + __wt_schema_release_table(session, table); return (0); } } + __wt_schema_release_table(session, table); WT_RET_MSG(session, ENOENT, "%s not found in table", uri); } @@ -453,6 +455,7 @@ int __wt_schema_get_index(WT_SESSION_IMPL *session, const char *uri, WT_TABLE **tablep, WT_INDEX **indexp) { + WT_DECL_RET; WT_INDEX *idx; WT_TABLE *table; const char *tablename, *tend; @@ -468,22 +471,26 @@ __wt_schema_get_index(WT_SESSION_IMPL *session, WT_RET(__wt_schema_get_table(session, tablename, WT_PTRDIFF(tend, tablename), 0, &table)); - if (tablep != NULL) - *tablep = table; - /* Try to find the index in the table. */ for (i = 0; i < table->nindices; i++) { idx = table->indices[i]; if (strcmp(idx->name, uri) == 0) { + if (tablep != NULL) + *tablep = table; + else + __wt_schema_release_table(session, table); *indexp = idx; return (0); } } /* Otherwise, open it. */ - WT_RET(__wt_schema_open_index( + WT_ERR(__wt_schema_open_index( session, table, tend + 1, strlen(tend + 1), indexp)); +err: __wt_schema_release_table(session, table); + WT_RET(ret); + if (*indexp != NULL) return (0); diff --git a/src/schema/schema_rename.c b/src/schema/schema_rename.c index c336ff95dae..00dddbb4058 100644 --- a/src/schema/schema_rename.c +++ b/src/schema/schema_rename.c @@ -102,6 +102,7 @@ __rename_tree(WT_SESSION_IMPL *session, "expected a 'colgroup:' or 'index:' source: '%s'", name); suffix = strchr(name, ':'); + /* An existing table should have a well formed name. */ WT_ASSERT(session, suffix != NULL); suffix = strchr(suffix + 1, ':'); @@ -187,16 +188,17 @@ __rename_table(WT_SESSION_IMPL *session, /* Rename the column groups. */ for (i = 0; i < WT_COLGROUPS(table); i++) - WT_RET(__rename_tree(session, table, newuri, + WT_ERR(__rename_tree(session, table, newuri, table->cgroups[i]->name, cfg)); /* Rename the indices. */ - WT_RET(__wt_schema_open_indices(session, table)); + WT_ERR(__wt_schema_open_indices(session, table)); for (i = 0; i < table->nindices; i++) - WT_RET(__rename_tree(session, table, newuri, + WT_ERR(__rename_tree(session, table, newuri, table->indices[i]->name, cfg)); - WT_RET(__wt_schema_remove_table(session, table)); + __wt_schema_remove_table(session, table); + table = NULL; /* Rename the table. */ WT_ERR(__wt_scr_alloc(session, 0, &buf)); @@ -205,6 +207,8 @@ __rename_table(WT_SESSION_IMPL *session, WT_ERR(__wt_metadata_insert(session, newuri, value)); err: __wt_scr_free(&buf); + if (table != NULL) + __wt_schema_release_table(session, table); return (ret); } @@ -247,6 +251,9 @@ __wt_schema_rename(WT_SESSION_IMPL *session, } else if ((ret = __wt_schema_get_source(session, uri, &dsrc)) == 0) ret = dsrc->rename(dsrc, &session->iface, uri, newuri, cfg); + /* Bump the schema generation so that stale data is ignored. */ + ++S2C(session)->schema_gen; + WT_TRET(__wt_meta_track_off(session, ret != 0)); /* If we didn't find a metadata entry, map that error to ENOENT. */ diff --git a/src/schema/schema_stat.c b/src/schema/schema_stat.c index 2631b97a1c1..4bec00165c0 100644 --- a/src/schema/schema_stat.c +++ b/src/schema/schema_stat.c @@ -74,15 +74,16 @@ __curstat_table_init(WT_SESSION_IMPL *session, WT_UNUSED(flags); name = uri + strlen("table:"); - WT_RET(__wt_schema_get_table(session, name, strlen(name), 0, &table)); WT_RET(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_schema_get_table(session, name, strlen(name), 0, &table)); /* Clear the statistics we are about to recalculate. */ if (cst->stats != NULL) { __wt_stat_clear_dsrc_stats(cst->stats); stats = (WT_DSRC_STATS *)cst->stats; } else { - WT_ERR(__wt_stat_alloc_dsrc_stats(session, &stats)); + WT_ERR(__wt_calloc_def(session, 1, &stats)); + __wt_stat_init_dsrc_stats(stats); cst->stats_first = cst->stats = (WT_STATS *)stats; cst->stats_count = sizeof(*stats) / sizeof(WT_STATS); } @@ -127,6 +128,7 @@ __curstat_table_init(WT_SESSION_IMPL *session, } err: __wt_scr_free(&buf); + __wt_schema_release_table(session, table); return (ret); } diff --git a/src/schema/schema_truncate.c b/src/schema/schema_truncate.c index 4caa4604a22..ece2ecbf049 100644 --- a/src/schema/schema_truncate.c +++ b/src/schema/schema_truncate.c @@ -25,7 +25,7 @@ __truncate_file(WT_SESSION_IMPL *session, const char *name) /* Delete the root address and truncate the file. */ WT_RET(__wt_meta_checkpoint_clear(session, name)); - WT_RET(__wt_btree_truncate(session, filename)); + WT_RET(__wt_block_manager_truncate(session, filename)); return (0); } @@ -43,8 +43,8 @@ __truncate_table(WT_SESSION_IMPL *session, const char *name) const char *hname; u_int i; - WT_RET(__wt_schema_get_table(session, name, strlen(name), 0, &table)); WT_RET(__wt_scr_alloc(session, 0, &namebuf)); + WT_ERR(__wt_schema_get_table(session, name, strlen(name), 0, &table)); /* Truncate the column groups. */ for (i = 0; i < WT_COLGROUPS(table); i++) { @@ -78,6 +78,7 @@ __truncate_table(WT_SESSION_IMPL *session, const char *name) } err: __wt_scr_free(&namebuf); + __wt_schema_release_table(session, table); return (ret); } diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c index ad9480bc0fe..619432ea9ac 100644 --- a/src/schema/schema_worker.c +++ b/src/schema/schema_worker.c @@ -25,6 +25,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session, const char *tablename; u_int i; + table = NULL; tablename = uri; /* Get the btree handle(s) and call the underlying function. */ @@ -52,18 +53,20 @@ __wt_schema_worker(WT_SESSION_IMPL *session, for (i = 0; i < WT_COLGROUPS(table); i++) { colgroup = table->cgroups[i]; - WT_RET(__wt_schema_worker( + WT_ERR(__wt_schema_worker( session, colgroup->source, func, cfg, open_flags)); } - WT_RET(__wt_schema_open_indices(session, table)); + WT_ERR(__wt_schema_open_indices(session, table)); for (i = 0; i < table->nindices; i++) { idx = table->indices[i]; - WT_RET(__wt_schema_worker( + WT_ERR(__wt_schema_worker( session, idx->source, func, cfg, open_flags)); } } else return (__wt_bad_object_type(session, uri)); +err: if (table != NULL) + __wt_schema_release_table(session, table); return (ret); } diff --git a/src/session/session_api.c b/src/session/session_api.c index 5339dcf0800..8141a28e545 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -38,7 +38,7 @@ __session_close_cache(WT_SESSION_IMPL *session) while ((dhandle_cache = TAILQ_FIRST(&session->dhandles)) != NULL) WT_TRET(__wt_session_discard_btree(session, dhandle_cache)); - WT_TRET(__wt_schema_close_tables(session)); + __wt_schema_close_tables(session); return (ret); } @@ -595,6 +595,13 @@ __session_begin_transaction(WT_SESSION *wt_session, const char *config) WT_ERR(__session_reset_cursors(session)); + /* + * Now there are no cursors open and no transaction active in this + * thread. Check if the cache is full: if we have to block for + * eviction, this is the best time to do it. + */ + WT_ERR(__wt_cache_full_check(session)); + ret = __wt_txn_begin(session, cfg); err: API_END(session); diff --git a/src/support/filename.c b/src/support/filename.c index 1b2223a50ef..6b3d7ff17d2 100644 --- a/src/support/filename.c +++ b/src/support/filename.c @@ -8,12 +8,36 @@ #include "wt_internal.h" /* + * __wt_absolute_path -- + * Return if a filename is an absolute path. + */ +int +__wt_absolute_path(const char *path) +{ + return (path[0] == '/' ? 1 : 0); +} + +/* * __wt_filename -- - * Build a filename in a scratch buffer. + * Build a file name in a scratch buffer, automatically calculate the + * length of the file name. */ int __wt_filename(WT_SESSION_IMPL *session, const char *name, const char **path) { + return __wt_nfilename(session, name, strlen(name), path); +} + +/* + * __wt_nfilename -- + * Build a file name in a scratch buffer. If the name is already an + * absolute path duplicate it, otherwise generate a path relative to the + * connection home directory. + */ +int +__wt_nfilename(WT_SESSION_IMPL *session, + const char *name, size_t namelen, const char **path) +{ WT_CONNECTION_IMPL *conn; size_t len; char *buf; @@ -21,10 +45,14 @@ __wt_filename(WT_SESSION_IMPL *session, const char *name, const char **path) conn = S2C(session); *path = NULL; - len = strlen(conn->home) + 1 + strlen(name) + 1; - WT_RET(__wt_calloc(session, 1, len, &buf)); - snprintf(buf, len, "%s/%s", conn->home, name); + if (__wt_absolute_path(name)) + WT_RET(__wt_strndup(session, name, namelen, path)); + else { + len = strlen(conn->home) + 1 + namelen + 1; + WT_RET(__wt_calloc(session, 1, len, &buf)); + snprintf(buf, len, "%s/%.*s", conn->home, (int)namelen, name); + *path = buf; + } - *path = buf; return (0); } diff --git a/src/support/hazard.c b/src/support/hazard.c index d5f49f89eef..9db7a5d6c95 100644 --- a/src/support/hazard.c +++ b/src/support/hazard.c @@ -88,7 +88,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp * page to be evicted and a different page read into the same * memory, so the pointer hasn't changed but the contents have. * That's OK, we found this page using the tree's key space, - * whatever page we find here is the page page for us to use.) + * whatever page we find here is the page for us to use.) */ if (ref->page == hp->page && (ref->state == WT_REF_MEM || @@ -153,7 +153,7 @@ __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page) * Perform the check here since we want to do it when * we are about to release the hazard reference. */ - (void)__wt_eviction_page_force(session, page); + __wt_eviction_page_force(session, page); /* * We don't publish the hazard pointer clear in the diff --git a/src/support/scratch.c b/src/support/scratch.c index 9cda695ec89..e0e76348876 100644 --- a/src/support/scratch.c +++ b/src/support/scratch.c @@ -302,7 +302,7 @@ __wt_scr_alloc_func(WT_SESSION_IMPL *session, /* * If we find a buffer that's not in-use, check its size: we - * want the the smallest buffer larger than the requested size, + * want the smallest buffer larger than the requested size, * or the largest buffer if none are large enough. */ if (best == NULL || diff --git a/src/support/stat.c b/src/support/stat.c index 08f7f2a9f30..7df21d4719e 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -2,13 +2,9 @@ #include "wt_internal.h" -int -__wt_stat_alloc_dsrc_stats(WT_SESSION_IMPL *session, WT_DSRC_STATS **statsp) +void +__wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats) { - WT_DSRC_STATS *stats; - - WT_RET(__wt_calloc_def(session, 1, &stats)); - stats->block_alloc.desc = "blocks allocated"; stats->block_allocsize.desc = "block manager file allocation unit size"; @@ -47,13 +43,23 @@ __wt_stat_alloc_dsrc_stats(WT_SESSION_IMPL *session, WT_DSRC_STATS **statsp) stats->btree_row_leaf.desc = "row-store leaf pages"; stats->cache_bytes_read.desc = "bytes read into cache"; stats->cache_bytes_write.desc = "bytes written from cache"; + stats->cache_eviction_checkpoint.desc = + "cache: checkpoint blocked page eviction"; stats->cache_eviction_clean.desc = "unmodified pages evicted"; stats->cache_eviction_dirty.desc = "modified pages evicted"; stats->cache_eviction_fail.desc = "data source pages selected for eviction unable to be evicted"; + stats->cache_eviction_force.desc = + "cache: pages queued for forced eviction"; stats->cache_eviction_hazard.desc = - "eviction unable to acquire hazard pointer"; + "cache: hazard pointer blocked page eviction"; stats->cache_eviction_internal.desc = "internal pages evicted"; + stats->cache_eviction_merge.desc = + "cache: internal page merge operations completed"; + stats->cache_eviction_merge_fail.desc = + "cache: internal page merge attempts that could not complete"; + stats->cache_eviction_merge_levels.desc = + "cache: internal levels merged"; stats->cache_overflow_value.desc = "overflow values cached in memory"; stats->cache_read.desc = "pages read into cache"; stats->cache_read_overflow.desc = "overflow pages read into cache"; @@ -95,19 +101,18 @@ __wt_stat_alloc_dsrc_stats(WT_SESSION_IMPL *session, WT_DSRC_STATS **statsp) stats->rec_pages_eviction.desc = "page reconciliation calls for eviction"; stats->rec_skipped_update.desc = - "page reconciliation failed when an update could not be included"; + "reconciliation failed because an update could not be included"; stats->rec_split_intl.desc = "reconciliation internal pages split"; stats->rec_split_leaf.desc = "reconciliation leaf pages split"; + stats->rec_split_max.desc = + "reconciliation maximum number of splits created by for a page"; stats->session_compact.desc = "object compaction"; stats->txn_update_conflict.desc = "update conflicts"; stats->txn_write_conflict.desc = "write generation conflicts"; - - *statsp = stats; - return (0); } void -__wt_stat_clear_dsrc_stats(WT_STATS *stats_arg) +__wt_stat_clear_dsrc_stats(void *stats_arg) { WT_DSRC_STATS *stats; @@ -145,11 +150,16 @@ __wt_stat_clear_dsrc_stats(WT_STATS *stats_arg) stats->btree_row_leaf.v = 0; stats->cache_bytes_read.v = 0; stats->cache_bytes_write.v = 0; + stats->cache_eviction_checkpoint.v = 0; stats->cache_eviction_clean.v = 0; stats->cache_eviction_dirty.v = 0; stats->cache_eviction_fail.v = 0; + stats->cache_eviction_force.v = 0; stats->cache_eviction_hazard.v = 0; stats->cache_eviction_internal.v = 0; + stats->cache_eviction_merge.v = 0; + stats->cache_eviction_merge_fail.v = 0; + stats->cache_eviction_merge_levels.v = 0; stats->cache_overflow_value.v = 0; stats->cache_read.v = 0; stats->cache_read_overflow.v = 0; @@ -186,18 +196,15 @@ __wt_stat_clear_dsrc_stats(WT_STATS *stats_arg) stats->rec_skipped_update.v = 0; stats->rec_split_intl.v = 0; stats->rec_split_leaf.v = 0; + stats->rec_split_max.v = 0; stats->session_compact.v = 0; stats->txn_update_conflict.v = 0; stats->txn_write_conflict.v = 0; } -int -__wt_stat_alloc_connection_stats(WT_SESSION_IMPL *session, WT_CONNECTION_STATS **statsp) +void +__wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) { - WT_CONNECTION_STATS *stats; - - WT_RET(__wt_calloc_def(session, 1, &stats)); - stats->block_byte_map_read.desc = "mapped bytes read by the block manager"; stats->block_byte_read.desc = "bytes read by the block manager"; @@ -211,15 +218,26 @@ __wt_stat_alloc_connection_stats(WT_SESSION_IMPL *session, WT_CONNECTION_STATS * stats->cache_bytes_max.desc = "cache: maximum bytes configured"; stats->cache_bytes_read.desc = "cache: bytes read into cache"; stats->cache_bytes_write.desc = "cache: bytes written from cache"; + stats->cache_eviction_checkpoint.desc = + "cache: checkpoint blocked page eviction"; stats->cache_eviction_clean.desc = "cache: unmodified pages evicted"; stats->cache_eviction_dirty.desc = "cache: modified pages evicted"; stats->cache_eviction_fail.desc = "cache: pages selected for eviction unable to be evicted"; + stats->cache_eviction_force.desc = + "cache: pages queued for forced eviction"; stats->cache_eviction_hazard.desc = - "cache: eviction unable to acquire hazard pointer"; + "cache: hazard pointer blocked page eviction"; stats->cache_eviction_internal.desc = "cache: internal pages evicted"; + stats->cache_eviction_merge.desc = + "cache: internal page merge operations completed"; + stats->cache_eviction_merge_fail.desc = + "cache: internal page merge attempts that could not complete"; + stats->cache_eviction_merge_levels.desc = + "cache: internal levels merged"; stats->cache_eviction_slow.desc = "cache: eviction server unable to reach eviction goal"; + stats->cache_eviction_walk.desc = "cache: pages walked for eviction"; stats->cache_pages_dirty.desc = "cache: tracked dirty pages in the cache"; stats->cache_pages_inuse.desc = @@ -230,7 +248,13 @@ __wt_stat_alloc_connection_stats(WT_SESSION_IMPL *session, WT_CONNECTION_STATS * stats->file_open.desc = "files currently open"; stats->memory_allocation.desc = "total heap memory allocations"; stats->memory_free.desc = "total heap memory frees"; + stats->memory_grow.desc = "total heap memory re-allocations"; stats->read_io.desc = "total read I/Os"; + stats->rec_pages.desc = "page reconciliation calls"; + stats->rec_pages_eviction.desc = + "page reconciliation calls for eviction"; + stats->rec_skipped_update.desc = + "reconciliation failed because an update could not be included"; stats->rwlock_read.desc = "pthread mutex shared lock read-lock calls"; stats->rwlock_write.desc = "pthread mutex shared lock write-lock calls"; @@ -242,13 +266,10 @@ __wt_stat_alloc_connection_stats(WT_SESSION_IMPL *session, WT_CONNECTION_STATS * "transaction failures due to cache overflow"; stats->txn_rollback.desc = "transactions rolled-back"; stats->write_io.desc = "total write I/Os"; - - *statsp = stats; - return (0); } void -__wt_stat_clear_connection_stats(WT_STATS *stats_arg) +__wt_stat_clear_connection_stats(void *stats_arg) { WT_CONNECTION_STATS *stats; @@ -262,12 +283,18 @@ __wt_stat_clear_connection_stats(WT_STATS *stats_arg) stats->cache_bytes_dirty.v = 0; stats->cache_bytes_read.v = 0; stats->cache_bytes_write.v = 0; + stats->cache_eviction_checkpoint.v = 0; stats->cache_eviction_clean.v = 0; stats->cache_eviction_dirty.v = 0; stats->cache_eviction_fail.v = 0; + stats->cache_eviction_force.v = 0; stats->cache_eviction_hazard.v = 0; stats->cache_eviction_internal.v = 0; + stats->cache_eviction_merge.v = 0; + stats->cache_eviction_merge_fail.v = 0; + stats->cache_eviction_merge_levels.v = 0; stats->cache_eviction_slow.v = 0; + stats->cache_eviction_walk.v = 0; stats->cache_pages_dirty.v = 0; stats->cache_read.v = 0; stats->cache_write.v = 0; @@ -275,7 +302,11 @@ __wt_stat_clear_connection_stats(WT_STATS *stats_arg) stats->file_open.v = 0; stats->memory_allocation.v = 0; stats->memory_free.v = 0; + stats->memory_grow.v = 0; stats->read_io.v = 0; + stats->rec_pages.v = 0; + stats->rec_pages_eviction.v = 0; + stats->rec_skipped_update.v = 0; stats->rwlock_read.v = 0; stats->rwlock_write.v = 0; stats->txn_ancient.v = 0; diff --git a/src/txn/txn.c b/src/txn/txn.c index 918ac67d9da..6e2ba2d5e33 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -39,7 +39,7 @@ __txn_sort_snapshot(WT_SESSION_IMPL *session, txn->snapshot_count = n; txn->snap_min = (n == 0) ? id : txn->snapshot[0]; txn->snap_max = id; - WT_ASSERT(session, txn->snap_min != WT_TXN_NONE); + WT_ASSERT(session, n == 0 || txn->snap_min != WT_TXN_NONE); txn->oldest_snap_min = TXNID_LT(oldest_snap_min, txn->snap_min) ? oldest_snap_min : txn->snap_min; } @@ -51,13 +51,9 @@ __txn_sort_snapshot(WT_SESSION_IMPL *session, void __wt_txn_release_snapshot(WT_SESSION_IMPL *session) { - WT_TXN *txn; WT_TXN_STATE *txn_state; - txn = &session->txn; txn_state = &S2C(session)->txn_global.states[session->id]; - - txn->snapshot_count = 0; txn_state->snap_min = WT_TXN_NONE; } @@ -79,9 +75,17 @@ __wt_txn_get_oldest(WT_SESSION_IMPL *session) conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; + oldest_snap_min = (txn->id != WT_TXN_NONE) ? txn->id : txn_global->current; + /* If nothing has changed since last time, we're done. */ + if (txn->last_oldest_gen == txn_global->gen && + txn->last_oldest_id == oldest_snap_min) + return; + txn->last_oldest_gen = txn_global->gen; + txn->last_oldest_id = oldest_snap_min; + WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; @@ -100,7 +104,7 @@ __wt_txn_get_oldest(WT_SESSION_IMPL *session) */ void __wt_txn_get_snapshot( - WT_SESSION_IMPL *session, wt_txnid_t my_id, wt_txnid_t max_id) + WT_SESSION_IMPL *session, wt_txnid_t my_id, wt_txnid_t max_id, int force) { WT_CONNECTION_IMPL *conn; WT_TXN *txn; @@ -114,9 +118,18 @@ __wt_txn_get_snapshot( txn_global = &conn->txn_global; txn_state = &txn_global->states[session->id]; + /* If nothing has changed since last time, we're done. */ + if (!force && txn->last_id == txn_global->current && + txn->last_gen == txn_global->gen) { + txn_state->snap_min = txn->snap_min; + return; + } + do { /* Take a copy of the current session ID. */ - current_id = oldest_snap_min = txn_global->current; + txn->last_gen = txn->last_oldest_gen = txn_global->gen; + txn->last_id = oldest_snap_min = current_id = + txn_global->current; /* Copy the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); @@ -153,39 +166,19 @@ __wt_txn_get_snapshot( /* * __wt_txn_get_evict_snapshot -- * Set up a snapshot in the current transaction for eviction. - * No changes that are invisible to any active transaction can be evicted. + * Only changes that visible to all active transactions can be evicted. */ void __wt_txn_get_evict_snapshot(WT_SESSION_IMPL *session) { - WT_CONNECTION_IMPL *conn; - WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *s; - wt_txnid_t current_id, id, oldest_snap_min; - uint32_t i, session_cnt; - - conn = S2C(session); - txn_global = &conn->txn_global; - - do { - /* Take a copy of the current session ID. */ - current_id = oldest_snap_min = txn_global->current; + WT_TXN *txn; - /* Walk the array of concurrent transactions. */ - WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) - if ((id = s->snap_min) != WT_TXN_NONE && - TXNID_LT(id, oldest_snap_min)) - oldest_snap_min = id; + txn = &session->txn; - /* - * Ensure the snapshot reads are scheduled before re-checking - * the global current ID. - */ - WT_READ_BARRIER(); - } while (current_id != txn_global->current); + __wt_txn_get_oldest(session); + __txn_sort_snapshot( + session, 0, txn->oldest_snap_min, txn->oldest_snap_min); - __txn_sort_snapshot(session, 0, oldest_snap_min, oldest_snap_min); /* * Note that we carefully don't update the global table with this * snap_min value: there is already a running transaction in this @@ -237,7 +230,7 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) * If two threads race to allocate an ID, only the latest ID * will proceed. The winning thread can be sure its snapshot * contains all of the earlier active IDs. Threads that race - * race and get an earlier ID may not appear in the snapshot, + * and get an earlier ID may not appear in the snapshot, * but they will loop and allocate a new ID before proceeding * to make any updates. * @@ -249,7 +242,6 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) txn->id = WT_ATOMIC_ADD(txn_global->current, 1); } while (txn->id == WT_TXN_NONE || txn->id == WT_TXN_ABORTED); WT_PUBLISH(txn_state->id, txn->id); - oldest_snap_min = txn->id; /* * If we are starting a snapshot isolation transaction, get @@ -260,6 +252,9 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) * visible. */ if (txn->isolation == TXN_ISO_SNAPSHOT) { + txn->last_gen = txn->last_oldest_gen = txn_global->gen; + oldest_snap_min = txn->id; + /* Copy the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = n = 0, s = txn_global->states; @@ -297,11 +292,13 @@ void __wt_txn_release(WT_SESSION_IMPL *session) { WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; txn = &session->txn; txn->mod_count = txn->modref_count = 0; - txn_state = &S2C(session)->txn_global.states[session->id]; + txn_global = &S2C(session)->txn_global; + txn_state = &txn_global->states[session->id]; /* Clear the transaction's ID from the global table. */ WT_ASSERT(session, txn_state->id != WT_TXN_NONE && @@ -319,6 +316,9 @@ __wt_txn_release(WT_SESSION_IMPL *session) __wt_txn_release_snapshot(session); txn->isolation = session->isolation; F_CLR(txn, TXN_ERROR | TXN_OLDEST | TXN_RUNNING); + + /* Update the global generation number. */ + ++txn_global->gen; } /* @@ -347,7 +347,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) * commit. */ if (session->ncursors > 0) - __wt_txn_get_snapshot(session, txn->id, WT_TXN_NONE); + __wt_txn_get_snapshot(session, txn->id, WT_TXN_NONE, 1); __wt_txn_release(session); return (0); } @@ -463,5 +463,6 @@ __wt_txn_global_destroy(WT_CONNECTION_IMPL *conn) session = conn->default_session; txn_global = &conn->txn_global; - __wt_free(session, txn_global->states); + if (txn_global != NULL) + __wt_free(session, txn_global->states); } diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 39194819b90..ce957ceffd8 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -7,46 +7,24 @@ #include "wt_internal.h" +static int __checkpoint_sync(WT_SESSION_IMPL *, const char *[]); +static int __checkpoint_write_leaves(WT_SESSION_IMPL *, const char *[]); + /* - * __wt_txn_checkpoint -- - * Checkpoint a database or a list of objects in the database. + * __checkpoint_apply -- + * Apply an operation to all files involved in a checkpoint. */ -int -__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) +static int +__checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], + int (*op)(WT_SESSION_IMPL *, const char *[])) { - WT_CONNECTION_IMPL *conn; WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; - WT_DATA_HANDLE *dhandle, *saved_dhandle; WT_DECL_ITEM(tmp); WT_DECL_RET; - WT_SESSION *wt_session; - WT_TXN *txn; - void *saved_meta_next; - int ckpt_closed, target_list, tracking; + int ckpt_closed, target_list; - conn = S2C(session); - target_list = tracking = 0; - txn = &session->txn; - - /* - * Only one checkpoint can be active at a time, and checkpoints must - * run in the same order as they update the metadata; we are using the - * schema lock to determine that ordering, so we can't move this to - * __session_checkpoint. - * - * Begin a transaction for the checkpoint. - */ - WT_ASSERT(session, - F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) && - !F_ISSET(txn, TXN_RUNNING)); - __wt_spin_lock(session, &conn->metadata_lock); - - wt_session = &session->iface; - WT_ERR(wt_session->begin_transaction(wt_session, "isolation=snapshot")); - - WT_ERR(__wt_meta_track_on(session)); - tracking = 1; + target_list = 0; /* Step through the list of targets and checkpoint each one. */ WT_ERR(__wt_config_gets(session, cfg, "target", &cval)); @@ -65,7 +43,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); if ((ret = __wt_schema_worker( - session, tmp->data, __wt_checkpoint, cfg, 0)) != 0) + session, tmp->data, op, cfg, 0)) != 0) WT_ERR_MSG(session, ret, "%s", (const char *)tmp->data); } WT_ERR_NOTFOUND_OK(ret); @@ -91,10 +69,70 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) if (cval.len != 0) ckpt_closed = 1; WT_ERR(ckpt_closed ? - __wt_meta_btree_apply(session, __wt_checkpoint, cfg) : - __wt_conn_btree_apply(session, __wt_checkpoint, cfg)); + __wt_meta_btree_apply(session, op, cfg) : + __wt_conn_btree_apply(session, op, cfg)); } +err: __wt_scr_free(&tmp); + return (ret); +} + +/* + * __wt_txn_checkpoint -- + * Checkpoint a database or a list of objects in the database. + */ +int +__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle, *saved_dhandle; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_SESSION *wt_session; + WT_TXN *txn; + void *saved_meta_next; + int tracking; + + conn = S2C(session); + tracking = 0; + txn = &session->txn; + + /* + * Only one checkpoint can be active at a time, and checkpoints must + * run in the same order as they update the metadata; we are using the + * schema lock to determine that ordering, so we can't move this to + * __session_checkpoint. + * + * Begin a transaction for the checkpoint. + */ + WT_ASSERT(session, + F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) && + !F_ISSET(txn, TXN_RUNNING)); + __wt_spin_lock(session, &conn->metadata_lock); + + /* Flush dirty leaf pages before we start the checkpoint. */ + txn->isolation = TXN_ISO_READ_COMMITTED; + WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_write_leaves)); + + WT_ERR(__wt_meta_track_on(session)); + tracking = 1; + + /* Start a snapshot transaction for the checkpoint. */ + wt_session = &session->iface; + WT_ERR(wt_session->begin_transaction(wt_session, "isolation=snapshot")); + + WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint)); + + /* Release the snapshot transaction, before syncing the file(s). */ + __wt_txn_release(session); + + /* + * Checkpoints have to hit disk (it would be reasonable to configure for + * lazy checkpoints, but we don't support them yet). + */ + if (F_ISSET(conn, WT_CONN_SYNC)) + WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_sync)); + /* Checkpoint the metadata file. */ TAILQ_FOREACH(dhandle, &conn->dhqh, q) { if (WT_IS_METADATA(dhandle) || @@ -139,7 +177,8 @@ err: /* if (tracking) WT_TRET(__wt_meta_track_off(session, ret != 0)); - __wt_txn_release(session); + if (F_ISSET(txn, TXN_RUNNING)) + __wt_txn_release(session); __wt_spin_unlock(session, &conn->metadata_lock); __wt_scr_free(&tmp); @@ -628,11 +667,48 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) } /* + * __checkpoint_write_leaves -- + * Write dirty leaf pages before a checkpoint. + */ +static int +__checkpoint_write_leaves(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_UNUSED(cfg); + + if (S2BT(session)->modified) + WT_RET(__wt_bt_cache_op( + session, NULL, WT_SYNC_WRITE_LEAVES)); + + return (0); +} + +/* + * __checkpoint_sync -- + * Sync a file that has been checkpointed. + */ +static int +__checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_BTREE *btree; + + WT_UNUSED(cfg); + btree = S2BT(session); + + /* Only sync ordinary handles: checkpoint handles are read-only. */ + if (btree->dhandle->checkpoint == NULL && btree->bm != NULL) + return (btree->bm->sync(btree->bm, session)); + return (0); +} + +/* * __wt_checkpoint_close -- * Checkpoint a file as part of a close. */ int __wt_checkpoint_close(WT_SESSION_IMPL *session, const char *cfg[]) { - return (__checkpoint_worker(session, cfg, 0)); + WT_RET(__checkpoint_worker(session, cfg, 0)); + if (F_ISSET(S2C(session), WT_CONN_SYNC)) + WT_RET(__checkpoint_sync(session, cfg)); + return (0); } diff --git a/src/utilities/util.h b/src/utilities/util.h index cdf51946fc6..d231b0ac9c0 100644 --- a/src/utilities/util.h +++ b/src/utilities/util.h @@ -10,7 +10,7 @@ #define UTIL_COLGROUP_OK 0x01 /* colgroup: prefix OK */ #define UTIL_FILE_OK 0x02 /* file: prefix OK */ #define UTIL_INDEX_OK 0x04 /* index: prefix OK */ -#define UTIL_LSM_OK 0x04 /* lsm: prefix OK */ +#define UTIL_LSM_OK 0x08 /* lsm: prefix OK */ #define UTIL_TABLE_OK 0x10 /* table: prefix OK */ /* all known prefixes OK */ diff --git a/test/format/wts.c b/test/format/wts.c index 5e489a62b39..8c1f02a3ebe 100644 --- a/test/format/wts.c +++ b/test/format/wts.c @@ -75,9 +75,13 @@ wts_open(void) * override the standard configuration. */ snprintf(config, sizeof(config), - "create,error_prefix=\"%s\",cache_size=%" PRIu32 "MB,sync=false," - "extensions=[\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"],%s,%s", - g.progname, g.c_cache, + "create,cache_size=%" PRIu32 "MB," + "error_prefix=\"%s\"," + "extensions=[\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"],%s,%s," + "statistics=true,statistics_log=(sources=(\"%s\"),wait=5)," + "sync=false,", + g.c_cache, + g.progname, REVERSE_PATH, access(BZIP_PATH, R_OK) == 0 ? BZIP_PATH : "", access(LZO_PATH, R_OK) == 0 ? LZO_PATH : "", @@ -85,7 +89,8 @@ wts_open(void) access(BZIP_PATH, R_OK) == 0) ? RAW_PATH : "", access(SNAPPY_PATH, R_OK) == 0 ? SNAPPY_PATH : "", g.c_config_open == NULL ? "" : g.c_config_open, - g.config_open == NULL ? "" : g.config_open); + g.config_open == NULL ? "" : g.config_open, + g.uri); if ((ret = wiredtiger_open("RUNDIR", &event_handler, config, &conn)) != 0) diff --git a/test/java/com/wiredtiger/test/CursorTest.java b/test/java/com/wiredtiger/test/CursorTest.java new file mode 100644 index 00000000000..f5608e96554 --- /dev/null +++ b/test/java/com/wiredtiger/test/CursorTest.java @@ -0,0 +1,117 @@ +/*- + * Public Domain 2008-2013 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package com.wiredtiger.test; + +import com.wiredtiger.db.Connection; +import com.wiredtiger.db.Cursor; +import com.wiredtiger.db.Session; +import com.wiredtiger.db.WiredTigerPackingException; +import com.wiredtiger.db.wiredtiger; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; +import org.junit.Assert; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +public class CursorTest { + Connection conn; + Session s; + + @Test + public void cursor01() + throws WiredTigerPackingException { + String keyFormat = "S"; + String valueFormat = "u"; + setup(keyFormat, valueFormat); + + Cursor c = s.open_cursor("table:t", null, null); + c.putKeyString("bar"); + c.putValueByteArray("foo".getBytes()); + c.insert(); + c.close(); + teardown(); + } + + @Test + public void cursor02() + throws WiredTigerPackingException { + String keyFormat = "S"; + String valueFormat = "u"; + setup(keyFormat, valueFormat); + + Cursor c = s.open_cursor("table:t", null, null); + c.putKeyString("bar"); + c.putValueByteArray("foo".getBytes()); + c.insert(); + c.putKeyString("bar"); + c.search(); + Assert.assertEquals(c.getKeyString(), "bar"); + Assert.assertEquals(new String(c.getValueByteArray()), "foo"); + c.close(); + teardown(); + } + + @Test + public void cursor03() + throws WiredTigerPackingException { + String keyFormat = "S"; + String valueFormat = "uiSu"; + setup(keyFormat, valueFormat); + + Cursor c = s.open_cursor("table:t", null, null); + c.putKeyString("bar"); + c.putValueByteArray("aaaaa".getBytes()).putValueInt(123); + c.putValueString("eeeee").putValueByteArray("iiiii".getBytes()); + + c.insert(); + c.putKeyString("bar"); + c.search(); + Assert.assertEquals(c.getKeyString(), "bar"); + Assert.assertEquals(new String(c.getValueByteArray()), "aaaaa"); + Assert.assertEquals(c.getValueInt(), 123); + Assert.assertEquals(c.getValueString(), "eeeee"); + Assert.assertEquals(new String(c.getValueByteArray()), "iiiii"); + c.close(); + teardown(); + } + + private void setup(String keyFormat, String valueFormat) { + conn = wiredtiger.open("WT_HOME", "create"); + s = conn.open_session(null); + s.create("table:t", + "key_format=" + keyFormat + ",value_format=" + valueFormat); + } + + private void teardown() { + s.drop("table:t", ""); + s.close(""); + conn.close(""); + } + +} diff --git a/test/java/com/wiredtiger/test/PackTest.java b/test/java/com/wiredtiger/test/PackTest.java new file mode 100644 index 00000000000..a16d03a1fb0 --- /dev/null +++ b/test/java/com/wiredtiger/test/PackTest.java @@ -0,0 +1,246 @@ +/*- + * Public Domain 2008-2013 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package com.wiredtiger.test; + +import com.wiredtiger.db.PackOutputStream; +import com.wiredtiger.db.PackInputStream; +import com.wiredtiger.db.WiredTigerPackingException; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; +import org.junit.Assert; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +public class PackTest { + + // Some random numbers for testing. + static long[] testers = { + -12, -145, -14135, -1352308572, -1, 0, 1, 12, 145, 12314, + 873593485, -30194371, -4578928, 75452136, -28619244, 93580892, + 83350219, 27407091, -82413912, -727169, -3748613, 54046160, + -49539872, -4517158, 20397230, -68522195, 61663315, -6009306, + -57778143, -97631892, -62388819, 23581637, 2417807, -17761744, + -4174142, 92685293, 84597598, -83143925, 95302021, 90888796, + 88697909, -89601258, 93585507, 63705051, 51191732, 60900034, + -93016118, -68693051, -49366599, -90203871, 58404039, -79195628, + -98043222, 35325799, 47942822, 11582824, 93322027, 71799760, + 65114434, 42851643, 69146495, -86855643, 40073283, 1956899, + 28090147, 71274080, -95192279, -30641467, -1142067, -32599515, + 92478069, -90277478, -39370258, -77673053, 82435569, 88167012, + -39048877, 96895962, -8587864, -70095341, 49508886, 69912387, + 24311011, -58758419, 63228782, -52050021, 24687766, 34342885, + 97830395, 74658034, -9715954, -76120311, -63117710, -19312535, + 42829521, 32389638, -51273506, 16329653, -39061706, -9931233, + 42174615, 75412082, -26236331, 57741055, -17577762, 3605997, + -73993355, -54545904, -86580638, 84432898, -83573465, -1278, + 636, -9935, 9847, 8300, -5170, -2501, 6031, -6658, -9780, -5351, + 6573, -5582, -1994, -7498, -5190, 7710, -8125, -6478, 3670, 4293, + 1903, 2367, 3501, 841, -1718, -2303, -670, 9668, 8391, 3719, 1453, + 7203, -9693, 1294, -3549, -8941, -5455, 30, 2773, 8354, 7272, + -9794, -4806, -7091, -8404, 8297, -4093, -9890, -4948, -38, -66, + -12, 9, 50, -26, 4, -25, 62, 2, 47, -40, -22, -87, 75, -43, -51, + 65, 7, -17, -90, -27, 56, -60, 27, -2, 2, -3, 4, 7, 8, -8 + }; + + @Test + public void pack01() + throws WiredTigerPackingException { + String format = "b"; + PackOutputStream packer = new PackOutputStream(format); + packer.addByte((byte)8); + + Assert.assertEquals(format, packer.getFormat()); + byte[] packed = packer.getValue(); + + PackInputStream unpacker = new PackInputStream(format, packed); + Assert.assertEquals(unpacker.getByte(), (byte)8); + } + + @Test + public void pack02() + throws WiredTigerPackingException { + String format = "biqrhS"; + PackOutputStream packer = new PackOutputStream(format); + packer.addByte((byte)8); + packer.addInt(124); + packer.addLong(1240978); + packer.addRecord(5680234); + packer.addShort((short)8576); + packer.addString("Hello string"); + + Assert.assertEquals(format, packer.getFormat()); + byte[] packed = packer.getValue(); + + PackInputStream unpacker = new PackInputStream(format, packed); + Assert.assertEquals(unpacker.getByte(), (byte)8); + Assert.assertEquals(unpacker.getInt(), 124); + Assert.assertEquals(unpacker.getLong(), 1240978); + Assert.assertEquals(unpacker.getRecord(), 5680234); + Assert.assertEquals(unpacker.getShort(), 8576); + Assert.assertEquals(unpacker.getString(), "Hello string"); + } + + @Test + public void pack03() + throws WiredTigerPackingException { + String format = "SS"; + PackOutputStream packer = new PackOutputStream(format); + packer.addString("Hello 1"); + packer.addString("Hello 2"); + + byte[] packed = packer.getValue(); + PackInputStream unpacker = new PackInputStream(format, packed); + Assert.assertEquals(unpacker.getString(), "Hello 1"); + Assert.assertEquals(unpacker.getString(), "Hello 2"); + } + + @Test + public void pack04() + throws WiredTigerPackingException { + String format = "U"; + PackOutputStream packer = new PackOutputStream(format); + packer.addByteArray("Hello 1".getBytes()); + + byte[] packed = packer.getValue(); + PackInputStream unpacker = new PackInputStream(format, packed); + Assert.assertTrue(java.util.Arrays.equals( + unpacker.getByteArray(), "Hello 1".getBytes())); + } + + @Test + public void pack05() + throws WiredTigerPackingException { + String format = "uuu"; + PackOutputStream packer = new PackOutputStream(format); + packer.addByteArray("Hello 1".getBytes()); + packer.addByteArray("Hello 2".getBytes()); + packer.addByteArray("Hello 3".getBytes()); + + byte[] packed = packer.getValue(); + //printByteArray(packed, packed.length); + PackInputStream unpacker = new PackInputStream(format, packed); + Assert.assertTrue(java.util.Arrays.equals( + unpacker.getByteArray(), "Hello 1".getBytes())); + Assert.assertTrue(java.util.Arrays.equals( + unpacker.getByteArray(), "Hello 2".getBytes())); + Assert.assertTrue(java.util.Arrays.equals( + unpacker.getByteArray(), "Hello 3".getBytes())); + } + + @Test + public void pack06() + throws WiredTigerPackingException { + String format = "uiS"; + PackOutputStream packer = new PackOutputStream(format); + packer.addByteArray("Hello 1".getBytes()); + packer.addInt(12); + packer.addString("Hello 3"); + + byte[] packed = packer.getValue(); + PackInputStream unpacker = new PackInputStream(format, packed); + Assert.assertTrue(java.util.Arrays.equals( + unpacker.getByteArray(), "Hello 1".getBytes())); + Assert.assertEquals(unpacker.getInt(), 12); + Assert.assertEquals(unpacker.getString(), "Hello 3"); + } + + @Test + public void pack07() + throws WiredTigerPackingException { + String format = "4s"; + PackOutputStream packer = new PackOutputStream(format); + packer.addString("Hello 1"); + + byte[] packed = packer.getValue(); + PackInputStream unpacker = new PackInputStream(format, packed); + Assert.assertEquals(unpacker.getString(), "Hell"); + } + + @Test + public void packUnpackNumber01() + throws WiredTigerPackingException { + // Verify that we can pack and unpack single signed longs. + for (long i:testers) { + PackOutputStream packer = new PackOutputStream("Q"); + packer.addLong(i); + PackInputStream unpacker = + new PackInputStream("Q", packer.getValue()); + long unpacked = unpacker.getLong(); + if (i != unpacked) + System.out.println( + i + " did not match " + unpacked); + } + } + + @Test + public void packUnpackNumber02() + throws WiredTigerPackingException { + // Verify that we can pack and unpack pairs of signed longs reliably. + // This is interesting because it ensures that we are tracking the + // number of bytes used by number packing correctly. + for (int i = 0; i + 1 < testers.length; i += 2) { + long val1 = testers[i]; + long val2 = testers[i+1]; + + PackOutputStream packer = new PackOutputStream("QQ"); + packer.addLong(val1); + packer.addLong(val2); + PackInputStream unpacker = + new PackInputStream("QQ", packer.getValue()); + long unpacked = unpacker.getLong(); + if (val1 != unpacked) { + System.out.println(i + " did not match " + unpacked); + } + unpacked = unpacker.getLong(); + if (val2 != unpacked) { + System.out.println(i + " did not match " + unpacked); + } + } + } + + // A debug helper method + private void printByteArray(byte[] bytes, int len) { + for (int i = 0; i < len; i++) { + System.out.println(String.format( + "\t%8s", Integer.toBinaryString( + bytes[i] & 0xff)).replace(' ', '0')); + } + } + + public static void main(String[] args) { + PackTest tester = new PackTest(); + try { + tester.pack01(); + tester.pack02(); + tester.packUnpackNumber01(); + } catch (WiredTigerPackingException wtpe) { + System.err.println("Packing exception: " + wtpe); + } + } +} diff --git a/test/java/com/wiredtiger/test/WiredTigerSuite.java b/test/java/com/wiredtiger/test/WiredTigerSuite.java new file mode 100644 index 00000000000..0bdb4308871 --- /dev/null +++ b/test/java/com/wiredtiger/test/WiredTigerSuite.java @@ -0,0 +1,41 @@ +/*- + * Public Domain 2008-2013 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package com.wiredtiger.test; + +import org.junit.runner.RunWith; +import org.junit.runners.Suite; + +@RunWith(Suite.class) +@Suite.SuiteClasses( { + CursorTest.class, + PackTest.class +}) + +public class WiredTigerSuite { + // the class remains empty, + // used only as a holder for the above annotations +} diff --git a/test/suite/test_cursor_random.py b/test/suite/test_cursor_random.py index 929355042ba..95f8540f156 100644 --- a/test/suite/test_cursor_random.py +++ b/test/suite/test_cursor_random.py @@ -72,7 +72,7 @@ class test_cursor_random(wttest.WiredTigerTestCase): cursor.close # Check that next_random works in the presence of a larger set of values. - def test_cursor_random_single_record(self): + def test_cursor_random_multiple_records(self): uri = self.type + 'random' if self.type == 'file:': simple_populate( diff --git a/test/suite/test_drop_create.py b/test/suite/test_drop_create.py index cf9971d6efd..12638cc00ae 100644 --- a/test/suite/test_drop_create.py +++ b/test/suite/test_drop_create.py @@ -45,5 +45,28 @@ class test_drop_create(wttest.WiredTigerTestCase): self.assertEqual(s.create("table:test", config), 0) self.assertEqual(s.close(), 0) + def test_drop_create2(self): + s, self.session = self.session, None + self.assertEqual(s.close(), 0) + + # Test creating the same table with multiple sessions, to ensure + # that session table cache is working as expected. + s = self.conn.open_session() + s2 = self.conn.open_session() + self.assertEqual(s.drop("table:test", "force"), 0) + self.assertEqual(s.create("table:test", 'key_format=S,value_format=S,columns=(k,v)'), 0) + # Ensure the table cache for the second session knows about this table + c2 = s2.open_cursor("table:test", None, None) + c2.close() + self.assertEqual(s.drop("table:test"), 0) + # Create a table with the same name, but a different schema + self.assertEqual(s.create("table:test", 'key_format=S,value_format=l,columns=(k,v)'), 0) + c2 = s2.open_cursor("table:test", None, None) + c2.set_key("Hi") + c2.set_value(1) + c2.insert() + self.assertEqual(s.close(), 0) + self.assertEqual(s2.close(), 0) + if __name__ == '__main__': wttest.run() diff --git a/test/suite/test_reconfig.py b/test/suite/test_reconfig.py new file mode 100644 index 00000000000..a86ebecb115 --- /dev/null +++ b/test/suite/test_reconfig.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +# +# Public Domain 2008-2013 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest + +# test_reconfig.py +# Smoke-test the connection reconfiguration operations. +class test_reconfig(wttest.WiredTigerTestCase): + + def test_reconfig_shared_cache(self): + self.conn.reconfigure("shared_cache=(size=300M)") + + def test_reconfig_statistics(self): + self.conn.reconfigure("statistics=true") + + def test_reconfig_verbose(self): + self.conn.reconfigure("verbose=[mutex]") + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_shared_cache.py b/test/suite/test_shared_cache.py index cb1edc46ded..7bf54432e3a 100644 --- a/test/suite/test_shared_cache.py +++ b/test/suite/test_shared_cache.py @@ -209,5 +209,15 @@ class test_shared_cache(wttest.WiredTigerTestCase): connection.reconfigure("shared_cache=(size=300M)") self.closeConnections() + # Test default config values + def test_shared_cache11(self): + nops = 1000 + self.openConnections(['WT_TEST1', 'WT_TEST2'], pool_opts=',shared_cache=()') + + for sess in self.sessions: + sess.create(self.uri, "key_format=S,value_format=S") + self.add_records(sess, 0, nops) + self.closeConnections() + if __name__ == '__main__': wttest.run() diff --git a/tools/statlog.py b/tools/statlog.py new file mode 100644 index 00000000000..ef74b2812aa --- /dev/null +++ b/tools/statlog.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +# +# Public Domain 2008-2013 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +import fileinput, os, shutil, sys +from collections import defaultdict +from subprocess import call + +# Plot a set of entries for a title. +def plot(title, entries, num): + # Ignore entries where the value never changes. + skip = 1 + v = entries[0][1] + for entry in entries: + if v != entry[1]: + skip = 0 + break + if skip == 1: + print '\tskipping ' + title + return + + print 'building ' + title + + # Write the raw data into a file for processing. + of = open("reports/report." + num + ".raw", "w") + for entry in sorted(entries): + of.write(" ".join(entry) + "\n") + of.close() + + # Write a command file for gnuplot. + of = open("gnuplot.cmd", "w") + of.write("set terminal png nocrop\n") + of.write("set autoscale\n") + of.write("set grid\n") + of.write("set style data linespoints\n") + of.write("set title \"" + title + "\"\n") + of.write("set xlabel \"Time\"\n") + of.write("set xtics rotate by -45\n") + of.write("set xdata time\n") + of.write("set timefmt \"%b %d %H:%M:%S\"\n") + of.write("set format x \"%b %d %H:%M:%S\"\n") + of.write("set ylabel \"Value\"\n") + of.write("set yrange [0:]\n") + of.write("set output 'reports/report." + num + ".png'\n") + of.write("plot \"reports/report." + num + ".raw\" using 1:4 notitle\n") + of.close() + + # Run gnuplot. + call(["gnuplot", "gnuplot.cmd"]) + + # Remove the command file. + os.remove("gnuplot.cmd") + +# Remove and re-create the reports folder. +shutil.rmtree("reports", True) +os.mkdir("reports") + +# Read the input into a dictionary of lists. +if sys.argv[1:] == []: + print "usage: " + sys.argv[0] + " file ..." + sys.exit(1) +d = defaultdict(list) +for line in fileinput.input(sys.argv[1:]): + s = line.strip('\n').split(" ", 4) + d[s[4]].append([" ".join([s[0], s[1], s[2]]), s[3]]) + +# Plot each entry in the dictionary. +rno = 0 +for entry in sorted(d.iteritems()): + rno = rno + 1 + plot(entry[0], entry[1], "%03d" % rno) |