diff options
author | Dan Pasette <dan@10gen.com> | 2015-02-04 06:48:51 -0500 |
---|---|---|
committer | Dan Pasette <dan@mongodb.com> | 2015-02-04 06:55:52 -0500 |
commit | 7d9ec251cf0e70bc0f9bb246aacfb6e62226ad37 (patch) | |
tree | 5b436359112bc5610dcf9fd1e1f7331854b388d6 /src | |
parent | fc14926f9c8256edce8bbd15d439ca34667c6ebb (diff) | |
download | mongo-7d9ec251cf0e70bc0f9bb246aacfb6e62226ad37.tar.gz |
Import wiredtiger-wiredtiger-mongodb-2.8-rc7-105-g92325a3.tar.gz from wiredtiger branch mongodb-2.8
Diffstat (limited to 'src')
50 files changed, 703 insertions, 605 deletions
diff --git a/src/third_party/wiredtiger/.hgtags b/src/third_party/wiredtiger/.hgtags index 054227388cf..ad2212f950a 100644 --- a/src/third_party/wiredtiger/.hgtags +++ b/src/third_party/wiredtiger/.hgtags @@ -26,3 +26,4 @@ b5c9f28d72fe1f835d24fe427e211a539f8709fe 1.5.2 5c9acd2584f2657dec2a44fd8b54211bf9c21193 mongodb-2.8-rc5 f40795b146bd35a623ef57de5b875a817925b7c9 mongodb-2.8-rc5 c3cac38f1632e0e20d0279639bb59ff11700e692 mongodb-2.8-rc6 +9b61b870d26feafd8f8058cfffc6ac817890ce2b mongodb-2.8-rc7 diff --git a/src/third_party/wiredtiger/SConstruct b/src/third_party/wiredtiger/SConstruct index b075afe696a..1c109c973c0 100644 --- a/src/third_party/wiredtiger/SConstruct +++ b/src/third_party/wiredtiger/SConstruct @@ -1,6 +1,10 @@ # -*- mode: python; -*- import re import os +import shutil +import subprocess +import sys +import tempfile import textwrap import distutils.sysconfig @@ -10,33 +14,51 @@ if not os.sys.platform == "win32": print ("SConstruct is only supported for Windows, use build_posix for other platforms") Exit(1) -AddOption("--with-berkeley-db", dest="bdb", type="string", nargs=1, action="store", - help="Berkeley DB install path, ie, /usr/local") +# Command line options +# +AddOption("--dynamic-crt", dest="dynamic-crt", action="store_true", default=False, + help="Link with the MSVCRT DLL version") -AddOption("--enable-zlib", dest="zlib", type="string", nargs=1, action="store", - help="Use zlib compression") +AddOption("--enable-attach", dest="attach", action="store_true", default=False, + help="Configure for debugger attach on failure.") + +AddOption("--enable-diagnostic", dest="diagnostic", action="store_true", default=False, + help="Configure WiredTiger to perform various run-time diagnostic tests. DO NOT configure this option in production environments.") + +AddOption("--enable-python", dest="lang-python", type="string", nargs=1, action="store", + help="Build Python extension, specify location of swig.exe binary") AddOption("--enable-snappy", dest="snappy", type="string", nargs=1, action="store", help="Use snappy compression") -AddOption("--enable-swig", dest="swig", type="string", nargs=1, action="store", - help="Build python extension, specify location of swig.exe binary") +AddOption("--enable-verbose", dest="verbose", action="store_true", default=False, + help="Configure WiredTiger to support the verbose configuration string to wiredtiger_open") -AddOption("--dynamic-crt", dest="dynamic-crt", action="store_true", default=False, - help="Link with the MSVCRT DLL version") +AddOption("--enable-zlib", dest="zlib", type="string", nargs=1, action="store", + help="Use zlib compression") +AddOption("--prefix", dest="prefix", type="string", nargs=1, action="store", default="package", + help="Install directory") + +AddOption("--with-berkeley-db", dest="bdb", type="string", nargs=1, action="store", + help="Berkeley DB install path, ie, /usr/local") + +# Get the swig binary from the command line option since SCONS cannot find it automatically +# +swig_binary = GetOption("lang-python") + +# Initialize environment +# env = Environment( CPPPATH = ["#/src/include/", "#/build_win", "#/test/windows", "#/.", - distutils.sysconfig.get_python_inc() ], - #CPPDEFINES = ["HAVE_DIAGNOSTIC", "HAVE_VERBOSE"], CFLAGS = [ "/Z7", # Generate debugging symbols "/wd4090", # Ignore warning about mismatched const qualifiers - "/wd4996", + "/wd4996", "/W3", # Warning level 3 "/we4013", # Error on undefined functions "/TC", # Compile as C code @@ -59,17 +81,12 @@ env = Environment( "/DYNAMICBASE", "/NXCOMPAT", ], - LIBPATH=[ distutils.sysconfig.PREFIX + r"\libs"], tools=["default", "swig", "textfile"], - SWIGFLAGS=['-python', - "-threads", - "-O", - "-nodefaultctor", - "-nodefaultdtor" - ], - SWIG=GetOption("swig") + SWIG=swig_binary ) +env['STATIC_AND_SHARED_OBJECTS_ARE_THE_SAME'] = 1 + useZlib = GetOption("zlib") useSnappy = GetOption("snappy") useBdb = GetOption("bdb") @@ -109,7 +126,24 @@ if useBdb: env = conf.Finish() +# Configure build environment variables +# +if GetOption("attach"): + env.Append(CPPDEFINES = ["HAVE_ATTACH"]) + +if GetOption("diagnostic"): + env.Append(CPPDEFINES = ["HAVE_DIAGNOSTIC"]) + +if GetOption("lang-python"): + env.Append(LIBPATH=[distutils.sysconfig.PREFIX + r"\libs"]) + env.Append(CPPPATH=[distutils.sysconfig.get_python_inc()]) +if GetOption("verbose"): + env.Append(CPPDEFINES = ["HAVE_VERBOSE"]) + + +# Build WiredTiger.h file +# version_file = 'build_posix/aclocal/version-set.m4' VERSION_MAJOR = None @@ -147,7 +181,7 @@ replacements = { '@wiredtiger_includes_decl@': wiredtiger_includes } -env.Substfile( +wtheader = env.Substfile( target='wiredtiger.h', source=[ 'src/include/wiredtiger.in', @@ -170,11 +204,27 @@ if useZlib: if useSnappy: wtsources.append("ext/compressors/snappy/snappy_compress.c") -wtlib = env.Library("wiredtiger", wtsources) +wt_objs = [env.Object(a) for a in wtsources] + +# Static Library - libwiredtiger.lib +# +wtlib = env.Library( + target="libwiredtiger", + source=wt_objs, LIBS=wtlibs) env.Depends(wtlib, [filelistfile, version_file]) -env.Program("wt", [ +# Dynamically Loaded Library - wiredtiger.dll +# +wtdll = env.SharedLibrary( + target="wiredtiger", + source=wt_objs + ['build_win/wiredtiger.def'], LIBS=wtlibs) + +env.Depends(wtdll, [filelistfile, version_file]) + +Default(wtlib, wtdll) + +wtbin = env.Program("wt", [ "src/utilities/util_backup.c", "src/utilities/util_cpyright.c", "src/utilities/util_compact.c", @@ -198,27 +248,48 @@ env.Program("wt", [ "src/utilities/util_write.c"], LIBS=[wtlib] + wtlibs) -if GetOption("swig"): - swiglib = env.SharedLibrary('_wiredtiger', +Default(wtbin) + +# Python SWIG wrapper for WiredTiger +if GetOption("lang-python"): + # Check that this version of python is 64-bit + # + if sys.maxsize < 2**32: + print "The Python Interpreter must be 64-bit in order to build the python bindings" + Exit(1) + + pythonEnv = env.Clone() + pythonEnv.Append(SWIGFLAGS=[ + "-python", + "-threads", + "-O", + "-nodefaultctor", + "-nodefaultdtor", + ]) + + swiglib = pythonEnv.SharedLibrary('_wiredtiger', [ 'lang\python\wiredtiger.i'], SHLIBSUFFIX=".pyd", - LIBS=[wtlib]) + LIBS=[wtlib] + wtlibs) - copySwig = env.Command( + copySwig = pythonEnv.Command( 'lang/python/wiredtiger/__init__.py', 'lang/python/wiredtiger.py', Copy('$TARGET', '$SOURCE')) - env.Depends(copySwig, swiglib) + pythonEnv.Depends(copySwig, swiglib) - env.Install('lang/python/wiredtiger/', swiglib) + swiginstall = pythonEnv.Install('lang/python/wiredtiger/', swiglib) + + Default(swiginstall, copySwig) # Shim library of functions to emulate POSIX on Windows shim = env.Library("window_shim", ["test/windows/windows_shim.c"]) -env.Program("t_bloom", +t = env.Program("t_bloom", "test/bloom/test_bloom.c", - LIBS=[wtlib]) + LIBS=[wtlib] + wtlibs) +Default(t) #env.Program("t_checkpoint", #["test/checkpoint/checkpointer.c", @@ -226,9 +297,10 @@ env.Program("t_bloom", #"test/checkpoint/workers.c"], #LIBS=[wtlib]) -env.Program("t_huge", +t = env.Program("t_huge", "test/huge/huge.c", - LIBS=[wtlib]) + LIBS=[wtlib] + wtlibs) +Default(t) #env.Program("t_fops", #["test/fops/file.c", @@ -241,7 +313,7 @@ if useBdb: benv.Append(CPPDEFINES=['BERKELEY_DB_PATH=\\"' + useBdb.replace("\\", "\\\\") + '\\"']) - benv.Program("t_format", + t = benv.Program("t_format", ["test/format/backup.c", "test/format/bdb.c", "test/format/bulk.c", @@ -252,7 +324,8 @@ if useBdb: "test/format/t.c", "test/format/util.c", "test/format/wts.c"], - LIBS=[wtlib, shim, "libdb61"]) + LIBS=[wtlib, shim, "libdb61"] + wtlibs) + Default(t) #env.Program("t_thread", #["test/thread/file.c", @@ -265,13 +338,14 @@ if useBdb: #["test/salvage/salvage.c"], #LIBS=[wtlib]) -env.Program("wtperf", [ +t = env.Program("wtperf", [ "bench/wtperf/config.c", "bench/wtperf/misc.c", "bench/wtperf/track.c", "bench/wtperf/wtperf.c", ], - LIBS=[wtlib, shim] ) + LIBS=[wtlib, shim] + wtlibs) +Default(t) examples = [ "ex_access", @@ -293,9 +367,52 @@ examples = [ "ex_thread", ] +# WiredTiger Smoke Test suppor +# Runs each test in a custom temporary directory +# +def run_smoke_test(x): + print "Running Smoke Test: " + x + + # Make temp dir + temp_dir = tempfile.mkdtemp(prefix="wt_home") + + try: + # Set WT_HOME environment variable for test + os.environ["WIREDTIGER_HOME"] = temp_dir + + # Run the test + ret = subprocess.call(x); + if( ret != 0): + sys.stderr.write("Bad exit code %d\n" % (ret)) + raise Exception() + + finally: + # Clean directory + # + shutil.rmtree(temp_dir) + +def builder_smoke_test(target, source, env): + run_smoke_test(source[0].abspath) + return None + +env.Append(BUILDERS={'SmokeTest' : Builder(action = builder_smoke_test)}) + for ex in examples: - if(ex in ['ex_async', 'ex_thread']): - env.Program(ex, "examples/c/" + ex + ".c", LIBS=[wtlib, shim]) + if(ex in ['ex_all', 'ex_async', 'ex_thread']): + exp = env.Program(ex, "examples/c/" + ex + ".c", LIBS=[wtlib, shim] + wtlibs) + Default(exp) + env.Alias("check", env.SmokeTest(exp)) else: - env.Program(ex, "examples/c/" + ex + ".c", LIBS=[wtlib]) + exp = env.Program(ex, "examples/c/" + ex + ".c", LIBS=[wtdll[1]] + wtlibs) + Default(exp) + if not ex == 'ex_log': + env.Alias("check", env.SmokeTest(exp)) +# Install Target +# +prefix = GetOption("prefix") +env.Alias("install", env.Install(os.path.join(prefix, "bin"), wtbin)) +env.Alias("install", env.Install(os.path.join(prefix, "bin"), wtdll[0])) # Just the dll +env.Alias("install", env.Install(os.path.join(prefix, "include"), wtheader)) +env.Alias("install", env.Install(os.path.join(prefix, "lib"), wtdll[1])) # Just the import lib +env.Alias("install", env.Install(os.path.join(prefix, "lib"), wtlib)) diff --git a/src/third_party/wiredtiger/build_win/wiredtiger.def b/src/third_party/wiredtiger/build_win/wiredtiger.def new file mode 100644 index 00000000000..02884e4fd65 --- /dev/null +++ b/src/third_party/wiredtiger/build_win/wiredtiger.def @@ -0,0 +1,21 @@ +LIBRARY WIREDTIGER +EXPORTS + wiredtiger_config_parser_open + wiredtiger_open + wiredtiger_pack_close + wiredtiger_pack_int + wiredtiger_pack_item + wiredtiger_pack_start + wiredtiger_pack_str + wiredtiger_pack_uint + wiredtiger_strerror + wiredtiger_strerror_r + wiredtiger_struct_pack + wiredtiger_struct_size + wiredtiger_struct_unpack + wiredtiger_unpack_int + wiredtiger_unpack_item + wiredtiger_unpack_start + wiredtiger_unpack_str + wiredtiger_unpack_uint + wiredtiger_version diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 7754a3a1d13..65af833c4a2 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -241,6 +241,12 @@ file_config = format_meta + [ minimum gain before prefix compression will be used on row-store leaf pages''', min=0), + Config('split_deepen_min_child', '0', r''' + minimum entries in a page to consider deepening the tree''', + type='int', undoc=True), + Config('split_deepen_per_child', '0', r''' + entries allocated per child when deepening the tree''', + type='int', undoc=True), Config('split_pct', '75', r''' the Btree page split size as a percentage of the maximum Btree page size, that is, when a Btree page is split, it will be diff --git a/src/third_party/wiredtiger/dist/s_all b/src/third_party/wiredtiger/dist/s_all index b63ed314453..96e69ddff04 100644 --- a/src/third_party/wiredtiger/dist/s_all +++ b/src/third_party/wiredtiger/dist/s_all @@ -66,6 +66,7 @@ run "sh ./s_tags" "building tags files" run "sh ./s_copyright" "checking copyright notices" run "sh ./s_define" "checking for unused #defines" +run "sh ./s_export" "checking external symbol names" run "sh ./s_funcs" "checking for unused functions" run "sh ./s_getopt" "checking for incorrect getopt usage" run "sh ./s_lang" "checking for SWIG generated name conflicts" @@ -74,7 +75,6 @@ run "sh ./s_stat" "checking for unused statistics fields" run "sh ./s_string" "checking string spelling" run "python style.py" "checking style (pass 1)" run "sh ./s_style" "checking style (pass 2)" -run "sh ./s_symbols" "checking external symbol names" run "sh ./s_typedef -c" "checking for unused typedefs" run "sh ./s_whitespace" "checking whitespace" run "sh ./s_win" "checking windows config" diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index 7803b628917..91fbc971afa 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -34,11 +34,14 @@ WT_ATOMIC_SUB2 WT_BARRIER WT_BLOCK_DESC_SIZE WT_CACHE_LINE_ALIGNMENT +WT_COMPILER_TYPE_ALIGN WT_CONN_CHECK_PANIC WT_DEADLOCK WT_DEBUG_BYTE WT_HANDLE_CLOSED WT_HANDLE_NULLABLE +WT_PACKED_STRUCT_BEGIN +WT_PACKED_STRUCT_END WT_READ_BARRIER WT_REF_SIZE WT_SPINLOCK_MAX diff --git a/src/third_party/wiredtiger/dist/s_symbols b/src/third_party/wiredtiger/dist/s_export index e590ab6f62c..1212b5b2c1f 100644 --- a/src/third_party/wiredtiger/dist/s_symbols +++ b/src/third_party/wiredtiger/dist/s_export @@ -23,7 +23,7 @@ esac check() { - (sed -e '/^#/d' s_symbols.list && + (sed -e '/^#/d' s_export.list && eval $NM | sed 's/.* //' | egrep -v '^__wt') | diff --git a/src/third_party/wiredtiger/dist/s_symbols.list b/src/third_party/wiredtiger/dist/s_export.list index 8f469e94433..8f469e94433 100644 --- a/src/third_party/wiredtiger/dist/s_symbols.list +++ b/src/third_party/wiredtiger/dist/s_export.list diff --git a/src/third_party/wiredtiger/dist/s_prototypes b/src/third_party/wiredtiger/dist/s_prototypes index f29b96a1f55..603c0f5633d 100755 --- a/src/third_party/wiredtiger/dist/s_prototypes +++ b/src/third_party/wiredtiger/dist/s_prototypes @@ -31,7 +31,7 @@ for i in `sed -e '/^[a-z]/!d' filelist`; do -e 's/\* /\*/g' \ -e 's/ */ /g' \ -e 's/^/extern /' \ - -e 's/WT_GCC_FUNC_/WT_GCC_/' \ + -e 's/WT_GCC_FUNC_/WT_GCC_FUNC_DECL_/' \ -e 's/$/;/p' \ < ../$i done) > $t diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index cea96db2848..1658684313c 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -881,6 +881,7 @@ poptable pos posint posix +pragmas pre prealloc preload diff --git a/src/third_party/wiredtiger/dist/s_typedef b/src/third_party/wiredtiger/dist/s_typedef index 6b230223baa..2e206757f48 100644..100755 --- a/src/third_party/wiredtiger/dist/s_typedef +++ b/src/third_party/wiredtiger/dist/s_typedef @@ -20,8 +20,12 @@ build() { l=`ls ../src/include/*.[hi] ../src/include/*.in | sed -e '/wiredtiger.*/d' -e '/queue.h/d'` - egrep -h '^[ ]*(struct|union)[ ]*__.*[ ]*{' $l | \ - sed -e 's/^[ ]*//' -e 's/[ ]*{.*//' | sort | \ + egrep -h \ + '^[ ]*(((struct|union)[ ].*__wt_.*{)|WT_PACKED_STRUCT_BEGIN)' \ + $l | + sed -e 's/WT_PACKED_STRUCT_BEGIN(\(.*\))/struct \1 {/' \ + -e 's/WT_COMPILER_TYPE_ALIGN(.*)[ ]*//' \ + -e 's/^[ ]*//' -e 's/[ ]*{.*//' | sort | \ while read t n; do upper=`echo $n | sed -e 's/^__//' | tr [a-z] [A-Z]` echo "$t $n;" diff --git a/src/third_party/wiredtiger/dist/s_win b/src/third_party/wiredtiger/dist/s_win index 187de91e498..cdfc71a8a1e 100755 --- a/src/third_party/wiredtiger/dist/s_win +++ b/src/third_party/wiredtiger/dist/s_win @@ -26,6 +26,19 @@ win_config() } } +win_export() +{ + # Build the Windows list of exported symbols. + f='../build_win/wiredtiger.def' + (echo 'LIBRARY WIREDTIGER' + echo 'EXPORTS' + sed -e '/^$/d' \ + -e '/^#/d' \ + -e 's/^/ /') < s_export.list > $t + cmp $t $f > /dev/null 2>&1 || + (echo "Building $f" && rm -f $f && cp $t $f) +} + win_filelist() { f='../build_win/filelist.win' @@ -65,6 +78,7 @@ win_filelist() } win_config +win_export win_filelist exit 0 diff --git a/src/third_party/wiredtiger/dist/serial.py b/src/third_party/wiredtiger/dist/serial.py deleted file mode 100644 index ddadbbdb6be..00000000000 --- a/src/third_party/wiredtiger/dist/serial.py +++ /dev/null @@ -1,189 +0,0 @@ -# Output serialization functions. - -import textwrap -from dist import compare_srcfile - -class SerialArg: - def __init__(self, typestr, name, sized=0): - self.typestr = typestr - self.name = name - self.sized = sized - -class Serial: - def __init__(self, name, args): - self.name = name - self.args = args - -msgtypes = [ -Serial('col_append', [ - SerialArg('WT_INSERT_HEAD *', 'ins_head'), - SerialArg('WT_INSERT ***', 'ins_stack'), - SerialArg('WT_INSERT *', 'new_ins', 1), - SerialArg('uint64_t *', 'recnop'), - SerialArg('u_int', 'skipdepth'), - ]), - -Serial('insert', [ - SerialArg('WT_INSERT_HEAD *', 'ins_head'), - SerialArg('WT_INSERT ***', 'ins_stack'), - SerialArg('WT_INSERT *', 'new_ins', 1), - SerialArg('u_int', 'skipdepth'), - ]), - -Serial('update', [ - SerialArg('WT_UPDATE **', 'srch_upd'), - SerialArg('WT_UPDATE *', 'upd', 1), - ]), -] - -# decl -- -# Return a declaration for the variable. -def decl(l): - o = l.typestr - if o[-1] != '*': - o += ' ' - return o + l.name - -# decl_p -- -# Return a declaration for a reference to the variable, which requires -# another level of indirection. -def decl_p(l): - o = l.typestr - if o[-1] != '*': - o += ' ' - return o + '*' + l.name + 'p' - -# output -- -# Create serialized function calls. -def output(entry, f): - # Function declaration. - f.write('static inline int\n__wt_' + entry.name + '_serial(\n') - o = 'WT_SESSION_IMPL *session, WT_PAGE *page' - for l in entry.args: - if l.sized: - o += ', ' + decl_p(l) + ', size_t ' + l.name + '_size' - else: - o += ', ' + decl(l) - o += ')' - f.write('\n'.join('\t' + l for l in textwrap.wrap(o, 70))) - f.write('\n{') - - # Local variable declarations. - for l in entry.args: - if l.sized: - f.write(''' -\t''' + decl(l) + ''' = *''' + l.name + '''p; -\tWT_DECL_RET; -\tsize_t incr_mem; -''') - - # Clear memory references we now own. - for l in entry.args: - if l.sized: - f.write(''' -\t/* Clear references to memory we now own. */ -\t*''' + l.name + '''p = NULL; -''') - - # Check the page write generation hasn't wrapped. - f.write(''' -\t/* -\t * Check to see if the page's write generation is about to wrap (wildly -\t * unlikely as it implies 4B updates between clean page reconciliations, -\t * but technically possible), and fail the update. -\t * -\t * The check is outside of the serialization mutex because the page's -\t * write generation is going to be a hot cache line, so technically it's -\t * possible for the page's write generation to wrap between the test and -\t * our subsequent modification of it. However, the test is (4B-1M), and -\t * there cannot be a million threads that have done the test but not yet -\t * completed their modification. -\t */ -\t WT_RET(__page_write_gen_wrapped_check(page)); -''') - - # Call the worker function. - if entry.name != "update": - f.write(''' -\t/* Acquire the page's spinlock, call the worker function. */ -\tWT_PAGE_LOCK(session, page);''') - - f.write(''' -\tret = __''' + entry.name + '''_serial_func( -''') - o = 'session' - if entry.name == "update": - o += ', page' - for l in entry.args: - o += ', ' + l.name - o += ');' - f.write('\n'.join('\t ' + l for l in textwrap.wrap(o, 70))) - - if entry.name != "update": - f.write(''' -\tWT_PAGE_UNLOCK(session, page);''') - - f.write(''' - -\t/* Free unused memory on error. */ -\tif (ret != 0) { -''') - for l in entry.args: - if not l.sized: - continue - f.write( - '\t\t__wt_free(session, ' + l.name + ');\n') - f.write(''' -\t\treturn (ret); -\t} -''') - - f.write(''' -\t/* -\t * Increment in-memory footprint after releasing the mutex: that's safe -\t * because the structures we added cannot be discarded while visible to -\t * any running transaction, and we're a running transaction, which means -\t * there can be no corresponding delete until we complete. -\t */ -\tincr_mem = 0; -''') - for l in entry.args: - if not l.sized: - continue - f.write('\tWT_ASSERT(session, ' + - l.name + '_size != 0);\n') - f.write('\tincr_mem += ' + l.name + '_size;\n') - f.write('''\tif (incr_mem != 0) -\t\t__wt_cache_page_inmem_incr(session, page, incr_mem); - -\t/* Mark the page dirty after updating the footprint. */ -\t__wt_page_modify_set(session, page); - -\treturn (0); -} - -''') - -##################################################################### -# Update serial.i. -##################################################################### -tmp_file = '__tmp' -tfile = open(tmp_file, 'w') -skip = 0 -for line in open('../src/include/serial.i', 'r'): - if not skip: - tfile.write(line) - if line.count('Serialization function section: END'): - tfile.write(line) - skip = 0 - elif line.count('Serialization function section: BEGIN'): - tfile.write(' */\n\n') - skip = 1 - - for entry in msgtypes: - output(entry, tfile) - - tfile.write('/*\n') - -tfile.close() -compare_srcfile(tmp_file, '../src/include/serial.i') diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 704b258a7dd..1960e4605ef 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -782,7 +782,9 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_RET(__cursor_func_init(cbt, 1)); - WT_ERR(__wt_row_random(session, cbt)); + WT_WITH_PAGE_INDEX(session, + ret = __wt_row_random(session, cbt)); + WT_ERR(ret); if (__cursor_valid(cbt, &upd)) WT_ERR(__wt_kv_return(session, cbt, upd)); else @@ -948,7 +950,11 @@ __cursor_truncate(WT_SESSION_IMPL *session, } else { do { WT_RET(__wt_btcur_remove(start)); - for (;;) { + /* + * Reset ret each time through so that we don't loop + * forever in the cursor equals case. + */ + for (ret = 0;;) { if (stop != NULL && __cursor_equals(start, stop)) break; @@ -1009,7 +1015,11 @@ __cursor_truncate_fix(WT_SESSION_IMPL *session, } else { do { WT_RET(__wt_btcur_remove(start)); - for (;;) { + /* + * Reset ret each time through so that we don't loop + * forever in the cursor equals case. + */ + for (ret = 0;;) { if (stop != NULL && __cursor_equals(start, stop)) break; diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index af9f6a669f2..e84a63695f9 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -48,7 +48,7 @@ static void __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *); static int __debug_tree(WT_SESSION_IMPL *, WT_PAGE *, const char *, uint32_t); static void __debug_update(WT_DBG *, WT_UPDATE *, int); static void __dmsg(WT_DBG *, const char *, ...) - WT_GCC_ATTRIBUTE((format (printf, 2, 3))); + WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 2, 3))); static void __dmsg_wrapup(WT_DBG *); /* @@ -548,7 +548,9 @@ __debug_page(WT_DBG *ds, WT_PAGE *page, uint32_t flags) session = ds->session; /* Dump the page metadata. */ - WT_RET(__debug_page_metadata(ds, page)); + WT_WITH_PAGE_INDEX(session, + ret = __debug_page_metadata(ds, page)); + WT_RET(ret); /* Dump the page. */ switch (page->type) { diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index f0414c4e855..b47c9c897a6 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -12,7 +12,7 @@ static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt); static int __btree_get_last_recno(WT_SESSION_IMPL *); static int __btree_page_sizes(WT_SESSION_IMPL *); static int __btree_preload(WT_SESSION_IMPL *); -static int __btree_tree_open_empty(WT_SESSION_IMPL *, int, int); +static int __btree_tree_open_empty(WT_SESSION_IMPL *, int); /* * __wt_btree_open -- @@ -100,8 +100,7 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) ckpt.raw.data, ckpt.raw.size, root_addr, &root_addr_size, readonly)); if (creation || root_addr_size == 0) - WT_ERR(__btree_tree_open_empty( - session, creation, readonly)); + WT_ERR(__btree_tree_open_empty(session, creation)); else { WT_ERR(__wt_btree_tree_open( session, root_addr, root_addr_size)); @@ -391,16 +390,17 @@ err: __wt_buf_free(session, &dsk); * Create an empty in-memory tree. */ static int -__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation, int readonly) +__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation) { WT_BTREE *btree; WT_DECL_RET; - WT_PAGE *root, *leaf; + WT_PAGE *leaf, *root; WT_PAGE_INDEX *pindex; WT_REF *ref; btree = S2BT(session); root = leaf = NULL; + ref = NULL; /* * Newly created objects can be used for cursor inserts or for bulk @@ -414,13 +414,10 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation, int readonly) } /* - * A note about empty trees: the initial tree is a root page and a leaf - * page. We need a pair of pages instead of just a single page because - * we can reconcile the leaf page while the root stays pinned in memory. - * If the pair is evicted without being modified, that's OK, nothing is - * ever written. - * - * Create the root and leaf pages. + * A note about empty trees: the initial tree is a single root page. + * It has a single reference to a leaf page, marked deleted. The leaf + * page will be created by the first update. If the root is evicted + * without being modified, that's OK, nothing is ever written. * * !!! * Be cautious about changing the order of updates in this code: to call @@ -437,10 +434,9 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation, int readonly) pindex = WT_INTL_INDEX_COPY(root); ref = pindex->index[0]; ref->home = root; - WT_ERR(__wt_btree_new_leaf_page(session, &leaf)); - ref->page = leaf; + ref->page = NULL; ref->addr = NULL; - ref->state = WT_REF_MEM; + ref->state = WT_REF_DELETED; ref->key.recno = 1; break; case BTREE_ROW: @@ -451,48 +447,20 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation, int readonly) pindex = WT_INTL_INDEX_COPY(root); ref = pindex->index[0]; ref->home = root; - WT_ERR(__wt_btree_new_leaf_page(session, &leaf)); - ref->page = leaf; + ref->page = NULL; ref->addr = NULL; - ref->state = WT_REF_MEM; + ref->state = WT_REF_DELETED; WT_ERR(__wt_row_ikey_incr( session, root, 0, "", 1, &ref->key.ikey)); break; WT_ILLEGAL_VALUE_ERR(session); } - /* - * Mark the leaf page dirty: we didn't create an entirely valid root - * page (specifically, the root page's disk address isn't set, and it's - * the act of reconciling the leaf page that makes it work, we don't - * try and use the original disk address of modified pages). We could - * get around that by leaving the leaf page clean and building a better - * root page, but then we get into trouble because a checkpoint marks - * the root page dirty to force a write, and without reconciling the - * leaf page we won't realize there's no records to write, we'll write - * a root page, which isn't correct for an empty tree. - * - * Earlier versions of this code kept the leaf page clean, but with the - * "empty" flag set in the leaf page's modification structure; in that - * case, checkpoints works (forced reconciliation of a root with a - * single "empty" page wouldn't write any blocks). That version had - * memory leaks because the eviction code didn't correctly handle pages - * that were "clean" (and so never reconciled), yet "modified" with an - * "empty" flag. The goal of this code is to mimic a real tree that - * simply has no records, for whatever reason, and trust reconciliation - * to figure out it's empty and not write any blocks. - * - * We do not set the tree's modified flag because the checkpoint code - * skips unmodified files in closing checkpoints (checkpoints that - * don't require a write unless the file is actually dirty). There's - * no need to reconcile this file unless the application does a real - * checkpoint or it's actually modified. - * - * Only do this for a live tree, not for checkpoints. If we open an - * empty checkpoint, the leaf page cannot be dirty or eviction may try - * to write it, which will fail because checkpoints are read-only. - */ - if (!readonly) { + /* Bulk loads require a leaf page for reconciliation: create it now. */ + if (F_ISSET(btree, WT_BTREE_BULK)) { + WT_ERR(__wt_btree_new_leaf_page(session, &leaf)); + ref->page = leaf; + ref->state = WT_REF_MEM; WT_ERR(__wt_page_modify_init(session, leaf)); __wt_page_only_modify_set(session, leaf); } @@ -676,6 +644,22 @@ __btree_page_sizes(WT_SESSION_IMPL *session) leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage); /* + * In-memory split configuration. + */ + if (__wt_config_gets( + session, cfg, "split_deepen_min_child", &cval) == WT_NOTFOUND || + cval.val == 0) + btree->split_deepen_min_child = WT_SPLIT_DEEPEN_MIN_CHILD_DEF; + else + btree->split_deepen_min_child = (u_int)cval.val; + if (__wt_config_gets( + session, cfg, "split_deepen_per_child", &cval) == WT_NOTFOUND || + cval.val == 0) + btree->split_deepen_per_child = WT_SPLIT_DEEPEN_PER_CHILD_DEF; + else + btree->split_deepen_per_child = (u_int)cval.val; + + /* * Get the maximum internal/leaf page key/value sizes. * * In historic versions of WiredTiger, the maximum internal/leaf page diff --git a/src/third_party/wiredtiger/src/btree/bt_huffman.c b/src/third_party/wiredtiger/src/btree/bt_huffman.c index c1cf3431c3b..c31b3f2fdf1 100644 --- a/src/third_party/wiredtiger/src/btree/bt_huffman.c +++ b/src/third_party/wiredtiger/src/btree/bt_huffman.c @@ -128,6 +128,30 @@ static const struct __wt_huffman_table __wt_huffman_nytenglish[] = { static int __wt_huffman_read(WT_SESSION_IMPL *, WT_CONFIG_ITEM *, struct __wt_huffman_table **, u_int *, u_int *); +#define WT_HUFFMAN_CONFIG_VALID(str, len) \ + (WT_STRING_CASE_MATCH("english", (str), (len)) || \ + WT_PREFIX_MATCH((str), "utf8") || WT_PREFIX_MATCH((str), "utf16")) + +/* + * __btree_huffman_config -- + * Verify the key or value strings passed in. + */ +static int +__btree_huffman_config(WT_SESSION_IMPL *session, + WT_CONFIG_ITEM *key_conf, WT_CONFIG_ITEM *value_conf) +{ + if (key_conf->len != 0 && + !WT_HUFFMAN_CONFIG_VALID(key_conf->str, key_conf->len)) + WT_RET_MSG( + session, EINVAL, "illegal Huffman key configuration"); + if (value_conf->len != 0 && + !WT_HUFFMAN_CONFIG_VALID(value_conf->str, value_conf->len)) + WT_RET_MSG( + session, EINVAL, "illegal Huffman value configuration"); + return (0); + +} + /* * __wt_btree_huffman_open -- * Configure Huffman encoding for the tree. @@ -150,6 +174,7 @@ __wt_btree_huffman_open(WT_SESSION_IMPL *session) __wt_config_gets_none(session, cfg, "huffman_value", &value_conf)); if (key_conf.len == 0 && value_conf.len == 0) return (0); + WT_RET(__btree_huffman_config(session, &key_conf, &value_conf)); switch (btree->type) { /* Check file type compatibility. */ case BTREE_COL_FIX: @@ -311,6 +336,8 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip, tp->frequency = (uint32_t)frequency; } + if (ret == EOF) + ret = 0; *entriesp = lineno - 1; *tablep = table; diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 561e1c19218..1e539b7caee 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -131,8 +131,8 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; - if ((ret = __wt_page_release_busy( - session, ref, flags)) == EBUSY) { + ret = __wt_page_release_evict(session, ref); + if (ret == EBUSY) { /* If forced eviction fails, stall. */ ret = 0; wait_cnt += 1000; @@ -285,6 +285,7 @@ err: if ((pindex = WT_INTL_INDEX_COPY(page)) != NULL) { /* Increment the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); + (void)WT_ATOMIC_ADD8(cache->bytes_read, size); (void)WT_ATOMIC_ADD8(cache->pages_inmem, 1); *pagep = page; diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index fbc3890f23b..1cf616a2f6b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -294,12 +294,16 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) switch (ss->page_type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: - WT_ERR( - __slvg_col_build_internal(session, leaf_cnt, ss)); + WT_WITH_PAGE_INDEX(session, + ret = __slvg_col_build_internal( + session, leaf_cnt, ss)); + WT_ERR(ret); break; case WT_PAGE_ROW_LEAF: - WT_ERR( - __slvg_row_build_internal(session, leaf_cnt, ss)); + WT_WITH_PAGE_INDEX(session, + ret = __slvg_row_build_internal( + session, leaf_cnt, ss)); + WT_ERR(ret); break; } diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index e4fe51ea28f..05af1a2f885 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -8,17 +8,10 @@ #include "wt_internal.h" -/* - * Track allocation increments, matching the cache calculations, which add an - * estimate of allocation overhead to every object. - */ -#define WT_MEMSIZE_ADD(total, len) do { \ - total += (len); \ -} while (0) -#define WT_MEMSIZE_TRANSFER(from_decr, to_incr, len) do { \ +#define WT_MEM_TRANSFER(from_decr, to_incr, len) do { \ size_t __len = (len); \ - WT_MEMSIZE_ADD(from_decr, __len); \ - WT_MEMSIZE_ADD(to_incr, __len); \ + from_decr += __len; \ + to_incr += __len; \ } while (0) /* @@ -49,7 +42,8 @@ __split_oldest_gen(WT_SESSION_IMPL *session) * Add a new entry into the session's split stash list. */ static int -__split_stash_add(WT_SESSION_IMPL *session, void *p, size_t len) +__split_stash_add( + WT_SESSION_IMPL *session, uint64_t split_gen, void *p, size_t len) { WT_SPLIT_STASH *stash; @@ -60,7 +54,7 @@ __split_stash_add(WT_SESSION_IMPL *session, void *p, size_t len) session->split_stash_cnt + 1, &session->split_stash)); stash = session->split_stash + session->split_stash_cnt++; - stash->split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1); + stash->split_gen = split_gen; stash->p = p; stash->len = len; @@ -150,14 +144,14 @@ __wt_split_stash_discard_all( * it to be freed otherwise. */ static int -__split_safe_free(WT_SESSION_IMPL *session, int exclusive, void *p, size_t s) +__split_safe_free(WT_SESSION_IMPL *session, + uint64_t split_gen, int exclusive, void *p, size_t s) { /* * We have swapped something in a page: if we don't have exclusive * access, check whether there are other threads in the same tree. */ - if (!exclusive && - __split_oldest_gen(session) == S2C(session)->split_gen + 1) + if (!exclusive && __split_oldest_gen(session) > split_gen) exclusive = 1; if (exclusive) { @@ -165,17 +159,10 @@ __split_safe_free(WT_SESSION_IMPL *session, int exclusive, void *p, size_t s) return (0); } - return (__split_stash_add(session, p, s)); + return (__split_stash_add(session, split_gen, p, s)); } /* - * Tuning; global variables to allow the binary to be patched, we don't yet have - * any real understanding of what might be useful to surface to applications. - */ -static u_int __split_deepen_min_child = 10000; -static u_int __split_deepen_per_child = 100; - -/* * __split_should_deepen -- * Return if we should deepen the tree. */ @@ -183,11 +170,13 @@ static int __split_should_deepen( WT_SESSION_IMPL *session, WT_REF *ref, uint32_t *childrenp) { - WT_PAGE_INDEX *pindex; + WT_BTREE *btree; WT_PAGE *page; + WT_PAGE_INDEX *pindex; *childrenp = 0; + btree = S2BT(session); page = ref->page; pindex = WT_INTL_INDEX_COPY(page); @@ -204,8 +193,8 @@ __split_should_deepen( * we get a significant payback (in the case of a set of large keys, * splitting won't help). */ - if (pindex->entries > __split_deepen_min_child) { - *childrenp = pindex->entries / __split_deepen_per_child; + if (pindex->entries > btree->split_deepen_min_child) { + *childrenp = pindex->entries / btree->split_deepen_per_child; return (1); } @@ -296,10 +285,9 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session, ref->key.ikey = ikey; } else { WT_RET(__split_ovfl_key_cleanup(session, parent, ref)); - WT_MEMSIZE_ADD(*parent_decrp, - sizeof(WT_IKEY) + ikey->size); + *parent_decrp += sizeof(WT_IKEY) + ikey->size; } - WT_MEMSIZE_ADD(*child_incrp, sizeof(WT_IKEY) + ikey->size); + *child_incrp += sizeof(WT_IKEY) + ikey->size; } /* @@ -323,7 +311,7 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session, } /* And finally, the WT_REF itself. */ - WT_MEMSIZE_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF)); + WT_MEM_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF)); return (0); } @@ -393,6 +381,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) WT_REF **alloc_refp; WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref; size_t child_incr, parent_decr, parent_incr, size; + uint64_t split_gen; uint32_t chunk, i, j, remain, slots; int panic; void *p; @@ -432,7 +421,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) size = sizeof(WT_PAGE_INDEX) + (children + SPLIT_CORRECT_2) * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); - WT_MEMSIZE_ADD(parent_incr, size); + parent_incr += size; alloc_index->index = (WT_REF **)(alloc_index + 1); alloc_index->entries = children + SPLIT_CORRECT_2; alloc_index->index[0] = pindex->index[0]; @@ -441,7 +430,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) for (alloc_refp = alloc_index->index + SPLIT_CORRECT_1, i = 0; i < children; ++alloc_refp, ++i) { WT_ERR(__wt_calloc_one(session, alloc_refp)); - WT_MEMSIZE_ADD(parent_incr, sizeof(WT_REF)); + parent_incr += sizeof(WT_REF); } /* Allocate child pages, and connect them into the new page index. */ @@ -466,7 +455,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) __wt_ref_key(parent, *parent_refp, &p, &size); WT_ERR( __wt_row_ikey(session, 0, p, size, &ref->key.ikey)); - WT_MEMSIZE_ADD(parent_incr, sizeof(WT_IKEY) + size); + parent_incr += sizeof(WT_IKEY) + size; } else ref->key.recno = (*parent_refp)->key.recno; ref->state = WT_REF_MEM; @@ -527,6 +516,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) * needs to be paid. */ WT_INTL_INDEX_SET(parent, alloc_index); + split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1); panic = 1; #ifdef HAVE_DIAGNOSTIC @@ -596,31 +586,14 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) * be using the new index. */ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); - WT_ERR(__split_safe_free(session, 0, pindex, size)); - WT_MEMSIZE_ADD(parent_decr, size); + WT_ERR(__split_safe_free(session, split_gen, 0, pindex, size)); + parent_decr += size; -#if 0 /* - * Adjust the parent's memory footprint. This may look odd, but we - * have already taken the allocation overhead into account, and an - * increment followed by a decrement will cancel out the normal - * adjustment. + * Adjust the parent's memory footprint. */ __wt_cache_page_inmem_incr(session, parent, parent_incr); __wt_cache_page_inmem_decr(session, parent, parent_decr); -#else - /* - * XXX - * The code to track page sizes is fundamentally flawed in the face of - * splits: for example, we don't add in an overhead allocation constant - * when allocating WT_REF structures as pages are created, but the - * calculations during split assume that correction. For now, ignore - * our carefully calculated values and force the internal page size to - * 5% of its current value. - */ - size = parent->memory_footprint - (parent->memory_footprint / 20); - __wt_cache_page_inmem_decr(session, parent, size); -#endif if (0) { err: __wt_free_ref_index(session, parent, alloc_index, 1); @@ -753,11 +726,10 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, incr = 0; /* In some cases, the underlying WT_REF has not yet been allocated. */ - if (*refp == NULL) { + if (*refp == NULL) WT_RET(__wt_calloc_one(session, refp)); - WT_MEMSIZE_ADD(incr, sizeof(WT_REF)); - } ref = *refp; + incr += sizeof(WT_REF); /* * Any parent reference must be filled in by our caller; the primary @@ -790,7 +762,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, ikey = multi->key.ikey; WT_RET(__wt_row_ikey(session, 0, WT_IKEY_DATA(ikey), ikey->size, &ref->key.ikey)); - WT_MEMSIZE_ADD(incr, sizeof(WT_IKEY) + ikey->size); + incr += sizeof(WT_IKEY) + ikey->size; break; default: ref->key.recno = multi->key.recno; @@ -815,7 +787,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, static int __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, uint32_t new_entries, size_t parent_decr, size_t parent_incr, - int exclusive, int ref_discard) + int exclusive, int ref_discard, uint64_t *split_genp) { WT_DECL_RET; WT_IKEY *ikey; @@ -823,6 +795,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_PAGE_INDEX *alloc_index, *pindex; WT_REF **alloc_refp, *next_ref, *parent_ref; size_t size; + uint64_t split_gen; uint32_t children, i, j; uint32_t deleted_entries, parent_entries, result_entries; int complete, hazard, locked; @@ -902,7 +875,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, */ size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); - WT_MEMSIZE_ADD(parent_incr, size); + parent_incr += size; alloc_index->index = (WT_REF **)(alloc_index + 1); alloc_index->entries = result_entries; for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { @@ -929,6 +902,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * to threads descending the tree. */ WT_INTL_INDEX_SET(parent, alloc_index); + split_gen = *split_genp = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1); alloc_index = NULL; #ifdef HAVE_DIAGNOSTIC @@ -975,8 +949,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, if (ikey != NULL) { size = sizeof(WT_IKEY) + ikey->size; WT_TRET(__split_safe_free( - session, 0, ikey, size)); - WT_MEMSIZE_ADD(parent_decr, size); + session, split_gen, 0, ikey, size)); + parent_decr += size; } /* * The page_del structure can be freed @@ -993,8 +967,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, } WT_TRET(__split_safe_free( - session, 0, next_ref, sizeof(WT_REF))); - WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF)); + session, split_gen, 0, next_ref, sizeof(WT_REF))); + parent_decr += sizeof(WT_REF); } } @@ -1003,8 +977,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * Add it to the session discard list, to be freed when it's safe. */ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); - WT_TRET(__split_safe_free(session, exclusive, pindex, size)); - WT_MEMSIZE_ADD(parent_decr, size); + WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size)); + parent_decr += size; /* * Row-store trees where the old version of the page is being discarded: @@ -1020,10 +994,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_TRET(__split_ovfl_key_cleanup(session, parent, ref)); /* - * Adjust the parent's memory footprint. This may look odd, but we - * have already taken the allocation overhead into account, and an - * increment followed by a decrement will cancel out the normal - * adjustment. + * Adjust the parent's memory footprint. */ __wt_cache_page_inmem_incr(session, parent, parent_incr); __wt_cache_page_inmem_decr(session, parent, parent_decr); @@ -1061,8 +1032,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, */ uint64_t __a, __b; __a = parent->memory_footprint; - WT_WITH_PAGE_INDEX(session, - ret = __split_deepen(session, parent, children)); + ret = __split_deepen(session, parent, children); __b = parent->memory_footprint; if (__b * 2 >= __a) F_SET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN); @@ -1110,6 +1080,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) WT_PAGE *page, *right; WT_REF *child, *split_ref[2] = { NULL, NULL }; size_t page_decr, parent_decr, parent_incr, right_incr; + uint64_t split_gen; int i; *splitp = 0; @@ -1198,9 +1169,9 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) } else WT_ERR(__wt_row_leaf_key( session, page, &page->pg_row_d[0], key, 1)); - WT_ERR(__wt_row_ikey( session, 0, key->data, key->size, &child->key.ikey)); + parent_incr += sizeof(WT_REF) + sizeof(WT_IKEY) + key->size; __wt_scr_free(session, &key); /* @@ -1209,8 +1180,8 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) WT_ERR(__wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, 0, &right)); WT_ERR(__wt_calloc_one(session, &right->pg_row_ins)); WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0])); - WT_MEMSIZE_ADD(right_incr, sizeof(WT_INSERT_HEAD)); - WT_MEMSIZE_ADD(right_incr, sizeof(WT_INSERT_HEAD *)); + right_incr += sizeof(WT_INSERT_HEAD); + right_incr += sizeof(WT_INSERT_HEAD *); WT_ERR(__wt_calloc_one(session, &split_ref[1])); child = split_ref[1]; @@ -1219,19 +1190,18 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) WT_ERR(__wt_row_ikey(session, 0, WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins), &child->key.ikey)); + parent_incr += + sizeof(WT_REF) + sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins); /* - * We're swapping WT_REFs in the parent, adjust the accounting, and - * row store pages may have instantiated keys. + * After the split, we're going to discard the WT_REF, account for the + * change in memory footprint. Row store pages have keys that may be + * instantiated, check for that. */ - WT_MEMSIZE_ADD(parent_incr, sizeof(WT_REF)); - WT_MEMSIZE_ADD( - parent_incr, sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins)); - WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF)); + parent_decr += sizeof(WT_REF); if (page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT) if ((ikey = __wt_ref_key_instantiated(ref)) != NULL) - WT_MEMSIZE_ADD( - parent_decr, sizeof(WT_IKEY) + ikey->size); + parent_decr += sizeof(WT_IKEY) + ikey->size; /* The new page is dirty by definition. */ WT_ERR(__wt_page_modify_init(session, right)); @@ -1253,10 +1223,10 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) */ for (i = 0; i < WT_SKIP_MAXDEPTH && ins_head->tail[i] == moved_ins; ++i) ; - WT_MEMSIZE_TRANSFER(page_decr, right_incr, sizeof(WT_INSERT) + + WT_MEM_TRANSFER(page_decr, right_incr, sizeof(WT_INSERT) + (size_t)i * sizeof(WT_INSERT *) + WT_INSERT_KEY_SIZE(moved_ins)); - WT_MEMSIZE_TRANSFER(page_decr, right_incr, - __wt_update_list_memsize(moved_ins->upd)); + WT_MEM_TRANSFER( + page_decr, right_incr, __wt_update_list_memsize(moved_ins->upd)); /* * Allocation operations completed, move the last insert list item from @@ -1349,7 +1319,12 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) */ page->modify->inmem_split_txn = __wt_txn_new_id(session); - /* Update the page accounting. */ + /* + * Update the page accounting. + * + * XXX + * If we fail to split the parent, the page's accounting will be wrong. + */ __wt_cache_page_inmem_decr(session, page, page_decr); __wt_cache_page_inmem_incr(session, right, right_incr); @@ -1358,8 +1333,8 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) * longer locked, so we cannot safely look at it. */ page = NULL; - if ((ret = __split_parent( - session, ref, split_ref, 2, parent_decr, parent_incr, 0, 0)) != 0) { + if ((ret = __split_parent(session, ref, split_ref, 2, + parent_decr, parent_incr, 0, 0, &split_gen)) != 0) { /* * Move the insert list element back to the original page list. * For simplicity, the previous skip list pointers originally @@ -1396,8 +1371,8 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) */ if (ikey != NULL) WT_TRET(__split_safe_free( - session, 0, ikey, sizeof(WT_IKEY) + ikey->size)); - WT_TRET(__split_safe_free(session, 0, ref, sizeof(WT_REF))); + session, split_gen, 0, ikey, sizeof(WT_IKEY) + ikey->size)); + WT_TRET(__split_safe_free(session, split_gen, 0, ref, sizeof(WT_REF))); /* * A note on error handling: if we completed the split, return success, @@ -1480,6 +1455,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) WT_PAGE_MODIFY *mod; WT_REF **ref_new; size_t parent_decr, parent_incr; + uint64_t split_gen; uint32_t i, new_entries; page = ref->page; @@ -1503,15 +1479,14 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) * change in memory footprint. Row store pages have keys that may be * instantiated, check for that. */ - WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF)); + parent_decr += sizeof(WT_REF); if (page->type == WT_PAGE_ROW_LEAF || page->type == WT_PAGE_ROW_INT) if ((ikey = __wt_ref_key_instantiated(ref)) != NULL) - WT_MEMSIZE_ADD( - parent_decr, sizeof(WT_IKEY) + ikey->size); + parent_decr += sizeof(WT_IKEY) + ikey->size; /* Split into the parent. */ - WT_ERR(__split_parent(session, - ref, ref_new, new_entries, parent_decr, parent_incr, exclusive, 1)); + WT_ERR(__split_parent(session, ref, ref_new, new_entries, + parent_decr, parent_incr, exclusive, 1, &split_gen)); __wt_free(session, ref_new); @@ -1534,9 +1509,10 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) * safe. */ if (ikey != NULL) - WT_TRET(__split_safe_free( - session, exclusive, ikey, sizeof(WT_IKEY) + ikey->size)); - WT_TRET(__split_safe_free(session, exclusive, ref, sizeof(WT_REF))); + WT_TRET(__split_safe_free(session, split_gen, exclusive, + ikey, sizeof(WT_IKEY) + ikey->size)); + WT_TRET(__split_safe_free(session, split_gen, exclusive, + ref, sizeof(WT_REF))); /* * A note on error handling: if we completed the split, return success, diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c index d9ff2a6af1e..b7108b52395 100644 --- a/src/third_party/wiredtiger/src/btree/bt_stat.c +++ b/src/third_party/wiredtiger/src/btree/bt_stat.c @@ -45,8 +45,11 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) next_walk = NULL; while ((ret = - __wt_tree_walk(session, &next_walk, 0)) == 0 && next_walk != NULL) - WT_RET(__stat_page(session, next_walk->page, stats)); + __wt_tree_walk(session, &next_walk, 0)) == 0 && next_walk != NULL) { + WT_WITH_PAGE_INDEX(session, + ret = __stat_page(session, next_walk->page, stats)); + WT_RET(ret); + } return (ret == WT_NOTFOUND ? 0 : ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index fafb4b58fc4..2957eda3a49 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -366,11 +366,16 @@ recno_chk: if (recno != vs->record_total + 1) goto celltype_err; break; case WT_PAGE_COL_VAR: - case WT_PAGE_ROW_LEAF: if (unpack->raw != WT_CELL_ADDR_LEAF && unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; + case WT_PAGE_ROW_LEAF: + if (unpack->raw != WT_CELL_ADDR_DEL && + unpack->raw != WT_CELL_ADDR_LEAF && + unpack->raw != WT_CELL_ADDR_LEAF_NO) + goto celltype_err; + break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: if (unpack->raw != WT_CELL_ADDR_INT) diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index 4c418f91de0..db1b565b439 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -49,6 +49,7 @@ restart: page = current->page; WT_ASSERT(session, current->key.recno == page->pg_intl_recno); + WT_ASSERT(session, session->split_gen != 0); pindex = WT_INTL_INDEX_COPY(page); base = pindex->entries; descent = pindex->index[base - 1]; diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 036e11bec6d..9967c5ecb0c 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -195,6 +195,7 @@ restart: page = current->page; if (page->type != WT_PAGE_ROW_INT) break; + WT_ASSERT(session, session->split_gen != 0); pindex = WT_INTL_INDEX_COPY(page); /* @@ -487,6 +488,7 @@ restart: if (page->type != WT_PAGE_ROW_INT) break; + WT_ASSERT(session, session->split_gen != 0); pindex = WT_INTL_INDEX_COPY(page); descent = pindex->index[ __wt_random(session->rnd) % pindex->entries]; @@ -521,6 +523,7 @@ restart: */ cbt->ref = current; cbt->compare = 0; + WT_ASSERT(session, session->split_gen != 0); pindex = WT_INTL_INDEX_COPY(btree->root.page); cbt->slot = pindex->entries < 2 ? __wt_random(session->rnd) % page->pg_row_entries : 0; diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index 646551cdd38..a7e9419a65c 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -152,6 +152,8 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = { { "os_cache_max", "int", "min=0", NULL }, { "prefix_compression", "boolean", NULL, NULL }, { "prefix_compression_min", "int", "min=0", NULL }, + { "split_deepen_min_child", "int", NULL, NULL }, + { "split_deepen_per_child", "int", NULL, NULL }, { "split_pct", "int", "min=25,max=100", NULL }, { "value_format", "format", NULL, NULL }, { "version", "string", NULL, NULL }, @@ -246,6 +248,8 @@ static const WT_CONFIG_CHECK confchk_session_create[] = { { "prefix_compression", "boolean", NULL, NULL }, { "prefix_compression_min", "int", "min=0", NULL }, { "source", "string", NULL, NULL }, + { "split_deepen_min_child", "int", NULL, NULL }, + { "split_deepen_per_child", "int", NULL, NULL }, { "split_pct", "int", "min=25,max=100", NULL }, { "type", "string", NULL, NULL }, { "value_format", "format", NULL, NULL }, @@ -585,7 +589,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",key_format=u,key_gap=10,leaf_item_max=0,leaf_key_max=0," "leaf_page_max=32KB,leaf_value_max=0,memory_page_max=5MB," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0," - "prefix_compression_min=4,split_pct=75,value_format=u," + "prefix_compression_min=4,split_deepen_min_child=0," + "split_deepen_per_child=0,split_pct=75,value_format=u," "version=(major=0,minor=0)", confchk_file_meta }, @@ -626,8 +631,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { "bloom_hash_count=8,bloom_oldest=0,chunk_max=5GB,chunk_size=10MB," "merge_max=15,merge_min=0),memory_page_max=5MB," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0," - "prefix_compression_min=4,source=,split_pct=75,type=file," - "value_format=u", + "prefix_compression_min=4,source=,split_deepen_min_child=0," + "split_deepen_per_child=0,split_pct=75,type=file,value_format=u", confchk_session_create }, { "session.drop", diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c index 861bafed900..f5b78e33b04 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c @@ -448,15 +448,15 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest) continue; cache = entry->cache; ++entries; - new = cache->bytes_evict; + new = cache->bytes_read; /* Handle wrapping of eviction requests. */ - if (new >= cache->cp_saved_evict) - cache->cp_current_evict = new - cache->cp_saved_evict; + if (new >= cache->cp_saved_read) + cache->cp_current_read = new - cache->cp_saved_read; else - cache->cp_current_evict = new; - cache->cp_saved_evict = new; - if (cache->cp_current_evict > highest) - highest = cache->cp_current_evict; + cache->cp_current_read = new; + cache->cp_saved_read = new; + if (cache->cp_current_read > highest) + highest = cache->cp_current_read; } WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, "Highest eviction count: %" PRIu64 ", entries: %" PRIu64, @@ -501,7 +501,7 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, reserved = cache->cp_reserved; adjusted = 0; - read_pressure = cache->cp_current_evict / highest; + read_pressure = cache->cp_current_read / highest; WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, "\t%" PRIu64 ", %" PRIu64 ", %" PRIu32, entry->cache_size, read_pressure, cache->cp_skip_count)); diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index 01f08aa5f07..a5bd8e1343c 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -32,14 +32,14 @@ __sweep(WT_SESSION_IMPL *session) dhandle_next = SLIST_NEXT(dhandle, l); if (WT_IS_METADATA(dhandle)) continue; - if (dhandle->session_inuse == 0 && dhandle->timeofdeath == 0) { + if (dhandle->session_inuse != 0 || + now <= dhandle->timeofdeath + WT_DHANDLE_SWEEP_WAIT) + continue; + if (dhandle->timeofdeath == 0) { dhandle->timeofdeath = now; WT_STAT_FAST_CONN_INCR(session, dh_conn_tod); continue; } - if (dhandle->session_inuse != 0 || - now <= dhandle->timeofdeath + WT_DHANDLE_SWEEP_WAIT) - continue; /* * We have a candidate for closing; if it's open, acquire an diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c index 553acc988f0..5aa85872a3b 100644 --- a/src/third_party/wiredtiger/src/evict/evict_file.c +++ b/src/third_party/wiredtiger/src/evict/evict_file.c @@ -90,33 +90,43 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) WT_ERR(__wt_evict(session, ref, 1)); break; case WT_SYNC_DISCARD: - case WT_SYNC_DISCARD_FORCE: /* - * Discard the page, whether clean or dirty. - * - * Clean the page, both to keep statistics correct, and - * to let the page-discard function assert no dirty page - * is ever discarded. + * Ordinary discard of the page, whether clean or dirty. + * If we see a dirty page in an ordinary discard (e.g., + * from sweep), give up: an update must have happened + * since the file was selected for sweeping. */ - if (__wt_page_is_modified(page)) { - page->modify->write_gen = 0; - __wt_cache_dirty_decr(session, page); - } + if (__wt_page_is_modified(page)) + WT_ERR(EBUSY); + /* * If the page contains an update that is too recent to * evict, stop. This should never happen during - * connection close, and in other paths our caller + * connection close, but in other paths our caller * should be prepared to deal with this case. */ - if (syncop == WT_SYNC_DISCARD && - page->modify != NULL && + if (page->modify != NULL && !__wt_txn_visible_all(session, page->modify->rec_max_txn)) WT_ERR(EBUSY); - if (syncop == WT_SYNC_DISCARD_FORCE) - F_SET(session, WT_SESSION_DISCARD_FORCE); - __wt_rec_page_clean_update(session, ref); + __wt_evict_page_clean_update(session, ref); + break; + case WT_SYNC_DISCARD_FORCE: + /* + * Forced discard of the page, whether clean or dirty. + * If we see a dirty page in a forced discard, clean + * the page, both to keep statistics correct, and to + * let the page-discard function assert no dirty page + * is ever discarded. + */ + if (__wt_page_is_modified(page)) { + page->modify->write_gen = 0; + __wt_cache_dirty_decr(session, page); + } + + F_SET(session, WT_SESSION_DISCARD_FORCE); + __wt_evict_page_clean_update(session, ref); F_CLR(session, WT_SESSION_DISCARD_FORCE); break; WT_ILLEGAL_VALUE_ERR(session); diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 384ec9be5b3..6e7d3e9c6cd 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -205,11 +205,10 @@ __evict_server(void *arg) "cache server: exiting with %" PRIu64 " pages in " "memory and %" PRIu64 " pages evicted", cache->pages_inmem, cache->pages_evict); - if (cache->bytes_inmem != cache->bytes_evict) + if (cache->bytes_inmem != 0) __wt_errx(session, - "cache server: exiting with %" PRIu64 " bytes in " - "memory and %" PRIu64 " bytes evicted", - cache->bytes_inmem, cache->bytes_evict); + "cache server: exiting with %" PRIu64 " bytes in memory", + cache->bytes_inmem); if (cache->bytes_dirty != 0 || cache->pages_dirty != 0) __wt_errx(session, "cache server: exiting with %" PRIu64 diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 5bbf3b891f7..0cff584f2ab 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -98,7 +98,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) if (__wt_ref_is_root(ref)) __wt_ref_out(session, ref); else - __wt_rec_page_clean_update(session, ref); + __wt_evict_page_clean_update(session, ref); WT_STAT_FAST_CONN_INCR(session, cache_eviction_clean); WT_STAT_FAST_DATA_INCR(session, cache_eviction_clean); @@ -139,11 +139,11 @@ done: session->excl_next = 0; } /* - * __wt_rec_page_clean_update -- + * __wt_evict_page_clean_update -- * Update a clean page's reference on eviction. */ void -__wt_rec_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref) +__wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref) { /* * Discard the page and update the reference structure; if the page has @@ -327,6 +327,7 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags; btree = S2BT(session); + flags = WT_EVICTING; /* * Get exclusive access to the page if our caller doesn't have the tree @@ -472,7 +473,6 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, * they are not expected to split). */ if (__wt_page_is_modified(page)) { - flags = WT_EVICTING; if (exclusive) LF_SET(WT_SKIP_UPDATE_ERR); else if (top && !WT_PAGE_IS_INTERNAL(page) && @@ -482,17 +482,18 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, WT_ASSERT(session, !__wt_page_is_modified(page) || LF_ISSET(WT_SKIP_UPDATE_RESTORE)); - } else { - /* - * If the page was ever modified, make sure all of the updates - * on the page are old enough they can be discarded from cache. - */ - if (!exclusive && mod != NULL && - !__wt_txn_visible_all(session, mod->rec_max_txn)) - return (EBUSY); } /* + * If the page was ever modified, make sure all of the updates + * on the page are old enough they can be discarded from cache. + */ + if (!exclusive && mod != NULL && + !__wt_txn_visible_all(session, mod->rec_max_txn) && + !LF_ISSET(WT_SKIP_UPDATE_RESTORE)) + return (EBUSY); + + /* * Repeat the test: fail if any page in the top-level page's subtree * won't be merged into its parent. */ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index aea9ee98742..ef6f9b40414 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -192,7 +192,7 @@ struct __wt_page_modify { uint64_t inmem_split_txn; /* Dirty bytes added to the cache. */ - uint64_t bytes_dirty; + size_t bytes_dirty; /* * When pages are reconciled, the result is one or more replacement @@ -532,7 +532,7 @@ struct __wt_page { #define WT_READGEN_STEP 100 uint64_t read_gen; - uint64_t memory_footprint; /* Memory attached to the page */ + size_t memory_footprint; /* Memory attached to the page */ #define WT_PAGE_IS_INTERNAL(page) \ ((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT) @@ -759,11 +759,11 @@ struct __wt_col { * with RLE counts greater than 1 when reading the page. We can do a binary * search in this array, then an offset calculation to find the cell. */ -struct __wt_col_rle { +WT_PACKED_STRUCT_BEGIN(__wt_col_rle) uint64_t recno; /* Record number of first repeat. */ uint64_t rle; /* Repeat count. */ uint32_t indx; /* Slot of entry in col_var.d */ -} WT_GCC_ATTRIBUTE((packed)); +WT_PACKED_STRUCT_END /* * WT_COL_PTR, WT_COL_PTR_SET -- @@ -827,7 +827,7 @@ struct __wt_ikey { * is done for an entry, WT_UPDATE structures are formed into a forward-linked * list. */ -struct __wt_update { +WT_PACKED_STRUCT_BEGIN(__wt_update) uint64_t txnid; /* update transaction */ WT_UPDATE *next; /* forward-linked list */ @@ -846,7 +846,7 @@ struct __wt_update { /* The untyped value immediately follows the WT_UPDATE structure. */ #define WT_UPDATE_DATA(upd) \ ((void *)((uint8_t *)(upd) + sizeof(WT_UPDATE))) -} WT_GCC_ATTRIBUTE((packed)); +}; /* * WT_INSERT -- @@ -1004,11 +1004,18 @@ struct __wt_insert_head { * already have a split generation, leave it alone. If our caller is examining * an index, we don't want the oldest split generation to move forward and * potentially free it. + * + * Check that we haven't raced with a split_gen update after publishing: we + * rely on the published value not being missed when scanning for the oldest + * active split_gen. */ #define WT_ENTER_PAGE_INDEX(session) do { \ uint64_t __prev_split_gen = (session)->split_gen; \ if (__prev_split_gen == 0) \ - WT_PUBLISH((session)->split_gen, S2C(session)->split_gen) + do { \ + WT_PUBLISH((session)->split_gen, \ + S2C(session)->split_gen); \ + } while ((session)->split_gen != S2C(session)->split_gen) #define WT_LEAVE_PAGE_INDEX(session) \ if (__prev_split_gen == 0) \ diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index fa01dd5edc2..dd3acf6940d 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -98,14 +98,21 @@ struct __wt_btree { CKSUM_UNCOMPRESSED=3 /* Uncompressed blocks only */ } checksum; /* Checksum configuration */ - u_int dictionary; /* Reconcile: dictionary slots */ - int internal_key_truncate; /* Reconcile: internal key truncate */ - int maximum_depth; /* Reconcile: maximum tree depth */ - int prefix_compression; /* Reconcile: prefix compression */ - u_int prefix_compression_min; /* Reconcile: prefix compression min */ - int split_pct; /* Reconcile: split page percent */ - WT_COMPRESSOR *compressor; /* Reconcile: page compressor */ - WT_RWLOCK *ovfl_lock; /* Reconcile: overflow lock */ + /* + * Reconciliation... + */ + u_int dictionary; /* Dictionary slots */ + int internal_key_truncate; /* Internal key truncate */ + int maximum_depth; /* Maximum tree depth */ + int prefix_compression; /* Prefix compression */ + u_int prefix_compression_min; /* Prefix compression min */ +#define WT_SPLIT_DEEPEN_MIN_CHILD_DEF 10000 + u_int split_deepen_min_child; /* Minimum entries to deepen tree */ +#define WT_SPLIT_DEEPEN_PER_CHILD_DEF 100 + u_int split_deepen_per_child; /* Entries per child when deepened */ + int split_pct; /* Split page percent */ + WT_COMPRESSOR *compressor; /* Page compressor */ + WT_RWLOCK *ovfl_lock; /* Overflow lock */ uint64_t last_recno; /* Column-store last record number */ diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 0dffdc798af..a0cbb23f126 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -75,6 +75,52 @@ __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) #endif /* + * __wt_cache_page_byte_dirty_decr -- + * Decrement the page's dirty byte count, guarding from underflow. + */ +static inline void +__wt_cache_page_byte_dirty_decr( + WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) +{ + WT_CACHE *cache; + size_t decr, orig; + int i; + + cache = S2C(session)->cache; + + /* + * We don't have exclusive access and there are ways of decrementing the + * page's dirty byte count by a too-large value. For example: + * T1: __wt_cache_page_inmem_incr(page, size) + * page is clean, don't increment dirty byte count + * T2: mark page dirty + * T1: __wt_cache_page_inmem_decr(page, size) + * page is dirty, decrement dirty byte count + * and, of course, the reverse where the page is dirty at the increment + * and clean at the decrement. + * + * The page's dirty-byte value always reflects bytes represented in the + * cache's dirty-byte count, decrement the page/cache as much as we can + * without underflow. If we can't decrement the dirty byte counts after + * few tries, give up: the cache's value will be wrong, but consistent, + * and we'll fix it the next time this page is marked clean, or evicted. + */ + for (i = 0; i < 5; ++i) { + /* + * Take care to read the dirty-byte count only once in case + * we're racing with updates. + */ + orig = page->modify->bytes_dirty; + decr = WT_MIN(size, orig); + if (WT_ATOMIC_CAS8( + page->modify->bytes_dirty, orig, orig - decr)) { + WT_CACHE_DECR(session, cache->bytes_dirty, decr); + break; + } + } +} + +/* * __wt_cache_page_inmem_decr -- * Decrement a page's memory footprint in the cache. */ @@ -87,17 +133,16 @@ __wt_cache_page_inmem_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) WT_ASSERT(session, size < WT_EXABYTE); - WT_CACHE_DECR(session, page->memory_footprint, size); WT_CACHE_DECR(session, cache->bytes_inmem, size); - if (__wt_page_is_modified(page)) { - WT_CACHE_DECR(session, cache->bytes_dirty, size); - WT_CACHE_DECR(session, page->modify->bytes_dirty, size); - } + WT_CACHE_DECR(session, page->memory_footprint, size); + if (__wt_page_is_modified(page)) + __wt_cache_page_byte_dirty_decr(session, page, size); } /* * __wt_cache_dirty_incr -- - * Increment the cache dirty page/byte counts. + * Page switch from clean to dirty: increment the cache dirty page/byte + * counts. */ static inline void __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page) @@ -119,42 +164,29 @@ __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page) /* * __wt_cache_dirty_decr -- - * Decrement the cache dirty page/byte counts. + * Page switch from dirty to clean: decrement the cache dirty page/byte + * counts. */ static inline void __wt_cache_dirty_decr(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_CACHE *cache; - size_t size; + WT_PAGE_MODIFY *modify; cache = S2C(session)->cache; if (cache->pages_dirty < 1) { - (void)__wt_errx(session, - "cache dirty decrement failed: cache dirty page count went " - "negative"); + __wt_errx(session, + "cache eviction dirty-page decrement failed: dirty page" + "count went negative"); cache->pages_dirty = 0; } else (void)WT_ATOMIC_SUB8(cache->pages_dirty, 1); - /* - * It is possible to decrement the footprint of the page without making - * the page dirty (for example when freeing an obsolete update list), - * so the footprint could change between read and decrement, and we - * might attempt to decrement by a different amount than the bytes held - * by the page. - * - * We catch that by maintaining a per-page dirty size, and fixing the - * cache stats if that is non-zero when the page is discarded. - * - * Also take care that the global size doesn't go negative. This may - * lead to small accounting errors (particularly on the last page of the - * last file in a checkpoint), but that will come out in the wash when - * the page is evicted. - */ - size = WT_MIN(page->memory_footprint, cache->bytes_dirty); - (void)WT_ATOMIC_SUB8(cache->bytes_dirty, size); - (void)WT_ATOMIC_SUB8(page->modify->bytes_dirty, size); + modify = page->modify; + if (modify != NULL && modify->bytes_dirty != 0) + __wt_cache_page_byte_dirty_decr( + session, page, modify->bytes_dirty); } /* @@ -165,23 +197,28 @@ static inline void __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_CACHE *cache; - WT_PAGE_MODIFY *mod; + WT_PAGE_MODIFY *modify; cache = S2C(session)->cache; - mod = page->modify; - - /* - * In rare cases, we may race tracking a page's dirty footprint. - * If so, we will get here with a non-zero dirty_size in the page, and - * we can fix the global stats. - */ - if (mod != NULL && mod->bytes_dirty != 0) - (void)WT_ATOMIC_SUB8(cache->bytes_dirty, mod->bytes_dirty); + modify = page->modify; + + /* Update the bytes in-memory to reflect the eviction. */ + WT_CACHE_DECR(session, cache->bytes_inmem, page->memory_footprint); + + /* Update the cache's dirty-byte count. */ + if (modify != NULL && modify->bytes_dirty != 0) { + if (cache->bytes_dirty < modify->bytes_dirty) { + __wt_errx(session, + "cache eviction dirty-bytes decrement failed: " + "dirty byte count went negative"); + cache->bytes_dirty = 0; + } else + WT_CACHE_DECR( + session, cache->bytes_dirty, modify->bytes_dirty); + } - WT_ASSERT(session, page->memory_footprint != 0); + /* Update pages and bytes evicted. */ (void)WT_ATOMIC_ADD8(cache->bytes_evict, page->memory_footprint); - page->memory_footprint = 0; - (void)WT_ATOMIC_ADD8(cache->pages_evict, 1); } @@ -221,8 +258,7 @@ __wt_page_refp(WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex; uint32_t i; - WT_ASSERT(session, - WT_SESSION_TXN_STATE(session)->snap_min != WT_TXN_NONE); + WT_ASSERT(session, session->split_gen != 0); /* * Copy the parent page's index value: the page can split at any time, @@ -894,11 +930,11 @@ __wt_ref_info(WT_SESSION_IMPL *session, } /* - * __wt_page_release_busy -- - * Release a reference to a page, fail if busy during forced eviction. + * __wt_page_release_evict -- + * Attempt to release and immediately evict a page. */ static inline int -__wt_page_release_busy(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) +__wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; WT_DECL_RET; @@ -906,37 +942,8 @@ __wt_page_release_busy(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) int locked, too_big; btree = S2BT(session); - - /* - * Discard our hazard pointer. Ignore pages we don't have and the root - * page, which sticks in memory, regardless. - */ - if (ref == NULL || __wt_ref_is_root(ref)) - return (0); page = ref->page; - - too_big = (page->memory_footprint < btree->maxmempage) ? 0 : 1; - - /* - * Attempt to evict pages with the special "oldest" read generation. - * - * This is set for pages that grow larger than the configured - * memory_page_max setting, and when we are attempting to scan without - * trashing the cache. - * - * Skip this if eviction is disabled for this operation or this tree, - * or if there is no chance of eviction succeeding for dirty pages due - * to a checkpoint or because we've already tried writing this page and - * it contains an update that isn't stable. Also skip forced eviction - * if we just did an in-memory split. - */ - if (LF_ISSET(WT_READ_NO_EVICT) || - page->read_gen != WT_READGEN_OLDEST || - F_ISSET(btree, WT_BTREE_NO_EVICTION) || - (__wt_page_is_modified(page) && (btree->checkpointing || - !__wt_txn_visible_all(session, page->modify->first_dirty_txn) || - !__wt_txn_visible_all(session, page->modify->inmem_split_txn)))) - return (__wt_hazard_clear(session, page)); + too_big = (page->memory_footprint > btree->maxmempage) ? 1 : 0; /* * Take some care with order of operations: if we release the hazard @@ -945,8 +952,10 @@ __wt_page_release_busy(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) */ locked = WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED); WT_TRET(__wt_hazard_clear(session, page)); - if (!locked) + if (!locked) { + WT_TRET(EBUSY); return (ret); + } (void)WT_ATOMIC_ADD4(btree->evict_busy, 1); if ((ret = __wt_evict_page(session, ref)) == 0) { @@ -970,12 +979,46 @@ __wt_page_release_busy(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) /* * __wt_page_release -- - * Release a reference to a page. + * Release a reference to a page, fail if busy during forced eviction. */ static inline int __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) { - WT_RET_BUSY_OK(__wt_page_release_busy(session, ref, flags)); + WT_BTREE *btree; + WT_PAGE *page; + + btree = S2BT(session); + + /* + * Discard our hazard pointer. Ignore pages we don't have and the root + * page, which sticks in memory, regardless. + */ + if (ref == NULL || __wt_ref_is_root(ref)) + return (0); + page = ref->page; + + /* + * Attempt to evict pages with the special "oldest" read generation. + * + * This is set for pages that grow larger than the configured + * memory_page_max setting, and when we are attempting to scan without + * trashing the cache. + * + * Skip this if eviction is disabled for this operation or this tree, + * or if there is no chance of eviction succeeding for dirty pages due + * to a checkpoint or because we've already tried writing this page and + * it contains an update that isn't stable. Also skip forced eviction + * if we just did an in-memory split. + */ + if (LF_ISSET(WT_READ_NO_EVICT) || + page->read_gen != WT_READGEN_OLDEST || + F_ISSET(btree, WT_BTREE_NO_EVICTION) || + (__wt_page_is_modified(page) && (btree->checkpointing || + !__wt_txn_visible_all(session, page->modify->first_dirty_txn) || + !__wt_txn_visible_all(session, page->modify->inmem_split_txn)))) + return (__wt_hazard_clear(session, page)); + + WT_RET_BUSY_OK(__wt_page_release_evict(session, ref)); return (0); } diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index deccd676e26..9db0729fe3c 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -60,6 +60,7 @@ struct __wt_cache { uint64_t pages_evict; uint64_t bytes_dirty; /* Bytes/pages currently dirty */ uint64_t pages_dirty; + uint64_t bytes_read; /* Bytes read into memory */ uint64_t evict_max_page_size; /* Largest page seen at eviction */ @@ -102,8 +103,8 @@ struct __wt_cache { /* * Cache pool information. */ - uint64_t cp_saved_evict; /* Evict count from last pass */ - uint64_t cp_current_evict; /* Evict count from current pass */ + uint64_t cp_saved_read; /* Read count from last pass */ + uint64_t cp_current_read; /* Read count from current pass */ uint32_t cp_skip_count; /* Post change stabilization */ uint64_t cp_reserved; /* Base size for this cache */ WT_SESSION_IMPL *cp_session; /* May be used for cache management */ diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i index b1ace5e6a80..4bceb5c0d6c 100644 --- a/src/third_party/wiredtiger/src/include/cache.i +++ b/src/third_party/wiredtiger/src/include/cache.i @@ -62,7 +62,7 @@ __wt_cache_pages_inuse(WT_CACHE *cache) static inline uint64_t __wt_cache_bytes_inuse(WT_CACHE *cache) { - return (cache->bytes_inmem - cache->bytes_evict); + return (cache->bytes_inmem); } /* diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index ff34b014ecf..7b94a7ea94b 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -146,7 +146,7 @@ struct __wt_connection_impl { WT_FH *lock_fh; /* Lock file handle */ - uint64_t split_gen; /* Generation number for splits */ + volatile uint64_t split_gen; /* Generation number for splits */ /* * The connection keeps a cache of data handles. The set of handles diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index 8fa9790e096..ad42f989bf4 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -164,8 +164,11 @@ __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session) dhandle = session->dhandle; + /* If we close a handle with a time of death set, clear it. */ WT_ASSERT(session, dhandle->session_inuse > 0); - (void)WT_ATOMIC_SUB4(dhandle->session_inuse, 1); + if (WT_ATOMIC_SUB4(dhandle->session_inuse, 1) == 0 && + dhandle->timeofdeath != 0) + dhandle->timeofdeath = 0; } /* diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index ee9c27581c8..e47f4ba09c0 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -302,7 +302,7 @@ extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_server); extern int __wt_cache_wait(WT_SESSION_IMPL *session, int full); extern void __wt_cache_dump(WT_SESSION_IMPL *session); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive); -extern void __wt_rec_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref); +extern void __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); extern void __wt_log_written_reset(WT_SESSION_IMPL *session); extern int __wt_log_get_all_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, int active_only); @@ -423,7 +423,7 @@ extern int __wt_meta_track_handle_lock(WT_SESSION_IMPL *session, int created); extern int __wt_turtle_init(WT_SESSION_IMPL *session); extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep); extern int __wt_turtle_update( WT_SESSION_IMPL *session, const char *key, const char *value); -extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_ATTRIBUTE((noreturn)); +extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp); extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); extern int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); @@ -567,13 +567,13 @@ extern uint32_t __wt_cksum(const void *chunk, size_t len); extern void __wt_cksum_init(void); extern void __wt_event_handler_set(WT_SESSION_IMPL *session, WT_EVENT_HANDLER *handler); extern int __wt_eventv(WT_SESSION_IMPL *session, int msg_event, int error, const char *file_name, int line_number, const char *fmt, va_list ap); -extern void __wt_err(WT_SESSION_IMPL *session, int error, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4))); -extern void __wt_errx(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 2, 3))); -extern int __wt_ext_err_printf( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4))); -extern int __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 2, 3))); -extern int __wt_ext_msg_printf( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4))); +extern void __wt_err(WT_SESSION_IMPL *session, int error, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))); +extern void __wt_errx(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 2, 3))); +extern int __wt_ext_err_printf( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))); +extern int __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 2, 3))); +extern int __wt_ext_msg_printf( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))); extern int __wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v); -extern void __wt_assert(WT_SESSION_IMPL *session, int error, const char *file_name, int line_number, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 5, 6))); +extern void __wt_assert(WT_SESSION_IMPL *session, int error, const char *file_name, int line_number, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 5, 6))); extern int __wt_panic(WT_SESSION_IMPL *session); extern int __wt_illegal_value(WT_SESSION_IMPL *session, const char *name); extern int __wt_object_unsupported(WT_SESSION_IMPL *session, const char *uri); @@ -616,8 +616,8 @@ extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2); extern void __wt_random_init(uint32_t *rnd); extern uint32_t __wt_random(uint32_t *rnd); extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size); -extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4))); -extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4))); +extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))); +extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))); extern int __wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp #ifdef HAVE_DIAGNOSTIC diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h index 7b606ca80b1..805838eb84b 100644 --- a/src/third_party/wiredtiger/src/include/gcc.h +++ b/src/third_party/wiredtiger/src/include/gcc.h @@ -7,7 +7,12 @@ */ /* Add GCC-specific attributes to types and function declarations. */ -#define WT_GCC_ATTRIBUTE(x) __attribute__(x) +#define WT_COMPILER_TYPE_ALIGN(x) __attribute__((aligned(x))) + +#define WT_PACKED_STRUCT_BEGIN(name) \ + struct __attribute__ ((__packed__)) name { +#define WT_PACKED_STRUCT_END \ + }; /* * Attribute are only permitted on function declarations, not definitions. @@ -15,6 +20,7 @@ * dist/s_prototypes to create extern.h. */ #define WT_GCC_FUNC_ATTRIBUTE(x) +#define WT_GCC_FUNC_DECL_ATTRIBUTE(x) __attribute__(x) /* * Atomic writes: diff --git a/src/third_party/wiredtiger/src/include/lint.h b/src/third_party/wiredtiger/src/include/lint.h index 9c560339e03..5668abc6dab 100644 --- a/src/third_party/wiredtiger/src/include/lint.h +++ b/src/third_party/wiredtiger/src/include/lint.h @@ -6,8 +6,15 @@ * See the file LICENSE for redistribution information. */ -#define WT_GCC_ATTRIBUTE(x) +#define WT_COMPILER_TYPE_ALIGN(x) + +#define WT_PACKED_STRUCT_BEGIN(name) \ + struct name { +#define WT_PACKED_STRUCT_END \ + }; + #define WT_GCC_FUNC_ATTRIBUTE(x) +#define WT_GCC_FUNC_DECL_ATTRIBUTE(x) #define __WT_ATOMIC_ADD(v, val) \ ((v) += (val)) diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h index f88a5381227..82d90070609 100644 --- a/src/third_party/wiredtiger/src/include/log.h +++ b/src/third_party/wiredtiger/src/include/log.h @@ -70,7 +70,7 @@ #define WT_LOG_SLOT_FREE 1 #define WT_LOG_SLOT_PENDING 2 #define WT_LOG_SLOT_READY 3 -typedef struct { +typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct { int64_t slot_state; /* Slot state */ uint64_t slot_group_size; /* Group size */ int32_t slot_error; /* Error value */ @@ -90,7 +90,7 @@ typedef struct { #define SLOT_SYNC 0x08 /* Needs sync on release */ #define SLOT_SYNC_DIR 0x10 /* Directory sync on release */ uint32_t flags; /* Flags */ -} WT_LOGSLOT WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT))); +} WT_LOGSLOT; typedef struct { WT_LOGSLOT *slot; diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h index 8038e5a34ab..8d50f3ea73b 100644 --- a/src/third_party/wiredtiger/src/include/lsm.h +++ b/src/third_party/wiredtiger/src/include/lsm.h @@ -73,7 +73,7 @@ struct __wt_cursor_lsm { * WT_LSM_CHUNK -- * A single chunk (file) in an LSM tree. */ -struct __wt_lsm_chunk { +struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_lsm_chunk { const char *uri; /* Data source for this chunk */ const char *bloom_uri; /* URI of Bloom filter, if any */ struct timespec create_ts; /* Creation time (for rate limiting) */ @@ -101,7 +101,7 @@ struct __wt_lsm_chunk { #define WT_LSM_CHUNK_ONDISK 0x04 #define WT_LSM_CHUNK_STABLE 0x08 uint32_t flags; -} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT))); +}; /* * Different types of work units. Used by LSM worker threads to choose which diff --git a/src/third_party/wiredtiger/src/include/msvc.h b/src/third_party/wiredtiger/src/include/msvc.h index f176a40f2bf..3ec74b2d629 100644 --- a/src/third_party/wiredtiger/src/include/msvc.h +++ b/src/third_party/wiredtiger/src/include/msvc.h @@ -13,8 +13,21 @@ #define inline __inline -#define WT_GCC_ATTRIBUTE(x) +/* + * Add MSVC-specific attributes and pragmas to types and function declarations. + */ +#define WT_COMPILER_TYPE_ALIGN(x) __declspec(align(x)) + +#define WT_PACKED_STRUCT_BEGIN(name) \ + __pragma(pack(push,1)) \ + struct name { + +#define WT_PACKED_STRUCT_END \ + }; \ + __pragma(pack(pop)) + #define WT_GCC_FUNC_ATTRIBUTE(x) +#define WT_GCC_FUNC_DECL_ATTRIBUTE(x) #define __WT_ATOMIC_ADD(v, val, n, s, t) \ (WT_STATIC_ASSERT(sizeof(v) == (n)), \ diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h index c5b7587303d..07aa740c525 100644 --- a/src/third_party/wiredtiger/src/include/mutex.h +++ b/src/third_party/wiredtiger/src/include/mutex.h @@ -68,15 +68,15 @@ struct __wt_rwlock { #if SPINLOCK_TYPE == SPINLOCK_GCC -typedef volatile int - WT_SPINLOCK WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT))); +typedef volatile int WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) + WT_SPINLOCK; #elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\ SPINLOCK_TYPE == SPINLOCK_MSVC ||\ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING -typedef struct { +typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct { wt_mutex_t lock; uint64_t counter; /* Statistics: counter */ @@ -85,7 +85,7 @@ typedef struct { int8_t id; /* Statistics: current holder ID */ int8_t initialized; /* Lock initialized, for cleanup */ -} WT_SPINLOCK WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT))); +} WT_SPINLOCK; #else diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i index fb610383a75..b42b792f5a7 100644 --- a/src/third_party/wiredtiger/src/include/serial.i +++ b/src/third_party/wiredtiger/src/include/serial.i @@ -245,6 +245,9 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, */ __wt_cache_page_inmem_incr(session, page, upd_size); + /* Mark the page dirty after updating the footprint. */ + __wt_page_modify_set(session, page); + /* * If there are subsequent WT_UPDATE structures, we're evicting pages * and the page-scanning mutex isn't held, discard obsolete WT_UPDATE @@ -264,8 +267,5 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, __wt_update_obsolete_free(session, page, obsolete); } - /* Mark the page dirty after updating the footprint. */ - __wt_page_modify_set(session, page); - return (0); } diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index c2ed3473dfb..909f1daf5a4 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -42,7 +42,7 @@ struct __wt_hazard { * WT_SESSION_IMPL -- * Implementation of WT_SESSION. */ -struct __wt_session_impl { +struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { WT_SESSION iface; void *lang_private; /* Language specific private storage */ @@ -190,4 +190,4 @@ struct __wt_session_impl { uint32_t hazard_size; /* Allocated slots in hazard array. */ uint32_t nhazard; /* Count of active hazard pointers */ WT_HAZARD *hazard; /* Hazard pointer array */ -} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT))); +}; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 8380e55effb..c1c4703316b 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -25,10 +25,10 @@ #define WT_SESSION_TXN_STATE(s) (&S2C(s)->txn_global.states[(s)->id]) -struct __wt_txn_state { +struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state { volatile uint64_t id; volatile uint64_t snap_min; -} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT))); +}; struct __wt_txn_global { volatile uint64_t current; /* Current transaction ID. */ diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index 1b3a9b62626..576827bebcd 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -36,7 +36,9 @@ extern "C" { #include <io.h> #endif #include <limits.h> -#ifndef _WIN32 +#ifdef _WIN32 +#include <process.h> +#else #include <pthread.h> #endif #ifdef HAVE_PTHREAD_NP_H diff --git a/src/third_party/wiredtiger/src/reconcile/rec_track.c b/src/third_party/wiredtiger/src/reconcile/rec_track.c index c5c72391248..2533ad9e201 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_track.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_track.c @@ -335,12 +335,12 @@ __ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) * fixing up skiplist links. */ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) - for (e = &head[i]; *e != NULL;) { - if (F_ISSET(*e, WT_OVFL_REUSE_INUSE)) { - e = &(*e)->next[i]; + for (e = &head[i]; (reuse = *e) != NULL;) { + if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) { + e = &reuse->next[i]; continue; } - *e = (*e)->next[i]; + *e = reuse->next[i]; } /* @@ -359,19 +359,20 @@ __ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) { F_CLR(reuse, WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED); - e = &(*e)->next[0]; + e = &reuse->next[0]; continue; } - *e = (*e)->next[0]; + *e = reuse->next[0]; WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)); - decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE); if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET( __ovfl_reuse_verbose(session, page, reuse, "free")); + WT_RET(bm->free( bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size)); + decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE); __wt_free(session, reuse); } @@ -404,12 +405,12 @@ __ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) * fixing up skiplist links. */ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) - for (e = &head[i]; *e != NULL;) { - if (!F_ISSET(*e, WT_OVFL_REUSE_JUST_ADDED)) { - e = &(*e)->next[i]; + for (e = &head[i]; (reuse = *e) != NULL;) { + if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) { + e = &reuse->next[i]; continue; } - *e = (*e)->next[i]; + *e = reuse->next[i]; } /* @@ -420,17 +421,17 @@ __ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) for (e = &head[0]; (reuse = *e) != NULL;) { if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) { F_CLR(reuse, WT_OVFL_REUSE_INUSE); - e = &(*e)->next[0]; + e = &reuse->next[0]; continue; } - *e = (*e)->next[0]; + *e = reuse->next[0]; if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET( __ovfl_reuse_verbose(session, page, reuse, "free")); + WT_TRET(bm->free( bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size)); - decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE); __wt_free(session, reuse); } @@ -722,26 +723,26 @@ __ovfl_txnc_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) for (e = &head[i]; (txnc = *e) != NULL;) { if (TXNID_LE(oldest_txn, txnc->current)) { - e = &(*e)->next[i]; + e = &txnc->next[i]; continue; } - *e = (*e)->next[i]; + *e = txnc->next[i]; } /* Second, discard any no longer needed transaction-cache records. */ decr = 0; for (e = &head[0]; (txnc = *e) != NULL;) { if (TXNID_LE(oldest_txn, txnc->current)) { - e = &(*e)->next[0]; + e = &txnc->next[0]; continue; } - *e = (*e)->next[0]; - - decr += WT_OVFL_SIZE(txnc, WT_OVFL_TXNC); + *e = txnc->next[0]; if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET( __ovfl_txnc_verbose(session, page, txnc, "free")); + + decr += WT_OVFL_SIZE(txnc, WT_OVFL_TXNC); __wt_free(session, txnc); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 0300596f90b..be66309c77f 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -440,8 +440,11 @@ __wt_reconcile(WT_SESSION_IMPL *session, * Root pages are special, splits have to be done, we can't put it off * as the parent's problem any more. */ - if (__wt_ref_is_root(ref)) - return (__rec_root_write(session, page, flags)); + if (__wt_ref_is_root(ref)) { + WT_WITH_PAGE_INDEX(session, + ret = __rec_root_write(session, page, flags)); + return (ret); + } /* * Otherwise, mark the page's parent dirty. @@ -504,6 +507,7 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) WT_ILLEGAL_VALUE(session); } + WT_ASSERT(session, session->split_gen != 0); pindex = WT_INTL_INDEX_COPY(next); for (i = 0; i < mod->mod_multi_entries; ++i) { WT_ERR(__wt_multi_to_ref(session, @@ -2895,7 +2899,7 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_RET_MSG(session, EINVAL, "bulk-load is only possible for newly created trees"); - /* Set a reference to the empty leaf page. */ + /* Get a reference to the empty leaf page. */ pindex = WT_INTL_INDEX_COPY(btree->root.page); cbulk->ref = pindex->index[0]; cbulk->leaf = cbulk->ref->page; @@ -4005,7 +4009,6 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state)); addr = ref->addr; child = ref->page; - vtype = 0; /* Deleted child we don't have to write. */ if (state == WT_CHILD_IGNORE) { @@ -4023,10 +4026,6 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) continue; } - /* Deleted child requiring a proxy cell. */ - if (state == WT_CHILD_PROXY) - vtype = WT_CELL_ADDR_DEL; - /* * Modified child. Empty pages are merged into the parent and * discarded. @@ -4076,22 +4075,22 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* * Build the value cell, the child page's address. Addr points - * to an on-page cell or an off-page WT_ADDR structure. The - * cell type has been set in the case of page deletion requiring + * to an on-page cell or an off-page WT_ADDR structure. There's + * a special cell type in the case of page deletion requiring * a proxy cell, otherwise use the information from the addr or * original cell. */ if (__wt_off_page(page, addr)) { p = addr->addr; size = addr->size; - if (vtype == 0) - vtype = __rec_vtype(addr); + vtype = state == WT_CHILD_PROXY ? + WT_CELL_ADDR_DEL : __rec_vtype(addr); } else { __wt_cell_unpack(ref->addr, vpack); p = vpack->data; size = vpack->size; - if (vtype == 0) - vtype = vpack->raw; + vtype = state == WT_CHILD_PROXY ? + WT_CELL_ADDR_DEL : (u_int)vpack->raw; } __rec_cell_build_addr(r, p, size, vtype, 0); CHILD_RELEASE_ERR(session, hazard, ref); |