diff options
86 files changed, 1360 insertions, 1369 deletions
diff --git a/src/third_party/wiredtiger/.gdb_history b/src/third_party/wiredtiger/.gdb_history new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/src/third_party/wiredtiger/.gdb_history diff --git a/src/third_party/wiredtiger/SConstruct b/src/third_party/wiredtiger/SConstruct index 1d32cf05a6c..ffb59b5e523 100644 --- a/src/third_party/wiredtiger/SConstruct +++ b/src/third_party/wiredtiger/SConstruct @@ -336,9 +336,64 @@ if GetOption("lang-python"): shim = env.Library("window_shim", ["test/windows/windows_shim.c"]) + + +examples = [ + "ex_access", + "ex_all", + "ex_async", + "ex_call_center", + "ex_config", + "ex_config_parse", + "ex_cursor", + "ex_data_source", + "ex_extending", + "ex_hello", + "ex_log", + "ex_pack", + "ex_process", + "ex_schema", + "ex_scope", + "ex_stat", + "ex_thread", + ] + +# WiredTiger Smoke Test support +# Runs each test in a custom temporary directory +def run_smoke_test(x): + print "Running Smoke Test: " + x + + # Make temp dir + temp_dir = tempfile.mkdtemp(prefix="wt_home") + + try: + # Set WT_HOME environment variable for test + os.environ["WIREDTIGER_HOME"] = temp_dir + + # Run the test + ret = subprocess.call(x); + if( ret != 0): + sys.stderr.write("Bad exit code %d\n" % (ret)) + raise Exception() + + finally: + # Clean directory + # + shutil.rmtree(temp_dir) + +def builder_smoke_test(target, source, env): + run_smoke_test(source[0].abspath) + return None + +env.Append(BUILDERS={'SmokeTest' : Builder(action = builder_smoke_test)}) + +#Build the tests and setup the "scons test" target + +#Don't test bloom on Windows, its broken t = env.Program("t_bloom", "test/bloom/test_bloom.c", LIBS=[wtlib] + wtlibs) +#env.Alias("check", env.SmokeTest(t)) Default(t) #env.Program("t_checkpoint", @@ -350,6 +405,7 @@ Default(t) t = env.Program("t_huge", "test/huge/huge.c", LIBS=[wtlib] + wtlibs) +#env.Alias("check", env.SmokeTest(t)) Default(t) t = env.Program("t_fops", @@ -357,6 +413,8 @@ t = env.Program("t_fops", "test/fops/fops.c", "test/fops/t.c"], LIBS=[wtlib, shim] + wtlibs) +env.Append(CPPPATH=["test/utility"]) +env.Alias("check", env.SmokeTest(t)) Default(t) if useBdb: @@ -376,6 +434,7 @@ if useBdb: "test/format/util.c", "test/format/wts.c"], LIBS=[wtlib, shim, "libdb61"] + wtlibs) + env.Alias("test", env.SmokeTest(t)) Default(t) #env.Program("t_thread", @@ -398,56 +457,7 @@ t = env.Program("wtperf", [ LIBS=[wtlib, shim] + wtlibs) Default(t) -examples = [ - "ex_access", - "ex_all", - "ex_async", - "ex_call_center", - "ex_config", - "ex_config_parse", - "ex_cursor", - "ex_data_source", - "ex_extending", - "ex_hello", - "ex_log", - "ex_pack", - "ex_process", - "ex_schema", - "ex_scope", - "ex_stat", - "ex_thread", - ] - -# WiredTiger Smoke Test suppor -# Runs each test in a custom temporary directory -# -def run_smoke_test(x): - print "Running Smoke Test: " + x - - # Make temp dir - temp_dir = tempfile.mkdtemp(prefix="wt_home") - - try: - # Set WT_HOME environment variable for test - os.environ["WIREDTIGER_HOME"] = temp_dir - - # Run the test - ret = subprocess.call(x); - if( ret != 0): - sys.stderr.write("Bad exit code %d\n" % (ret)) - raise Exception() - - finally: - # Clean directory - # - shutil.rmtree(temp_dir) - -def builder_smoke_test(target, source, env): - run_smoke_test(source[0].abspath) - return None - -env.Append(BUILDERS={'SmokeTest' : Builder(action = builder_smoke_test)}) - +#Build the Examples for ex in examples: if(ex in ['ex_all', 'ex_async', 'ex_thread']): exp = env.Program(ex, "examples/c/" + ex + ".c", LIBS=[wtlib, shim] + wtlibs) diff --git a/src/third_party/wiredtiger/bench/wtperf/config.c b/src/third_party/wiredtiger/bench/wtperf/config.c index 3b3308d6fac..634391a9d27 100644 --- a/src/third_party/wiredtiger/bench/wtperf/config.c +++ b/src/third_party/wiredtiger/bench/wtperf/config.c @@ -600,6 +600,12 @@ config_sanity(CONFIG *cfg) "invalid database count, less than 1 or greater than 99\n"); return (EINVAL); } + + if (cfg->pareto > 100) { + fprintf(stderr, + "Invalid pareto distribution - should be a percentage\n"); + return (EINVAL); + } return (0); } diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/fruit-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/fruit-lsm.wtperf index a34c74d5e08..e5817554201 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/fruit-lsm.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/fruit-lsm.wtperf @@ -12,7 +12,7 @@ icount=25000000 key_sz=40 value_sz=800 #max_latency=2000 -pareto=true +pareto=20 populate_threads=20 report_interval=10 random_value=true diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/fruit-short.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/fruit-short.wtperf index a2ea535b4c5..10cb423a92d 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/fruit-short.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/fruit-short.wtperf @@ -11,7 +11,7 @@ icount=25000000 key_sz=40 value_sz=800 max_latency=2000 -pareto=true +pareto=20 populate_threads=20 report_interval=10 random_value=true diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/index-pareto-btree.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/index-pareto-btree.wtperf new file mode 100644 index 00000000000..8a7481453d0 --- /dev/null +++ b/src/third_party/wiredtiger/bench/wtperf/runners/index-pareto-btree.wtperf @@ -0,0 +1,10 @@ +# wtperf options file: medium btree configuration +conn_config="cache_size=10G,checkpoint=(wait=60),statistics=[all],statistics_log=(wait=5)" +table_config="type=file,leaf_page_max=32k" +icount=50000000 +report_interval=5 +run_time=1200 +pareto=1 +index=true +populate_threads=1 +threads=((count=40,reads=1),(count=40,updates=1)) diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test3-1b-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test3-1b-lsm.wtperf index 113b79bc9f9..e7a888fa73b 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/test3-1b-lsm.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/test3-1b-lsm.wtperf @@ -12,7 +12,7 @@ table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom key_sz=40 value_sz=1000 max_latency=2000 -pareto=true +pareto=20 report_interval=10 run_time=14400 sample_interval=10 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test3-2b-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test3-2b-lsm.wtperf index 574cf54b109..c9298756ac8 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/test3-2b-lsm.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/test3-2b-lsm.wtperf @@ -12,7 +12,7 @@ table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom key_sz=40 value_sz=1000 max_latency=2000 -pareto=true +pareto=20 report_interval=10 run_time=14400 sample_interval=10 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test3-500m-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test3-500m-lsm.wtperf index 307d92b6db6..cc51f9c0b76 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/test3-500m-lsm.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/test3-500m-lsm.wtperf @@ -12,7 +12,7 @@ table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom key_sz=40 value_sz=1000 max_latency=2000 -pareto=true +pareto=20 report_interval=10 run_time=14400 sample_interval=10 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/test3-50m-lsm.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/test3-50m-lsm.wtperf index 278b1ce3872..89a85394594 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/test3-50m-lsm.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/test3-50m-lsm.wtperf @@ -11,7 +11,7 @@ table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom key_sz=40 value_sz=1000 max_latency=2000 -pareto=true +pareto=20 report_interval=10 run_time=1440 sample_interval=10 diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c index 3c7a867f96f..8c5e6ba8060 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c @@ -90,6 +90,7 @@ static uint64_t wtperf_value_range(CONFIG *); #define HELIUM_PATH \ "../../ext/test/helium/.libs/libwiredtiger_helium.so" #define HELIUM_CONFIG ",type=helium" +#define INDEX_COL_NAMES ",columns=(key,val)" /* Retrieve an ID for the next insert operation. */ static inline uint64_t @@ -798,7 +799,8 @@ populate_thread(void *arg) } /* Do bulk loads if populate is single-threaded. */ - cursor_config = cfg->populate_threads == 1 ? "bulk" : NULL; + cursor_config = + (cfg->populate_threads == 1 && !cfg->index) ? "bulk" : NULL; /* Create the cursors. */ cursors = calloc(cfg->table_count, sizeof(WT_CURSOR *)); if (cursors == NULL) { @@ -1672,13 +1674,24 @@ create_tables(CONFIG *cfg) } } - for (i = 0; i < cfg->table_count; i++) + for (i = 0; i < cfg->table_count; i++) { if ((ret = session->create( session, cfg->uris[i], cfg->table_config)) != 0) { lprintf(cfg, ret, 0, "Error creating table %s", cfg->uris[i]); return (ret); } + if (cfg->index) { + snprintf(buf, 512, "index:%s:val_idx", + cfg->uris[i] + strlen("table:")); + if ((ret = session->create( + session, buf, "columns=(val)")) != 0) { + lprintf(cfg, ret, 0, + "Error creating index %s", buf); + return (ret); + } + } + } if ((ret = session->close(session, NULL)) != 0) { lprintf(cfg, ret, 0, "Error closing session"); @@ -2115,7 +2128,7 @@ main(int argc, char *argv[]) if ((ret = config_opt_str(cfg, "conn_config", cc_buf)) != 0) goto err; } - if (cfg->verbose > 1 || cfg->helium_mount != NULL || + if (cfg->verbose > 1 || cfg->index || cfg->helium_mount != NULL || user_tconfig != NULL || cfg->compress_table != NULL) { req_len = strlen(cfg->table_config) + strlen(HELIUM_CONFIG) + strlen(debug_tconfig) + 3; @@ -2123,6 +2136,8 @@ main(int argc, char *argv[]) req_len += strlen(user_tconfig); if (cfg->compress_table != NULL) req_len += strlen(cfg->compress_table); + if (cfg->index) + req_len += strlen(INDEX_COL_NAMES); if ((tc_buf = calloc(req_len, 1)) == NULL) { ret = enomem(cfg); goto err; @@ -2130,8 +2145,9 @@ main(int argc, char *argv[]) /* * This is getting hard to parse. */ - snprintf(tc_buf, req_len, "%s%s%s%s%s%s%s", + snprintf(tc_buf, req_len, "%s%s%s%s%s%s%s%s", cfg->table_config, + cfg->index ? INDEX_COL_NAMES : "", cfg->compress_table ? cfg->compress_table : "", cfg->verbose > 1 ? ",": "", cfg->verbose > 1 ? debug_tconfig : "", @@ -2351,15 +2367,16 @@ wtperf_rand(CONFIG_THREAD *thread) rval = (uint64_t)__wt_random(thread->rnd); /* Use Pareto distribution to give 80/20 hot/cold values. */ - if (cfg->pareto) { + if (cfg->pareto != 0) { #define PARETO_SHAPE 1.5 S1 = (-1 / PARETO_SHAPE); - S2 = wtperf_value_range(cfg) * 0.2 * (PARETO_SHAPE - 1); + S2 = wtperf_value_range(cfg) * + (cfg->pareto / 100.0) * (PARETO_SHAPE - 1); U = 1 - (double)rval / (double)UINT32_MAX; rval = (pow(U, S1) - 1) * S2; /* * This Pareto calculation chooses out of range values about - * about 2% of the time, from my testing. That will lead to the + * 2% of the time, from my testing. That will lead to the * first item in the table being "hot". */ if (rval > wtperf_value_range(cfg)) diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i index cfc39933517..cc3fd34e227 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i @@ -110,6 +110,8 @@ DEF_OPT_AS_UINT32(drop_tables, 0, DEF_OPT_AS_UINT32(icount, 5000, "number of records to initially populate. If multiple tables are " "configured the count is spread evenly across all tables.") +DEF_OPT_AS_BOOL(index, 0, + "Whether to create an index on the value field.") DEF_OPT_AS_BOOL(insert_rmw, 0, "execute a read prior to each insert in workload phase") DEF_OPT_AS_UINT32(key_sz, 20, "key size") @@ -119,7 +121,9 @@ DEF_OPT_AS_UINT32(min_throughput, 0, DEF_OPT_AS_UINT32(max_latency, 0, "abort if any latency measured exceeds this number of milliseconds." "Requires sample_interval to be configured") -DEF_OPT_AS_BOOL(pareto, 0, "use pareto 80/20 distribution for random numbers") +DEF_OPT_AS_UINT32(pareto, 0, "use pareto distribution for random numbers. Zero " + "to disable, otherwise a percentage indicating how aggressive the " + "distribution should be.") DEF_OPT_AS_UINT32(populate_ops_per_txn, 0, "number of operations to group into each transaction in the populate " "phase, zero for auto-commit") diff --git a/src/third_party/wiredtiger/build_posix/Make.subdirs b/src/third_party/wiredtiger/build_posix/Make.subdirs index 161fcd88fb4..748b932de32 100644 --- a/src/third_party/wiredtiger/build_posix/Make.subdirs +++ b/src/third_party/wiredtiger/build_posix/Make.subdirs @@ -20,6 +20,8 @@ examples/c lang/java JAVA examples/java JAVA lang/python PYTHON + +# Make the tests test/bloom test/checkpoint test/fops diff --git a/src/third_party/wiredtiger/build_win/filelist.win b/src/third_party/wiredtiger/build_win/filelist.win index d1b43bd8348..eb2c44f4bec 100644 --- a/src/third_party/wiredtiger/build_win/filelist.win +++ b/src/third_party/wiredtiger/build_win/filelist.win @@ -84,6 +84,7 @@ src/log/log.c src/log/log_auto.c src/log/log_slot.c src/lsm/lsm_cursor.c +src/lsm/lsm_cursor_bulk.c src/lsm/lsm_manager.c src/lsm/lsm_merge.c src/lsm/lsm_meta.c diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 48bbee6cdde..f9ce33f33a6 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -142,8 +142,8 @@ file_config = format_meta + [ are also available. See @ref compression for more information''', func='__wt_compressor_confchk'), Config('cache_resident', 'false', r''' - do not ever evict the object's pages; see @ref - tuning_cache_resident for more information''', + do not ever evict the object's pages from cache. Not compatible with + LSM tables; see @ref tuning_cache_resident for more information''', type='boolean'), Config('checksum', 'uncompressed', r''' configure block checksums; permitted values are <code>on</code> diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index 7d57864a788..b140b422352 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -84,6 +84,7 @@ src/log/log.c src/log/log_auto.c src/log/log_slot.c src/lsm/lsm_cursor.c +src/lsm/lsm_cursor_bulk.c src/lsm/lsm_manager.c src/lsm/lsm_merge.c src/lsm/lsm_meta.c diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py index 34f3ab3e02f..544e3b5d549 100644 --- a/src/third_party/wiredtiger/dist/flags.py +++ b/src/third_party/wiredtiger/dist/flags.py @@ -44,6 +44,11 @@ flags = { 'READ_TRUNCATE', 'READ_WONT_NEED', ], + 'page_eviction' : [ + 'EVICT_CHECK_SPLITS', + 'EVICT_EXCLUSIVE', + 'EVICT_INMEM_SPLIT', + ], 'rec_write' : [ 'EVICTING', 'SKIP_UPDATE_ERR', @@ -106,7 +111,10 @@ flags = { 'SESSION_CAN_WAIT', 'SESSION_CLEAR_EVICT_WALK', 'SESSION_DISCARD_FORCE', - 'SESSION_HANDLE_LIST_LOCKED', + 'SESSION_LOCKED_CHECKPOINT', + 'SESSION_LOCKED_HANDLE_LIST', + 'SESSION_LOCKED_SCHEMA', + 'SESSION_LOCKED_TABLE', 'SESSION_INTERNAL', 'SESSION_LOGGING_INMEM', 'SESSION_NO_CACHE', @@ -115,9 +123,7 @@ flags = { 'SESSION_NO_LOGGING', 'SESSION_NO_SCHEMA_LOCK', 'SESSION_SALVAGE_CORRUPT_OK', - 'SESSION_SCHEMA_LOCKED', 'SESSION_SERVER_ASYNC', - 'SESSION_TABLE_LOCKED', ], } diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index 2a6581cf880..eae95d527a1 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -44,6 +44,7 @@ WT_PACKED_STRUCT_BEGIN WT_PACKED_STRUCT_END WT_READ_BARRIER WT_REF_SIZE +WT_SESSION_LOCKED_CHECKPOINT WT_STAT_ATOMIC_DECR WT_STAT_ATOMIC_DECRV WT_STAT_ATOMIC_INCR diff --git a/src/third_party/wiredtiger/dist/s_docs b/src/third_party/wiredtiger/dist/s_docs index cf5f3962c19..96cacd3fb34 100755 --- a/src/third_party/wiredtiger/dist/s_docs +++ b/src/third_party/wiredtiger/dist/s_docs @@ -166,10 +166,12 @@ EOF (cd ../docs && mkdir -p man/man1 && mv man/man3/command_line.3 man/man1/wt.1 && sed -i~ -e 's/command_line/wt/g' man/man1/wt.1 && + sed -i~ -e 's/Version Version/Version/g' man/man1/wt.1 && rm -f man/man1/wt.1~ && mv man/man3/basic_api.3 man/ && rm -f man/man3/* && mv man/basic_api.3 man/man3/wiredtiger.3 && sed -i~ -e 's/basic_api/WiredTiger/g' man/man3/wiredtiger.3 && + sed -i~ -e 's/Version Version/Version/g' man/man3/wiredtiger.3 && rm -f man/man3/wiredtiger.3~) } diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index a014dc5414f..b197322e969 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -166,6 +166,7 @@ MEM MEMALIGN MERCHANTABILITY MSVC +MULTIBLOCK MUTEX Manos MapViewOfFile @@ -232,6 +233,7 @@ STAILQ STRUCT Scalability Scalable +Sedgewick Seigh SetEndOfFile SetFilePointerEx @@ -808,6 +810,7 @@ strtouq struct structs su +subdirectory subgetraw subgets subinit @@ -820,6 +823,7 @@ sys t's tV tablename +testutil th tid timestamp diff --git a/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c b/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c index e6b8219aafb..9939a4f8a04 100644 --- a/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c +++ b/src/third_party/wiredtiger/ext/compressors/lz4/lz4_compress.c @@ -43,15 +43,16 @@ /* Local compressor structure. */ typedef struct { WT_COMPRESSOR compressor; /* Must come first */ + WT_EXTENSION_API *wt_api; /* Extension API */ } LZ4_COMPRESSOR; /* - * wt_lz4_error -- + * lz4_error -- * Output an error message, and return a standard error code. */ static int -wt_lz4_error( +lz4_error( WT_COMPRESSOR *compressor, WT_SESSION *session, const char *call, int zret) { WT_EXTENSION_API *wt_api; @@ -64,11 +65,11 @@ wt_lz4_error( } /* - * wt_lz4_compress -- + * lz4_compress -- * WiredTiger LZ4 compression. */ static int -wt_lz4_compress(WT_COMPRESSOR *compressor, WT_SESSION *session, +lz4_compress(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src, size_t src_len, uint8_t *dst, size_t dst_len, size_t *result_lenp, int *compression_failed) @@ -81,7 +82,7 @@ wt_lz4_compress(WT_COMPRESSOR *compressor, WT_SESSION *session, * call, but be paranoid and error if it isn't. */ if (dst_len < src_len + sizeof(size_t)) - return (wt_lz4_error(compressor, session, + return (lz4_error(compressor, session, "LZ4 compress buffer too small", 0)); /* Store the length of the compressed block in the first 8 bytes. */ @@ -110,11 +111,11 @@ wt_lz4_compress(WT_COMPRESSOR *compressor, WT_SESSION *session, } /* - * wt_lz4_decompress -- + * lz4_decompress -- * WiredTiger LZ4 decompression. */ static int -wt_lz4_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, +lz4_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src, size_t src_len, uint8_t *dst, size_t dst_len, size_t *result_lenp) @@ -131,7 +132,7 @@ wt_lz4_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, if (src_data_len + sizeof(size_t) > src_len) { (void)wt_api->err_printf(wt_api, session, - "wt_lz4_decompress: stored size exceeds buffer size"); + "lz4_decompress: stored size exceeds buffer size"); return (WT_ERROR); } @@ -148,7 +149,7 @@ wt_lz4_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, compressed_data, (char *)dst, (int)src_data_len, (int)dst_len); if (decoded < 0) - return (wt_lz4_error(compressor, session, + return (lz4_error(compressor, session, "LZ4 decompress error", decoded)); /* return the uncompressed data length */ @@ -158,11 +159,11 @@ wt_lz4_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, } /* - * wt_lz4_pre_size -- + * lz4_pre_size -- * WiredTiger LZ4 destination buffer sizing for compression. */ static int -wt_lz4_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session, +lz4_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session, uint8_t *src, size_t src_len, size_t *result_lenp) { @@ -179,11 +180,11 @@ wt_lz4_pre_size(WT_COMPRESSOR *compressor, WT_SESSION *session, } /* - * wt_lz4_terminate -- + * lz4_terminate -- * WiredTiger LZ4 compression termination. */ static int -wt_lz4_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session) +lz4_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session) { (void)session; @@ -221,11 +222,11 @@ lz4_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) * the compressor is terminated. However, this approach is more general * purpose and supports multiple databases per application. */ - lz4_compressor->compressor.compress = wt_lz4_compress; + lz4_compressor->compressor.compress = lz4_compress; lz4_compressor->compressor.compress_raw = NULL; - lz4_compressor->compressor.decompress = wt_lz4_decompress; - lz4_compressor->compressor.pre_size = wt_lz4_pre_size; - lz4_compressor->compressor.terminate = wt_lz4_terminate; + lz4_compressor->compressor.decompress = lz4_decompress; + lz4_compressor->compressor.pre_size = lz4_pre_size; + lz4_compressor->compressor.terminate = lz4_terminate; lz4_compressor->wt_api = connection->get_extension_api(connection); @@ -246,6 +247,6 @@ lz4_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) int wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) { - return lz4_extension_init(connection, config); + return (lz4_extension_init(connection, config)); } #endif diff --git a/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c b/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c index 381bf0d5070..e5a313627a5 100644 --- a/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c +++ b/src/third_party/wiredtiger/ext/compressors/zlib/zlib_compress.c @@ -68,14 +68,14 @@ typedef struct { */ static int zlib_error( - WT_COMPRESSOR *compressor, WT_SESSION *session, const char *call, int zret) + WT_COMPRESSOR *compressor, WT_SESSION *session, const char *call, int error) { WT_EXTENSION_API *wt_api; wt_api = ((ZLIB_COMPRESSOR *)compressor)->wt_api; (void)wt_api->err_printf(wt_api, session, - "zlib error: %s: %s: %d", call, zError(zret), zret); + "zlib error: %s: %s: %d", call, zError(error), error); return (WT_ERROR); } @@ -154,32 +154,6 @@ zlib_compress(WT_COMPRESSOR *compressor, WT_SESSION *session, } /* - * zlib_find_slot -- - * Find the slot containing the target offset (binary search). - */ -static inline uint32_t -zlib_find_slot(uint64_t target, uint32_t *offsets, uint32_t slots) -{ - uint32_t base, indx, limit; - - indx = 1; - - /* Figure out which slot we got to: binary search */ - if (target >= offsets[slots]) - indx = slots; - else if (target > offsets[1]) - for (base = 2, limit = slots - base; limit != 0; limit >>= 1) { - indx = base + (limit >> 1); - if (target < offsets[indx]) - continue; - base = indx + 1; - --limit; - } - - return (indx); -} - -/* * zlib_decompress -- * WiredTiger zlib decompression. */ @@ -222,6 +196,32 @@ zlib_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session, } /* + * zlib_find_slot -- + * Find the slot containing the target offset (binary search). + */ +static inline uint32_t +zlib_find_slot(uint64_t target, uint32_t *offsets, uint32_t slots) +{ + uint32_t base, indx, limit; + + indx = 1; + + /* Figure out which slot we got to: binary search */ + if (target >= offsets[slots]) + indx = slots; + else if (target > offsets[1]) + for (base = 2, limit = slots - base; limit != 0; limit >>= 1) { + indx = base + (limit >> 1); + if (target < offsets[indx]) + continue; + base = indx + 1; + --limit; + } + + return (indx); +} + +/* * zlib_compress_raw -- * Pack records into a specified on-disk page size. */ @@ -267,8 +267,7 @@ zlib_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session, /* Save the stream state in case the chosen data doesn't fit. */ if ((ret = deflateCopy(&last_zs, &zs)) != Z_OK) - return (zlib_error( - compressor, session, "deflateCopy", ret)); + return (zlib_error(compressor, session, "deflateCopy", ret)); /* * Strategy: take the available output size and compress that much @@ -410,8 +409,8 @@ zlib_add_compressor(WT_CONNECTION *connection, int raw, const char *name) ZLIB_COMPRESSOR *zlib_compressor; /* - * There are two almost identical zlib compressors: one supporting raw - * compression, and one without. + * There are two almost identical zlib compressors: one using raw + * compression to target a specific block size, and one without. */ if ((zlib_compressor = calloc(1, sizeof(ZLIB_COMPRESSOR))) == NULL) return (errno); @@ -426,13 +425,13 @@ zlib_add_compressor(WT_CONNECTION *connection, int raw, const char *name) zlib_compressor->wt_api = connection->get_extension_api(connection); /* - * between 0-10: level: see zlib manual. + * Between 0-10: level: see zlib manual. */ zlib_compressor->zlib_level = Z_DEFAULT_COMPRESSION; - /* Load the standard compressor. */ + /* Load the compressor. */ return (connection->add_compressor( - connection, name, &zlib_compressor->compressor, NULL)); + connection, name, (WT_COMPRESSOR *)zlib_compressor, NULL)); } int zlib_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); @@ -440,12 +439,11 @@ int zlib_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *); /* * zlib_extension_init -- * WiredTiger zlib compression extension - called directly when zlib - * support is built in, or via wiredtiger_extension_init when zlib - * support is included via extension loading. + * support is built in, or via wiredtiger_extension_init when zlib support + * is included via extension loading. */ int -zlib_extension_init( - WT_CONNECTION *connection, WT_CONFIG_ARG *config) +zlib_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) { int ret; @@ -468,8 +466,7 @@ zlib_extension_init( * WiredTiger zlib compression extension. */ int -wiredtiger_extension_init( - WT_CONNECTION *connection, WT_CONFIG_ARG *config) +wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) { return (zlib_extension_init(connection, config)); } diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c index 1528d65b8c8..4709ac3260e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_compact.c +++ b/src/third_party/wiredtiger/src/btree/bt_compact.c @@ -110,7 +110,7 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) * * We're holding the schema lock which serializes with checkpoints. */ - WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); /* * Get the tree handle's flush lock which blocks threads writing leaf diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 7c894effacd..0aed5940533 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -841,8 +841,11 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_ERR(ret); if (__cursor_valid(cbt, &upd)) WT_ERR(__wt_kv_return(session, cbt, upd)); - else - WT_ERR(__wt_btcur_search_near(cbt, NULL)); + else { + if ((ret = __wt_btcur_next(cbt, 0)) == WT_NOTFOUND) + ret = __wt_btcur_prev(cbt, 0); + WT_ERR(ret); + } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 9cc7cd2a824..b1fa5ce6178 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -640,10 +640,14 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) __dmsg(ds, ", disk-mapped"); if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) __dmsg(ds, ", evict-lru"); + if (F_ISSET_ATOMIC(page, WT_PAGE_REFUSE_DEEPEN)) + __dmsg(ds, ", refuse-deepen"); if (F_ISSET_ATOMIC(page, WT_PAGE_SCANNING)) __dmsg(ds, ", scanning"); - if (F_ISSET_ATOMIC(page, WT_PAGE_SPLITTING)) - __dmsg(ds, ", splitting"); + if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) + __dmsg(ds, ", split-insert"); + if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED)) + __dmsg(ds, ", split-locked"); if (mod != NULL) switch (F_ISSET(mod, WT_PM_REC_MASK)) { @@ -656,6 +660,9 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) case WT_PM_REC_REPLACE: __dmsg(ds, ", replaced"); break; + case WT_PM_REC_REWRITE: + __dmsg(ds, ", rewrite"); + break; case 0: break; WT_ILLEGAL_VALUE(session); diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index f43e936eeda..a05c6217338 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -56,7 +56,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); - WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLITTING)); + WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED)); #ifdef HAVE_DIAGNOSTIC { @@ -150,6 +150,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) switch (F_ISSET(mod, WT_PM_REC_MASK)) { case WT_PM_REC_MULTIBLOCK: + case WT_PM_REC_REWRITE: /* Free list of replacement blocks. */ for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index e249f997d87..4303ba4cd48 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -133,16 +133,13 @@ __wt_btree_close(WT_SESSION_IMPL *session) { WT_BM *bm; WT_BTREE *btree; - WT_DATA_HANDLE *dhandle; WT_DECL_RET; - dhandle = session->dhandle; btree = S2BT(session); if ((bm = btree->bm) != NULL) { /* Unload the checkpoint, unless it's a special command. */ - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && - !F_ISSET(btree, + if (!F_ISSET(btree, WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) WT_TRET(bm->checkpoint_unload(bm, session)); @@ -173,6 +170,8 @@ __wt_btree_close(WT_SESSION_IMPL *session) btree->bulk_load_ok = 0; + F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); + return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 8086806b3a4..17d9442e1a4 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -49,8 +49,12 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) /* Trigger eviction on the next page release. */ __wt_page_evict_soon(page); + /* Bump the oldest ID, we're about to do some visibility checks. */ + __wt_txn_update_oldest(session, 0); + /* If eviction cannot succeed, don't try. */ - return (__wt_page_can_evict(session, page, 1)); + return ( + __wt_page_can_evict(session, page, WT_EVICT_CHECK_SPLITS, NULL)); } /* @@ -181,8 +185,11 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags skip_evict: /* * Check if we need an autocommit transaction. + * Starting a transaction can trigger eviction, so skip + * it if eviction isn't permitted. */ - return (__wt_txn_autocommit_check(session)); + return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : + __wt_txn_autocommit_check(session)); WT_ILLEGAL_VALUE(session); } diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index 896ab23f1c2..1bfd03f58cb 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -327,7 +327,7 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) */ if (ss->root_ref.page != NULL) { btree->ckpt = ckptbase; - ret = __wt_evict(session, &ss->root_ref, 1); + ret = __wt_evict(session, &ss->root_ref, WT_EVICT_EXCLUSIVE); ss->root_ref.page = NULL; btree->ckpt = NULL; } @@ -1313,7 +1313,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) ret = __wt_page_release(session, ref, 0); if (ret == 0) - ret = __wt_evict(session, ref, 1); + ret = __wt_evict(session, ref, WT_EVICT_EXCLUSIVE); if (0) { err: WT_TRET(__wt_page_release(session, ref, 0)); @@ -2022,7 +2022,7 @@ __slvg_row_build_leaf( */ ret = __wt_page_release(session, ref, 0); if (ret == 0) - ret = __wt_evict(session, ref, 1); + ret = __wt_evict(session, ref, WT_EVICT_EXCLUSIVE); if (0) { err: WT_TRET(__wt_page_release(session, ref, 0)); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 59ad7abb221..d4c8cf1b92d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -670,7 +670,6 @@ __split_multi_inmem( * when discarding the original page, and our caller will discard the * allocated page on error, when discarding the allocated WT_REF. */ - WT_RET(__wt_page_inmem(session, ref, multi->skip_dsk, ((WT_PAGE_HEADER *)multi->skip_dsk)->mem_size, WT_PAGE_DISK_ALLOC, &page)); @@ -815,7 +814,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, */ static int __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, - WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, int exclusive) + WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, uint32_t flags) { WT_DECL_RET; WT_IKEY *ikey; @@ -849,13 +848,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, */ for (;;) { parent = ref->home; - F_CAS_ATOMIC(parent, WT_PAGE_SPLITTING, ret); + F_CAS_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED, ret); if (ret == 0) { if (parent == ref->home) break; - F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING); + F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED); continue; } + /* + * If we're attempting an in-memory split and we can't lock the + * parent, give up. This avoids an infinite loop where we are + * trying to split a page while its parent is being + * checkpointed. + */ + if (LF_ISSET(WT_EVICT_INMEM_SPLIT)) + return (EBUSY); __wt_yield(); } @@ -865,6 +872,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * update the parent's index, it will no longer refer to the child, and * could conceivably be evicted. Get a hazard pointer on the parent * now, so that we can safely access it after updating the index. + * + * Take care that getting the page doesn't trigger eviction, or we + * could block trying to split a different child of our parent and + * deadlock. */ if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) { WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT)); @@ -1031,7 +1042,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * Add it to the session discard list, to be freed when it's safe. */ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); - WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size)); + WT_TRET(__split_safe_free(session, + split_gen, LF_ISSET(WT_EVICT_EXCLUSIVE), pindex, size)); parent_decr += size; /* @@ -1056,7 +1068,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * Do the check here because we've just grown the parent page and * are holding it locked. */ - if (ret == 0 && !exclusive && + if (ret == 0 && !LF_ISSET(WT_EVICT_EXCLUSIVE) && !F_ISSET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN) && __split_should_deepen(session, parent_ref, &children)) { /* @@ -1078,7 +1090,7 @@ err: if (!complete) if (next_ref->state == WT_REF_SPLIT) next_ref->state = WT_REF_DELETED; } - F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING); + F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED); if (hazard) WT_TRET(__wt_hazard_clear(session, parent)); @@ -1102,9 +1114,8 @@ err: if (!complete) * list into a separate page. */ int -__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) +__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) { - WT_BTREE *btree; WT_DECL_RET; WT_DECL_ITEM(key); WT_INSERT *ins, **insp, *moved_ins, *prev_ins; @@ -1114,60 +1125,20 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) size_t page_decr, parent_incr, right_incr; int i; - *splitp = 0; - - btree = S2BT(session); page = ref->page; right = NULL; page_decr = parent_incr = right_incr = 0; - /* - * Check for pages with append-only workloads. A common application - * pattern is to have multiple threads frantically appending to the - * tree. We want to reconcile and evict this page, but we'd like to - * do it without making the appending threads wait. If we're not - * discarding the tree, check and see if it's worth doing a split to - * let the threads continue before doing eviction. - * - * Ignore anything other than large, dirty row-store leaf pages. - * - * XXX KEITH - * Need a better test for append-only workloads. - */ - if (page->type != WT_PAGE_ROW_LEAF || - page->memory_footprint < btree->maxmempage || - !__wt_page_is_modified(page)) - return (0); + WT_ASSERT(session, __wt_page_can_split(session, page)); - /* - * There is no point splitting if the list is small, no deep items is - * our heuristic for that. (A 1/4 probability of adding a new skiplist - * level means there will be a new 6th level for roughly each 4KB of - * entries in the list. If we have at least two 6th level entries, the - * list is at least large enough to work with.) - * - * The following code requires at least two items on the insert list, - * this test serves the additional purpose of confirming that. - */ -#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(6, WT_SKIP_MAXDEPTH - 1) + /* Find the last item on the page. */ ins_head = page->pg_row_entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); - if (ins_head == NULL || - ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == NULL || - ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == - ins_head->tail[WT_MIN_SPLIT_SKIPLIST_DEPTH]) - return (0); - - /* Find the last item in the insert list. */ moved_ins = WT_SKIP_LAST(ins_head); - /* - * Only split a page once, otherwise workloads that update in the middle - * of the page could continually split without benefit. - */ - if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) - return (0); + /* Mark that this page has already been through an in-memory split. */ + WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)); F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT); /* @@ -1360,8 +1331,8 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) * longer locked, so we cannot safely look at it. */ page = NULL; - if ((ret = __split_parent( - session, ref, split_ref, 2, parent_incr, 0)) != 0) { + if ((ret = __split_parent(session, + ref, split_ref, 2, parent_incr, WT_EVICT_INMEM_SPLIT)) != 0) { /* * Move the insert list element back to the original page list. * For simplicity, the previous skip list pointers originally @@ -1384,9 +1355,6 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) WT_ERR(ret); } - /* Let our caller know that we split. */ - *splitp = 1; - WT_STAT_FAST_CONN_INCR(session, cache_inmem_split); WT_STAT_FAST_DATA_INCR(session, cache_inmem_split); @@ -1480,8 +1448,8 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) page, &mod->mod_multi[i], &ref_new[i], &parent_incr)); /* Split into the parent. */ - WT_ERR(__split_parent( - session, ref, ref_new, new_entries, parent_incr, exclusive)); + WT_ERR(__split_parent( session, ref, ref_new, + new_entries, parent_incr, exclusive ? WT_EVICT_EXCLUSIVE : 0)); WT_STAT_FAST_CONN_INCR(session, cache_eviction_split); WT_STAT_FAST_DATA_INCR(session, cache_eviction_split); diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index dae2dd8d480..cc52f63f1f5 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -71,7 +71,7 @@ __sync_file(WT_SESSION_IMPL *session, int syncop) __wt_txn_visible_all( session, page->modify->update_txn)) { if (txn->isolation == TXN_ISO_READ_COMMITTED) - __wt_txn_refresh(session, 1); + __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); @@ -150,7 +150,8 @@ __sync_file(WT_SESSION_IMPL *session, int syncop) */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, TXN_HAS_SNAPSHOT) && - TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { + TXNID_LT(txn->snap_max, mod->first_dirty_txn) && + !F_ISSET(mod, WT_PM_REC_REWRITE)) { __wt_page_modify_set(session, page); continue; } diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index 0e351682e9e..d56b44bbd95 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -287,9 +287,11 @@ __wt_update_alloc( * Check for obsolete updates. */ WT_UPDATE * -__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd) +__wt_update_obsolete_check( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd) { WT_UPDATE *first, *next; + u_int count; /* * This function identifies obsolete updates, and truncates them from @@ -299,7 +301,7 @@ __wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd) * * Walk the list of updates, looking for obsolete updates at the end. */ - for (first = NULL; upd != NULL; upd = upd->next) + for (first = NULL, count = 0; upd != NULL; upd = upd->next, count++) if (__wt_txn_visible_all(session, upd->txnid)) { if (first == NULL) first = upd; @@ -317,6 +319,14 @@ __wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd) WT_ATOMIC_CAS8(first->next, next, NULL)) return (next); + /* + * If the list is long, don't retry checks on this page until the + * transaction state has moved forwards. + */ + if (count > 20) + page->modify->obsolete_check_txn = + S2C(session)->txn_global.last_running; + return (NULL); } diff --git a/src/third_party/wiredtiger/src/config/config_api.c b/src/third_party/wiredtiger/src/config/config_api.c index deff33a10bd..4e8f0d08400 100644 --- a/src/third_party/wiredtiger/src/config/config_api.c +++ b/src/third_party/wiredtiger/src/config/config_api.c @@ -322,8 +322,7 @@ __wt_configure_method(WT_SESSION_IMPL *session, newcheck = &checks[cnt]; newcheck->name = newcheck_name; WT_ERR(__wt_strdup(session, type, &newcheck->type)); - if (check != NULL) - WT_ERR(__wt_strdup(session, check, &newcheck->checks)); + WT_ERR(__wt_strdup(session, check, &newcheck->checks)); entry->checks = checks; entry->checks_entries = 0; diff --git a/src/third_party/wiredtiger/src/config/config_concat.c b/src/third_party/wiredtiger/src/config/config_concat.c deleted file mode 100644 index e872722a272..00000000000 --- a/src/third_party/wiredtiger/src/config/config_concat.c +++ /dev/null @@ -1,72 +0,0 @@ -/*- - * Copyright (c) 2014-2015 MongoDB, Inc. - * Copyright (c) 2008-2014 WiredTiger, Inc. - * All rights reserved. - * - * See the file LICENSE for redistribution information. - */ - -#include "wt_internal.h" - -/* - * __wt_config_concat -- - * Given a NULL-terminated list of configuration strings, concatenate them - * into newly allocated memory. Nothing special is assumed about any of - * the config strings, they are simply combined in order. - * - * This code deals with the case where some of the config strings are - * wrapped in brackets but others aren't: the resulting string does not - * have brackets. - */ -int -__wt_config_concat( - WT_SESSION_IMPL *session, const char **cfg, char **config_ret) -{ - WT_CONFIG cparser; - WT_CONFIG_ITEM k, v; - WT_DECL_ITEM(tmp); - WT_DECL_RET; - const char **cp; - - WT_RET(__wt_scr_alloc(session, 0, &tmp)); - - for (cp = cfg; *cp != NULL; ++cp) { - WT_ERR(__wt_config_init(session, &cparser, *cp)); - while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) { - if (k.type != WT_CONFIG_ITEM_STRING && - k.type != WT_CONFIG_ITEM_ID) - WT_ERR_MSG(session, EINVAL, - "Invalid configuration key found: '%s'\n", - k.str); - /* Include the quotes around string keys/values. */ - if (k.type == WT_CONFIG_ITEM_STRING) { - --k.str; - k.len += 2; - } - if (v.type == WT_CONFIG_ITEM_STRING) { - --v.str; - v.len += 2; - } - WT_ERR(__wt_buf_catfmt(session, tmp, "%.*s%s%.*s,", - (int)k.len, k.str, - (v.len > 0) ? "=" : "", - (int)v.len, v.str)); - } - if (ret != WT_NOTFOUND) - goto err; - } - - /* - * If the caller passes us no valid configuration strings, we get here - * with no bytes to copy -- that's OK, the underlying string copy can - * handle empty strings. - * - * Strip any trailing comma. - */ - if (tmp->size != 0) - --tmp->size; - ret = __wt_strndup(session, tmp->data, tmp->size, config_ret); - -err: __wt_scr_free(session, &tmp); - return (ret); -} diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index 3214926bcf1..56f15a89f30 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -919,8 +919,7 @@ __conn_open_session(WT_CONNECTION *wt_conn, CONNECTION_API_CALL(conn, session, open_session, config, cfg); WT_UNUSED(cfg); - WT_ERR(__wt_open_session(conn, event_handler, config, &session_ret)); - + WT_ERR(__wt_open_session(conn, event_handler, config, 1, &session_ret)); *wt_sessionp = &session_ret->iface; err: API_END_RET_NOTFOUND_MAP(session, ret); diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c index 8de8cd3f8bc..1edd9dac7fb 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache.c @@ -176,13 +176,22 @@ __wt_cache_stats_update(WT_SESSION_IMPL *session) WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_CONNECTION_STATS *stats; + uint64_t inuse, leaf, used; conn = S2C(session); cache = conn->cache; stats = &conn->stats; + inuse = __wt_cache_bytes_inuse(cache); + /* + * There are races updating the different cache tracking values so + * be paranoid calculating the leaf byte usage. + */ + used = cache->bytes_overflow + cache->bytes_internal; + leaf = inuse > used ? inuse - used : 0; + WT_STAT_SET(stats, cache_bytes_max, conn->cache_size); - WT_STAT_SET(stats, cache_bytes_inuse, __wt_cache_bytes_inuse(cache)); + WT_STAT_SET(stats, cache_bytes_inuse, inuse); WT_STAT_SET(stats, cache_overhead, cache->overhead_pct); WT_STAT_SET(stats, cache_pages_inuse, __wt_cache_pages_inuse(cache)); @@ -191,11 +200,9 @@ __wt_cache_stats_update(WT_SESSION_IMPL *session) cache_eviction_maximum_page_size, cache->evict_max_page_size); WT_STAT_SET(stats, cache_pages_dirty, cache->pages_dirty); - /* Figure out internal, leaf and overflow stats */ WT_STAT_SET(stats, cache_bytes_internal, cache->bytes_internal); - WT_STAT_SET(stats, cache_bytes_leaf, - conn->cache_size - (cache->bytes_internal + cache->bytes_overflow)); WT_STAT_SET(stats, cache_bytes_overflow, cache->bytes_overflow); + WT_STAT_SET(stats, cache_bytes_leaf, leaf); } /* diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c index 488864ce351..de7e9e3486f 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c @@ -96,8 +96,8 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) __wt_process.cache_pool = cp; WT_ERR(__wt_verbose(session, WT_VERB_SHARED_CACHE, "Created cache pool %s", cp->name)); - } else if (!updating && !WT_STRING_MATCH( - __wt_process.cache_pool->name, pool_name, strlen(pool_name))) + } else if (!updating && + strcmp(__wt_process.cache_pool->name, pool_name) != 0) /* Only a single cache pool is supported. */ WT_ERR_MSG(session, WT_ERROR, "Attempting to join a cache pool that does not exist: %s", diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index 60e7c41f76d..07d4cce40f5 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -9,119 +9,57 @@ #include "wt_internal.h" /* - * __conn_dhandle_open_lock -- - * Spin on the current data handle until either (a) it is open, read - * locked; or (b) it is closed, write locked. If exclusive access is - * requested and cannot be granted immediately because the handle is - * in use, fail with EBUSY. - * - * Here is a brief summary of how different operations synchronize using - * either the schema lock, handle locks or handle flags: - * - * open -- holds the schema lock, one thread gets the handle exclusive, - * reverts to a shared handle lock and drops the schema lock - * once the handle is open; - * bulk load -- sets bulk and exclusive; - * salvage, truncate, update, verify -- hold the schema lock, set a - * "special" flag; - * sweep -- gets a write lock on the handle, doesn't set exclusive - * - * The schema lock prevents a lot of potential conflicts: we should never - * see handles being salvaged or verified because those operation hold the - * schema lock. However, it is possible to see a handle that is being - * bulk loaded, or that the sweep server is closing. - * - * The principle here is that application operations can cause other - * application operations to fail (so attempting to open a cursor on a - * file while it is being bulk-loaded will fail), but internal or - * database-wide operations should not prevent application-initiated - * operations. For example, attempting to verify a file should not fail - * because the sweep server happens to be in the process of closing that - * file. + * __conn_dhandle_destroy -- + * Destroy a data handle. + */ +static int +__conn_dhandle_destroy(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) +{ + WT_DECL_RET; + + ret = __wt_rwlock_destroy(session, &dhandle->rwlock); + __wt_free(session, dhandle->name); + __wt_free(session, dhandle->checkpoint); + __wt_free(session, dhandle->handle); + __wt_spin_destroy(session, &dhandle->close_lock); + __wt_overwrite_and_free(session, dhandle); + + return (ret); +} + +/* + * __conn_dhandle_alloc -- + * Allocate a new data handle and return it linked into the connection's + * list. */ static int -__conn_dhandle_open_lock( - WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, uint32_t flags) +__conn_dhandle_alloc(WT_SESSION_IMPL *session, + const char *uri, const char *checkpoint, WT_DATA_HANDLE **dhandlep) { WT_BTREE *btree; + WT_DATA_HANDLE *dhandle; WT_DECL_RET; - int is_open, lock_busy, want_exclusive; - btree = dhandle->handle; - lock_busy = 0; - want_exclusive = LF_ISSET(WT_DHANDLE_EXCLUSIVE) ? 1 : 0; + WT_RET(__wt_calloc_one(session, &dhandle)); - /* - * Check that the handle is open. We've already incremented - * the reference count, so once the handle is open it won't be - * closed by another thread. - * - * If we can see the WT_DHANDLE_OPEN flag set while holding a - * lock on the handle, then it's really open and we can start - * using it. Alternatively, if we can get an exclusive lock - * and WT_DHANDLE_OPEN is still not set, we need to do the open. - */ - for (;;) { - /* - * If the handle is already open for a special operation, - * give up. - */ - if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) - return (EBUSY); + WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle")); + dhandle->name_hash = __wt_hash_city64(uri, strlen(uri)); + WT_ERR(__wt_strdup(session, uri, &dhandle->name)); + WT_ERR(__wt_strdup(session, checkpoint, &dhandle->checkpoint)); - /* - * If the handle is open, get a read lock and recheck. - * - * Wait for a read lock if we want exclusive access and failed - * to get it: the sweep server may be closing this handle, and - * we need to wait for it to release its lock. If we want - * exclusive access and find the handle open once we get the - * read lock, give up: some other thread has it locked for real. - */ - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && - (!want_exclusive || lock_busy)) { - WT_RET(__wt_readlock(session, dhandle->rwlock)); - is_open = F_ISSET(dhandle, WT_DHANDLE_OPEN) ? 1 : 0; - if (is_open && !want_exclusive) { - WT_ASSERT(session, - !F_ISSET(dhandle, WT_DHANDLE_DEAD)); - return (0); - } - WT_RET(__wt_readunlock(session, dhandle->rwlock)); - } else - is_open = 0; + /* TODO: abstract this out for other data handle types */ + WT_ERR(__wt_calloc_one(session, &btree)); + dhandle->handle = btree; + btree->dhandle = dhandle; - /* - * It isn't open or we want it exclusive: try to get an - * exclusive lock. There is some subtlety here: if we race - * with another thread that successfully opens the file, we - * don't want to block waiting to get exclusive access. - */ - if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) { - /* - * If it was opened while we waited, drop the write - * lock and get a read lock instead. - */ - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && - !want_exclusive) { - lock_busy = 0; - WT_RET( - __wt_writeunlock(session, dhandle->rwlock)); - continue; - } + WT_ERR(__wt_spin_init( + session, &dhandle->close_lock, "data handle close")); - /* We have an exclusive lock, we're done. */ - F_SET(dhandle, WT_DHANDLE_EXCLUSIVE); - WT_ASSERT(session, !F_ISSET(dhandle, WT_DHANDLE_DEAD)); - return (0); - } else if (ret != EBUSY || (is_open && want_exclusive)) - return (ret); - else - lock_busy = 1; + *dhandlep = dhandle; + return (0); - /* Give other threads a chance to make progress. */ - __wt_yield(); - } +err: WT_TRET(__conn_dhandle_destroy(session, dhandle)); + return (ret); } /* @@ -129,8 +67,8 @@ __conn_dhandle_open_lock( * Find a previously opened data handle. */ int -__wt_conn_dhandle_find(WT_SESSION_IMPL *session, - const char *name, const char *ckpt, uint32_t flags) +__wt_conn_dhandle_find( + WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; @@ -138,20 +76,16 @@ __wt_conn_dhandle_find(WT_SESSION_IMPL *session, conn = S2C(session); - /* - * We must be holding the handle list lock at a higher level, and not - * have a reference. - */ - WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED) && - !LF_ISSET(WT_DHANDLE_HAVE_REF)); + /* We must be holding the handle list lock at a higher level. */ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); - bucket = __wt_hash_city64(name, strlen(name)) % WT_HASH_ARRAY_SIZE; - if (ckpt == NULL) { + bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; + if (checkpoint == NULL) { SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) { if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) continue; if (dhandle->checkpoint == NULL && - strcmp(name, dhandle->name) == 0) { + strcmp(uri, dhandle->name) == 0) { session->dhandle = dhandle; return (0); } @@ -161,90 +95,25 @@ __wt_conn_dhandle_find(WT_SESSION_IMPL *session, if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) continue; if (dhandle->checkpoint != NULL && - strcmp(name, dhandle->name) == 0 && - strcmp(ckpt, dhandle->checkpoint) == 0) { + strcmp(uri, dhandle->name) == 0 && + strcmp(checkpoint, dhandle->checkpoint) == 0) { session->dhandle = dhandle; return (0); } } - return (WT_NOTFOUND); -} - -/* - * __conn_dhandle_get -- - * Allocate a new data handle, lock it exclusively, and return it linked - * into the connection's list. - */ -static int -__conn_dhandle_get(WT_SESSION_IMPL *session, - const char *name, const char *ckpt, uint32_t flags) -{ - WT_BTREE *btree; - WT_CONNECTION_IMPL *conn; - WT_DATA_HANDLE *dhandle; - WT_DECL_RET; - uint32_t bucket; - - conn = S2C(session); - - /* - * We have the handle lock, check whether we can find the handle we - * are looking for. If we do, and we can lock it in the state we - * want, this session will take ownership and we are done. - */ - ret = __wt_conn_dhandle_find(session, name, ckpt, flags); - if (ret == 0) { - dhandle = session->dhandle; - WT_RET(__conn_dhandle_open_lock(session, dhandle, flags)); - return (0); - } - WT_RET_NOTFOUND_OK(ret); - - /* - * If no handle was found, allocate the data handle and a btree handle, - * then initialize the data handle. Exclusively lock the data handle - * before inserting it in the list. - */ - WT_RET(__wt_calloc_one(session, &dhandle)); - - WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle")); - - dhandle->name_hash = __wt_hash_city64(name, strlen(name)); - WT_ERR(__wt_strdup(session, name, &dhandle->name)); - if (ckpt != NULL) - WT_ERR(__wt_strdup(session, ckpt, &dhandle->checkpoint)); - - WT_ERR(__wt_calloc_one(session, &btree)); - dhandle->handle = btree; - btree->dhandle = dhandle; - - WT_ERR(__wt_spin_init( - session, &dhandle->close_lock, "data handle close")); - - F_SET(dhandle, WT_DHANDLE_EXCLUSIVE); - WT_ERR(__wt_writelock(session, dhandle->rwlock)); + WT_RET(__conn_dhandle_alloc(session, uri, checkpoint, &dhandle)); /* * Prepend the handle to the connection list, assuming we're likely to * need new files again soon, until they are cached by all sessions. * Find the right hash bucket to insert into as well. */ - WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)); bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE; WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket); session->dhandle = dhandle; return (0); - -err: WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock)); - __wt_free(session, dhandle->name); - __wt_free(session, dhandle->checkpoint); - __wt_free(session, dhandle->handle); /* btree free */ - __wt_spin_destroy(session, &dhandle->close_lock); - __wt_overwrite_and_free(session, dhandle); - - return (ret); } /* @@ -256,8 +125,6 @@ __conn_dhandle_mark_dead(WT_SESSION_IMPL *session) { int evict_reset; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)); - /* * Handle forced discard (e.g., when dropping a file). * @@ -281,10 +148,11 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int final, int force) WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - int no_schema_lock; + int marked_dead, no_schema_lock; - dhandle = session->dhandle; btree = S2BT(session); + dhandle = session->dhandle; + marked_dead = 0; if (!F_ISSET(dhandle, WT_DHANDLE_OPEN)) return (0); @@ -297,7 +165,7 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int final, int force) * a handle lock (specifically, checkpoint). */ no_schema_lock = 0; - if (!F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { + if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) { no_schema_lock = 1; F_SET(session, WT_SESSION_NO_SCHEMA_LOCK); } @@ -320,18 +188,27 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int final, int force) * invalid if the mapping is closed. */ if (!F_ISSET(btree, - WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) - WT_ERR(force && (btree->bm == NULL || btree->bm->map == NULL) ? - __conn_dhandle_mark_dead(session) : - __wt_checkpoint_close(session, final)); + WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) { + if (force && (btree->bm == NULL || btree->bm->map == NULL)) { + WT_ERR(__conn_dhandle_mark_dead(session)); + marked_dead = 1; + } else + WT_ERR(__wt_checkpoint_close(session, final)); + } WT_TRET(__wt_btree_close(session)); - if (!force || final) { + /* + * If we marked a handle as dead it will be closed by sweep, via + * another call to sync and close. + */ + if (!marked_dead) { F_CLR(dhandle, WT_DHANDLE_OPEN); if (dhandle->checkpoint == NULL) --S2C(session)->open_btree_count; } - F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); + WT_ASSERT(session, + F_ISSET(dhandle, WT_DHANDLE_DEAD) || + !F_ISSET(dhandle, WT_DHANDLE_OPEN)); err: __wt_spin_unlock(session, &dhandle->close_lock); @@ -408,11 +285,12 @@ err: __wt_free(session, metaconf); } /* - * __conn_btree_open -- + * __wt_conn_btree_open -- * Open the current btree handle. */ -static int -__conn_btree_open(WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags) +int +__wt_conn_btree_open( + WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; @@ -421,24 +299,23 @@ __conn_btree_open(WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags) dhandle = session->dhandle; btree = S2BT(session); - WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) && + WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) && !LF_ISSET(WT_DHANDLE_LOCK_ONLY)); WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_CLOSING)); /* - * If the handle is already open, it has to be closed so it can be - * reopened with a new configuration. We don't need to check again: - * this function isn't called if the handle is already open in the - * required mode. + * If the handle is already open, it has to be closed so it can + * be reopened with a new configuration. * - * This call can return EBUSY if there's an update in the object that's - * not yet globally visible. That's not a problem because it can only - * happen when we're switching from a normal handle to a "special" one, - * so we're returning EBUSY to an attempt to verify or do other special - * operations. The reverse won't happen because when the handle from a - * verify or other special operation is closed, there won't be updates + * This call can return EBUSY if there's an update in the + * object that's not yet globally visible. That's not a + * problem because it can only happen when we're switching from + * a normal handle to a "special" one, so we're returning EBUSY + * to an attempt to verify or do other special operations. The + * reverse won't happen because when the handle from a verify + * or other special operation is closed, there won't be updates * in the tree that can block the close. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) @@ -451,72 +328,24 @@ __conn_btree_open(WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags) /* Set any special flags on the handle. */ F_SET(btree, LF_ISSET(WT_BTREE_SPECIAL_FLAGS)); - do { - WT_ERR(__wt_btree_open(session, cfg)); - F_SET(dhandle, WT_DHANDLE_OPEN); - /* - * Checkpoint handles are read only, so eviction calculations - * based on the number of btrees are better to ignore them. - */ - if (dhandle->checkpoint == NULL) - ++S2C(session)->open_btree_count; - - /* Drop back to a readlock if that is all that was needed. */ - if (!LF_ISSET(WT_DHANDLE_EXCLUSIVE)) { - F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); - WT_ERR(__wt_writeunlock(session, dhandle->rwlock)); - WT_ERR( - __conn_dhandle_open_lock(session, dhandle, flags)); - } - } while (!F_ISSET(dhandle, WT_DHANDLE_OPEN)); + WT_ERR(__wt_btree_open(session, cfg)); + F_SET(dhandle, WT_DHANDLE_OPEN); + + /* + * Checkpoint handles are read only, so eviction calculations + * based on the number of btrees are better to ignore them. + */ + if (dhandle->checkpoint == NULL) + ++S2C(session)->open_btree_count; if (0) { err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); - /* If the open failed, close the handle. */ - if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) - WT_TRET(__wt_conn_btree_sync_and_close(session, 0, 0)); } return (ret); } /* - * __wt_conn_btree_get -- - * Get an open btree file handle, otherwise open a new one. - */ -int -__wt_conn_btree_get(WT_SESSION_IMPL *session, - const char *name, const char *ckpt, const char *cfg[], uint32_t flags) -{ - WT_DATA_HANDLE *dhandle; - WT_DECL_RET; - - if (LF_ISSET(WT_DHANDLE_HAVE_REF)) - WT_RET( - __conn_dhandle_open_lock(session, session->dhandle, flags)); - else { - WT_WITH_DHANDLE_LOCK(session, - ret = __conn_dhandle_get(session, name, ckpt, flags)); - WT_RET(ret); - } - dhandle = session->dhandle; - - if (!LF_ISSET(WT_DHANDLE_LOCK_ONLY) && - (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || - LF_ISSET(WT_BTREE_SPECIAL_FLAGS))) - if ((ret = __conn_btree_open(session, cfg, flags)) != 0) { - F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); - WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); - } - - WT_ASSERT(session, ret != 0 || - LF_ISSET(WT_DHANDLE_EXCLUSIVE) == - F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE)); - - return (ret); -} - -/* * __conn_btree_apply_internal -- * Apply a function to the open btree handles. */ @@ -561,7 +390,7 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, conn = S2C(session); - WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)); + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); /* * If we're given a URI, then we walk only the hash list for that @@ -651,7 +480,7 @@ __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, conn = S2C(session); - WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)); + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); hash = __wt_hash_city64(uri, strlen(uri)); bucket = hash % WT_HASH_ARRAY_SIZE; @@ -689,7 +518,7 @@ __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, */ int __wt_conn_dhandle_close_all( - WT_SESSION_IMPL *session, const char *name, int force) + WT_SESSION_IMPL *session, const char *uri, int force) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; @@ -698,12 +527,12 @@ __wt_conn_dhandle_close_all( conn = S2C(session); - WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)); + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); WT_ASSERT(session, session->dhandle == NULL); - bucket = __wt_hash_city64(name, strlen(name)) % WT_HASH_ARRAY_SIZE; + bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) { - if (strcmp(dhandle->name, name) != 0 || + if (strcmp(dhandle->name, uri) != 0 || F_ISSET(dhandle, WT_DHANDLE_DEAD)) continue; @@ -759,7 +588,7 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, int final) dhandle = session->dhandle; bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)); + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); /* Check if the handle was reacquired by a session while we waited. */ if (!final && @@ -799,25 +628,19 @@ __wt_conn_dhandle_discard_single(WT_SESSION_IMPL *session, int final, int force) * Kludge: interrupt the eviction server in case it is holding the * handle list lock. */ - if (!F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)) + if (!F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) F_SET(S2C(session)->cache, WT_CACHE_CLEAR_WALKS); /* Try to remove the handle, protected by the data handle lock. */ - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, WT_TRET(__conn_dhandle_remove(session, final))); /* * After successfully removing the handle, clean it up. */ if (ret == 0 || final) { - WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock)); - __wt_free(session, dhandle->name); - __wt_free(session, dhandle->checkpoint); __conn_btree_config_clear(session); - __wt_free(session, dhandle->handle); - __wt_spin_destroy(session, &dhandle->close_lock); - __wt_overwrite_and_free(session, dhandle); - + WT_TRET(__conn_dhandle_destroy(session, dhandle)); session->dhandle = NULL; } diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index f2d50e09561..75fdd7a9aa1 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -408,7 +408,7 @@ __log_wrlsn_server(void *arg) * as soon as one is not in order. */ for (i = 0; i < written_i; i++) { - if (LOG_CMP(&log->write_lsn, + if (WT_LOG_CMP(&log->write_lsn, &written[i].lsn) != 0) break; /* @@ -416,7 +416,7 @@ __log_wrlsn_server(void *arg) * Advance the LSN and process the slot. */ slot = &log->slot_pool[written[i].slot_index]; - WT_ASSERT(session, LOG_CMP(&written[i].lsn, + WT_ASSERT(session, WT_LOG_CMP(&written[i].lsn, &slot->slot_release_lsn) == 0); log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index bf363e81215..ff3ad7a67f7 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -92,7 +92,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * transaction ID will catch up with the current ID. */ for (;;) { - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 1); if (txn_global->oldest_id == txn_global->current) break; __wt_yield(); diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c index 647e4b02abb..8acbd84ccba 100644 --- a/src/third_party/wiredtiger/src/conn/conn_stat.c +++ b/src/third_party/wiredtiger/src/conn/conn_stat.c @@ -323,7 +323,7 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) * any that match the list of object sources. */ if (conn->stat_sources != NULL) { - WT_WITH_DHANDLE_LOCK(session, ret = + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_conn_btree_apply( session, 0, NULL, __statlog_apply, NULL)); WT_RET(ret); diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index fc29e0b2e15..3a07f2afe17 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -249,14 +249,14 @@ __sweep_server(void *arg) /* Close handles if we have reached the configured limit */ if (conn->open_file_count >= conn->sweep_handles_min) { - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __sweep_expire(session)); WT_ERR(ret); } WT_ERR(__sweep_flush(session)); - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __sweep_remove_handles(session)); WT_ERR(ret); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c index a201477abe3..e366a3673b8 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_backup.c +++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c @@ -376,7 +376,7 @@ __backup_all(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) WT_ERR_NOTFOUND_OK(ret); /* Build a list of the file objects that need to be copied. */ - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_meta_btree_apply( session, __backup_list_all_append, NULL)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c index 44a00d4d192..92d4d583300 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_file.c +++ b/src/third_party/wiredtiger/src/cursor/cur_file.c @@ -513,12 +513,12 @@ __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, * open failing with EBUSY due to a database-wide checkpoint. */ if (bulk) - __wt_spin_lock( - session, &S2C(session)->checkpoint_lock); - ret = __wt_session_get_btree_ckpt(session, uri, cfg, flags); - if (bulk) - __wt_spin_unlock( - session, &S2C(session)->checkpoint_lock); + WT_WITH_CHECKPOINT_LOCK(session, ret = + __wt_session_get_btree_ckpt( + session, uri, cfg, flags)); + else + ret = __wt_session_get_btree_ckpt( + session, uri, cfg, flags); WT_RET(ret); } else WT_RET(__wt_bad_object_type(session, uri)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_log.c b/src/third_party/wiredtiger/src/cursor/cur_log.c index b7f11576425..4b72a472cb7 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_log.c +++ b/src/third_party/wiredtiger/src/cursor/cur_log.c @@ -74,7 +74,7 @@ __curlog_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) acl = (WT_CURSOR_LOG *)a; bcl = (WT_CURSOR_LOG *)b; WT_ASSERT(session, cmpp != NULL); - *cmpp = LOG_CMP(acl->cur_lsn, bcl->cur_lsn); + *cmpp = WT_LOG_CMP(acl->cur_lsn, bcl->cur_lsn); /* * If both are on the same LSN, compare step counter. */ diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c index 864c116a380..21f6a1f016a 100644 --- a/src/third_party/wiredtiger/src/evict/evict_file.c +++ b/src/third_party/wiredtiger/src/evict/evict_file.c @@ -27,7 +27,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset)); /* Make sure the oldest transaction ID is up-to-date. */ - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 1); /* Walk the tree, discarding pages. */ next_ref = NULL; @@ -76,11 +76,11 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) /* * Evict the page. */ - WT_ERR(__wt_evict(session, ref, 1)); + WT_ERR(__wt_evict(session, ref, WT_EVICT_EXCLUSIVE)); break; case WT_SYNC_DISCARD: WT_ASSERT(session, - __wt_page_can_evict(session, page, 0)); + __wt_page_can_evict(session, page, 0, NULL)); __wt_evict_page_clean_update(session, ref); break; case WT_SYNC_DISCARD_FORCE: diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index a22531277dd..3ad7e8a2723 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -681,7 +681,7 @@ __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref) * before evicting, using a special "eviction" isolation level, where * only globally visible updates can be evicted. */ - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 1); txn = &session->txn; saved_iso = txn->isolation; txn->isolation = TXN_ISO_EVICTION; @@ -941,7 +941,7 @@ __evict_walk(WT_SESSION_IMPL *session, uint32_t flags) * after a long-running transaction (such as a checkpoint) completes, * we may never start evicting again. */ - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 1); if (cache->evict_current == NULL) WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_empty); @@ -1232,7 +1232,8 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) } fast: /* If the page can't be evicted, give up. */ - if (!__wt_page_can_evict(session, page, 1)) + if (!__wt_page_can_evict( + session, page, WT_EVICT_CHECK_SPLITS, NULL)) continue; /* @@ -1522,7 +1523,7 @@ __wt_cache_wait(WT_SESSION_IMPL *session, int full) * are not busy. */ if (busy) { - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 0); if (txn_state->id == txn_global->oldest_id || txn_state->snap_min == txn_global->oldest_id) return (0); diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index e276f72fe3f..e54ed0ff8e7 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -9,7 +9,7 @@ #include "wt_internal.h" static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, int); -static int __evict_review(WT_SESSION_IMPL *, WT_REF *, int, int *); +static int __evict_review(WT_SESSION_IMPL *, WT_REF *, int *, uint32_t); /* * __evict_exclusive_clear -- @@ -49,7 +49,7 @@ __evict_exclusive(WT_SESSION_IMPL *session, WT_REF *ref) * Evict a page. */ int -__wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) +__wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; @@ -73,7 +73,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) * to make this check for clean pages, too: while unlikely eviction * would choose an internal page with children, it's not disallowed. */ - WT_ERR(__evict_review(session, ref, exclusive, &inmem_split)); + WT_ERR(__evict_review(session, ref, &inmem_split, flags)); /* * If there was an in-memory split, the tree has been left in the state @@ -89,7 +89,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) mod = page->modify; /* Count evictions of internal pages during normal operation. */ - if (!exclusive && WT_PAGE_IS_INTERNAL(page)) { + if (!LF_ISSET(WT_EVICT_EXCLUSIVE) && WT_PAGE_IS_INTERNAL(page)) { WT_STAT_FAST_CONN_INCR(session, cache_eviction_internal); WT_STAT_FAST_DATA_INCR(session, cache_eviction_internal); } @@ -115,22 +115,22 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) if (__wt_ref_is_root(ref)) __wt_ref_out(session, ref); else - WT_ERR( - __evict_page_dirty_update(session, ref, exclusive)); + WT_ERR(__evict_page_dirty_update( + session, ref, LF_ISSET(WT_EVICT_EXCLUSIVE))); WT_STAT_FAST_CONN_INCR(session, cache_eviction_dirty); WT_STAT_FAST_DATA_INCR(session, cache_eviction_dirty); } if (0) { -err: if (!exclusive) +err: if (!LF_ISSET(WT_EVICT_EXCLUSIVE)) __evict_exclusive_clear(session, ref); WT_STAT_FAST_CONN_INCR(session, cache_eviction_fail); WT_STAT_FAST_DATA_INCR(session, cache_eviction_fail); } -done: if ((inmem_split || (forced_eviction && ret == EBUSY)) && +done: if (((inmem_split && ret == 0) || (forced_eviction && ret == EBUSY)) && !F_ISSET(conn->cache, WT_CACHE_WOULD_BLOCK)) { F_SET(conn->cache, WT_CACHE_WOULD_BLOCK); WT_TRET(__wt_evict_server_wake(session)); @@ -195,25 +195,10 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) break; case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ /* - * There are two cases in this code. - * - * First, an in-memory page that got too large, we forcibly - * evicted it, and there wasn't anything to write. (Imagine two - * threads updating a small set keys on a leaf page. The page is - * too large so we try to evict it, but after reconciliation - * there's only a small amount of data (so it's a single page we - * can't split), and because there are two threads, there's some - * data we can't write (so we can't evict it). In that case, we - * take advantage of the fact we have exclusive access to the - * page and rewrite it in memory.) - * - * Second, a real split where we reconciled a page and it turned - * into a lot of pages. + * A real split where we reconciled a page and it turned into a + * lot of pages. */ - if (mod->mod_multi_entries == 1) - WT_RET(__wt_split_rewrite(session, ref)); - else - WT_RET(__wt_split_multi(session, ref, exclusive)); + WT_RET(__wt_split_multi(session, ref, exclusive)); break; case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) { @@ -236,6 +221,20 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) ref->addr = addr; WT_PUBLISH(ref->state, WT_REF_DISK); break; + case WT_PM_REC_REWRITE: + /* + * An in-memory page that got too large, we forcibly evicted + * it, and there wasn't anything to write. (Imagine two threads + * updating a small set keys on a leaf page. The page is too + * large so we try to evict it, but after reconciliation + * there's only a small amount of data (so it's a single page + * we can't split), and because there are two threads, there's + * some data we can't write (so we can't evict it). In that + * case, we take advantage of the fact we have exclusive access + * to the page and rewrite it in memory.) + */ + WT_RET(__wt_split_rewrite(session, ref)); + break; WT_ILLEGAL_VALUE(session); } @@ -271,18 +270,20 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) */ static int __evict_review( - WT_SESSION_IMPL *session, WT_REF *ref, int exclusive, int *inmem_splitp) + WT_SESSION_IMPL *session, WT_REF *ref, int *inmem_splitp, uint32_t flags) { WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; - uint32_t flags; + uint32_t reconcile_flags; + + reconcile_flags = WT_EVICTING; /* * Get exclusive access to the page if our caller doesn't have the tree * locked down. */ - if (!exclusive) { + if (!LF_ISSET(WT_EVICT_EXCLUSIVE)) { WT_RET(__evict_exclusive(session, ref)); /* @@ -312,19 +313,19 @@ __evict_review( } /* Check if the page can be evicted. */ - if (!exclusive && !__wt_page_can_evict(session, page, 0)) - return (EBUSY); + if (!LF_ISSET(WT_EVICT_EXCLUSIVE)) { + if (!__wt_page_can_evict(session, page, flags, inmem_splitp)) + return (EBUSY); - /* - * Check for an append-only workload needing an in-memory split; we - * can't do this earlier because in-memory splits require exclusive - * access. If an in-memory split completes, the page stays in memory - * and the tree is left in the desired state: avoid the usual cleanup. - */ - if (!exclusive) { - WT_RET(__wt_split_insert(session, ref, inmem_splitp)); + /* + * Check for an append-only workload needing an in-memory + * split; we can't do this earlier because in-memory splits + * require exclusive access. If an in-memory split completes, + * the page stays in memory and the tree is left in the desired + * state: avoid the usual cleanup. + */ if (*inmem_splitp) - return (0); + return (__wt_split_insert(session, ref)); } /* @@ -346,24 +347,23 @@ __evict_review( * Don't set the update-restore flag for internal pages, they don't have * updates that can be saved and restored. */ - flags = WT_EVICTING; if (__wt_page_is_modified(page)) { - if (exclusive) - LF_SET(WT_SKIP_UPDATE_ERR); + if (LF_ISSET(WT_EVICT_EXCLUSIVE)) + FLD_SET(reconcile_flags, WT_SKIP_UPDATE_ERR); else if (!WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_OLDEST) - LF_SET(WT_SKIP_UPDATE_RESTORE); - WT_RET(__wt_reconcile(session, ref, NULL, flags)); + FLD_SET(reconcile_flags, WT_SKIP_UPDATE_RESTORE); + WT_RET(__wt_reconcile(session, ref, NULL, reconcile_flags)); WT_ASSERT(session, !__wt_page_is_modified(page) || - LF_ISSET(WT_SKIP_UPDATE_RESTORE)); + FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE)); } /* * If the page was ever modified, make sure all of the updates * on the page are old enough they can be discarded from cache. */ - if (!exclusive && mod != NULL && + if (!LF_ISSET(WT_EVICT_EXCLUSIVE) && mod != NULL && !__wt_txn_visible_all(session, mod->rec_max_txn) && !LF_ISSET(WT_SKIP_UPDATE_RESTORE)) return (EBUSY); diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index e9b6b5a1d6e..cde01e4e1ac 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -179,18 +179,21 @@ struct __wt_page_modify { */ uint64_t disk_snap_min; - /* The largest transaction ID seen on the page by reconciliation. */ - uint64_t rec_max_txn; - /* The first unwritten transaction ID (approximate). */ uint64_t first_dirty_txn; - /* The largest update transaction ID (approximate). */ - uint64_t update_txn; - /* In-memory split transaction ID. */ uint64_t inmem_split_txn; + /* Avoid checking for obsolete updates during checkpoints. */ + uint64_t obsolete_check_txn; + + /* The largest transaction ID seen on the page by reconciliation. */ + uint64_t rec_max_txn; + + /* The largest update transaction ID (approximate). */ + uint64_t update_txn; + /* Dirty bytes added to the cache. */ size_t bytes_dirty; @@ -353,8 +356,10 @@ struct __wt_page_modify { #define WT_PM_REC_EMPTY 0x01 /* Reconciliation: no replacement */ #define WT_PM_REC_MULTIBLOCK 0x02 /* Reconciliation: multiple blocks */ #define WT_PM_REC_REPLACE 0x04 /* Reconciliation: single block */ +#define WT_PM_REC_REWRITE 0x08 /* Reconciliation: rewrite in place */ #define WT_PM_REC_MASK \ - (WT_PM_REC_EMPTY | WT_PM_REC_MULTIBLOCK | WT_PM_REC_REPLACE) + (WT_PM_REC_EMPTY | WT_PM_REC_MULTIBLOCK | \ + WT_PM_REC_REPLACE | WT_PM_REC_REWRITE) uint8_t flags; /* Page flags */ }; @@ -535,7 +540,7 @@ struct __wt_page { #define WT_PAGE_REFUSE_DEEPEN 0x10 /* Don't deepen the tree at this page */ #define WT_PAGE_SCANNING 0x20 /* Obsolete updates are being scanned */ #define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */ -#define WT_PAGE_SPLITTING 0x80 /* An internal page is growing */ +#define WT_PAGE_SPLIT_LOCKED 0x80 /* An internal page is growing */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ /* diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 9038dab2b34..5a2253f6078 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -226,6 +226,11 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) /* Update the bytes in-memory to reflect the eviction. */ WT_CACHE_DECR(session, cache->bytes_inmem, page->memory_footprint); + /* Update the bytes_internal value to reflect the eviction */ + if (WT_PAGE_IS_INTERNAL(page)) + WT_CACHE_DECR(session, + cache->bytes_internal, page->memory_footprint); + /* Update the cache's dirty-byte count. */ if (modify != NULL && modify->bytes_dirty != 0) { if (cache->bytes_dirty < modify->bytes_dirty) { @@ -949,17 +954,86 @@ __wt_ref_info(WT_SESSION_IMPL *session, } /* + * __wt_page_can_split -- + * Check whether a page can be split in memory. + */ +static inline int +__wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_INSERT_HEAD *ins_head; + + btree = S2BT(session); + + /* + * Only split a page once, otherwise workloads that update in the middle + * of the page could continually split without benefit. + */ + if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) + return (0); + + /* + * Check for pages with append-only workloads. A common application + * pattern is to have multiple threads frantically appending to the + * tree. We want to reconcile and evict this page, but we'd like to + * do it without making the appending threads wait. If we're not + * discarding the tree, check and see if it's worth doing a split to + * let the threads continue before doing eviction. + * + * Ignore anything other than large, dirty row-store leaf pages. + * + * XXX KEITH + * Need a better test for append-only workloads. + */ + if (page->type != WT_PAGE_ROW_LEAF || + page->memory_footprint < btree->maxmempage || + !__wt_page_is_modified(page)) + return (0); + + /* Don't split a page that is pending a multi-block split. */ + if (F_ISSET(page->modify, WT_PM_REC_MULTIBLOCK)) + return (0); + + /* + * There is no point splitting if the list is small, no deep items is + * our heuristic for that. (A 1/4 probability of adding a new skiplist + * level means there will be a new 6th level for roughly each 4KB of + * entries in the list. If we have at least two 6th level entries, the + * list is at least large enough to work with.) + * + * The following code requires at least two items on the insert list, + * this test serves the additional purpose of confirming that. + */ +#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(6, WT_SKIP_MAXDEPTH - 1) + ins_head = page->pg_row_entries == 0 ? + WT_ROW_INSERT_SMALLEST(page) : + WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + if (ins_head == NULL || + ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == NULL || + ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == + ins_head->tail[WT_MIN_SPLIT_SKIPLIST_DEPTH]) + return (0); + + return (1); +} + +/* * __wt_page_can_evict -- * Check whether a page can be evicted. */ static inline int -__wt_page_can_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int check_splits) +__wt_page_can_evict( + WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags, int *inmem_splitp) { WT_BTREE *btree; WT_PAGE_MODIFY *mod; + WT_TXN_GLOBAL *txn_global; btree = S2BT(session); mod = page->modify; + txn_global = &S2C(session)->txn_global; + if (inmem_splitp != NULL) + *inmem_splitp = 0; /* Pages that have never been modified can always be evicted. */ if (mod == NULL) @@ -974,11 +1048,24 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int check_splits) * a transaction value, once that's globally visible, we know we can * evict the created page. */ - if (check_splits && WT_PAGE_IS_INTERNAL(page) && + if (LF_ISSET(WT_EVICT_CHECK_SPLITS) && WT_PAGE_IS_INTERNAL(page) && !__wt_txn_visible_all(session, mod->mod_split_txn)) return (0); /* + * Allow for the splitting of pages when a checkpoint is underway only + * if the allow_splits flag has been passed, we know we are performing + * a checkpoint, the page is larger than the stated maximum and there + * has not already been a split on this page as the WT_PM_REC_MULTIBLOCK + * flag is unset. + */ + if (__wt_page_can_split(session, page)) { + if (inmem_splitp != NULL) + *inmem_splitp = 1; + return (1); + } + + /* * If the file is being checkpointed, we can't evict dirty pages: * if we write a page and free the previous version of the page, that * previous version might be referenced by an internal page already @@ -1017,10 +1104,12 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int check_splits) /* * If the page was recently split in-memory, don't force it out: we - * hope an eviction thread will find it first. + * hope an eviction thread will find it first. The check here is + * similar to __wt_txn_visible_all, but ignores the checkpoints + * transaction. */ - if (check_splits && - !__wt_txn_visible_all(session, mod->inmem_split_txn)) + if (LF_ISSET(WT_EVICT_CHECK_SPLITS) && + TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn)) return (0); return (1); @@ -1040,7 +1129,6 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) btree = S2BT(session); page = ref->page; - too_big = (page->memory_footprint > btree->maxmempage) ? 1 : 0; /* * Take some care with order of operations: if we release the hazard @@ -1055,6 +1143,8 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) } (void)WT_ATOMIC_ADD4(btree->evict_busy, 1); + + too_big = (page->memory_footprint > btree->maxmempage) ? 1 : 0; if ((ret = __wt_evict_page(session, ref)) == 0) { if (too_big) WT_STAT_FAST_CONN_INCR(session, cache_eviction_force); @@ -1115,8 +1205,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) page = ref->page; if (F_ISSET(btree, WT_BTREE_NO_EVICTION) || LF_ISSET(WT_READ_NO_EVICT) || - page->read_gen != WT_READGEN_OLDEST || - !__wt_page_can_evict(session, page, 1)) + page->read_gen != WT_READGEN_OLDEST || !__wt_page_can_evict( + session, page, WT_EVICT_CHECK_SPLITS, NULL)) return (__wt_hazard_clear(session, page)); WT_RET_BUSY_OK(__wt_page_release_evict(session, ref)); diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i index f952f1bf698..0c976800b38 100644 --- a/src/third_party/wiredtiger/src/include/cache.i +++ b/src/third_party/wiredtiger/src/include/cache.i @@ -148,7 +148,7 @@ __wt_session_can_wait(WT_SESSION_IMPL *session) * highjack the thread for eviction. */ if (F_ISSET(session, - WT_SESSION_NO_CACHE_CHECK | WT_SESSION_SCHEMA_LOCKED)) + WT_SESSION_NO_CACHE_CHECK | WT_SESSION_LOCKED_SCHEMA)) return (0); return (1); @@ -170,7 +170,7 @@ __wt_cache_full_check(WT_SESSION_IMPL *session) * block eviction), we don't want to highjack the thread for eviction. */ if (F_ISSET(session, WT_SESSION_NO_CACHE_CHECK | - WT_SESSION_SCHEMA_LOCKED | WT_SESSION_HANDLE_LIST_LOCKED)) + WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA)) return (0); /* diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 0121a1625c5..7a19a35c83c 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -124,6 +124,12 @@ struct __wt_named_extractor { } while (0) /* + * Default hash table size; use a prime number of buckets rather than assuming + * a good hash (Reference Sedgewick, Algorithms in C, "Hash Functions"). + */ +#define WT_HASH_ARRAY_SIZE 509 + +/* * WT_CONNECTION_IMPL -- * Implementation of WT_CONNECTION */ @@ -184,7 +190,6 @@ struct __wt_connection_impl { * URI. */ /* Locked: data handle hash array */ -#define WT_HASH_ARRAY_SIZE 512 SLIST_HEAD(__wt_dhhash, __wt_data_handle) dhhash[WT_HASH_ARRAY_SIZE]; /* Locked: data handle list */ SLIST_HEAD(__wt_dhandle_lh, __wt_data_handle) dhlh; diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h index 034db30a0a2..22a0a2c1dd4 100644 --- a/src/third_party/wiredtiger/src/include/dhandle.h +++ b/src/third_party/wiredtiger/src/include/dhandle.h @@ -69,8 +69,7 @@ struct __wt_data_handle { #define WT_DHANDLE_DISCARD 0x02 /* Discard on release */ #define WT_DHANDLE_DISCARD_FORCE 0x04 /* Force discard on release */ #define WT_DHANDLE_EXCLUSIVE 0x08 /* Need exclusive access */ -#define WT_DHANDLE_HAVE_REF 0x10 /* Already have ref */ -#define WT_DHANDLE_LOCK_ONLY 0x20 /* Handle only used as a lock */ -#define WT_DHANDLE_OPEN 0x40 /* Handle is open */ +#define WT_DHANDLE_LOCK_ONLY 0x10 /* Handle only used as a lock */ +#define WT_DHANDLE_OPEN 0x20 /* Handle is open */ uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 48bf792bcf5..a4810720d55 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -152,7 +152,7 @@ extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const ch extern void __wt_split_stash_discard(WT_SESSION_IMPL *session); extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session); extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp); -extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp); +extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive); extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst); @@ -174,7 +174,7 @@ extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove); extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep); extern int __wt_update_alloc( WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep); -extern WT_UPDATE *__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd); +extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert); @@ -226,13 +226,13 @@ extern WT_THREAD_RET __wt_cache_pool_server(void *arg); extern int __wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session); extern int __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize); -extern int __wt_conn_dhandle_find(WT_SESSION_IMPL *session, const char *name, const char *ckpt, uint32_t flags); +extern int __wt_conn_dhandle_find( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint); extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int final, int force); -extern int __wt_conn_btree_get(WT_SESSION_IMPL *session, const char *name, const char *ckpt, const char *cfg[], uint32_t flags); +extern int __wt_conn_btree_open( WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags); extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, int apply_checkpoints, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); extern int __wt_conn_btree_apply_single_ckpt(WT_SESSION_IMPL *session, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); extern int __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); -extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *name, int force); +extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *uri, int force); extern int __wt_conn_dhandle_discard_single(WT_SESSION_IMPL *session, int final, int force); extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session); extern int __wt_connection_init(WT_CONNECTION_IMPL *conn); @@ -314,7 +314,7 @@ extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session); extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_server); extern int __wt_cache_wait(WT_SESSION_IMPL *session, int full); extern void __wt_cache_dump(WT_SESSION_IMPL *session); -extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive); +extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags); extern void __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, int *rec); @@ -363,8 +363,12 @@ extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size); extern int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize); +extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm); +extern int __wt_clsm_await_switch(WT_CURSOR_LSM *clsm); extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks); +extern int __wt_clsm_close(WT_CURSOR *cursor); extern int __wt_clsm_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]); extern int __wt_lsm_manager_config(WT_SESSION_IMPL *session, const char **cfg); extern int __wt_lsm_manager_reconfig(WT_SESSION_IMPL *session, const char **cfg); extern int __wt_lsm_manager_start(WT_SESSION_IMPL *session); @@ -444,7 +448,6 @@ extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, voi extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); extern int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); extern int __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp); -extern int __wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp); extern void __wt_free_int(WT_SESSION_IMPL *session, const void *p_arg); extern int __wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix, uint32_t flags, char ***dirlist, u_int *countp); extern int __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp); @@ -574,10 +577,10 @@ extern int __wt_session_copy_values(WT_SESSION_IMPL *session); extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); extern int __wt_session_create_strip(WT_SESSION *wt_session, const char *v1, const char *v2, char **value_ret); extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp); -extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, WT_SESSION_IMPL **sessionp); +extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, int open_metadata, WT_SESSION_IMPL **sessionp); extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip); extern int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config); -extern int __wt_session_lock_dhandle(WT_SESSION_IMPL *session, uint32_t flags, int *deadp); +extern int __wt_session_lock_dhandle( WT_SESSION_IMPL *session, uint32_t flags, int *is_deadp); extern int __wt_session_release_btree(WT_SESSION_IMPL *session); extern int __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], uint32_t flags); extern void __wt_session_close_cache(WT_SESSION_IMPL *session); @@ -655,8 +658,8 @@ extern void __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats); extern void __wt_stat_refresh_connection_stats(void *stats_arg); extern int WT_CDECL __wt_txnid_cmp(const void *v1, const void *v2); extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session); -extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session); -extern void __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot); +extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session); +extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force); extern int __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]); extern void __wt_txn_release(WT_SESSION_IMPL *session); extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h index 99c77c94f49..95aa6f9809d 100644 --- a/src/third_party/wiredtiger/src/include/flags.h +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -18,6 +18,9 @@ #define WT_CONN_SERVER_SWEEP 0x00002000 #define WT_CONN_WAS_BACKUP 0x00004000 #define WT_EVICTING 0x00000001 +#define WT_EVICT_CHECK_SPLITS 0x00000001 +#define WT_EVICT_EXCLUSIVE 0x00000002 +#define WT_EVICT_INMEM_SPLIT 0x00000004 #define WT_FILE_TYPE_CHECKPOINT 0x00000001 #define WT_FILE_TYPE_DATA 0x00000002 #define WT_FILE_TYPE_DIRECTORY 0x00000004 @@ -42,18 +45,19 @@ #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_CLEAR_EVICT_WALK 0x00000002 #define WT_SESSION_DISCARD_FORCE 0x00000004 -#define WT_SESSION_HANDLE_LIST_LOCKED 0x00000008 -#define WT_SESSION_INTERNAL 0x00000010 -#define WT_SESSION_LOGGING_INMEM 0x00000020 -#define WT_SESSION_NO_CACHE 0x00000040 -#define WT_SESSION_NO_CACHE_CHECK 0x00000080 -#define WT_SESSION_NO_DATA_HANDLES 0x00000100 -#define WT_SESSION_NO_LOGGING 0x00000200 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00000400 -#define WT_SESSION_SALVAGE_CORRUPT_OK 0x00000800 -#define WT_SESSION_SCHEMA_LOCKED 0x00001000 -#define WT_SESSION_SERVER_ASYNC 0x00002000 -#define WT_SESSION_TABLE_LOCKED 0x00004000 +#define WT_SESSION_INTERNAL 0x00000008 +#define WT_SESSION_LOCKED_CHECKPOINT 0x00000010 +#define WT_SESSION_LOCKED_HANDLE_LIST 0x00000020 +#define WT_SESSION_LOCKED_SCHEMA 0x00000040 +#define WT_SESSION_LOCKED_TABLE 0x00000080 +#define WT_SESSION_LOGGING_INMEM 0x00000100 +#define WT_SESSION_NO_CACHE 0x00000200 +#define WT_SESSION_NO_CACHE_CHECK 0x00000400 +#define WT_SESSION_NO_DATA_HANDLES 0x00000800 +#define WT_SESSION_NO_LOGGING 0x00001000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00002000 +#define WT_SESSION_SALVAGE_CORRUPT_OK 0x00004000 +#define WT_SESSION_SERVER_ASYNC 0x00008000 #define WT_SKIP_UPDATE_ERR 0x00000002 #define WT_SKIP_UPDATE_RESTORE 0x00000004 #define WT_SYNC_CHECKPOINT 0x00000001 diff --git a/src/third_party/wiredtiger/src/include/hardware.h b/src/third_party/wiredtiger/src/include/hardware.h index 1c3fb287e86..e3c098826d0 100644 --- a/src/third_party/wiredtiger/src/include/hardware.h +++ b/src/third_party/wiredtiger/src/include/hardware.h @@ -38,7 +38,7 @@ } while (0) #define F_CAS_ATOMIC(p, mask, ret) do { \ - uint8_t __orig; \ + uint8_t __orig; \ ret = 0; \ do { \ __orig = (p)->flags_atomic; \ diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h index ebe3a00b19f..f4f7361b53f 100644 --- a/src/third_party/wiredtiger/src/include/log.h +++ b/src/third_party/wiredtiger/src/include/log.h @@ -51,7 +51,7 @@ * Compare 2 LSNs, return -1 if lsn0 < lsn1, 0 if lsn0 == lsn1 * and 1 if lsn0 > lsn1. */ -#define LOG_CMP(lsn1, lsn2) \ +#define WT_LOG_CMP(lsn1, lsn2) \ ((lsn1)->file != (lsn2)->file ? \ ((lsn1)->file < (lsn2)->file ? -1 : 1) : \ ((lsn1)->offset != (lsn2)->offset ? \ diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h index aa1d797e3b5..dc6a0d7e027 100644 --- a/src/third_party/wiredtiger/src/include/lsm.h +++ b/src/third_party/wiredtiger/src/include/lsm.h @@ -57,15 +57,16 @@ struct __wt_cursor_lsm { u_int update_count; /* Updates performed. */ -#define WT_CLSM_ACTIVE 0x01 /* Incremented the session count */ -#define WT_CLSM_ITERATE_NEXT 0x02 /* Forward iteration */ -#define WT_CLSM_ITERATE_PREV 0x04 /* Backward iteration */ -#define WT_CLSM_MERGE 0x08 /* Merge cursor, don't update */ -#define WT_CLSM_MINOR_MERGE 0x10 /* Minor merge, include tombstones */ -#define WT_CLSM_MULTIPLE 0x20 /* Multiple cursors have values for the +#define WT_CLSM_ACTIVE 0x001 /* Incremented the session count */ +#define WT_CLSM_BULK 0x002 /* Open for snapshot isolation */ +#define WT_CLSM_ITERATE_NEXT 0x004 /* Forward iteration */ +#define WT_CLSM_ITERATE_PREV 0x008 /* Backward iteration */ +#define WT_CLSM_MERGE 0x010 /* Merge cursor, don't update */ +#define WT_CLSM_MINOR_MERGE 0x020 /* Minor merge, include tombstones */ +#define WT_CLSM_MULTIPLE 0x040 /* Multiple cursors have values for the current key */ -#define WT_CLSM_OPEN_READ 0x40 /* Open for reads */ -#define WT_CLSM_OPEN_SNAPSHOT 0x80 /* Open for snapshot isolation */ +#define WT_CLSM_OPEN_READ 0x080 /* Open for reads */ +#define WT_CLSM_OPEN_SNAPSHOT 0x100 /* Open for snapshot isolation */ uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i index 53f08b3cbeb..98facff02b9 100644 --- a/src/third_party/wiredtiger/src/include/misc.i +++ b/src/third_party/wiredtiger/src/include/misc.i @@ -7,6 +7,17 @@ */ /* + * __wt_strdup -- + * ANSI strdup function. + */ +static inline int +__wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp) +{ + return (__wt_strndup( + session, str, (str == NULL) ? 0 : strlen(str), retp)); +} + +/* * __wt_verbose -- * Verbose message. */ diff --git a/src/third_party/wiredtiger/src/include/schema.h b/src/third_party/wiredtiger/src/include/schema.h index 5d524534b39..8f4884281cd 100644 --- a/src/third_party/wiredtiger/src/include/schema.h +++ b/src/third_party/wiredtiger/src/include/schema.h @@ -95,12 +95,20 @@ struct __wt_table { } while (0) /* - * WT_WITH_DHANDLE_LOCK -- + * WT_WITH_CHECKPOINT_LOCK -- + * Acquire the checkpoint lock, perform an operation, drop the lock. + */ +#define WT_WITH_CHECKPOINT_LOCK(session, op) \ + WT_WITH_LOCK(session, \ + &S2C(session)->checkpoint_lock, WT_SESSION_LOCKED_CHECKPOINT, op) + +/* + * WT_WITH_HANDLE_LIST_LOCK -- * Acquire the data handle list lock, perform an operation, drop the lock. */ -#define WT_WITH_DHANDLE_LOCK(session, op) \ +#define WT_WITH_HANDLE_LIST_LOCK(session, op) \ WT_WITH_LOCK(session, \ - &S2C(session)->dhandle_lock, WT_SESSION_HANDLE_LIST_LOCKED, op) + &S2C(session)->dhandle_lock, WT_SESSION_LOCKED_HANDLE_LIST, op) /* * WT_WITH_SCHEMA_LOCK -- * Acquire the schema lock, perform an operation, drop the lock. @@ -109,61 +117,61 @@ struct __wt_table { */ #define WT_WITH_SCHEMA_LOCK(session, op) do { \ WT_ASSERT(session, \ - F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) || \ - !F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED | \ - WT_SESSION_NO_SCHEMA_LOCK | WT_SESSION_TABLE_LOCKED)); \ + F_ISSET(session, WT_SESSION_LOCKED_SCHEMA) || \ + !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST | \ + WT_SESSION_NO_SCHEMA_LOCK | WT_SESSION_LOCKED_TABLE)); \ + WT_WITH_LOCK(session, \ + &S2C(session)->schema_lock, WT_SESSION_LOCKED_SCHEMA, op); \ +} while (0) + +/* + * WT_WITH_TABLE_LOCK -- + * Acquire the table lock, perform an operation, drop the lock. + */ +#define WT_WITH_TABLE_LOCK(session, op) do { \ + WT_ASSERT(session, \ + F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \ + !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \ WT_WITH_LOCK(session, \ - &S2C(session)->schema_lock, WT_SESSION_SCHEMA_LOCKED, op); \ + &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \ } while (0) /* * WT_WITHOUT_LOCKS -- - * Drop the schema lock and/or the handle list lock, perform an operation, + * Drop the handle, table and/or schema locks, perform an operation, * re-acquire the lock(s). */ #define WT_WITHOUT_LOCKS(session, op) do { \ WT_CONNECTION_IMPL *__conn = S2C(session); \ int __handle_locked = \ - F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED);\ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST);\ int __table_locked = \ - F_ISSET(session, WT_SESSION_TABLE_LOCKED); \ + F_ISSET(session, WT_SESSION_LOCKED_TABLE); \ int __schema_locked = \ - F_ISSET(session, WT_SESSION_SCHEMA_LOCKED); \ + F_ISSET(session, WT_SESSION_LOCKED_SCHEMA); \ if (__handle_locked) { \ - F_CLR(session, WT_SESSION_HANDLE_LIST_LOCKED); \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); \ __wt_spin_unlock(session, &__conn->dhandle_lock);\ } \ if (__table_locked) { \ - F_CLR(session, WT_SESSION_TABLE_LOCKED); \ + F_CLR(session, WT_SESSION_LOCKED_TABLE); \ __wt_spin_unlock(session, &__conn->table_lock);\ } \ if (__schema_locked) { \ - F_CLR(session, WT_SESSION_SCHEMA_LOCKED); \ + F_CLR(session, WT_SESSION_LOCKED_SCHEMA); \ __wt_spin_unlock(session, &__conn->schema_lock);\ } \ op; \ if (__schema_locked) { \ __wt_spin_lock(session, &__conn->schema_lock); \ - F_SET(session, WT_SESSION_SCHEMA_LOCKED); \ + F_SET(session, WT_SESSION_LOCKED_SCHEMA); \ } \ if (__table_locked) { \ __wt_spin_lock(session, &__conn->table_lock); \ - F_SET(session, WT_SESSION_TABLE_LOCKED); \ + F_SET(session, WT_SESSION_LOCKED_TABLE); \ } \ if (__handle_locked) { \ __wt_spin_lock(session, &__conn->dhandle_lock); \ - F_SET(session, WT_SESSION_HANDLE_LIST_LOCKED); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); \ } \ } while (0) - -/* - * WT_WITH_TABLE_LOCK -- - * Acquire the table lock, perform an operation, drop the lock. - */ -#define WT_WITH_TABLE_LOCK(session, op) do { \ - WT_ASSERT(session, \ - F_ISSET(session, WT_SESSION_TABLE_LOCKED) || \ - !F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)); \ - WT_WITH_LOCK(session, \ - &S2C(session)->table_lock, WT_SESSION_TABLE_LOCKED, op); \ -} while (0) diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i index e3581ae1c39..9e6b0f7916c 100644 --- a/src/third_party/wiredtiger/src/include/serial.i +++ b/src/third_party/wiredtiger/src/include/serial.i @@ -255,15 +255,18 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, * obsolete check at a time, and to protect updates from disappearing * under reconciliation. */ - if (upd->next != NULL) { + if (upd->next != NULL && + __wt_txn_visible_all(session, page->modify->obsolete_check_txn)) { F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret); /* If we can't lock it, don't scan, that's okay. */ if (ret != 0) return (0); - obsolete = __wt_update_obsolete_check(session, upd->next); + obsolete = __wt_update_obsolete_check(session, page, upd->next); F_CLR_ATOMIC(page, WT_PAGE_SCANNING); - if (obsolete != NULL) + if (obsolete != NULL) { + page->modify->obsolete_check_txn = WT_TXN_NONE; __wt_update_obsolete_free(session, page, obsolete); + } } return (0); diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 927ab09d5f9..62f565c0535 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -42,9 +42,6 @@ struct __wt_txn_global { */ volatile uint64_t oldest_id; - /* The oldest session found in the last scan. */ - uint32_t oldest_session; - /* Count of scanning threads, or -1 for exclusive access. */ volatile int32_t scan_count; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 4141d829f1d..b1cfba4257d 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -139,20 +139,20 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) txn = &session->txn; - /* - * Eviction only sees globally visible updates, or if there is a - * checkpoint transaction running, use its transaction. - */ - if (txn->isolation == TXN_ISO_EVICTION) - return (__wt_txn_visible_all(session, id)); + /* Changes with no associated transaction are always visible. */ + if (id == WT_TXN_NONE) + return (1); /* Nobody sees the results of aborted transactions. */ if (id == WT_TXN_ABORTED) return (0); - /* Changes with no associated transaction are always visible. */ - if (id == WT_TXN_NONE) - return (1); + /* + * Eviction only sees globally visible updates, or if there is a + * checkpoint transaction running, use its transaction. + */ + if (txn->isolation == TXN_ISO_EVICTION) + return (__wt_txn_visible_all(session, id)); /* * Read-uncommitted transactions see all other changes. @@ -222,7 +222,14 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) if (txn->isolation == TXN_ISO_SNAPSHOT) { if (session->ncursors > 0) WT_RET(__wt_session_copy_values(session)); - __wt_txn_refresh(session, 1); + + /* + * We're about to allocate a snapshot: if we need to block for + * eviction, it's better to do it beforehand. + */ + WT_RET(__wt_cache_full_check(session)); + + __wt_txn_get_snapshot(session); } F_SET(txn, TXN_RUNNING); @@ -429,7 +436,7 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) if (txn->isolation != TXN_ISO_READ_UNCOMMITTED && !F_ISSET(txn, TXN_HAS_SNAPSHOT)) - __wt_txn_refresh(session, 1); + __wt_txn_get_snapshot(session); } /* diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 05e92d313f2..6037cdeee96 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -996,9 +996,10 @@ struct __wt_session { * builtin support for \c "bzip2"\, \c "snappy"\, \c "lz4" or \c "zlib" * compression\, these names are also available. See @ref compression * for more information., a string; default \c none.} - * @config{cache_resident, do not ever evict the object's pages; see - * @ref tuning_cache_resident for more information., a boolean flag; - * default \c false.} + * @config{cache_resident, do not ever evict the object's pages from + * cache. Not compatible with LSM tables; see @ref + * tuning_cache_resident for more information., a boolean flag; default + * \c false.} * @config{checksum, configure block checksums; permitted values are * <code>on</code> (checksum all blocks)\, <code>off</code> (checksum no * blocks) and <code>uncompresssed</code> (checksum only blocks which diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 27be3dfb07c..6d64cd00c2a 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -973,7 +973,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) * be holes in the log file. */ WT_STAT_FAST_CONN_INCR(session, log_release_write_lsn); - while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) { + while (WT_LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) { if (++yield_count < 1000) __wt_yield(); else @@ -1036,7 +1036,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) * Sync the log file if needed. */ if (F_ISSET(slot, SLOT_SYNC) && - LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { + WT_LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_release: sync log %s", log->log_fh->name)); WT_STAT_FAST_CONN_INCR(session, log_sync); @@ -1485,7 +1485,7 @@ advance: /* Truncate if we're in recovery. */ if (LF_ISSET(WT_LOGSCAN_RECOVER) && - LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0) + WT_LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0) WT_ERR(__log_truncate(session, &rd_lsn, WT_LOG_FILENAME, 0)); @@ -1758,13 +1758,13 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_ERR(__wt_log_slot_free(session, myslot.slot)); } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ - while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 && + while (WT_LOG_CMP(&log->sync_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) (void)__wt_cond_wait( session, log->log_sync_cond, 10000); } else if (LF_ISSET(WT_LOG_FLUSH)) { /* Wait for our writes to reach the OS */ - while (LOG_CMP(&log->write_lsn, &lsn) <= 0 && + while (WT_LOG_CMP(&log->write_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) (void)__wt_cond_wait( session, log->log_write_cond, 10000); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index 7c9ac35d489..20d776bcfb9 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -20,11 +20,11 @@ static int __clsm_open_cursors(WT_CURSOR_LSM *, int, u_int, uint32_t); static int __clsm_reset_cursors(WT_CURSOR_LSM *, WT_CURSOR *); /* - * __clsm_request_switch -- + * __wt_clsm_request_switch -- * Request an LSM tree switch for a cursor operation. */ -static inline int -__clsm_request_switch(WT_CURSOR_LSM *clsm) +int +__wt_clsm_request_switch(WT_CURSOR_LSM *clsm) { WT_DECL_RET; WT_LSM_TREE *lsm_tree; @@ -44,9 +44,9 @@ __clsm_request_switch(WT_CURSOR_LSM *clsm) if (lsm_tree->nchunks == 0 || (clsm->dsk_gen == lsm_tree->dsk_gen && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))) { + F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); ret = __wt_lsm_manager_push_entry( session, WT_LSM_WORK_SWITCH, 0, lsm_tree); - F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); } WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree)); } @@ -55,6 +55,41 @@ __clsm_request_switch(WT_CURSOR_LSM *clsm) } /* + * __wt_clsm_await_switch -- + * Wait for a switch to have completed in the LSM tree + */ +int +__wt_clsm_await_switch(WT_CURSOR_LSM *clsm) +{ + WT_LSM_TREE *lsm_tree; + WT_SESSION_IMPL *session; + int waited; + + lsm_tree = clsm->lsm_tree; + session = (WT_SESSION_IMPL *)clsm->iface.session; + + /* + * If there is no primary chunk, or a chunk has overflowed the hard + * limit, which either means a worker thread has fallen behind or there + * has just been a user-level checkpoint, wait until the tree changes. + * + * We used to switch chunks in the application thread here, but that is + * problematic because there is a transaction in progress and it could + * roll back, leaving the metadata inconsistent. + */ + for (waited = 0; + lsm_tree->nchunks == 0 || + clsm->dsk_gen == lsm_tree->dsk_gen; + ++waited) { + if (waited % 1000 == 0) + WT_RET(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_SWITCH, 0, lsm_tree)); + __wt_sleep(0, 10); + } + return (0); +} + +/* * __clsm_enter_update -- * Make sure an LSM cursor is ready to perform an update. */ @@ -65,7 +100,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) WT_LSM_CHUNK *primary_chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; - int hard_limit, have_primary, ovfl, waited; + int hard_limit, have_primary, ovfl; lsm_tree = clsm->lsm_tree; ovfl = 0; @@ -109,30 +144,13 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) } /* Request a switch. */ - WT_RET(__clsm_request_switch(clsm)); + WT_RET(__wt_clsm_request_switch(clsm)); /* If we only overflowed the soft limit, we're done. */ if (have_primary && !hard_limit) return (0); - /* - * If there is no primary chunk, or it has overflowed the hard limit, - * which either means a worker thread has fallen behind or there has - * just been a user-level checkpoint, wait until the tree changes. - * - * We used to switch chunks in the application thread if we got to - * here, but that is problematic because there is a transaction in - * progress and it could roll back, leaving the metadata inconsistent. - */ - for (waited = 0; - lsm_tree->nchunks == 0 || - clsm->dsk_gen == lsm_tree->dsk_gen; - ++waited) { - if (waited % 1000 == 0) - WT_RET(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_SWITCH, 0, lsm_tree)); - __wt_sleep(0, 10); - } + WT_RET(__wt_clsm_await_switch(clsm)); return (0); } @@ -1424,11 +1442,11 @@ err: __clsm_leave(clsm); } /* - * __clsm_close -- + * __wt_clsm_close -- * WT_CURSOR->close method for the LSM cursor type. */ -static int -__clsm_close(WT_CURSOR *cursor) +int +__wt_clsm_close(WT_CURSOR *cursor) { WT_CURSOR_LSM *clsm; WT_DECL_RET; @@ -1482,14 +1500,17 @@ __wt_clsm_open(WT_SESSION_IMPL *session, __clsm_update, /* update */ __clsm_remove, /* remove */ __wt_cursor_reconfigure, /* reconfigure */ - __clsm_close); /* close */ + __wt_clsm_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_LSM *clsm; WT_DECL_RET; WT_LSM_TREE *lsm_tree; + int bulk; + bulk = 0; clsm = NULL; cursor = NULL; + lsm_tree = NULL; if (!WT_PREFIX_MATCH(uri, "lsm:")) return (EINVAL); @@ -1499,9 +1520,21 @@ __wt_clsm_open(WT_SESSION_IMPL *session, WT_RET_MSG(session, EINVAL, "LSM does not support opening by checkpoint"); + WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval)); + if (cval.val != 0) + bulk = 1; + /* Get the LSM tree. */ - WT_WITH_DHANDLE_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)); + WT_WITH_HANDLE_LIST_LOCK(session, + ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree)); + /* + * Check whether the exclusive open for a bulk load succeeded, and + * if it did ensure that it's safe to bulk load into the tree. + */ + if (bulk && (ret == EBUSY || (ret == 0 && lsm_tree->nchunks > 1))) + WT_ERR_MSG(session, EINVAL, + "bulk-load is only supported on newly created LSM trees"); + /* Flag any errors from the tree get. */ WT_RET(ret); WT_ERR(__wt_calloc_one(session, &clsm)); @@ -1524,9 +1557,20 @@ __wt_clsm_open(WT_SESSION_IMPL *session, WT_STATIC_ASSERT(offsetof(WT_CURSOR_LSM, iface) == 0); WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp)); + if (bulk) + WT_ERR(__wt_clsm_open_bulk(clsm, cfg)); + if (0) { err: if (clsm != NULL) - WT_TRET(__clsm_close(cursor)); + WT_TRET(__wt_clsm_close(cursor)); + else if (lsm_tree != NULL) + __wt_lsm_tree_release(session, lsm_tree); + + /* + * We open bulk cursors after setting the returned cursor. + * Fix that here. + */ + *cursorp = NULL; } return (ret); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c new file mode 100644 index 00000000000..6b51a070e47 --- /dev/null +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __clsm_close_bulk -- + * WT_CURSOR->close method for LSM bulk cursors. + */ +static int +__clsm_close_bulk(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_LSM_TREE *lsm_tree; + + clsm = (WT_CURSOR_LSM *)cursor; + lsm_tree = clsm->lsm_tree; + F_SET(lsm_tree->chunk[0], WT_LSM_CHUNK_ONDISK); + + WT_RET(__wt_clsm_close(cursor)); + return (0); +} +/* + * __clsm_insert_bulk -- + * WT_CURSOR->insert method for LSM bulk cursors. + */ +static int +__clsm_insert_bulk(WT_CURSOR *cursor) +{ + WT_CURSOR *bulk_cursor; + WT_CURSOR_LSM *clsm; + WT_LSM_TREE *lsm_tree; + WT_SESSION_IMPL *session; + + clsm = (WT_CURSOR_LSM *)cursor; + lsm_tree = clsm->lsm_tree; + session = (WT_SESSION_IMPL *)clsm->iface.session; + + WT_ASSERT(session, lsm_tree->nchunks == 1 && clsm->nchunks == 1); + ++lsm_tree->chunk[0]->count; + bulk_cursor = *clsm->cursors; + bulk_cursor->set_key(bulk_cursor, &cursor->key); + bulk_cursor->set_value(bulk_cursor, &cursor->value); + WT_RET(bulk_cursor->insert(bulk_cursor)); + + return (0); +} + +/* + * __wt_clsm_open_bulk -- + * WT_SESSION->open_cursor method for LSM bulk cursors. + */ +int +__wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]) +{ + WT_CURSOR *cursor, *bulk_cursor; + WT_LSM_TREE *lsm_tree; + WT_SESSION_IMPL *session; + + bulk_cursor = NULL; + cursor = &clsm->iface; + lsm_tree = clsm->lsm_tree; + session = (WT_SESSION_IMPL *)clsm->iface.session; + + F_SET(clsm, WT_CLSM_BULK); + + /* Bulk cursors are limited to insert and close. */ + __wt_cursor_set_notsup(cursor); + cursor->insert = __clsm_insert_bulk; + cursor->close = __clsm_close_bulk; + + /* Setup the first chunk in the tree. */ + WT_RET(__wt_clsm_request_switch(clsm)); + WT_RET(__wt_clsm_await_switch(clsm)); + + /* + * Grab and release the LSM tree lock to ensure that the first chunk + * has been fully created before proceeding. We have the LSM tree + * open exclusive, so that saves us from needing the lock generally. + */ + WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); + WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree)); + + /* + * Open a bulk cursor on the first chunk, it's not a regular LSM chunk + * cursor, but use the standard storage locations. Allocate the space + * for a bloom filter - it makes cleanup simpler. Cleaned up by + * cursor close on error. + */ + WT_RET(__wt_calloc_one(session, &clsm->blooms)); + clsm->bloom_alloc = 1; + WT_RET(__wt_calloc_one(session, &clsm->cursors)); + clsm->cursor_alloc = 1; + clsm->nchunks = 1; + + /* + * Open a bulk cursor on the first chunk in the tree - take a read + * lock on the LSM tree while we are opening the chunk, to ensure + * that the first chunk has been fully created before we succeed. + * Pass through the application config to ensure the tree is open + * for bulk access. + */ + WT_RET(__wt_open_cursor(session, + lsm_tree->chunk[0]->uri, &clsm->iface, cfg, &bulk_cursor)); + clsm->cursors[0] = bulk_cursor; + /* LSM cursors are always raw */ + F_SET(bulk_cursor, WT_CURSTD_RAW); + + return (0); +} + diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c index 3d9fc27d1d2..0533e628601 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c @@ -423,7 +423,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) if (TAILQ_EMPTY(&conn->lsmqh)) continue; __wt_spin_lock(session, &conn->dhandle_lock); - F_SET(session, WT_SESSION_HANDLE_LIST_LOCKED); + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); dhandle_locked = 1; TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) @@ -483,13 +483,13 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) } } __wt_spin_unlock(session, &conn->dhandle_lock); - F_CLR(session, WT_SESSION_HANDLE_LIST_LOCKED); + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); dhandle_locked = 0; } err: if (dhandle_locked) { __wt_spin_unlock(session, &conn->dhandle_lock); - F_CLR(session, WT_SESSION_HANDLE_LIST_LOCKED); + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); } return (ret); } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c index 5398982aef4..bc694000900 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_stat.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c @@ -32,7 +32,7 @@ __curstat_lsm_init( "checkpoint=" WT_CHECKPOINT, NULL, NULL }; locked = 0; - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)); WT_RET(ret); WT_ERR(__wt_scr_alloc(session, 0, &uribuf)); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c index cce49984f43..439837e96be 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -29,7 +29,7 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int final) /* We may be destroying an lsm_tree before it was added. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) { WT_ASSERT(session, final || - F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)); + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q); } @@ -248,7 +248,7 @@ int __wt_lsm_tree_setup_chunk( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { - WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); WT_RET(__wt_epoch(session, &chunk->create_ts)); WT_RET(__wt_lsm_tree_chunk_name( @@ -307,7 +307,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, char *tmpconfig; /* If the tree is open, it already exists. */ - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)); if (ret == 0) { __wt_lsm_tree_release(session, lsm_tree); @@ -348,6 +348,11 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, WT_ERR(__wt_strndup( session, cval.str, cval.len, &lsm_tree->collator_name)); + WT_ERR(__wt_config_gets(session, cfg, "cache_resident", &cval)); + if (cval.val != 0) + WT_ERR_MSG(session, EINVAL, + "The cache_resident flag is not compatible with LSM"); + WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval)); if (cval.val) F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); @@ -429,7 +434,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, * tracking macros handle cleaning up on failure. */ if (ret == 0) - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __lsm_tree_open(session, uri, &lsm_tree)); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); @@ -454,7 +459,7 @@ __lsm_tree_find(WT_SESSION_IMPL *session, { WT_LSM_TREE *lsm_tree; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)); + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); /* See if the tree is already open. */ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) @@ -548,7 +553,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep) conn = S2C(session); lsm_tree = NULL; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)); + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); /* Start the LSM manager thread if it isn't running. */ if (WT_ATOMIC_CAS4(conn->lsm_manager.lsm_workers, 0, 1)) @@ -608,7 +613,7 @@ __wt_lsm_tree_get(WT_SESSION_IMPL *session, { WT_DECL_RET; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)); + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); ret = __lsm_tree_find(session, uri, exclusive, treep); if (ret == WT_NOTFOUND) @@ -934,7 +939,7 @@ __wt_lsm_tree_drop( locked = 0; /* Get the LSM tree. */ - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_lsm_tree_get(session, name, 1, &lsm_tree)); WT_RET(ret); @@ -970,7 +975,7 @@ __wt_lsm_tree_drop( err: if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, WT_TRET(__lsm_tree_discard(session, lsm_tree, 0))); return (ret); } @@ -994,7 +999,7 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session, locked = 0; /* Get the LSM tree. */ - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_lsm_tree_get(session, olduri, 1, &lsm_tree)); WT_RET(ret); @@ -1044,7 +1049,7 @@ err: if (locked) * Discard this LSM tree structure. The first operation on the renamed * tree will create a new one. */ - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, WT_TRET(__lsm_tree_discard(session, lsm_tree, 0))); return (ret); } @@ -1067,7 +1072,7 @@ __wt_lsm_tree_truncate( locked = 0; /* Get the LSM tree. */ - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_lsm_tree_get(session, name, 1, &lsm_tree)); WT_RET(ret); @@ -1106,7 +1111,7 @@ err: if (locked) * the last good version of the metadata will be used, resulting * in a valid (not truncated) tree. */ - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, WT_TRET(__lsm_tree_discard(session, lsm_tree, 0))); } return (ret); @@ -1204,7 +1209,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) /* Tell __wt_schema_worker not to look inside the LSM tree. */ *skip = 1; - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_lsm_tree_get(session, name, 0, &lsm_tree)); WT_RET(ret); @@ -1390,7 +1395,7 @@ __wt_lsm_tree_worker(WT_SESSION_IMPL *session, locked = 0; exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE) ? 1 : 0; - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); WT_RET(ret); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index 74a52ad7402..1145c329639 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -281,7 +281,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, } /* Stop if a running transaction needs the chunk. */ - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 1); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c index a2e4a2f8e9f..227d0fa9a6c 100644 --- a/src/third_party/wiredtiger/src/meta/meta_table.c +++ b/src/third_party/wiredtiger/src/meta/meta_table.c @@ -61,6 +61,7 @@ __wt_metadata_cursor( WT_DECL_RET; const char *cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), config, NULL }; + int is_dead; saved_dhandle = session->dhandle; WT_ERR(__wt_metadata_open(session)); @@ -71,7 +72,11 @@ __wt_metadata_cursor( * We use the metadata a lot, so we have a handle cached; lock it and * increment the in-use counter once the cursor is open. */ - WT_ERR(__wt_session_lock_dhandle(session, 0, NULL)); + WT_ERR(__wt_session_lock_dhandle(session, 0, &is_dead)); + + /* The metadata should never be closed. */ + WT_ASSERT(session, !is_dead); + WT_ERR(__wt_curfile_create(session, NULL, cfg, 0, 0, cursorp)); __wt_cursor_dhandle_incr_use(session); diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c index 62d4df47ff6..5e083d6df5e 100644 --- a/src/third_party/wiredtiger/src/meta/meta_track.c +++ b/src/third_party/wiredtiger/src/meta/meta_track.c @@ -386,10 +386,8 @@ __wt_meta_track_fileop( WT_RET(__meta_track_next(session, &trk)); trk->op = WT_ST_FILEOP; - if (olduri != NULL) - WT_RET(__wt_strdup(session, olduri, &trk->a)); - if (newuri != NULL) - WT_RET(__wt_strdup(session, newuri, &trk->b)); + WT_RET(__wt_strdup(session, olduri, &trk->a)); + WT_RET(__wt_strdup(session, newuri, &trk->b)); return (0); } diff --git a/src/third_party/wiredtiger/src/os_posix/os_alloc.c b/src/third_party/wiredtiger/src/os_posix/os_alloc.c index e0613197642..4d04f9ac579 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_alloc.c +++ b/src/third_party/wiredtiger/src/os_posix/os_alloc.c @@ -43,6 +43,12 @@ __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp) void *p; /* + * Defensive: if our caller doesn't handle errors correctly, ensure a + * free won't fail. + */ + *(void **)retp = NULL; + + /* * !!! * This function MUST handle a NULL WT_SESSION_IMPL handle. */ @@ -222,17 +228,6 @@ __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp) } /* - * __wt_strdup -- - * ANSI strdup function. - */ -int -__wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp) -{ - return (__wt_strndup( - session, str, (str == NULL) ? 0 : strlen(str), retp)); -} - -/* * __wt_free_int -- * ANSI free function. */ diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 5bef5cd2d2d..76d61642bfd 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -483,6 +483,7 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) switch (F_ISSET(mod, WT_PM_REC_MASK)) { case WT_PM_REC_EMPTY: /* Page is empty */ case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ + case WT_PM_REC_REWRITE: /* Rewrite */ return (0); case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ break; @@ -3229,6 +3230,18 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_RET(__rec_split_init( session, r, page, page->pg_intl_recno, btree->maxintlpage)); + /* + * We need to mark this page as splitting, as this may be an in-memory + * split during a checkpoint. + */ + for (;;) { + F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret); + if (ret == 0) { + break; + } + __wt_yield(); + } + /* For each entry in the in-memory page... */ WT_INTL_FOREACH_BEGIN(session, page, ref) { /* Update the starting record number in case we split. */ @@ -3271,6 +3284,8 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) case WT_PM_REC_REPLACE: addr = &child->modify->mod_replace; break; + case WT_PM_REC_REWRITE: + break; WT_ILLEGAL_VALUE_ERR(session); } } else @@ -3309,6 +3324,8 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) __rec_copy_incr(session, r, val); } WT_INTL_FOREACH_END; + F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED); + /* Write the remnant page. */ return (__rec_split_finish(session, r)); @@ -4041,6 +4058,18 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) */ r->cell_zero = 1; + /* + * We need to mark this page as splitting in order to ensure we don't + * deadlock when performing an in-memory split during a checkpoint. + */ + for (;;) { + F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret); + if (ret == 0) { + break; + } + __wt_yield(); + } + /* For each entry in the in-memory page... */ WT_INTL_FOREACH_BEGIN(session, page, ref) { /* @@ -4199,6 +4228,8 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) __rec_key_state_update(r, ovfl_key); } WT_INTL_FOREACH_END; + F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED); + /* Write the remnant page. */ return (__rec_split_finish(session, r)); @@ -4836,6 +4867,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) case WT_PM_REC_EMPTY: /* Page deleted */ break; case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ + case WT_PM_REC_REWRITE: /* Rewrite */ /* * Discard the multiple replacement blocks. */ @@ -4914,7 +4946,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) bnd->dsk = NULL; mod->mod_multi_entries = 1; - F_SET(mod, WT_PM_REC_MULTIBLOCK); + F_SET(mod, WT_PM_REC_REWRITE); break; } @@ -5064,10 +5096,14 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * information (otherwise we might think the backing block is being * reused on a subsequent reconciliation where we want to free it). */ - if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_MULTIBLOCK) + switch (F_ISSET(mod, WT_PM_REC_MASK)) { + case WT_PM_REC_MULTIBLOCK: + case WT_PM_REC_REWRITE: for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) multi->addr.reuse = 0; + break; + } /* * On error, discard blocks we've written, they're unreferenced by the diff --git a/src/third_party/wiredtiger/src/schema/schema_drop.c b/src/third_party/wiredtiger/src/schema/schema_drop.c index 03097128ec2..56c6f7b0551 100644 --- a/src/third_party/wiredtiger/src/schema/schema_drop.c +++ b/src/third_party/wiredtiger/src/schema/schema_drop.c @@ -29,7 +29,7 @@ __drop_file( return (EINVAL); /* Close all btree handles associated with this file. */ - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_conn_dhandle_close_all(session, uri, force)); WT_RET(ret); @@ -59,7 +59,7 @@ __drop_colgroup( WT_DECL_RET; WT_TABLE *table; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_TABLE_LOCKED)); + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE)); /* If we can get the colgroup, detach it from the table. */ if ((ret = __wt_schema_get_colgroup( diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c index 414722652a0..2b645d5c666 100644 --- a/src/third_party/wiredtiger/src/schema/schema_open.c +++ b/src/third_party/wiredtiger/src/schema/schema_open.c @@ -44,7 +44,7 @@ __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table) char *cgconfig; u_int i; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_TABLE_LOCKED)); + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE)); if (table->cg_complete) return (0); @@ -407,7 +407,7 @@ __wt_schema_open_table(WT_SESSION_IMPL *session, table = NULL; tablename = NULL; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_TABLE_LOCKED)); + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE)); WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "table:%.*s", (int)namelen, name)); diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c index 51281eccec5..3e619fe9cff 100644 --- a/src/third_party/wiredtiger/src/schema/schema_rename.c +++ b/src/third_party/wiredtiger/src/schema/schema_rename.c @@ -30,7 +30,7 @@ __rename_file( return (EINVAL); /* Close any btree handles in the file. */ - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_conn_dhandle_close_all(session, uri, 0)); WT_ERR(ret); diff --git a/src/third_party/wiredtiger/src/schema/schema_truncate.c b/src/third_party/wiredtiger/src/schema/schema_truncate.c index be9f6bcfb57..0124ec70ca2 100644 --- a/src/third_party/wiredtiger/src/schema/schema_truncate.c +++ b/src/third_party/wiredtiger/src/schema/schema_truncate.c @@ -33,7 +33,7 @@ __truncate_file(WT_SESSION_IMPL *session, const char *name) WT_RET(__wt_session_release_btree(session)); /* Close any btree handles in the file. */ - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_conn_dhandle_close_all(session, name, 0)); WT_RET(ret); diff --git a/src/third_party/wiredtiger/src/schema/schema_worker.c b/src/third_party/wiredtiger/src/schema/schema_worker.c index e913fcfe69d..76b47a2ccff 100644 --- a/src/third_party/wiredtiger/src/schema/schema_worker.c +++ b/src/third_party/wiredtiger/src/schema/schema_worker.c @@ -49,7 +49,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session, * any open file handles, including checkpoints. */ if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE)) { - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_conn_dhandle_close_all( session, uri, 0)); WT_ERR(ret); @@ -60,11 +60,13 @@ __wt_schema_worker(WT_SESSION_IMPL *session, WT_SAVE_DHANDLE(session, ret = file_func(session, cfg)); WT_TRET(__wt_session_release_btree(session)); - } else if (ret == EBUSY) - /* TODO: Decode checkpoint from cfg. */ - WT_WITH_DHANDLE_LOCK(session, + } else if (ret == EBUSY) { + WT_ASSERT(session, !FLD_ISSET( + open_flags, WT_DHANDLE_EXCLUSIVE)); + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_conn_btree_apply_single_ckpt( session, uri, file_func, cfg)); + } WT_ERR(ret); } } else if (WT_PREFIX_MATCH(uri, "colgroup:")) { diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index ac24ae18c1d..2aa8e924302 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -546,12 +546,12 @@ __session_salvage(WT_SESSION *wt_session, const char *uri, const char *config) session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, salvage, config, cfg); + /* Block out checkpoints to avoid spurious EBUSY errors. */ - __wt_spin_lock(session, &S2C(session)->checkpoint_lock); - WT_WITH_SCHEMA_LOCK(session, - ret = __wt_schema_worker(session, uri, __wt_salvage, - NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_SALVAGE)); - __wt_spin_unlock(session, &S2C(session)->checkpoint_lock); + WT_WITH_CHECKPOINT_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, ret = + __wt_schema_worker(session, uri, __wt_salvage, + NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_SALVAGE))); err: API_END_RET_NOTFOUND_MAP(session, ret); } @@ -605,15 +605,11 @@ __session_truncate(WT_SESSION *wt_session, "the truncate method should not specify any" "target after the log: URI prefix."); ret = __wt_log_truncate_files(session, start, cfg); - } else { + } else /* Wait for checkpoints to avoid EBUSY errors. */ - __wt_spin_lock(session, - &S2C(session)->checkpoint_lock); - WT_WITH_SCHEMA_LOCK(session, - ret = __wt_schema_truncate(session, uri, cfg)); - __wt_spin_unlock(session, - &S2C(session)->checkpoint_lock); - } + WT_WITH_CHECKPOINT_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_truncate(session, uri, cfg))); goto done; } @@ -717,11 +713,10 @@ __session_upgrade(WT_SESSION *wt_session, const char *uri, const char *config) SESSION_API_CALL(session, upgrade, config, cfg); /* Block out checkpoints to avoid spurious EBUSY errors. */ - __wt_spin_lock(session, &S2C(session)->checkpoint_lock); - WT_WITH_SCHEMA_LOCK(session, - ret = __wt_schema_worker(session, uri, __wt_upgrade, - NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_UPGRADE)); - __wt_spin_unlock(session, &S2C(session)->checkpoint_lock); + WT_WITH_CHECKPOINT_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_worker(session, uri, __wt_upgrade, + NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_UPGRADE))); err: API_END_RET_NOTFOUND_MAP(session, ret); } @@ -740,11 +735,10 @@ __session_verify(WT_SESSION *wt_session, const char *uri, const char *config) SESSION_API_CALL(session, verify, config, cfg); /* Block out checkpoints to avoid spurious EBUSY errors. */ - __wt_spin_lock(session, &S2C(session)->checkpoint_lock); - WT_WITH_SCHEMA_LOCK(session, - ret = __wt_schema_worker(session, uri, __wt_verify, - NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_VERIFY)); - __wt_spin_unlock(session, &S2C(session)->checkpoint_lock); + WT_WITH_CHECKPOINT_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_worker(session, uri, __wt_verify, + NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_VERIFY))); err: API_END_RET_NOTFOUND_MAP(session, ret); } @@ -914,14 +908,12 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config) * here to ensure we don't get into trouble. */ WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 1); - __wt_spin_lock(session, &S2C(session)->checkpoint_lock); - ret = __wt_txn_checkpoint(session, cfg); + WT_WITH_CHECKPOINT_LOCK(session, + ret = __wt_txn_checkpoint(session, cfg)); WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0); - __wt_spin_unlock(session, &S2C(session)->checkpoint_lock); - err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK); API_END_RET_NOTFOUND_MAP(session, ret); @@ -953,7 +945,7 @@ __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, *sessionp = NULL; - WT_RET(__wt_open_session(conn, NULL, NULL, &session)); + WT_RET(__wt_open_session(conn, NULL, NULL, open_metadata, &session)); session->name = name; /* @@ -971,19 +963,6 @@ __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, if (!uses_dhandles) F_SET(session, WT_SESSION_NO_DATA_HANDLES); - /* - * Acquiring the metadata handle requires the schema lock; we've seen - * problems in the past where a worker thread has acquired the schema - * lock unexpectedly, relatively late in the run, and deadlocked. Be - * defensive, get it now. The metadata file may not exist when the - * connection first creates its default session or the shared cache - * pool creates its sessions, let our caller decline this work. - */ - if (open_metadata) { - WT_ASSERT(session, !F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); - WT_RET(__wt_metadata_open(session)); - } - *sessionp = session; return (0); } @@ -995,7 +974,7 @@ __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, */ int __wt_open_session(WT_CONNECTION_IMPL *conn, - WT_EVENT_HANDLER *event_handler, const char *config, + WT_EVENT_HANDLER *event_handler, const char *config, int open_metadata, WT_SESSION_IMPL **sessionp) { static const WT_SESSION stds = { @@ -1131,5 +1110,20 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, WT_STAT_FAST_CONN_INCR(session, session_open); err: __wt_spin_unlock(session, &conn->api_lock); - return (ret); + WT_RET(ret); + + /* + * Acquiring the metadata handle requires the schema lock; we've seen + * problems in the past where a session has acquired the schema lock + * unexpectedly, relatively late in the run, and deadlocked. Be + * defensive, get it now. The metadata file may not exist when the + * connection first creates its default session or the shared cache + * pool creates its sessions, let our caller decline this work. + */ + if (open_metadata) { + WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); + WT_RET(__wt_metadata_open(session_ret)); + } + + return (0); } diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c index ce5f95a40d0..720f40e8d11 100644 --- a/src/third_party/wiredtiger/src/session/session_dhandle.c +++ b/src/third_party/wiredtiger/src/session/session_dhandle.c @@ -31,97 +31,139 @@ __session_add_dhandle( if (dhandle_cachep != NULL) *dhandle_cachep = dhandle_cache; - (void)WT_ATOMIC_ADD4(session->dhandle->session_ref, 1); - /* Sweep the handle list to remove any dead handles. */ return (__session_dhandle_sweep(session)); } /* * __wt_session_lock_dhandle -- - * Try to lock a handle that is cached in this session. This is the fast - * path that tries to lock a handle without the need for the schema lock. + * Return when the current data handle is either (a) open with the + * requested lock mode; or (b) closed and write locked. If exclusive + * access is requested and cannot be granted immediately because the + * handle is in use, fail with EBUSY. + * + * Here is a brief summary of how different operations synchronize using + * either the schema lock, handle locks or handle flags: * - * If the handle can't be locked in the required state, release it and - * fail with WT_NOTFOUND: we have to take the slow path after acquiring - * the schema lock. + * open -- one thread gets the handle exclusive, reverts to a shared + * handle lock once the handle is open; + * bulk load -- sets bulk and exclusive; + * salvage, truncate, update, verify -- hold the schema lock, + * get the handle exclusive, set a "special" flag; + * sweep -- gets a write lock on the handle, doesn't set exclusive + * + * The principle is that some application operations can cause other + * application operations to fail (so attempting to open a cursor on a + * file while it is being bulk-loaded will fail), but internal or + * database-wide operations should not prevent application-initiated + * operations. For example, attempting to verify a file should not fail + * because the sweep server happens to be in the process of closing that + * file. */ int -__wt_session_lock_dhandle(WT_SESSION_IMPL *session, uint32_t flags, int *deadp) +__wt_session_lock_dhandle( + WT_SESSION_IMPL *session, uint32_t flags, int *is_deadp) { - enum { NOLOCK, READLOCK, WRITELOCK } locked; WT_BTREE *btree; WT_DATA_HANDLE *dhandle; - uint32_t special_flags; + WT_DECL_RET; + int is_open, lock_busy, want_exclusive; + + *is_deadp = 0; - btree = S2BT(session); dhandle = session->dhandle; - locked = NOLOCK; - if (deadp != NULL) - *deadp = 0; + btree = dhandle->handle; + lock_busy = 0; + want_exclusive = LF_ISSET(WT_DHANDLE_EXCLUSIVE) ? 1 : 0; /* - * Special operation flags will cause the handle to be reopened. - * For example, a handle opened with WT_BTREE_BULK cannot use the same - * internal data structures as a handle opened for ordinary access. + * Check that the handle is open. We've already incremented + * the reference count, so once the handle is open it won't be + * closed by another thread. + * + * If we can see the WT_DHANDLE_OPEN flag set while holding a + * lock on the handle, then it's really open and we can start + * using it. Alternatively, if we can get an exclusive lock + * and WT_DHANDLE_OPEN is still not set, we need to do the open. */ - special_flags = LF_ISSET(WT_BTREE_SPECIAL_FLAGS); - WT_ASSERT(session, - special_flags == 0 || LF_ISSET(WT_DHANDLE_EXCLUSIVE)); + for (;;) { + /* If the handle is dead, give up. */ + if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { + *is_deadp = 1; + return (0); + } + + /* + * If the handle is already open for a special operation, + * give up. + */ + if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) + return (EBUSY); - if (LF_ISSET(WT_DHANDLE_EXCLUSIVE)) { /* - * Try to get an exclusive handle lock and fail immediately if - * it's unavailable. We don't expect exclusive operations on - * trees to be mixed with ordinary cursor access, but if there - * is a use case in the future, we could make blocking here - * configurable. + * If the handle is open, get a read lock and recheck. * - * Special flags will cause the handle to be reopened, which - * will get the necessary lock, so don't bother here. + * Wait for a read lock if we want exclusive access and failed + * to get it: the sweep server may be closing this handle, and + * we need to wait for it to release its lock. If we want + * exclusive access and find the handle open once we get the + * read lock, give up: some other thread has it locked for real. */ - if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) || special_flags == 0) { - WT_RET(__wt_try_writelock(session, dhandle->rwlock)); + if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && + (!want_exclusive || lock_busy)) { + WT_RET(__wt_readlock(session, dhandle->rwlock)); + if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { + *is_deadp = 1; + WT_RET( + __wt_readunlock(session, dhandle->rwlock)); + return (0); + } + + is_open = F_ISSET(dhandle, WT_DHANDLE_OPEN) ? 1 : 0; + if (is_open && !want_exclusive) + return (0); + WT_RET(__wt_readunlock(session, dhandle->rwlock)); + } else + is_open = 0; + + /* + * It isn't open or we want it exclusive: try to get an + * exclusive lock. There is some subtlety here: if we race + * with another thread that successfully opens the file, we + * don't want to block waiting to get exclusive access. + */ + if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) { + if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { + *is_deadp = 1; + WT_RET( + __wt_writeunlock(session, dhandle->rwlock)); + return (0); + } + + /* + * If it was opened while we waited, drop the write + * lock and get a read lock instead. + */ + if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && + !want_exclusive) { + lock_busy = 0; + WT_RET( + __wt_writeunlock(session, dhandle->rwlock)); + continue; + } + + /* We have an exclusive lock, we're done. */ F_SET(dhandle, WT_DHANDLE_EXCLUSIVE); - locked = WRITELOCK; + WT_ASSERT(session, !F_ISSET(dhandle, WT_DHANDLE_DEAD)); + return (0); } - } else if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) - return (EBUSY); - else { - WT_RET(__wt_readlock(session, dhandle->rwlock)); - locked = READLOCK; - } - - /* - * At this point, we have the requested lock -- if that is all that was - * required, we're done. Otherwise, check that the handle is open and - * that no special flags are required. - */ - if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { - WT_ASSERT(session, deadp != NULL); - *deadp = 1; - } else if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) || - (F_ISSET(dhandle, WT_DHANDLE_OPEN) && special_flags == 0)) - return (0); + if (ret != EBUSY || (is_open && want_exclusive)) + return (ret); + lock_busy = 1; - /* - * The handle needs to be opened. If we locked the handle above, - * unlock it before returning. - */ - switch (locked) { - case NOLOCK: - break; - case READLOCK: - WT_RET(__wt_readunlock(session, dhandle->rwlock)); - break; - case WRITELOCK: - F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); - WT_RET(__wt_writeunlock(session, dhandle->rwlock)); - break; + /* Give other threads a chance to make progress. */ + __wt_yield(); } - - /* Treat an unopened handle just like a non-existent handle. */ - return (WT_NOTFOUND); } /* @@ -131,22 +173,21 @@ __wt_session_lock_dhandle(WT_SESSION_IMPL *session, uint32_t flags, int *deadp) int __wt_session_release_btree(WT_SESSION_IMPL *session) { - enum { NOLOCK, READLOCK, WRITELOCK } locked; WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; + int write_locked; btree = S2BT(session); dhandle = session->dhandle; + write_locked = F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) ? 1 : 0; - locked = F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) ? WRITELOCK : READLOCK; /* * If we had special flags set, close the handle so that future access * can get a handle without special flags. */ if (F_ISSET(dhandle, WT_DHANDLE_DISCARD_FORCE)) { - WT_WITH_DHANDLE_LOCK(session, - ret = __wt_conn_btree_sync_and_close(session, 0, 1)); + ret = __wt_conn_btree_sync_and_close(session, 0, 1); F_CLR(dhandle, WT_DHANDLE_DISCARD_FORCE); } else if (F_ISSET(dhandle, WT_DHANDLE_DISCARD) || F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) { @@ -155,19 +196,12 @@ __wt_session_release_btree(WT_SESSION_IMPL *session) F_CLR(dhandle, WT_DHANDLE_DISCARD); } - if (F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE)) + if (write_locked) F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); - switch (locked) { - case NOLOCK: - break; - case READLOCK: - WT_TRET(__wt_readunlock(session, dhandle->rwlock)); - break; - case WRITELOCK: - WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); - break; - } + WT_TRET(write_locked ? + __wt_writeunlock(session, dhandle->rwlock): + __wt_readunlock(session, dhandle->rwlock)); session->dhandle = NULL; return (ret); @@ -211,7 +245,6 @@ retry: WT_RET(__wt_meta_checkpoint_last_name( } ret = __wt_session_get_btree(session, uri, checkpoint, cfg, flags); - __wt_free(session, checkpoint); /* @@ -248,7 +281,6 @@ __session_discard_btree( dhandle_cache, __wt_data_handle_cache, hashl); (void)WT_ATOMIC_SUB4(dhandle_cache->dhandle->session_ref, 1); - __wt_overwrite_and_free(session, dhandle_cache); } @@ -297,8 +329,10 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) if (dhandle != session->dhandle && dhandle->session_inuse == 0 && (F_ISSET(dhandle, WT_DHANDLE_DEAD) || - now - dhandle->timeofdeath > conn->sweep_idle_time)) { + (dhandle->timeofdeath != 0 && + now - dhandle->timeofdeath > conn->sweep_idle_time))) { WT_STAT_FAST_CONN_INCR(session, dh_session_handles); + WT_ASSERT(session, !WT_IS_METADATA(dhandle)); __session_discard_btree(session, dhandle_cache); } dhandle_cache = dhandle_cache_next; @@ -307,42 +341,43 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) } /* - * __session_dhandle_find -- + * __session_dhandle_find_shared -- * Search for a data handle in the connection and add it to a session's * cache. Since the data handle isn't locked, this must be called holding * the handle list lock, and we must increment the handle's reference * count before releasing it. */ static int -__session_dhandle_find(WT_SESSION_IMPL *session, - const char *uri, const char *checkpoint, uint32_t flags) +__session_dhandle_find_shared( + WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { - WT_RET(__wt_conn_dhandle_find(session, uri, checkpoint, flags)); - return (__session_add_dhandle(session, NULL)); + WT_RET(__wt_conn_dhandle_find(session, uri, checkpoint)); + (void)WT_ATOMIC_ADD4(session->dhandle->session_ref, 1); + return (0); } - /* - * __wt_session_get_btree -- - * Get a btree handle for the given name, set session->dhandle. + * __session_dhandle_find -- + * Search for a data handle, first in the session cache, then in the + * connection. */ -int -__wt_session_get_btree(WT_SESSION_IMPL *session, - const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags) +static int +__session_dhandle_find( + WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { WT_DATA_HANDLE *dhandle; WT_DATA_HANDLE_CACHE *dhandle_cache; WT_DECL_RET; uint64_t bucket; - int is_dead; - - WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES)); - WT_ASSERT(session, !LF_ISSET(WT_DHANDLE_HAVE_REF)); - - dhandle = NULL; bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; - SLIST_FOREACH(dhandle_cache, &session->dhhash[bucket], hashl) { +retry: SLIST_FOREACH(dhandle_cache, &session->dhhash[bucket], hashl) { dhandle = dhandle_cache->dhandle; + if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { + WT_ASSERT(session, !WT_IS_METADATA(dhandle)); + __session_discard_btree(session, dhandle_cache); + /* We deleted our entry, retry from the start. */ + goto retry; + } if (strcmp(uri, dhandle->name) != 0) continue; if (checkpoint == NULL && dhandle->checkpoint == NULL) @@ -352,71 +387,97 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, break; } - if (dhandle_cache != NULL) + if (dhandle_cache != NULL) { session->dhandle = dhandle; - else { - /* - * We didn't find a match in the session cache, now search the - * shared handle list and cache any handle we find. - */ - WT_WITH_DHANDLE_LOCK(session, ret = - __session_dhandle_find(session, uri, checkpoint, flags)); - dhandle = (ret == 0) ? session->dhandle : NULL; - WT_RET_NOTFOUND_OK(ret); + return (0); } - if (dhandle != NULL) { - /* Try to lock the handle; if this succeeds, we're done. */ - if ((ret = - __wt_session_lock_dhandle(session, flags, &is_dead)) == 0) - goto done; + /* + * We didn't find a match in the session cache, search the shared + * handle list and cache the handle we find. + */ + WT_WITH_HANDLE_LIST_LOCK(session, ret = + __session_dhandle_find_shared(session, uri, checkpoint)); + if (ret == 0) + ret = __session_add_dhandle(session, NULL); - /* Propagate errors we don't expect. */ - if (ret != WT_NOTFOUND && ret != EBUSY) - return (ret); + return (ret); +} + +/* + * __wt_session_get_btree -- + * Get a btree handle for the given name, set session->dhandle. + */ +int +__wt_session_get_btree(WT_SESSION_IMPL *session, + const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags) +{ + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + int is_dead; + + WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES)); + + for (;;) { + WT_RET(__session_dhandle_find(session, uri, checkpoint)); + dhandle = session->dhandle; + + /* Try to lock the handle. */ + WT_RET(__wt_session_lock_dhandle(session, flags, &is_dead)); + if (is_dead) + continue; + + /* If the handle is open in the mode we want, we're done. */ + if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) || + (F_ISSET(dhandle, WT_DHANDLE_OPEN) && + !LF_ISSET(WT_BTREE_SPECIAL_FLAGS))) + break; + + WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE)); /* - * Don't try harder to get the handle if we're only checking - * for locks or our caller hasn't allowed us to take the schema - * lock - they do so on purpose and will handle error returns. + * For now, we need the schema lock and handle list locks to + * open a file for real. + * + * Code needing exclusive access (such as drop or verify) + * assumes that it can close all open handles, then open an + * exclusive handle on the active tree and no other threads can + * reopen handles in the meantime. A combination of the schema + * and handle list locks are used to enforce this. */ - if ((LF_ISSET(WT_DHANDLE_LOCK_ONLY) && ret == EBUSY) || - (!F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) && - F_ISSET(session, - WT_SESSION_HANDLE_LIST_LOCKED | WT_SESSION_TABLE_LOCKED))) - return (ret); + if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA) || + !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) { + F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); + WT_RET(__wt_writeunlock(session, dhandle->rwlock)); - /* If we found the handle and it isn't dead, reopen it. */ - if (is_dead) { - __session_discard_btree(session, dhandle_cache); - dhandle_cache = NULL; - session->dhandle = dhandle = NULL; - } else - LF_SET(WT_DHANDLE_HAVE_REF); - } + WT_WITH_SCHEMA_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = + __wt_session_get_btree( + session, uri, checkpoint, cfg, flags))); - /* - * Acquire the schema lock and the data handle lock, find and/or - * open the handle. - * - * We need the schema lock for this call so that if we lock a handle in - * order to open it, that doesn't race with a schema-changing operation - * such as drop. - */ - WT_WITH_SCHEMA_LOCK(session, - WT_WITH_DHANDLE_LOCK(session, ret = - __wt_conn_btree_get(session, uri, checkpoint, cfg, flags))); - WT_RET(ret); + return (ret); + } + + /* Open the handle. */ + if ((ret = __wt_conn_btree_open(session, cfg, flags)) == 0 && + LF_ISSET(WT_DHANDLE_EXCLUSIVE)) + break; - if (!LF_ISSET(WT_DHANDLE_HAVE_REF)) - WT_RET(__session_add_dhandle(session, NULL)); + /* + * If we got the handle exclusive to open it but only want + * ordinary access, drop our lock and retry the open. + */ + F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); + WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); + WT_RET(ret); + } + WT_ASSERT(session, !F_ISSET(dhandle, WT_DHANDLE_DEAD)); WT_ASSERT(session, LF_ISSET(WT_DHANDLE_LOCK_ONLY) || - (F_ISSET(session->dhandle, WT_DHANDLE_OPEN) && - !F_ISSET(session->dhandle, WT_DHANDLE_DEAD))); + F_ISSET(dhandle, WT_DHANDLE_OPEN)); -done: WT_ASSERT(session, LF_ISSET(WT_DHANDLE_EXCLUSIVE) == - F_ISSET(session->dhandle, WT_DHANDLE_EXCLUSIVE)); + WT_ASSERT(session, LF_ISSET(WT_DHANDLE_EXCLUSIVE) == + F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE)); return (0); } diff --git a/src/third_party/wiredtiger/src/support/mutex.c b/src/third_party/wiredtiger/src/support/mutex.c deleted file mode 100644 index fa85cfc33d5..00000000000 --- a/src/third_party/wiredtiger/src/support/mutex.c +++ /dev/null @@ -1,255 +0,0 @@ -/*- - * Copyright (c) 2014-2015 MongoDB, Inc. - * Copyright (c) 2008-2014 WiredTiger, Inc. - * All rights reserved. - * - * See the file LICENSE for redistribution information. - */ - -#include "wt_internal.h" - -#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING - -/* - * __wt_spin_lock_register_lock -- - * Add a lock to the connection's list. - */ -int -__wt_spin_lock_register_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) -{ - WT_CONNECTION_IMPL *conn; - u_int i; - - /* - * There is a spinlock we initialize before we have a connection, the - * global library lock. In that case, the session will be NULL and - * we can't track the lock. - */ - if (session == NULL) - return (0); - - conn = S2C(session); - - for (i = 0; i < WT_SPINLOCK_MAX; i++) - if (conn->spinlock_list[i] == NULL && - WT_ATOMIC_CAS(conn->spinlock_list[i], NULL, t)) - return (0); - - WT_RET_MSG(session, ENOMEM, - "spinlock connection registry failed, increase the connection's " - "spinlock list size"); -} - -/* - * __wt_spin_lock_unregister_lock -- - * Remove a lock from the connection's list. - */ -void -__wt_spin_lock_unregister_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) -{ - WT_CONNECTION_IMPL *conn; - u_int i; - - conn = S2C(session); - - for (i = 0; i < WT_SPINLOCK_MAX; i++) - if (conn->spinlock_list[i] == t) - conn->spinlock_list[i] = NULL; - - /* - * XXX - * The statistics thread reads through this array, there's a possible - * race: if that thread reads the pointer then goes to sleep, then we - * free the spinlock, then the statistics thread wakes up, it can read - * free'd memory. - * - * This is performance debugging code, so we're not fixing the race for - * now, minimize the window. - */ - WT_FULL_BARRIER(); -} - -/* - * __spin_lock_next_id -- - * Return the next spinlock caller ID. - */ -static int -__spin_lock_next_id(WT_SESSION_IMPL *session, int *idp) -{ - static int lock_id = 0, next_id = 0; - WT_DECL_RET; - - /* If we've ever registered this location, we already have an ID. */ - if (*idp != WT_SPINLOCK_REGISTER) - return (0); - - /* - * We can't use the global spinlock to lock the ID allocation (duh!), - * use a CAS instruction to serialize access to a local variable. - * This work only gets done once per library instantiation, there - * isn't a performance concern. - */ - while (!WT_ATOMIC_CAS(lock_id, 0, 1)) - __wt_yield(); - - /* Allocate a blocking ID for this location. */ - if (*idp == WT_SPINLOCK_REGISTER) { - if (next_id < WT_SPINLOCK_MAX_LOCATION_ID) - *idp = next_id++; - else - WT_ERR_MSG(session, ENOMEM, - "spinlock caller location registry failed, " - "increase the connection's blocking matrix size"); - } - -err: WT_PUBLISH(lock_id, 0); - return (ret); -} - -/* - * __wt_spin_lock_register_caller -- - * Register a spin-lock caller's location information in the blocking - * matrix. - */ -int -__wt_spin_lock_register_caller(WT_SESSION_IMPL *session, - const char *name, const char *file, int line, int *idp) -{ - WT_CONNECTION_IMPL *conn; - WT_CONNECTION_STATS_SPINLOCK *p; - - conn = S2C(session); - - /* - * The caller's location ID is a static offset into a per-connection - * structure, and that has problems: first, if there are multiple - * connections, we'll need to hold some kind of lock to avoid racing - * when setting that value, and second, if/when there are multiple - * connections and/or a single connection is closed and re-opened, the - * variable may be initialized and underlying connection information - * may not. - * - * First, allocate a location ID if needed. - */ - WT_RET(__spin_lock_next_id(session, idp)); - - /* - * Add the caller's information to the blocking matrix. We could race - * here (if two threads of control register the same lock at the same - * time), but we don't care as both threads are setting the identical - * information. - */ - p = &conn->spinlock_block[*idp]; - p->name = name; - if ((p->file = strrchr(file, '/')) == NULL) - p->file = file; - else - ++p->file; - p->line = line; - return (0); -} - -/* - * __wt_statlog_dump_spinlock -- - * Log the spin-lock statistics. - */ -int -__wt_statlog_dump_spinlock(WT_CONNECTION_IMPL *conn, const char *tag) -{ - WT_SPINLOCK *spin; - WT_CONNECTION_STATS_SPINLOCK *p, *t; - uint64_t block_manager, btree_page, ignore; - u_int i, j; - - /* - * Ignore rare acquisition of a spinlock using a base value of 10 per - * second so we don't create graphs we don't care about. - */ - ignore = (uint64_t)(conn->stat_usecs / 1000000) * 10; - - /* Output the number of times each spinlock was acquired. */ - block_manager = btree_page = 0; - for (i = 0; i < WT_ELEMENTS(conn->spinlock_list); ++i) { - if ((spin = conn->spinlock_list[i]) == NULL) - continue; - - /* - * There are two sets of spinlocks we aggregate, the btree page - * locks and the block manager per-file locks. The reason is - * the block manager locks grow with the number of files open - * (and LSM and bloom filters can open a lot of files), and - * there are 16 btree page locks and splitting them out has not - * historically been that informative. - */ - if (strcmp(spin->name, "block manager") == 0) { - block_manager += spin->counter; - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) - spin->counter = 0; - continue; - } - if (strcmp(spin->name, "btree page") == 0) { - btree_page += spin->counter; - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) - spin->counter = 0; - continue; - } - - WT_RET(__wt_fprintf(session, conn->stat_fp, - "%s %" PRIu64 " %s spinlock %s: acquisitions\n", - conn->stat_stamp, - spin->counter <= ignore ? 0 : spin->counter, - tag, spin->name)); - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) - spin->counter = 0; - } - WT_RET(__wt_fprintf(session, conn->stat_fp, - "%s %" PRIu64 " %s spinlock %s: acquisitions\n", - conn->stat_stamp, - block_manager <= ignore ? 0 : block_manager, - tag, "block manager")); - WT_RET(__wt_fprintf(session, conn->stat_fp, - "%s %" PRIu64 " %s spinlock %s: acquisitions\n", - conn->stat_stamp, - btree_page <= ignore ? 0 : btree_page, - tag, "btree page")); - - /* - * Output the number of times each location acquires its spinlock and - * the blocking matrix. - */ - for (i = 0; i < WT_ELEMENTS(conn->spinlock_block); ++i) { - p = &conn->spinlock_block[i]; - if (p->name == NULL) - continue; - - WT_RET(__wt_fprintf(session, conn->stat_fp, - "%s %d %s spinlock %s acquired by %s(%d)\n", - conn->stat_stamp, - p->total <= ignore ? 0 : p->total, - tag, - p->name, p->file, p->line)); - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) - p->total = 0; - - for (j = 0; j < WT_ELEMENTS(conn->spinlock_block); ++j) { - t = &conn->spinlock_block[j]; - if (t->name == NULL) - continue; - - WT_RET(__wt_fprintf(session, conn->stat_fp, - "%s %d %s spinlock %s: %s(%d) blocked by %s(%d)\n", - conn->stat_stamp, - p->blocked[j] <= ignore ? 0 : p->blocked[j], - tag, - p->name, p->file, p->line, - t->file, t->line)); - if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR)) - p->blocked[j] = 0; - } - } - - WT_FULL_BARRIER(); /* Minimize the window. */ - return (0); -} - -#endif /* SPINLOCK_PTHREAD_MUTEX_LOGGING */ diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index fb0a4b7fa6d..05b27cd9a56 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -60,50 +60,29 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session) txn = &session->txn; txn_state = &S2C(session)->txn_global.states[session->id]; - if (txn_state->snap_min != WT_TXN_NONE) { - WT_ASSERT(session, - session->txn.isolation == TXN_ISO_READ_UNCOMMITTED || - !__wt_txn_visible_all(session, txn_state->snap_min)); - txn_state->snap_min = WT_TXN_NONE; - } - F_CLR(txn, TXN_HAS_SNAPSHOT); -} + WT_ASSERT(session, + txn_state->snap_min == WT_TXN_NONE || + session->txn.isolation == TXN_ISO_READ_UNCOMMITTED || + !__wt_txn_visible_all(session, txn_state->snap_min)); -/* - * __wt_txn_update_oldest -- - * Sweep the running transactions to update the oldest ID required. - */ -void -__wt_txn_update_oldest(WT_SESSION_IMPL *session) -{ - /* - * !!! - * If a data-source is calling the WT_EXTENSION_API.transaction_oldest - * method (for the oldest transaction ID not yet visible to a running - * transaction), and then comparing that oldest ID against committed - * transactions to see if updates for a committed transaction are still - * visible to running transactions, the oldest transaction ID may be - * the same as the last committed transaction ID, if the transaction - * state wasn't refreshed after the last transaction committed. Push - * past the last committed transaction. - */ - __wt_txn_refresh(session, 0); + txn_state->snap_min = WT_TXN_NONE; + F_CLR(txn, TXN_HAS_SNAPSHOT); } /* - * __wt_txn_refresh -- - * Allocate a transaction ID and/or a snapshot. + * __wt_txn_get_snapshot -- + * Allocate a snapshot. */ void -__wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot) +__wt_txn_get_snapshot(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; - uint64_t current_id, id, oldest_id; + uint64_t ckpt_id, current_id, id; uint64_t prev_oldest_id, snap_min; - uint32_t i, n, oldest_session, session_cnt; + uint32_t i, n, session_cnt; int32_t count; conn = S2C(session); @@ -116,10 +95,9 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot) /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { - if (get_snapshot) { - txn_state->snap_min = current_id; - __txn_sort_snapshot(session, 0, current_id); - } + txn_state->snap_min = current_id; + __txn_sort_snapshot(session, 0, current_id); + /* Check that the oldest ID has not moved in the meantime. */ if (prev_oldest_id == txn_global->oldest_id && txn_global->scan_count == 0) @@ -139,15 +117,14 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot) /* The oldest ID cannot change until the scan count goes to zero. */ prev_oldest_id = txn_global->oldest_id; - current_id = oldest_id = snap_min = txn_global->current; - oldest_session = 0; + current_id = snap_min = txn_global->current; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); + ckpt_id = txn_global->checkpoint_id; for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* Skip the checkpoint transaction; it is never read from. */ - if (txn_global->checkpoint_id != WT_TXN_NONE && - s->id == txn_global->checkpoint_id) + if (ckpt_id != WT_TXN_NONE && ckpt_id == s->id) continue; /* @@ -163,18 +140,104 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot) if (s != txn_state && (id = s->id) != WT_TXN_NONE && TXNID_LE(prev_oldest_id, id)) { - if (get_snapshot) - txn->snapshot[n++] = id; + txn->snapshot[n++] = id; if (TXNID_LT(id, snap_min)) snap_min = id; } + } + + /* + * If we got a new snapshot, update the published snap_min for this + * session. + */ + WT_ASSERT(session, TXNID_LE(prev_oldest_id, snap_min)); + WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); + txn_state->snap_min = snap_min; + + /* Update the last running ID if we have a much newer value. */ + if (snap_min > txn_global->last_running + 100) + txn_global->last_running = snap_min; + + WT_ASSERT(session, txn_global->scan_count > 0); + (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); + + __txn_sort_snapshot(session, n, current_id); +} + +/* + * __wt_txn_update_oldest -- + * Sweep the running transactions to update the oldest ID required. + * !!! + * If a data-source is calling the WT_EXTENSION_API.transaction_oldest + * method (for the oldest transaction ID not yet visible to a running + * transaction), and then comparing that oldest ID against committed + * transactions to see if updates for a committed transaction are still + * visible to running transactions, the oldest transaction ID may be + * the same as the last committed transaction ID, if the transaction + * state wasn't refreshed after the last transaction committed. Push + * past the last committed transaction. +*/ +void +__wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) +{ + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *oldest_session; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *s; + uint64_t ckpt_id, current_id, id, oldest_id, prev_oldest_id, snap_min; + uint32_t i, session_cnt; + int32_t count; + int last_running_moved; + + conn = S2C(session); + txn_global = &conn->txn_global; + + current_id = snap_min = txn_global->current; + oldest_session = NULL; + prev_oldest_id = txn_global->oldest_id; + + /* + * For pure read-only workloads, or if the update isn't forced and the + * oldest ID isn't too far behind, avoid scanning. + */ + if (prev_oldest_id == current_id || + (!force && TXNID_LT(current_id, prev_oldest_id + 100))) + return; + + /* + * We're going to scan. Increment the count of scanners to prevent the + * oldest ID from moving forwards. Spin if the count is negative, + * which indicates that some thread is moving the oldest ID forwards. + */ + do { + if ((count = txn_global->scan_count) < 0) + WT_PAUSE(); + } while (count < 0 || + !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1)); + + /* The oldest ID cannot change until the scan count goes to zero. */ + prev_oldest_id = txn_global->oldest_id; + current_id = oldest_id = snap_min = txn_global->current; + + /* Walk the array of concurrent transactions. */ + WT_ORDERED_READ(session_cnt, conn->session_cnt); + ckpt_id = txn_global->checkpoint_id; + for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { + /* Skip the checkpoint transaction; it is never read from. */ + if (ckpt_id != WT_TXN_NONE && ckpt_id == s->id) + continue; /* - * Ignore the session's own snap_min: we are about to update - * it. + * Update the oldest ID. + * + * Ignore: IDs older than the oldest ID we saw. This can happen + * if we race with a thread that is allocating an ID -- the ID + * will not be used because the thread will keep spinning until + * it gets a valid one. */ - if (get_snapshot && s == txn_state) - continue; + if ((id = s->id) != WT_TXN_NONE && + TXNID_LE(prev_oldest_id, id) && TXNID_LT(id, snap_min)) + snap_min = id; /* * !!! @@ -187,49 +250,31 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot) if ((id = s->snap_min) != WT_TXN_NONE && TXNID_LT(id, oldest_id)) { oldest_id = id; - oldest_session = i; + oldest_session = &conn->sessions[i]; } } if (TXNID_LT(snap_min, oldest_id)) oldest_id = snap_min; - if (txn->id != WT_TXN_NONE && TXNID_LT(txn->id, oldest_id)) - oldest_id = txn->id; - - /* - * If we got a new snapshot, update the published snap_min for this - * session. - */ - if (get_snapshot) { - WT_ASSERT(session, TXNID_LE(prev_oldest_id, snap_min)); - WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); - txn_state->snap_min = snap_min; - } - /* - * Update the last running ID if we have a much newer value or we are - * forcing an update. - */ - if (!get_snapshot || snap_min > txn_global->last_running + 100) + /* Update the last running ID. */ + if (TXNID_LT(txn_global->last_running, snap_min)) { txn_global->last_running = snap_min; + last_running_moved = 1; + } else + last_running_moved = 0; - /* - * Update the oldest ID if we have a newer ID and we can get exclusive - * access. During normal snapshot refresh, only do this if we have a - * much newer value. Once we get exclusive access, do another pass to - * make sure nobody else is using an earlier ID. - */ + /* Update the oldest ID. */ if (TXNID_LT(prev_oldest_id, oldest_id) && - (!get_snapshot || oldest_id - prev_oldest_id > 100) && WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) { WT_ORDERED_READ(session_cnt, conn->session_cnt); + ckpt_id = txn_global->checkpoint_id; for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Skip the checkpoint transaction; it is never read * from. */ - if (txn_global->checkpoint_id != WT_TXN_NONE && - s->id == txn_global->checkpoint_id) + if (ckpt_id != WT_TXN_NONE && ckpt_id == s->id) continue; if ((id = s->id) != WT_TXN_NONE && @@ -244,23 +289,19 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot) txn_global->scan_count = 0; } else { if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && - current_id - oldest_id > 10000 && - txn_global->oldest_session != oldest_session) { + current_id - oldest_id > 10000 && last_running_moved && + oldest_session != NULL) { (void)__wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %d [%s]" " with snap_min %" PRIu64 "\n", - oldest_id, oldest_session, - conn->sessions[oldest_session].lastop, - conn->sessions[oldest_session].txn.snap_min); - txn_global->oldest_session = oldest_session; + oldest_id, oldest_session->id, + oldest_session->lastop, + oldest_session->txn.snap_min); } WT_ASSERT(session, txn_global->scan_count > 0); (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); } - - if (get_snapshot) - __txn_sort_snapshot(session, n, current_id); } /* diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 45560ff897a..1ae593fd6be 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -185,7 +185,7 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], session->ckpt_handle[i].dhandle, ret = (*op)(session, cfg)); else - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_conn_btree_apply_single(session, session->ckpt_handle[i].name, NULL, op, cfg)); WT_RET(ret); @@ -376,7 +376,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_WITH_SCHEMA_LOCK(session, WT_WITH_TABLE_LOCK(session, - WT_WITH_DHANDLE_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret = __checkpoint_apply_all( session, cfg, __wt_checkpoint_list, NULL)))); WT_ERR(ret); @@ -387,7 +387,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * This is particularly important for compact, so that all dirty pages * can be fully written. */ - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 1); /* Flush data-sources before we start the checkpoint. */ WT_ERR(__checkpoint_data_source(session, cfg)); @@ -411,7 +411,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); /* Acquire the schema lock. */ - F_SET(session, WT_SESSION_SCHEMA_LOCKED); + F_SET(session, WT_SESSION_LOCKED_SCHEMA); __wt_spin_lock(session, &conn->schema_lock); WT_ERR(__wt_meta_track_on(session)); @@ -568,8 +568,8 @@ err: /* __wt_free(session, session->ckpt_handle); session->ckpt_handle_allocated = session->ckpt_handle_next = 0; - if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { - F_CLR(session, WT_SESSION_SCHEMA_LOCKED); + if (F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) { + F_CLR(session, WT_SESSION_LOCKED_SCHEMA); __wt_spin_unlock(session, &conn->schema_lock); } @@ -1057,7 +1057,7 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ASSERT(session, session->dhandle->checkpoint == NULL); /* Should be holding the schema lock. */ - WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); return (__checkpoint_worker(session, cfg, 1)); } @@ -1107,7 +1107,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, int final) * for active readers. */ if (!btree->modified && !bulk) { - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 1); return (__wt_txn_visible_all(session, btree->rec_max_txn) ? __wt_cache_op(session, NULL, WT_SYNC_DISCARD) : EBUSY); } @@ -1123,7 +1123,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, int final) */ if (!final) WT_ASSERT(session, - bulk || F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); + bulk || F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); need_tracking = !bulk && !final && !WT_META_TRACKING(session); if (need_tracking) diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index 571754bf5bf..540b0528995 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -65,7 +65,7 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r, "No file found with ID %u (max %u)", id, r->nfiles)); r->missing = 1; - } else if (LOG_CMP(lsnp, &r->files[id].ckpt_lsn) >= 0) { + } else if (WT_LOG_CMP(lsnp, &r->files[id].ckpt_lsn) >= 0) { /* * We're going to apply the operation. Get the cursor, opening * one if none is cached. @@ -423,7 +423,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session) was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0; /* We need a real session for recovery. */ - WT_RET(__wt_open_session(conn, NULL, NULL, &session)); + WT_RET(__wt_open_session(conn, NULL, NULL, 1, &session)); F_SET(session, WT_SESSION_NO_LOGGING); r.session = session; |