diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2015-08-12 20:56:25 +1000 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2015-08-12 20:56:25 +1000 |
commit | 7bb09c0377f5160857617c38ab07955f8f4b03f6 (patch) | |
tree | 2d7041f6e1cc121c743c368406485e39280b551c /src/third_party | |
parent | 6d9cbeb53eb9e6d39a24a8ac54b71e105483730b (diff) | |
download | mongo-7bb09c0377f5160857617c38ab07955f8f4b03f6.tar.gz |
Import wiredtiger-wiredtiger-2.6.1-500-g26d1ad2.tar.gz from wiredtiger branch mongodb-3.2
Diffstat (limited to 'src/third_party')
51 files changed, 1182 insertions, 551 deletions
diff --git a/src/third_party/wiredtiger/bench/wtperf/Makefile.am b/src/third_party/wiredtiger/bench/wtperf/Makefile.am index 0630a27f640..15f151d84b2 100644 --- a/src/third_party/wiredtiger/bench/wtperf/Makefile.am +++ b/src/third_party/wiredtiger/bench/wtperf/Makefile.am @@ -5,7 +5,7 @@ LDADD = $(top_builddir)/libwiredtiger.la -lm noinst_PROGRAMS = wtperf wtperf_LDFLAGS = -static wtperf_SOURCES =\ - config.c misc.c track.c wtperf.c wtperf.h wtperf_opt.i + config.c misc.c track.c wtperf.c wtperf_truncate.c wtperf.h wtperf_opt.i TESTS = smoke.sh AM_TESTS_ENVIRONMENT = rm -rf WT_TEST ; mkdir WT_TEST ; diff --git a/src/third_party/wiredtiger/bench/wtperf/config.c b/src/third_party/wiredtiger/bench/wtperf/config.c index 47e052d6055..721b41432cb 100644 --- a/src/third_party/wiredtiger/bench/wtperf/config.c +++ b/src/third_party/wiredtiger/bench/wtperf/config.c @@ -95,6 +95,8 @@ config_assign(CONFIG *dest, const CONFIG *src) *pstr = newstr; } } + + STAILQ_INIT(&dest->stone_head); return (0); } @@ -122,6 +124,7 @@ config_free(CONFIG *cfg) free(cfg->uris); } + cleanup_truncate_config(cfg); free(cfg->ckptthreads); free(cfg->popthreads); free(cfg->base_uri); @@ -243,6 +246,28 @@ config_threads(CONFIG *cfg, const char *config, size_t len) goto err; continue; } + if (STRING_MATCH("truncate", k.str, k.len)) { + if ((workp->truncate = v.val) != 1) + goto err; + /* There can only be one Truncate thread. */ + if (cfg->has_truncate != 0) { + goto err; + } + cfg->has_truncate = 1; + continue; + } + if (STRING_MATCH("truncate_pct", k.str, k.len)) { + if (v.val <= 0) + goto err; + workp->truncate_pct = (uint64_t)v.val; + continue; + } + if (STRING_MATCH("truncate_count", k.str, k.len)) { + if (v.val <= 0) + goto err; + workp->truncate_count = (uint64_t)v.val; + continue; + } goto err; } if (ret == WT_NOTFOUND) @@ -253,9 +278,21 @@ config_threads(CONFIG *cfg, const char *config, size_t len) scan = NULL; if (ret != 0) goto err; - - if (workp->insert == 0 && - workp->read == 0 && workp->update == 0) + if (workp->insert == 0 && workp->read == 0 && + workp->update == 0 && workp->truncate == 0) + goto err; + /* Why run with truncate if we don't want any truncation. */ + if (workp->truncate != 0 && + workp->truncate_pct == 0 && workp->truncate_count == 0) + goto err; + if (workp->truncate != 0 && + (workp->truncate_pct < 1 || workp->truncate_pct > 99)) + goto err; + /* Truncate should have its own exclusive thread. */ + if (workp->truncate != 0 && workp->threads > 1) + goto err; + if (workp->truncate != 0 && + (workp->insert > 0 || workp->read > 0 || workp->update > 0)) goto err; cfg->workers_cnt += (u_int)workp->threads; } @@ -640,9 +677,11 @@ config_print(CONFIG *cfg) for (i = 0, workp = cfg->workload; i < cfg->workload_cnt; ++i, ++workp) printf("\t\t%" PRId64 " threads (inserts=%" PRId64 - ", reads=%" PRId64 ", updates=%" PRId64 ")\n", + ", reads=%" PRId64 ", updates=%" PRId64 + ", truncates=% " PRId64 ")\n", workp->threads, - workp->insert, workp->read, workp->update); + workp->insert, workp->read, + workp->update, workp->truncate); } printf("\t" "Checkpoint threads, interval: %" PRIu32 ", %" PRIu32 "\n", diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-oplog.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-oplog.wtperf new file mode 100644 index 00000000000..34235f04518 --- /dev/null +++ b/src/third_party/wiredtiger/bench/wtperf/runners/mongodb-oplog.wtperf @@ -0,0 +1,11 @@ +# wtperf options file to simulate populating a MongoDB oplog +conn_config="cache_size=2GB,checkpoint=(wait=60)" +table_config="type=file" +# Start with a small set of inserts in the populate phase. +icount=50000 +report_interval=5 +run_time=500 +populate_threads=1 +# Setup three threads to insert into the oplog +# Setup one thread to be doing truncates from the oplog +threads=((count=3,inserts=1,throttle=2000),(count=1,truncate=1,truncate_pct=10,truncate_count=50000)) diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/truncate-btree-populate.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/truncate-btree-populate.wtperf new file mode 100644 index 00000000000..4e4ae7500f0 --- /dev/null +++ b/src/third_party/wiredtiger/bench/wtperf/runners/truncate-btree-populate.wtperf @@ -0,0 +1,7 @@ +# Truncate workload population phase +conn_config="cache_size=2GB,checkpoint=(wait=60)" +table_config="type=file" +# Start with a small set of inserts in the populate phase. +icount=50000 +report_interval=5 +populate_threads=1 diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/truncate-btree-workload.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/truncate-btree-workload.wtperf new file mode 100644 index 00000000000..55e01dcd0dc --- /dev/null +++ b/src/third_party/wiredtiger/bench/wtperf/runners/truncate-btree-workload.wtperf @@ -0,0 +1,9 @@ +# truncate workload. work phase +conn_config="cache_size=2GB,checkpoint=(wait=60)" +table_config="type=file" +create=false +report_interval=5 +run_time=500 +# Setup three threads to insert into the oplog +# Setup one thread to be doing truncates from the oplog +threads=((count=3,inserts=1,throttle=2000),(count=1,truncate=1,truncate_pct=10,truncate_count=50000)) diff --git a/src/third_party/wiredtiger/bench/wtperf/track.c b/src/third_party/wiredtiger/bench/wtperf/track.c index 8ea4201246a..75f5a012a94 100644 --- a/src/third_party/wiredtiger/bench/wtperf/track.c +++ b/src/third_party/wiredtiger/bench/wtperf/track.c @@ -98,6 +98,11 @@ sum_read_ops(CONFIG *cfg) return (sum_ops(cfg, offsetof(CONFIG_THREAD, read))); } uint64_t +sum_truncate_ops(CONFIG *cfg) +{ + return (sum_ops(cfg, offsetof(CONFIG_THREAD, truncate))); +} +uint64_t sum_update_ops(CONFIG *cfg) { return (sum_ops(cfg, offsetof(CONFIG_THREAD, update))); diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c index 1c9ce963c9a..f079d6272d7 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c @@ -50,6 +50,7 @@ static const CONFIG default_cfg = { 0, /* checkpoint operations */ 0, /* insert operations */ 0, /* read operations */ + 0, /* truncate operations */ 0, /* update operations */ 0, /* insert key */ 0, /* checkpoint in progress */ @@ -57,6 +58,8 @@ static const CONFIG default_cfg = { 0, /* notify threads to stop */ 0, /* in warmup phase */ 0, /* total seconds running */ + 0, /* has truncate */ + {NULL, NULL}, /* the truncate queue */ #define OPT_DEFINE_DEFAULT #include "wtperf_opt.i" @@ -100,15 +103,6 @@ get_next_incr(CONFIG *cfg) return (WT_ATOMIC_ADD8(cfg->insert_key, 1)); } -static inline void -generate_key(CONFIG *cfg, char *key_buf, uint64_t keyno) -{ - /* - * Don't change to snprintf, sprintf is faster in some tests. - */ - sprintf(key_buf, "%0*" PRIu64, cfg->key_sz - 1, keyno); -} - static void randomize_value(CONFIG_THREAD *thread, char *value_buf) { @@ -258,6 +252,8 @@ op_name(uint8_t *op) return ("insert_rmw"); case WORKER_READ: return ("read"); + case WORKER_TRUNCATE: + return ("truncate"); case WORKER_UPDATE: return ("update"); default: @@ -389,7 +385,7 @@ worker(void *arg) size_t i; uint64_t next_val, usecs; uint8_t *op, *op_end; - int measure_latency, ret; + int measure_latency, ret, truncated; char *value_buf, *key_buf, *value; char buf[512]; @@ -444,6 +440,11 @@ worker(void *arg) goto err; } + /* Setup for truncate */ + if (thread->workload->truncate != 0) + if ((ret = setup_truncate(cfg, thread, session)) != 0) + goto err; + key_buf = thread->key_buf; value_buf = thread->value_buf; @@ -486,6 +487,10 @@ worker(void *arg) if (wtperf_value_range(cfg) < next_val) continue; break; + case WORKER_TRUNCATE: + /* Required but not used. */ + next_val = wtperf_rand(thread); + break; default: goto err; /* can't happen */ } @@ -502,10 +507,9 @@ worker(void *arg) * is 0, to avoid first time latency spikes. */ measure_latency = - cfg->sample_interval != 0 && trk->ops != 0 && ( - trk->ops % cfg->sample_rate == 0); - if (measure_latency && - (ret = __wt_epoch(NULL, &start)) != 0) { + cfg->sample_interval != 0 && trk != NULL && + trk->ops != 0 && (trk->ops % cfg->sample_rate == 0); + if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) { lprintf(cfg, ret, 0, "Get time call failed"); goto err; } @@ -548,6 +552,18 @@ worker(void *arg) if ((ret = cursor->insert(cursor)) == 0) break; goto op_err; + case WORKER_TRUNCATE: + if ((ret = run_truncate( + cfg, thread, cursor, session, &truncated)) == 0) { + if (truncated) + trk = &thread->truncate; + else + trk = &thread->truncate_sleep; + /* Pause between truncate attempts */ + (void)usleep(1000); + break; + } + goto op_err; case WORKER_UPDATE: if ((ret = cursor->search(cursor)) == 0) { if ((ret = cursor->get_value( @@ -711,16 +727,33 @@ run_mix_schedule(CONFIG *cfg, WORKLOAD *workp) { int64_t pct; - /* Confirm reads, inserts and updates cannot all be zero. */ - if (workp->insert == 0 && workp->read == 0 && workp->update == 0) { + /* Confirm reads, inserts, truncates and updates cannot all be zero. */ + if (workp->insert == 0 && workp->read == 0 && + workp->truncate == 0 && workp->update == 0) { lprintf(cfg, EINVAL, 0, "no operations scheduled"); return (EINVAL); } /* + * Handle truncate first - it's a special case that can't be used in + * a mixed workload. + */ + if (workp->truncate != 0) { + if (workp->insert != 0 || + workp->read != 0 || workp->update != 0) { + lprintf(cfg, EINVAL, 0, + "Can't configure truncate in a mixed workload"); + return (EINVAL); + } + memset(workp->ops, WORKER_TRUNCATE, sizeof(workp->ops)); + return (0); + } + + /* * Check for a simple case where the thread is only doing insert or - * update operations (because the default operation for a job-mix is - * read, the subsequent code works fine if only reads are specified). + * update operations (because the default operation for a + * job-mix is read, the subsequent code works fine if only reads are + * specified). */ if (workp->insert != 0 && workp->read == 0 && workp->update == 0) { memset(workp->ops, @@ -840,10 +873,9 @@ populate_thread(void *arg) cursor = cursors[op % cfg->table_count]; generate_key(cfg, key_buf, op); measure_latency = - cfg->sample_interval != 0 && trk->ops != 0 && ( - trk->ops % cfg->sample_rate == 0); - if (measure_latency && - (ret = __wt_epoch(NULL, &start)) != 0) { + cfg->sample_interval != 0 && + trk->ops != 0 && (trk->ops % cfg->sample_rate == 0); + if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) { lprintf(cfg, ret, 0, "Get time call failed"); goto err; } @@ -961,10 +993,9 @@ populate_async(void *arg) * the time to process by workers. */ measure_latency = - cfg->sample_interval != 0 && trk->ops != 0 && ( - trk->ops % cfg->sample_rate == 0); - if (measure_latency && - (ret = __wt_epoch(NULL, &start)) != 0) { + cfg->sample_interval != 0 && + trk->ops != 0 && (trk->ops % cfg->sample_rate == 0); + if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) { lprintf(cfg, ret, 0, "Get time call failed"); goto err; } @@ -1006,8 +1037,7 @@ populate_async(void *arg) goto err; if (measure_latency) { if ((ret = __wt_epoch(NULL, &stop)) != 0) { - lprintf(cfg, ret, 0, - "Get time call failed"); + lprintf(cfg, ret, 0, "Get time call failed"); goto err; } ++trk->latency_ops; @@ -1246,8 +1276,9 @@ execute_populate(CONFIG *cfg) CONFIG_THREAD *popth; WT_ASYNC_OP *asyncop; size_t i; - uint64_t last_ops, msecs; + uint64_t last_ops, msecs, print_ops_sec; uint32_t interval, tables; + double print_secs; int elapsed, ret; void *(*pfunc)(void *); @@ -1325,10 +1356,22 @@ execute_populate(CONFIG *cfg) lprintf(cfg, 0, 1, "Finished load of %" PRIu32 " items", cfg->icount); msecs = ns_to_ms(WT_TIMEDIFF(stop, start)); + + /* + * This is needed as the divisions will fail if the insert takes no time + * which will only be the case when there is no data to insert. + */ + if (msecs == 0) { + print_secs = 0; + print_ops_sec = 0; + } else { + print_secs = (double)msecs / (double)MSEC_PER_SEC; + print_ops_sec = + (uint64_t)((cfg->icount / msecs) / MSEC_PER_SEC); + } lprintf(cfg, 0, 1, "Load time: %.2f\n" "load ops/sec: %" PRIu64, - (double)msecs / (double)MSEC_PER_SEC, - (uint64_t)((cfg->icount / msecs) / MSEC_PER_SEC)); + print_secs, print_ops_sec); /* * If configured, compact to allow LSM merging to complete. We @@ -1420,16 +1463,19 @@ execute_workload(CONFIG *cfg) { CONFIG_THREAD *threads; WORKLOAD *workp; - uint64_t last_ckpts, last_inserts, last_reads, last_updates; + uint64_t last_ckpts, last_inserts, last_reads, last_truncates; + uint64_t last_updates; uint32_t interval, run_ops, run_time; u_int i; int ret, t_ret; void *(*pfunc)(void *); cfg->insert_key = 0; - cfg->insert_ops = cfg->read_ops = cfg->update_ops = 0; + cfg->insert_ops = cfg->read_ops = cfg->truncate_ops = 0; + cfg->update_ops = 0; - last_ckpts = last_inserts = last_reads = last_updates = 0; + last_ckpts = last_inserts = last_reads = last_truncates = 0; + last_updates = 0; ret = 0; if (cfg->warmup != 0) @@ -1454,9 +1500,9 @@ execute_workload(CONFIG *cfg) workp = cfg->workload; i < cfg->workload_cnt; ++i, ++workp) { lprintf(cfg, 0, 1, "Starting workload #%d: %" PRId64 " threads, inserts=%" - PRId64 ", reads=%" PRId64 ", updates=%" PRId64, - i + 1, - workp->threads, workp->insert, workp->read, workp->update); + PRId64 ", reads=%" PRId64 ", updates=%" PRId64 + ", truncate=%" PRId64, i + 1, workp->threads, workp->insert, + workp->read, workp->update, workp->truncate); /* Figure out the workload's schedule. */ if ((ret = run_mix_schedule(cfg, workp)) != 0) @@ -1496,6 +1542,7 @@ execute_workload(CONFIG *cfg) cfg->insert_ops = sum_insert_ops(cfg); cfg->read_ops = sum_read_ops(cfg); cfg->update_ops = sum_update_ops(cfg); + cfg->truncate_ops = sum_truncate_ops(cfg); /* If we're checking total operations, see if we're done. */ if (run_ops != 0 && run_ops <= @@ -1510,16 +1557,18 @@ execute_workload(CONFIG *cfg) lprintf(cfg, 0, 1, "%" PRIu64 " reads, %" PRIu64 " inserts, %" PRIu64 - " updates, %" PRIu64 " checkpoints in %" PRIu32 - " secs (%" PRIu32 " total secs)", + " updates, %" PRIu64 " truncates, %" PRIu64 + " checkpoints in %" PRIu32 " secs (%" PRIu32 " total secs)", cfg->read_ops - last_reads, cfg->insert_ops - last_inserts, cfg->update_ops - last_updates, + cfg->truncate_ops - last_truncates, cfg->ckpt_ops - last_ckpts, cfg->report_interval, cfg->totalsec); last_reads = cfg->read_ops; last_inserts = cfg->insert_ops; last_updates = cfg->update_ops; + last_truncates = cfg->truncate_ops; last_ckpts = cfg->ckpt_ops; } @@ -1902,6 +1951,7 @@ start_run(CONFIG *cfg) /* One final summation of the operations we've completed. */ cfg->read_ops = sum_read_ops(cfg); cfg->insert_ops = sum_insert_ops(cfg); + cfg->truncate_ops = sum_truncate_ops(cfg); cfg->update_ops = sum_update_ops(cfg); cfg->ckpt_ops = sum_ckpt_ops(cfg); total_ops = cfg->read_ops + cfg->insert_ops + cfg->update_ops; @@ -1917,6 +1967,11 @@ start_run(CONFIG *cfg) cfg->insert_ops, (cfg->insert_ops * 100) / total_ops, cfg->insert_ops / cfg->run_time); lprintf(cfg, 0, 1, + "Executed %" PRIu64 " truncate operations (%" PRIu64 + "%%) %" PRIu64 " ops/sec", + cfg->truncate_ops, (cfg->truncate_ops * 100) / total_ops, + cfg->truncate_ops / cfg->run_time); + lprintf(cfg, 0, 1, "Executed %" PRIu64 " update operations (%" PRIu64 "%%) %" PRIu64 " ops/sec", cfg->update_ops, (cfg->update_ops * 100) / total_ops, @@ -2062,14 +2117,25 @@ main(int argc, char *argv[]) break; } + if (cfg->populate_threads == 0 && cfg->icount != 0) { + lprintf(cfg, 1, 0, + "Cannot have 0 populate threads when icount is set\n"); + goto err; + } + cfg->async_config = NULL; /* * If the user specified async_threads we use async for all ops. * If the user wants compaction, then we also enable async for * the compact operation, but not for the workloads. */ - if (cfg->async_threads > 0) + if (cfg->async_threads > 0) { + if (cfg->has_truncate > 0) { + lprintf(cfg, 1, 0, "Cannot run truncate and async\n"); + goto err; + } cfg->use_asyncops = 1; + } if (cfg->compact && cfg->async_threads == 0) cfg->async_threads = 2; if (cfg->async_threads > 0) { @@ -2091,6 +2157,18 @@ main(int argc, char *argv[]) if ((ret = config_compress(cfg)) != 0) goto err; + /* You can't have truncate on a random collection. */ + if (cfg->has_truncate && cfg->random_range) { + lprintf(cfg, 1, 0, "Cannot run truncate and random_range\n"); + goto err; + } + + /* We can't run truncate with more than one table. */ + if (cfg->has_truncate && cfg->table_count > 1) { + lprintf(cfg, 1, 0, "Cannot truncate more than 1 table\n"); + goto err; + } + /* Build the URI from the table name. */ req_len = strlen("table:") + strlen(HELIUM_NAME) + strlen(cfg->table_name) + 2; @@ -2361,7 +2439,12 @@ wtperf_value_range(CONFIG *cfg) { if (cfg->random_range) return (cfg->icount + cfg->random_range); - + /* + * It is legal to configure a zero size populate phase, hide that + * from other code by pretending the range is 1 in that case. + */ + if (cfg->icount + cfg->insert_key == 0) + return (1); return (cfg->icount + cfg->insert_key - (u_int)(cfg->workers_cnt + 1)); } diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.h b/src/third_party/wiredtiger/bench/wtperf/wtperf.h index 874cdc499b1..58dc65388ae 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.h +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.h @@ -26,6 +26,9 @@ * OTHER DEALINGS IN THE SOFTWARE. */ +#ifndef HAVE_WTPERF_H +#define HAVE_WTPERF_H + #ifndef _WIN32 #include <sys/time.h> #endif @@ -90,14 +93,39 @@ typedef struct { int64_t throttle; /* Maximum operations/second */ /* Number of operations per transaction. Zero for autocommit */ int64_t ops_per_txn; + int64_t truncate; /* Truncate ratio */ + uint64_t truncate_pct; /* Truncate Percent */ + uint64_t truncate_count; /* Truncate Count */ #define WORKER_INSERT 1 /* Insert */ #define WORKER_INSERT_RMW 2 /* Insert with read-modify-write */ #define WORKER_READ 3 /* Read */ -#define WORKER_UPDATE 4 /* Update */ +#define WORKER_TRUNCATE 4 /* Truncate */ +#define WORKER_UPDATE 5 /* Update */ uint8_t ops[100]; /* Operation schedule */ } WORKLOAD; +/* Steering items for the truncate workload */ +typedef struct __truncate_struct TRUNCATE_CONFIG; +struct __truncate_struct { + uint64_t stone_gap; + uint64_t needed_stones; + uint64_t final_stone_gap; + uint64_t expected_total; + uint64_t total_inserts; + uint64_t last_total_inserts; + uint64_t num_stones; + uint64_t last_key; +}; + +/* Queue entry for use with the Truncate Logic */ +struct __truncate_queue_entry { + char *key; /* Truncation point */ + uint64_t diff; /* Number of items to be truncated*/ + STAILQ_ENTRY(__truncate_queue_entry) q; +}; +typedef struct __truncate_queue_entry TRUNCATE_QUEUE_ENTRY; + #define LOG_PARTIAL_CONFIG ",log=(enabled=false)" /* * NOTE: If you add any fields to this structure here, you must also add @@ -135,6 +163,7 @@ struct __config { /* Configuration structure */ uint64_t ckpt_ops; /* checkpoint operations */ uint64_t insert_ops; /* insert operations */ uint64_t read_ops; /* read operations */ + uint64_t truncate_ops; /* truncate operations */ uint64_t update_ops; /* update operations */ uint64_t insert_key; /* insert key */ @@ -146,6 +175,11 @@ struct __config { /* Configuration structure */ volatile uint32_t totalsec; /* total seconds running */ + u_int has_truncate; /* if there is a truncate workload */ + + /* Queue head for use with the Truncate Logic */ + STAILQ_HEAD(__truncate_qh, __truncate_queue_entry) stone_head; + /* Fields changeable on command line are listed in wtperf_opt.i */ #define OPT_DECLARE_STRUCT #include "wtperf_opt.i" @@ -211,7 +245,7 @@ typedef struct { struct __config_thread { /* Per-thread structure */ CONFIG *cfg; /* Enclosing configuration */ - uint64_t rnd; /* Random number generation state */ + WT_RAND_STATE rnd; /* Random number generation state */ pthread_t handle; /* Handle */ @@ -223,8 +257,13 @@ struct __config_thread { /* Per-thread structure */ TRACK insert; /* Insert operations */ TRACK read; /* Read operations */ TRACK update; /* Update operations */ + TRACK truncate; /* Truncate operations */ + TRACK truncate_sleep; /* Truncate sleep operations */ + TRUNCATE_CONFIG trunc_cfg; /* Truncate configuration */ + }; +void cleanup_truncate_config(CONFIG *); int config_assign(CONFIG *, const CONFIG *); int config_compress(CONFIG *); void config_free(CONFIG *); @@ -238,11 +277,15 @@ void latency_read(CONFIG *, uint32_t *, uint32_t *, uint32_t *); void latency_update(CONFIG *, uint32_t *, uint32_t *, uint32_t *); void latency_print(CONFIG *); int enomem(const CONFIG *); +int run_truncate( + CONFIG *, CONFIG_THREAD *, WT_CURSOR *, WT_SESSION *, int *); int setup_log_file(CONFIG *); +int setup_truncate(CONFIG *, CONFIG_THREAD *, WT_SESSION *); uint64_t sum_ckpt_ops(CONFIG *); uint64_t sum_insert_ops(CONFIG *); uint64_t sum_pop_ops(CONFIG *); uint64_t sum_read_ops(CONFIG *); +uint64_t sum_truncate_ops(CONFIG *); uint64_t sum_update_ops(CONFIG *); void usage(void); @@ -251,3 +294,14 @@ void lprintf(const CONFIG *, int err, uint32_t, const char *, ...) __attribute__((format (printf, 4, 5))) #endif ; + +static inline void +generate_key(CONFIG *cfg, char *key_buf, uint64_t keyno) +{ + /* + * Don't change to snprintf, sprintf is faster in some tests. + */ + sprintf(key_buf, "%0*" PRIu64, cfg->key_sz - 1, keyno); +} + +#endif diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i index 6cb39ac3cc4..7e29aa0f3c2 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i @@ -167,7 +167,8 @@ DEF_OPT_AS_STRING(threads, "", "workload configuration: each 'count' " "'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' " "which would create 2 threads doing nothing but reads and 8 threads " "each doing 50% inserts and 25% reads and updates. Allowed configuration " - "values are 'count', 'throttle', 'reads', 'inserts', 'updates'. There are " + "values are 'count', 'throttle', 'reads', 'inserts', 'updates', 'truncate'," + " 'truncate_pct' and 'truncate_count'. There are " "also behavior modifiers, supported modifiers are 'ops_per_txn'") DEF_OPT_AS_CONFIG_STRING(transaction_config, "", "transaction configuration string, relevant when populate_opts_per_txn " diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_truncate.c b/src/third_party/wiredtiger/bench/wtperf/wtperf_truncate.c new file mode 100644 index 00000000000..0d5d1045e1e --- /dev/null +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_truncate.c @@ -0,0 +1,216 @@ +/*- + * Public Domain 2014-2015 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "wtperf.h" + +static inline uint64_t +decode_key(char *key_buf) +{ + return (strtoull(key_buf, NULL, 10)); +} + +int +setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) { + + TRUNCATE_CONFIG *trunc_cfg; + TRUNCATE_QUEUE_ENTRY *truncate_item; + WORKLOAD *workload; + WT_CURSOR *cursor; + char *key, *truncate_key; + int ret; + uint64_t end_point, final_stone_gap, i, start_point; + + end_point = final_stone_gap = start_point = 0; + trunc_cfg = &thread->trunc_cfg; + workload = thread->workload; + + /* We are limited to only one table when running truncate. */ + if ((ret = session->open_cursor( + session, cfg->uris[0], NULL, NULL, &cursor)) != 0) + goto err; + + /* How many entries between each stone. */ + trunc_cfg->stone_gap = + (workload->truncate_count * workload->truncate_pct) / 100; + /* How many stones we need. */ + trunc_cfg->needed_stones = + workload->truncate_count / trunc_cfg->stone_gap; + + final_stone_gap = trunc_cfg->stone_gap; + + /* Reset this value for use again. */ + trunc_cfg->stone_gap = 0; + + /* + * Here we check if there is data in the collection. If there is + * data available, then we need to setup some initial truncation + * stones. + */ + if ((ret = cursor->next(cursor)) != 0 || + (ret = cursor->get_key(cursor, &key)) != 0) { + lprintf(cfg, ret, 0, "truncate setup start: failed"); + goto err; + } + + start_point = decode_key(key); + if ((cursor->reset(cursor)) != 0 || (ret = cursor->prev(cursor)) != 0 || + (ret = cursor->get_key(cursor, &key)) != 0) { + lprintf(cfg, ret, 0, "truncate setup end: failed"); + goto err; + } + end_point = decode_key(key); + + /* Assign stones if there are enough documents. */ + if (start_point + trunc_cfg->needed_stones > end_point) + trunc_cfg->stone_gap = 0; + else + trunc_cfg->stone_gap = + (end_point - start_point) / trunc_cfg->needed_stones; + + /* If we have enough data allocate some stones. */ + if (trunc_cfg->stone_gap != 0) { + trunc_cfg->expected_total = (end_point - start_point); + for (i = 1; i <= trunc_cfg->needed_stones; i++) { + truncate_key = calloc(cfg->key_sz, 1); + if (truncate_key == NULL) { + ret = enomem(cfg); + goto err; + } + truncate_item = calloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1); + if (truncate_item == NULL) { + free(truncate_key); + ret = enomem(cfg); + goto err; + } + generate_key( + cfg, truncate_key, trunc_cfg->stone_gap * i); + truncate_item->key = truncate_key; + truncate_item->diff = + (trunc_cfg->stone_gap * i) - trunc_cfg->last_key; + STAILQ_INSERT_TAIL( &cfg->stone_head, truncate_item, q); + trunc_cfg->last_key = trunc_cfg->stone_gap * i; + trunc_cfg->num_stones++; + } + } + trunc_cfg->stone_gap = final_stone_gap; + +err: if ((ret = cursor->close(cursor)) != 0) { + lprintf(cfg, ret, 0, "truncate setup: cursor close failed"); + } + return (ret); +} + +int +run_truncate(CONFIG *cfg, CONFIG_THREAD *thread, + WT_CURSOR *cursor, WT_SESSION *session, int *truncatedp) { + + TRUNCATE_CONFIG *trunc_cfg; + TRUNCATE_QUEUE_ENTRY *truncate_item; + char *truncate_key; + int ret, t_ret; + + ret = 0; + trunc_cfg = &thread->trunc_cfg; + + *truncatedp = 0; + /* Update the total inserts */ + trunc_cfg->total_inserts = sum_insert_ops(cfg); + trunc_cfg->expected_total += + (trunc_cfg->total_inserts - trunc_cfg->last_total_inserts); + trunc_cfg->last_total_inserts = trunc_cfg->total_inserts; + + /* We are done if there isn't enough data to trigger a new milestone. */ + if (trunc_cfg->expected_total <= trunc_cfg->needed_stones) + return (0); + + while (trunc_cfg->num_stones < trunc_cfg->needed_stones) { + trunc_cfg->last_key += trunc_cfg->stone_gap; + truncate_key = calloc(cfg->key_sz, 1); + if (truncate_key == NULL) { + lprintf(cfg, ENOMEM, 0, + "truncate: couldn't allocate key array"); + return (ENOMEM); + } + truncate_item = calloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1); + if (truncate_item == NULL) { + free(truncate_key); + lprintf(cfg, ENOMEM, 0, + "truncate: couldn't allocate item"); + return (ENOMEM); + } + generate_key(cfg, truncate_key, trunc_cfg->last_key); + truncate_item->key = truncate_key; + truncate_item->diff = trunc_cfg->stone_gap; + STAILQ_INSERT_TAIL(&cfg->stone_head, truncate_item, q); + trunc_cfg->num_stones++; + } + + /* We are done if there isn't enough data to trigger a truncate. */ + if (trunc_cfg->num_stones == 0 || + trunc_cfg->expected_total <= thread->workload->truncate_count) + return (0); + + truncate_item = STAILQ_FIRST(&cfg->stone_head); + trunc_cfg->num_stones--; + STAILQ_REMOVE_HEAD(&cfg->stone_head, q); + cursor->set_key(cursor,truncate_item->key); + if ((ret = cursor->search(cursor)) != 0) { + lprintf(cfg, ret, 0, "Truncate search: failed"); + goto err; + } + + if ((ret = session->truncate(session, NULL, NULL, cursor, NULL)) != 0) { + lprintf(cfg, ret, 0, "Truncate: failed"); + goto err; + } + + + *truncatedp = 1; + trunc_cfg->expected_total -= truncate_item->diff; + +err: free(truncate_item->key); + free(truncate_item); + t_ret = cursor->reset(cursor); + if (t_ret != 0) + lprintf(cfg, t_ret, 0, "Cursor reset failed"); + if (ret == 0 && t_ret != 0) + ret = t_ret; + return (ret); +} + +void +cleanup_truncate_config(CONFIG *cfg) { + TRUNCATE_QUEUE_ENTRY *truncate_item; + + while (!STAILQ_EMPTY(&cfg->stone_head)) { + truncate_item = STAILQ_FIRST(&cfg->stone_head); + STAILQ_REMOVE_HEAD(&cfg->stone_head, q); + free(truncate_item->key); + free(truncate_item); + } +} diff --git a/src/third_party/wiredtiger/dist/api_err.py b/src/third_party/wiredtiger/dist/api_err.py index d39f076656f..936c7bb11a7 100644 --- a/src/third_party/wiredtiger/dist/api_err.py +++ b/src/third_party/wiredtiger/dist/api_err.py @@ -100,7 +100,7 @@ tfile.write('''/* DO NOT EDIT: automatically built by dist/api_err.py. */ /* * __wt_wiredtiger_error -- - *\tReturn a constant string for WiredTiger POSIX-standard and errors. + *\tReturn a constant string for POSIX-standard and WiredTiger errors. */ const char * __wt_wiredtiger_error(int error) @@ -119,8 +119,8 @@ for err in errors: tfile.write('''\t} \t/* -\t * POSIX errors are non-negative integers; check for 0 explicitly -\t * in-case the underlying strerror doesn't handle 0, some don't. +\t * POSIX errors are non-negative integers; check for 0 explicitly incase +\t * the underlying strerror doesn't handle 0, some historically didn't. \t */ \tif (error == 0) \t\treturn ("Successful return: 0"); diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index 623a34447a8..a9ae2a10006 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -21,8 +21,8 @@ WT_ATOMIC_ADD2 WT_ATOMIC_CAS1 WT_ATOMIC_CAS2 WT_ATOMIC_FETCH_ADD1 -WT_ATOMIC_FETCH_ADD2 WT_ATOMIC_FETCH_ADD4 +WT_ATOMIC_FETCH_ADD8 WT_ATOMIC_STORE1 WT_ATOMIC_STORE2 WT_ATOMIC_SUB1 diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 1ed92b79ba8..a104bb011da 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -609,6 +609,7 @@ idx ifdef's ikey impl +incase incr incrementing indices @@ -743,6 +744,7 @@ nop noraw notfound notsup +notused nset nsnap nul diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index caf68364696..77061b36dcb 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -208,10 +208,12 @@ connection_stats = [ ########################################## # Dhandle statistics ########################################## - DhandleStat('dh_conn_handles', 'connection dhandles swept'), - DhandleStat('dh_conn_ref', 'connection candidate referenced'), - DhandleStat('dh_conn_sweeps', 'connection sweeps'), - DhandleStat('dh_conn_tod', 'connection time-of-death sets'), + DhandleStat('dh_sweep_close', 'connection sweep dhandles closed'), + DhandleStat('dh_sweep_remove', + 'connection sweep dhandles removed from hash list'), + DhandleStat('dh_sweep_ref', 'connection sweep candidate became referenced'), + DhandleStat('dh_sweep_tod', 'connection sweep time-of-death sets'), + DhandleStat('dh_sweeps', 'connection sweeps'), DhandleStat('dh_session_handles', 'session dhandles swept'), DhandleStat('dh_session_sweeps', 'session sweep attempts'), diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c index c88c44fb9c3..abcad392e33 100644 --- a/src/third_party/wiredtiger/src/block/block_ckpt.c +++ b/src/third_party/wiredtiger/src/block/block_ckpt.c @@ -403,7 +403,8 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) */ if (block->ckpt_inprogress) { __wt_errx(session, - "%s: checkpointed without the checkpoint being resolved", + "%s: checkpointed without first resolving the previous " + "checkpoint", block->name); WT_RET(__wt_block_checkpoint_resolve(session, block)); diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c index df42a14816f..fd00e0c7deb 100644 --- a/src/third_party/wiredtiger/src/block/block_open.c +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -132,8 +132,7 @@ __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block) bucket = block->name_hash % WT_HASH_ARRAY_SIZE; WT_CONN_BLOCK_REMOVE(conn, block, bucket); - if (block->name != NULL) - __wt_free(session, block->name); + __wt_free(session, block->name); if (block->fh != NULL) WT_TRET(__wt_close(session, &block->fh)); @@ -195,14 +194,20 @@ __wt_block_open(WT_SESSION_IMPL *session, } } - /* Basic structure allocation, initialization. */ + /* + * Basic structure allocation, initialization. + * + * Note: set the block's name-hash value before any work that can fail + * because cleanup calls the block destroy code which uses that hash + * value to remove the block from the underlying linked lists. + */ WT_ERR(__wt_calloc_one(session, &block)); block->ref = 1; + block->name_hash = hash; + block->allocsize = allocsize; WT_CONN_BLOCK_INSERT(conn, block, bucket); WT_ERR(__wt_strdup(session, filename, &block->name)); - block->name_hash = hash; - block->allocsize = allocsize; WT_ERR(__wt_config_gets(session, cfg, "block_allocation", &cval)); block->allocfirst = diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c index ef944fcb152..0d631396b41 100644 --- a/src/third_party/wiredtiger/src/block/block_read.c +++ b/src/third_party/wiredtiger/src/block/block_read.c @@ -192,21 +192,29 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, buf->size = size; blk = WT_BLOCK_HEADER_REF(buf->mem); - page_cksum = blk->cksum; - if (page_cksum == cksum) { + if (blk->cksum == cksum) { blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (page_cksum == cksum) return (0); - } - if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) - __wt_errx(session, - "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" - PRIu32 " != %" PRIu32 "]", - size, (uintmax_t)offset, cksum, page_cksum); + if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) + __wt_errx(session, + "read checksum error for %" PRIu32 "B block at " + "offset %" PRIuMAX ": calculated block checksum " + "of %" PRIu32 " doesn't match expected checksum " + "of %" PRIu32, + size, (uintmax_t)offset, page_cksum, cksum); + } else + if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) + __wt_errx(session, + "read checksum error for %" PRIu32 "B block at " + "offset %" PRIuMAX ": block header checksum " + "of %" PRIu32 " doesn't match expected checksum " + "of %" PRIu32, + size, (uintmax_t)offset, blk->cksum, cksum); /* Panic if a checksum fails during an ordinary read. */ return (block->verify || diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index f257a955801..2705f371fb5 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -81,10 +81,11 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; - int prev, skip; + int empty_internal, prev, skip; uint32_t slot; btree = S2BT(session); + empty_internal = 0; /* * Tree walks are special: they look inside page structures that splits @@ -171,6 +172,15 @@ ascend: /* (!prev && slot == pindex->entries - 1)) { ref = ref->home->pg_intl_parent_ref; + /* + * If we got all the way through an internal page and + * all of the child pages were deleted, evict it. + */ + if (empty_internal) { + __wt_page_evict_soon(ref->page); + empty_internal = 0; + } + /* Optionally skip internal pages. */ if (LF_ISSET(WT_READ_SKIP_INTL)) goto ascend; @@ -226,6 +236,13 @@ ascend: /* if (ref->pindex_hint != slot) ref->pindex_hint = slot; + /* + * If we see any child states other than deleted, the + * page isn't empty. + */ + if (ref->state != WT_REF_DELETED) + empty_internal = 0; + if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: @@ -338,10 +355,10 @@ ascend: /* */ descend: couple = ref; page = ref->page; - if (page->type == WT_PAGE_ROW_INT || - page->type == WT_PAGE_COL_INT) { + if (WT_PAGE_IS_INTERNAL(page)) { WT_INTL_INDEX_GET(session, page, pindex); slot = prev ? pindex->entries - 1 : 0; + empty_internal = 1; } else { *refp = ref; goto done; diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c index 2fe09681090..fb7c9a1ce90 100644 --- a/src/third_party/wiredtiger/src/btree/col_modify.c +++ b/src/third_party/wiredtiger/src/btree/col_modify.c @@ -160,7 +160,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ - if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0) + if (cbt->ins_stack[0] == NULL || recno == 0) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index f0a10cdf528..3331632b725 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -192,7 +192,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ - if (WT_SKIP_FIRST(ins_head) == NULL) + if (cbt->ins_stack[0] == NULL) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; @@ -263,7 +263,6 @@ int __wt_update_alloc( WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep) { - WT_UPDATE *upd; size_t size; /* @@ -271,16 +270,15 @@ __wt_update_alloc( * the value into place. */ size = value == NULL ? 0 : value->size; - WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd)); + WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, updp)); if (value == NULL) - WT_UPDATE_DELETED_SET(upd); + WT_UPDATE_DELETED_SET(*updp); else { - upd->size = WT_STORE_SIZE(size); - memcpy(WT_UPDATE_DATA(upd), value->data, size); + (*updp)->size = WT_STORE_SIZE(size); + memcpy(WT_UPDATE_DATA(*updp), value->data, size); } - *updp = upd; - *sizep = WT_UPDATE_MEMSIZE(upd); + *sizep = WT_UPDATE_MEMSIZE(*updp); return (0); } diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 9803b924355..d83d3253c44 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -471,6 +471,7 @@ __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *current, *descent; + uint32_t cnt; btree = S2BT(session); @@ -528,18 +529,22 @@ restart: /* * If the tree is new (and not empty), it might have a large insert - * list, pick the key in the middle of that insert list. + * list. Count how many records are in the list. */ F_SET(cbt, WT_CBT_SEARCH_SMALLEST); if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) WT_ERR(WT_NOTFOUND); - for (p = t = WT_SKIP_FIRST(cbt->ins_head);;) { + for (cnt = 1, p = WT_SKIP_FIRST(cbt->ins_head);; ++cnt) if ((p = WT_SKIP_NEXT(p)) == NULL) break; - if ((p = WT_SKIP_NEXT(p)) == NULL) + + /* + * Select a random number from 0 to (N - 1), return that record. + */ + cnt = __wt_random(&session->rnd) % cnt; + for (p = t = WT_SKIP_FIRST(cbt->ins_head);; t = p) + if (cnt-- == 0 || (p = WT_SKIP_NEXT(p)) == NULL) break; - t = WT_SKIP_NEXT(t); - } cbt->ref = current; cbt->compare = 0; cbt->ins = t; diff --git a/src/third_party/wiredtiger/src/conn/api_strerror.c b/src/third_party/wiredtiger/src/conn/api_strerror.c index e41e402a1fd..92f12402537 100644 --- a/src/third_party/wiredtiger/src/conn/api_strerror.c +++ b/src/third_party/wiredtiger/src/conn/api_strerror.c @@ -13,7 +13,7 @@ /* * __wt_wiredtiger_error -- - * Return a constant string for WiredTiger POSIX-standard and errors. + * Return a constant string for POSIX-standard and WiredTiger errors. */ const char * __wt_wiredtiger_error(int error) @@ -41,8 +41,8 @@ __wt_wiredtiger_error(int error) } /* - * POSIX errors are non-negative integers; check for 0 explicitly - * in-case the underlying strerror doesn't handle 0, some don't. + * POSIX errors are non-negative integers; check for 0 explicitly incase + * the underlying strerror doesn't handle 0, some historically didn't. */ if (error == 0) return ("Successful return: 0"); diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index 067ad00560e..bc96ddd117a 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1904,7 +1904,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, conn->hazard_max = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval)); - conn->session_size = (uint32_t)cval.val + WT_NUM_INTERNAL_SESSIONS; + conn->session_size = (uint32_t)cval.val + WT_EXTRA_INTERNAL_SESSIONS; WT_ERR(__wt_config_gets(session, cfg, "session_scratch_max", &cval)); conn->session_scratch_max = (size_t)cval.val; diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index 8ff54ec2a6d..1ea609f6578 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -591,6 +591,7 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, int final) bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE; WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + WT_ASSERT(session, dhandle != conn->cache->evict_file_next); /* Check if the handle was reacquired by a session while we waited. */ if (!final && diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index de4bf7268ed..dae0293d790 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -500,7 +500,7 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) WT_RET(__wt_log_slot_free(session, slot)); if (free_i != NULL && *free_i == WT_SLOT_POOL && slot->slot_state == WT_LOG_SLOT_FREE) - *free_i = save_i; + *free_i = written[i].slot_index; } } return (0); diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index c4350d90adb..3f3808579a9 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -128,7 +128,8 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * conditional because we allocate the log path so that printlog can * run without running logging or recovery. */ - if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && + FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE)) WT_TRET(__wt_txn_checkpoint_log( session, 1, WT_TXN_LOG_CKPT_STOP, NULL)); F_CLR(conn, WT_CONN_LOG_SERVER_RUN); diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index ec6f628a02e..492b89bb8a8 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -14,49 +14,44 @@ * handles. */ static int -__sweep_mark(WT_SESSION_IMPL *session, int *dead_handlesp) +__sweep_mark(WT_SESSION_IMPL *session, time_t now) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; - time_t now; conn = S2C(session); - *dead_handlesp = 0; - /* Don't discard handles that have been open recently. */ - WT_RET(__wt_seconds(session, &now)); - - WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps); SLIST_FOREACH(dhandle, &conn->dhlh, l) { if (WT_IS_METADATA(dhandle)) continue; - if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { - ++*dead_handlesp; - continue; - } - if (dhandle->session_inuse != 0 || - now <= dhandle->timeofdeath + conn->sweep_idle_time || - conn->sweep_idle_time == 0) - continue; - if (dhandle->timeofdeath == 0) { - dhandle->timeofdeath = now; - WT_STAT_FAST_CONN_INCR(session, dh_conn_tod); + + /* + * There are some internal increments of the in-use count such + * as eviction. Don't keep handles alive because of those + * cases, but if we see multiple cursors open, clear the time + * of death. + */ + if (dhandle->session_inuse > 1) + dhandle->timeofdeath = 0; + + if (F_ISSET(dhandle, WT_DHANDLE_DEAD) || + dhandle->session_inuse != 0 || + dhandle->timeofdeath != 0) continue; - } - /* We now have a candidate to close. */ - ++*dead_handlesp; + dhandle->timeofdeath = now; + WT_STAT_FAST_CONN_INCR(session, dh_sweep_tod); } return (0); } /* - * __sweep_expire_handle -- + * __sweep_expire_one -- * Mark a single handle dead. */ static int -__sweep_expire_handle(WT_SESSION_IMPL *session) +__sweep_expire_one(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; @@ -113,42 +108,31 @@ err: WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); * until we have reached the configured minimum number of handles. */ static int -__sweep_expire(WT_SESSION_IMPL *session) +__sweep_expire(WT_SESSION_IMPL *session, time_t now) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - time_t now; conn = S2C(session); - /* If sweep_idle_time is 0, then we won't expire any cursors */ - if (conn->sweep_idle_time == 0) - return (0); - - /* Don't discard handles that have been open recently. */ - WT_RET(__wt_seconds(session, &now)); - - WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps); SLIST_FOREACH(dhandle, &conn->dhlh, l) { /* - * Ignore open files once the open file count reaches the + * Ignore open files once the btree file count is below the * minimum number of handles. */ - if (conn->open_file_count < conn->sweep_handles_min) + if (conn->open_btree_count < conn->sweep_handles_min) break; - if (WT_IS_METADATA(dhandle)) - continue; - if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || - F_ISSET(dhandle, WT_DHANDLE_DEAD)) - continue; - if (dhandle->session_inuse != 0 || + if (WT_IS_METADATA(dhandle) || + F_ISSET(dhandle, WT_DHANDLE_DEAD) || + dhandle->session_inuse != 0 || + dhandle->timeofdeath == 0 || now <= dhandle->timeofdeath + conn->sweep_idle_time) continue; WT_WITH_DHANDLE(session, dhandle, - ret = __sweep_expire_handle(session)); + ret = __sweep_expire_one(session)); WT_RET_BUSY_OK(ret); } @@ -156,11 +140,12 @@ __sweep_expire(WT_SESSION_IMPL *session) } /* - * __sweep_flush -- - * Flush pages from dead trees. + * __sweep_discard_trees -- + * Discard pages from dead trees. */ static int -__sweep_flush(WT_SESSION_IMPL *session) +__sweep_discard_trees( + WT_SESSION_IMPL *session, time_t now, u_int *dead_handlesp) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; @@ -168,8 +153,14 @@ __sweep_flush(WT_SESSION_IMPL *session) conn = S2C(session); - WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps); + *dead_handlesp = 0; + SLIST_FOREACH(dhandle, &conn->dhlh, l) { + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN | WT_DHANDLE_EXCLUSIVE) && + (dhandle->timeofdiscard == 0 || + now <= dhandle->timeofdiscard + conn->sweep_idle_time)) + ++*dead_handlesp; + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || !F_ISSET(dhandle, WT_DHANDLE_DEAD)) continue; @@ -178,9 +169,12 @@ __sweep_flush(WT_SESSION_IMPL *session) WT_WITH_DHANDLE(session, dhandle, ret = __wt_conn_btree_sync_and_close(session, 0, 0)); - /* We closed the btree handle, bump the statistic. */ - if (ret == 0) - WT_STAT_FAST_CONN_INCR(session, dh_conn_handles); + /* We closed the btree handle. */ + if (ret == 0) { + WT_STAT_FAST_CONN_INCR(session, dh_sweep_close); + ++*dead_handlesp; + } else + WT_STAT_FAST_CONN_INCR(session, dh_sweep_ref); WT_RET_BUSY_OK(ret); } @@ -189,52 +183,75 @@ __sweep_flush(WT_SESSION_IMPL *session) } /* + * __sweep_remove_one -- + * Remove a closed handle from the connection list. + */ +static int +__sweep_remove_one(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) +{ + WT_DECL_RET; + + /* Try to get exclusive access. */ + WT_RET(__wt_try_writelock(session, dhandle->rwlock)); + + /* + * If there are no longer any references to the handle in any + * sessions, attempt to discard it. + */ + if (F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN) || + dhandle->session_inuse != 0 || dhandle->session_ref != 0) + WT_ERR(EBUSY); + + WT_WITH_DHANDLE(session, dhandle, + ret = __wt_conn_dhandle_discard_single(session, 0, 1)); + + /* + * If the handle was not successfully discarded, unlock it and + * don't retry the discard until it times out again. + */ + if (ret != 0) { +err: WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); + } + + return (ret); +} + +/* * __sweep_remove_handles -- - * Remove closed dhandles from the connection list. + * Remove closed handles from the connection list. */ static int -__sweep_remove_handles(WT_SESSION_IMPL *session) +__sweep_remove_handles(WT_SESSION_IMPL *session, time_t now) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle, *dhandle_next; WT_DECL_RET; conn = S2C(session); - dhandle = SLIST_FIRST(&conn->dhlh); - for (; dhandle != NULL; dhandle = dhandle_next) { + for (dhandle = SLIST_FIRST(&conn->dhlh); + dhandle != NULL; + dhandle = dhandle_next) { dhandle_next = SLIST_NEXT(dhandle, l); if (WT_IS_METADATA(dhandle)) continue; - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) || - dhandle->session_inuse != 0 || - dhandle->session_ref != 0) + if (F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN) || + dhandle->session_inuse != 0 || dhandle->session_ref != 0) continue; - - /* Make sure we get exclusive access. */ - if ((ret = - __wt_try_writelock(session, dhandle->rwlock)) == EBUSY) + if (dhandle->timeofdiscard != 0 && + now <= dhandle->timeofdiscard + conn->sweep_idle_time) continue; - WT_RET(ret); - /* - * If there are no longer any references to the handle in any - * sessions, attempt to discard it. - */ - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) || - dhandle->session_inuse != 0 || dhandle->session_ref != 0) { - WT_RET(__wt_writeunlock(session, dhandle->rwlock)); - continue; + WT_WITH_HANDLE_LIST_LOCK(session, + ret = __sweep_remove_one(session, dhandle)); + if (ret == 0) + WT_STAT_FAST_CONN_INCR( + session, dh_sweep_remove); + else { + WT_STAT_FAST_CONN_INCR(session, dh_sweep_ref); + dhandle->timeofdiscard = now; } - - WT_WITH_DHANDLE(session, dhandle, - ret = __wt_conn_dhandle_discard_single(session, 0, 1)); - - /* If the handle was not successfully discarded, unlock it. */ - if (ret != 0) - WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); WT_RET_BUSY_OK(ret); - WT_STAT_FAST_CONN_INCR(session, dh_conn_ref); } return (ret == EBUSY ? 0 : ret); @@ -250,7 +267,8 @@ __sweep_server(void *arg) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; - int dead_handles; + time_t now; + u_int dead_handles; session = arg; conn = S2C(session); @@ -263,35 +281,30 @@ __sweep_server(void *arg) /* Wait until the next event. */ WT_ERR(__wt_cond_wait(session, conn->sweep_cond, (uint64_t)conn->sweep_interval * WT_MILLION)); + WT_ERR(__wt_seconds(session, &now)); + + WT_STAT_FAST_CONN_INCR(session, dh_sweeps); /* * Mark handles with a time of death, and report whether any - * handles are marked dead. + * handles are marked dead. If sweep_idle_time is 0, handles + * never become idle. */ - WT_ERR(__sweep_mark(session, &dead_handles)); + if (conn->sweep_idle_time != 0) + WT_ERR(__sweep_mark(session, now)); /* - * We only want to flush and expire if there are no dead handles - * and if either the sweep_idle_time is not 0, or if we have - * reached the configured limit of handles. + * Close handles if we have reached the configured limit. + * If sweep_idle_time is 0, handles never become idle. */ - if (dead_handles == 0 && - (conn->open_file_count < conn->sweep_handles_min || - conn->sweep_idle_time != 0)) - continue; + if (conn->sweep_idle_time != 0 && + conn->open_btree_count >= conn->sweep_handles_min) + WT_ERR(__sweep_expire(session, now)); - /* Close handles if we have reached the configured limit */ - if (conn->open_file_count >= conn->sweep_handles_min) { - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __sweep_expire(session)); - WT_ERR(ret); - } - - WT_ERR(__sweep_flush(session)); + WT_ERR(__sweep_discard_trees(session, now, &dead_handles)); - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __sweep_remove_handles(session)); - WT_ERR(ret); + if (dead_handles > 0) + WT_ERR(__sweep_remove_handles(session, now)); } if (0) { diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c index 6f4d5e85f5a..7dad85e9d38 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_index.c +++ b/src/third_party/wiredtiger/src/cursor/cur_index.c @@ -427,7 +427,11 @@ __wt_curindex_open(WT_SESSION_IMPL *session, else namesize = (size_t)(columns - idxname); - WT_RET(__wt_schema_open_index(session, table, idxname, namesize, &idx)); + if ((ret = __wt_schema_open_index( + session, table, idxname, namesize, &idx)) != 0) { + __wt_schema_release_table(session, table); + return (ret); + } WT_RET(__wt_calloc_one(session, &cindex)); cursor = &cindex->iface; diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 513da401ae6..a03d8b9147d 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -158,7 +158,6 @@ __evict_server(void *arg) WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - WT_EVICT_WORKER *worker; WT_SESSION_IMPL *session; session = arg; @@ -173,30 +172,6 @@ __evict_server(void *arg) break; /* - * If we have caught up and there are more than the minimum - * number of eviction workers running, shut one down. - */ - if (conn->evict_workers > conn->evict_workers_min) { - WT_TRET(__wt_verbose(session, WT_VERB_EVICTSERVER, - "Stopping evict worker: %"PRIu32"\n", - conn->evict_workers)); - worker = &conn->evict_workctx[--conn->evict_workers]; - F_CLR(worker, WT_EVICT_WORKER_RUN); - WT_TRET(__wt_cond_signal( - session, cache->evict_waiter_cond)); - WT_TRET(__wt_thread_join(session, worker->tid)); - /* - * Flag errors here with a message, but don't shut down - * the eviction server - that's fatal. - */ - WT_ASSERT(session, ret == 0); - if (ret != 0) { - (void)__wt_msg(session, - "Error stopping eviction worker: %d", ret); - ret = 0; - } - } - /* * Clear the walks so we don't pin pages while asleep, * otherwise we can block applications evicting large pages. */ @@ -571,9 +546,14 @@ static int __evict_clear_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; + WT_CACHE *cache; WT_REF *ref; btree = S2BT(session); + cache = S2C(session)->cache; + + if (session->dhandle == cache->evict_file_next) + cache->evict_file_next = NULL; if ((ref = btree->evict_ref) == NULL) return (0); @@ -593,21 +573,17 @@ __evict_clear_walk(WT_SESSION_IMPL *session) static int __evict_clear_walks(WT_SESSION_IMPL *session) { - WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *s; u_int i, session_cnt; conn = S2C(session); - cache = conn->cache; WT_ORDERED_READ(session_cnt, conn->session_cnt); for (s = conn->sessions, i = 0; i < session_cnt; ++s, ++i) { if (!s->active || !F_ISSET(s, WT_SESSION_CLEAR_EVICT_WALK)) continue; - if (s->dhandle == cache->evict_file_next) - cache->evict_file_next = NULL; WT_WITH_DHANDLE( session, s->dhandle, WT_TRET(__evict_clear_walk(session))); } @@ -631,7 +607,8 @@ __evict_request_walk_clear(WT_SESSION_IMPL *session) F_SET(session, WT_SESSION_CLEAR_EVICT_WALK); - while (btree->evict_ref != NULL && ret == 0) { + while (ret == 0 && (btree->evict_ref != NULL || + cache->evict_file_next == session->dhandle)) { F_SET(cache, WT_CACHE_CLEAR_WALKS); ret = __wt_cond_wait( session, cache->evict_waiter_cond, 100000); @@ -982,9 +959,17 @@ retry: while (slot < max_entries && ret == 0) { dhandle_locked = 1; } - if (dhandle == NULL) - dhandle = SLIST_FIRST(&conn->dhlh); - else { + if (dhandle == NULL) { + /* + * On entry, continue from wherever we got to in the + * scan last time through. If we don't have a saved + * handle, start from the beginning of the list. + */ + if ((dhandle = cache->evict_file_next) != NULL) + cache->evict_file_next = NULL; + else + dhandle = SLIST_FIRST(&conn->dhlh); + } else { if (incr) { WT_ASSERT(session, dhandle->session_inuse > 0); (void)WT_ATOMIC_SUB4(dhandle->session_inuse, 1); @@ -1002,15 +987,6 @@ retry: while (slot < max_entries && ret == 0) { !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; - /* - * Each time we reenter this function, start at the next handle - * on the list. - */ - if (cache->evict_file_next != NULL && - cache->evict_file_next != dhandle) - continue; - cache->evict_file_next = NULL; - /* Skip files that don't allow eviction. */ btree = dhandle->handle; if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) @@ -1071,6 +1047,9 @@ retry: while (slot < max_entries && ret == 0) { } if (incr) { + /* Remember the file we should visit first, next loop. */ + cache->evict_file_next = dhandle; + WT_ASSERT(session, dhandle->session_inuse > 0); (void)WT_ATOMIC_SUB4(dhandle->session_inuse, 1); incr = 0; @@ -1084,21 +1063,17 @@ retry: while (slot < max_entries && ret == 0) { /* * Walk the list of files a few times if we don't find enough pages. * Try two passes through all the files, give up when we have some - * candidates and we aren't finding more. Take care not to skip files - * on subsequent passes. + * candidates and we aren't finding more. */ if (!F_ISSET(cache, WT_CACHE_CLEAR_WALKS) && ret == 0 && slot < max_entries && (retries < 2 || (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) && retries < 10 && (slot == cache->evict_entries || slot > start_slot)))) { - cache->evict_file_next = NULL; start_slot = slot; ++retries; goto retry; } - /* Remember the file we should visit first, next loop. */ - cache->evict_file_next = dhandle; cache->evict_entries = slot; return (ret); } @@ -1270,6 +1245,9 @@ fast: /* If the page can't be evicted, give up. */ WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, "select: %p, size %" PRIu64, page, page->memory_footprint)); } + WT_RET_NOTFOUND_OK(ret); + + *slotp += (u_int)(evict - start); /* * If we happen to end up on the root page, clear it. We have to track @@ -1282,16 +1260,12 @@ fast: /* If the page can't be evicted, give up. */ if ((ref = btree->evict_ref) != NULL && (__wt_ref_is_root(ref) || ref->page->read_gen == WT_READGEN_OLDEST)) { btree->evict_ref = NULL; - __wt_page_release(session, ref, WT_READ_NO_EVICT); + WT_RET(__wt_page_release(session, ref, WT_READ_NO_EVICT)); } - /* If the walk was interrupted by a locked page, that's okay. */ - if (ret == WT_NOTFOUND) - ret = 0; - - *slotp += (u_int)(evict - start); WT_STAT_FAST_CONN_INCRV(session, cache_eviction_walk, pages_walked); - return (ret); + + return (0); } /* diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index d13ec1972fb..da014a14e35 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1027,29 +1027,6 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, } /* - * If we aren't (potentially) doing eviction that can restore updates - * and the updates on this page are too recent, give up. - * - * Don't rely on new updates being skipped by the transaction used - * for transaction reads: (1) there are paths that dirty pages for - * artificial reasons; (2) internal pages aren't transactional; and - * (3) if an update was skipped during the checkpoint (leaving the page - * dirty), then rolled back, we could still successfully overwrite a - * page and corrupt the checkpoint. - * - * Further, we can't race with the checkpoint's reconciliation of - * an internal page as we evict a clean child from the page's subtree. - * This works in the usual way: eviction locks the page and then checks - * for existing hazard pointers, the checkpoint thread reconciling an - * internal page acquires hazard pointers on child pages it reads, and - * is blocked by the exclusive lock. - */ - if (page->read_gen != WT_READGEN_OLDEST && - !__wt_txn_visible_all(session, __wt_page_is_modified(page) ? - mod->update_txn : mod->rec_max_txn)) - return (0); - - /* * If the page was recently split in-memory, don't force it out: we * hope an eviction thread will find it first. The check here is * similar to __wt_txn_visible_all, but ignores the checkpoint's diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index cd55aadfc07..06a020b80e8 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -100,10 +100,10 @@ struct __wt_named_extractor { }; /* - * Allocate some additional slots for internal sessions. There is a default - * session for each connection, plus a session for each server thread. + * Allocate some additional slots for internal sessions so the user cannot + * configure too few sessions for us to run. */ -#define WT_NUM_INTERNAL_SESSIONS 10 +#define WT_EXTRA_INTERNAL_SESSIONS 10 /* * WT_CONN_CHECK_PANIC -- @@ -325,7 +325,8 @@ struct __wt_connection_impl { #define WT_CONN_LOG_ENABLED 0x02 /* Logging is enabled */ #define WT_CONN_LOG_EXISTED 0x04 /* Log files found */ #define WT_CONN_LOG_PREALLOC 0x08 /* Pre-allocation is enabled */ -#define WT_CONN_LOG_RECOVER_ERR 0x10 /* Error if recovery required */ +#define WT_CONN_LOG_RECOVER_DONE 0x10 /* Recovery completed */ +#define WT_CONN_LOG_RECOVER_ERR 0x20 /* Error if recovery required */ uint32_t log_flags; /* Global logging configuration */ WT_CONDVAR *log_cond; /* Log server wait mutex */ WT_SESSION_IMPL *log_session; /* Log server session */ diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index 9e592ede450..143a8e87449 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -187,6 +187,12 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter) if (reenter) WT_RET(__curfile_leave(cbt)); + /* + * Any old insert position is now invalid. We rely on this being + * cleared to detect if a new skiplist is installed after a search. + */ + cbt->ins_stack[0] = NULL; + /* If the transaction is idle, check that the cache isn't full. */ WT_RET(__wt_txn_idle_cache_check(session)); diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h index 22a0a2c1dd4..8bb649513c7 100644 --- a/src/third_party/wiredtiger/src/include/dhandle.h +++ b/src/third_party/wiredtiger/src/include/dhandle.h @@ -45,6 +45,7 @@ struct __wt_data_handle { uint32_t session_ref; /* Sessions referencing this handle */ int32_t session_inuse; /* Sessions using this handle */ time_t timeofdeath; /* Use count went to 0 */ + time_t timeofdiscard; /* Time of last failed discard */ uint64_t name_hash; /* Hash of name */ const char *name; /* Object name as a URI */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 0826fa7b10b..e98545c3466 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -640,8 +640,8 @@ extern uint32_t __wt_nlpo2(uint32_t v); extern uint32_t __wt_log2_int(uint32_t n); extern int __wt_ispo2(uint32_t v); extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2); -extern void __wt_random_init(uint64_t volatile *rnd_state); -extern uint32_t __wt_random(uint64_t volatile *rnd_state); +extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state); +extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state); extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size); extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))); extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))); diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index 95c43f6772d..7fb6ae13d38 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -255,3 +255,11 @@ #define __wt_page_swap(session, held, want, flags) \ __wt_page_swap_func(session, held, want, flags) #endif + +/* Random number generator state. */ +union __wt_rand_state { + uint64_t v; + struct { + uint32_t w, z; + } x; +}; diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h index 7a5028d6a28..7d901a38d0d 100644 --- a/src/third_party/wiredtiger/src/include/mutex.h +++ b/src/third_party/wiredtiger/src/include/mutex.h @@ -24,24 +24,20 @@ struct __wt_condvar { /* * !!! - * Don't touch this structure without understanding the read/write - * locking functions. + * Don't modify this structure without understanding the read/write locking + * functions. */ -typedef union { /* Read/write lock */ -#ifdef WORDS_BIGENDIAN - WiredTiger read/write locks require modification for big-endian systems. -#else +typedef union { /* Read/write lock */ uint64_t u; struct { - uint32_t us; + uint32_t wr; /* Writers and readers */ } i; struct { - uint16_t writers; - uint16_t readers; - uint16_t users; - uint16_t pad; + uint16_t writers; /* Now serving for writers */ + uint16_t readers; /* Now serving for readers */ + uint16_t users; /* Next available ticket number */ + uint16_t __notused; /* Padding */ } s; -#endif } wt_rwlock_t; /* diff --git a/src/third_party/wiredtiger/src/include/os.h b/src/third_party/wiredtiger/src/include/os.h index ba5d95657d5..edb59b0f521 100644 --- a/src/third_party/wiredtiger/src/include/os.h +++ b/src/third_party/wiredtiger/src/include/os.h @@ -56,7 +56,7 @@ typedef enum { case EMFILE: \ case ENFILE: \ case ENOSPC: \ - __wt_sleep(0L, 500000L); \ + __wt_sleep(0L, 50000L); \ continue; \ default: \ break; \ diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i index 9e6b0f7916c..0fc23348800 100644 --- a/src/third_party/wiredtiger/src/include/serial.i +++ b/src/third_party/wiredtiger/src/include/serial.i @@ -30,11 +30,11 @@ __page_write_gen_wrapped_check(WT_PAGE *page) } /* - * __insert_serial_func -- - * Worker function to add a WT_INSERT entry to a skiplist. + * __insert_simple_func -- + * Worker function to add a WT_INSERT entry to the middle of a skiplist. */ static inline int -__insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, +__insert_simple_func(WT_SESSION_IMPL *session, WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth) { u_int i; @@ -42,31 +42,62 @@ __insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, WT_UNUSED(session); /* - * Confirm we are still in the expected position, and no item has been - * added where our insert belongs. Take extra care at the beginning - * and end of the list (at each level): retry if we race there. + * Update the skiplist elements referencing the new WT_INSERT item. + * If we fail connecting one of the upper levels in the skiplist, + * return success: the levels we updated are correct and sufficient. + * Even though we don't get the benefit of the memory we allocated, + * we can't roll back. * - * !!! - * Note the test for ins_stack[0] == NULL: that's the test for an - * uninitialized cursor, ins_stack[0] is cleared as part of - * initializing a cursor for a search. + * All structure setup must be flushed before the structure is entered + * into the list. We need a write barrier here, our callers depend on + * it. Don't pass complex arguments to the macro, some implementations + * read the old value multiple times. */ for (i = 0; i < skipdepth; i++) { - if (ins_stack[i] == NULL || - *ins_stack[i] != new_ins->next[i]) - return (WT_RESTART); - if (new_ins->next[i] == NULL && - ins_head->tail[i] != NULL && - ins_stack[i] != &ins_head->tail[i]->next[i]) - return (WT_RESTART); + WT_INSERT *old_ins = *ins_stack[i]; + if (old_ins != new_ins->next[i] || + !WT_ATOMIC_CAS8(*ins_stack[i], old_ins, new_ins)) + return (i == 0 ? WT_RESTART : 0); } - /* Update the skiplist elements referencing the new WT_INSERT item. */ + return (0); +} + +/* + * __insert_serial_func -- + * Worker function to add a WT_INSERT entry to a skiplist. + */ +static inline int +__insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, + WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth) +{ + u_int i; + + /* The cursor should be positioned. */ + WT_ASSERT(session, ins_stack[0] != NULL); + + /* + * Update the skiplist elements referencing the new WT_INSERT item. + * + * Confirm we are still in the expected position, and no item has been + * added where our insert belongs. If we fail connecting one of the + * upper levels in the skiplist, return success: the levels we updated + * are correct and sufficient. Even though we don't get the benefit of + * the memory we allocated, we can't roll back. + * + * All structure setup must be flushed before the structure is entered + * into the list. We need a write barrier here, our callers depend on + * it. Don't pass complex arguments to the macro, some implementations + * read the old value multiple times. + */ for (i = 0; i < skipdepth; i++) { + WT_INSERT *old_ins = *ins_stack[i]; + if (old_ins != new_ins->next[i] || + !WT_ATOMIC_CAS8(*ins_stack[i], old_ins, new_ins)) + return (i == 0 ? WT_RESTART : 0); if (ins_head->tail[i] == NULL || ins_stack[i] == &ins_head->tail[i]->next[i]) ins_head->tail[i] = new_ins; - *ins_stack[i] = new_ins; } return (0); @@ -128,20 +159,20 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT *new_ins = *new_insp; WT_DECL_RET; - /* Clear references to memory we now own. */ - *new_insp = NULL; - /* Check for page write generation wrap. */ WT_RET(__page_write_gen_wrapped_check(page)); + /* Clear references to memory we now own and must free on error. */ + *new_insp = NULL; + /* Acquire the page's spinlock, call the worker function. */ WT_PAGE_LOCK(session, page); ret = __col_append_serial_func( session, ins_head, ins_stack, new_ins, recnop, skipdepth); WT_PAGE_UNLOCK(session, page); - /* Free unused memory on error. */ if (ret != 0) { + /* Free unused memory on error. */ __wt_free(session, new_ins); return (ret); } @@ -171,21 +202,32 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page, { WT_INSERT *new_ins = *new_insp; WT_DECL_RET; - - /* Clear references to memory we now own. */ - *new_insp = NULL; + int simple; + u_int i; /* Check for page write generation wrap. */ WT_RET(__page_write_gen_wrapped_check(page)); - /* Acquire the page's spinlock, call the worker function. */ - WT_PAGE_LOCK(session, page); - ret = __insert_serial_func( - session, ins_head, ins_stack, new_ins, skipdepth); - WT_PAGE_UNLOCK(session, page); + /* Clear references to memory we now own and must free on error. */ + *new_insp = NULL; + + simple = 1; + for (i = 0; i < skipdepth; i++) + if (new_ins->next[i] == NULL) + simple = 0; + + if (simple) + ret = __insert_simple_func( + session, ins_stack, new_ins, skipdepth); + else { + WT_PAGE_LOCK(session, page); + ret = __insert_serial_func( + session, ins_head, ins_stack, new_ins, skipdepth); + WT_PAGE_UNLOCK(session, page); + } - /* Free unused memory on error. */ if (ret != 0) { + /* Free unused memory on error. */ __wt_free(session, new_ins); return (ret); } @@ -215,17 +257,19 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DECL_RET; WT_UPDATE *obsolete, *upd = *updp; - /* Clear references to memory we now own. */ - *updp = NULL; - /* Check for page write generation wrap. */ WT_RET(__page_write_gen_wrapped_check(page)); + /* Clear references to memory we now own and must free on error. */ + *updp = NULL; + /* + * All structure setup must be flushed before the structure is entered + * into the list. We need a write barrier here, our callers depend on + * it. + * * Swap the update into place. If that fails, a new update was added - * after our search, we raced. Check if our update is still permitted, - * and if it is, do a full-barrier to ensure the update's next pointer - * is set before we update the linked list and try again. + * after our search, we raced. Check if our update is still permitted. */ while (!WT_ATOMIC_CAS8(*srch_upd, upd->next, upd)) { if ((ret = __wt_txn_update_check( @@ -234,7 +278,6 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, __wt_free(session, upd); return (ret); } - WT_WRITE_BARRIER(); } /* diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index bf1aa98d8d3..f32da177bf9 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -148,7 +148,7 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { #define WT_SESSION_CLEAR_SIZE(s) \ (WT_PTRDIFF(&(s)->rnd, s)) - uint64_t rnd; /* Random number generation state */ + WT_RAND_STATE rnd; /* Random number generation state */ /* Hashed handle reference list array */ SLIST_HEAD(__dhandles_hash, __wt_data_handle_cache) *dhhash; diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 6dc9282a613..d99d70b6d23 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -196,12 +196,13 @@ struct __wt_connection_stats { WT_STATS cursor_search; WT_STATS cursor_search_near; WT_STATS cursor_update; - WT_STATS dh_conn_handles; - WT_STATS dh_conn_ref; - WT_STATS dh_conn_sweeps; - WT_STATS dh_conn_tod; WT_STATS dh_session_handles; WT_STATS dh_session_sweeps; + WT_STATS dh_sweep_close; + WT_STATS dh_sweep_ref; + WT_STATS dh_sweep_remove; + WT_STATS dh_sweep_tod; + WT_STATS dh_sweeps; WT_STATS file_open; WT_STATS log_buffer_size; WT_STATS log_bytes_payload; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index a9b54d26e47..a8e052ec5eb 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -377,7 +377,8 @@ __wt_txn_id_check(WT_SESSION_IMPL *session) do { txn_state->id = txn->id = txn_global->current; } while (!WT_ATOMIC_CAS8( - txn_global->current, txn->id, txn->id + 1)); + txn_global->current, txn->id, txn->id + 1) || + WT_TXNID_LT(txn->id, txn_global->last_running)); /* * If we have used 64-bits of transaction IDs, there is nothing diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index e8f3b9958ce..80e7d0fcacc 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -3670,162 +3670,164 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1062 /*! cursor: cursor update calls */ #define WT_STAT_CONN_CURSOR_UPDATE 1063 -/*! data-handle: connection dhandles swept */ -#define WT_STAT_CONN_DH_CONN_HANDLES 1064 -/*! data-handle: connection candidate referenced */ -#define WT_STAT_CONN_DH_CONN_REF 1065 -/*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_CONN_SWEEPS 1066 -/*! data-handle: connection time-of-death sets */ -#define WT_STAT_CONN_DH_CONN_TOD 1067 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1068 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1064 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1069 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1065 +/*! data-handle: connection sweep dhandles closed */ +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1066 +/*! data-handle: connection sweep candidate became referenced */ +#define WT_STAT_CONN_DH_SWEEP_REF 1067 +/*! data-handle: connection sweep dhandles removed from hash list */ +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1068 +/*! data-handle: connection sweep time-of-death sets */ +#define WT_STAT_CONN_DH_SWEEP_TOD 1069 +/*! data-handle: connection sweeps */ +#define WT_STAT_CONN_DH_SWEEPS 1070 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1070 +#define WT_STAT_CONN_FILE_OPEN 1071 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1071 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1072 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1072 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1073 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1073 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1074 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1074 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1075 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1075 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1076 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1076 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1077 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1077 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1078 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1078 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1079 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1079 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1080 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1080 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1081 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1081 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1082 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1082 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1083 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1083 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1084 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1084 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1085 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1085 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1086 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1086 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1087 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1087 +#define WT_STAT_CONN_LOG_SCANS 1088 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1088 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1089 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1089 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1090 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1090 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1091 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1091 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1092 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1092 +#define WT_STAT_CONN_LOG_SLOT_RACES 1093 /*! log: record size exceeded maximum */ -#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1093 +#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1094 /*! log: failed to find a slot large enough for record */ -#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1094 +#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1095 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1095 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1096 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1096 +#define WT_STAT_CONN_LOG_SYNC 1097 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1097 +#define WT_STAT_CONN_LOG_SYNC_DIR 1098 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1098 +#define WT_STAT_CONN_LOG_WRITE_LSN 1099 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1099 +#define WT_STAT_CONN_LOG_WRITES 1100 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1100 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1101 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1101 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1102 /*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1102 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1103 /*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1103 +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1104 /*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1104 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1105 /*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1105 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1106 /*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1106 +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1107 /*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1107 +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1108 /*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1108 +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1109 /*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1109 +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1110 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1110 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1111 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1111 +#define WT_STAT_CONN_MEMORY_FREE 1112 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1112 +#define WT_STAT_CONN_MEMORY_GROW 1113 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1113 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1114 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1114 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1115 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1115 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1116 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1116 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1117 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1117 +#define WT_STAT_CONN_PAGE_SLEEP 1118 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1118 +#define WT_STAT_CONN_READ_IO 1119 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1119 +#define WT_STAT_CONN_REC_PAGES 1120 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1120 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1121 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1121 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1122 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1122 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1123 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1123 +#define WT_STAT_CONN_RWLOCK_READ 1124 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1124 +#define WT_STAT_CONN_RWLOCK_WRITE 1125 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1125 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1126 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1126 +#define WT_STAT_CONN_SESSION_OPEN 1127 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1127 +#define WT_STAT_CONN_TXN_BEGIN 1128 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1128 +#define WT_STAT_CONN_TXN_CHECKPOINT 1129 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1129 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1130 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1130 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1131 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1131 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1132 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1132 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1133 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1133 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1134 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1134 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1135 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1135 +#define WT_STAT_CONN_TXN_COMMIT 1136 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1136 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1137 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1137 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1138 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1138 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1139 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1139 +#define WT_STAT_CONN_TXN_ROLLBACK 1140 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1140 +#define WT_STAT_CONN_TXN_SYNC 1141 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1141 +#define WT_STAT_CONN_WRITE_IO 1142 /*! * @} diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index b876a2d032d..64e29e104bc 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -266,6 +266,8 @@ struct __wt_upd_skipped; typedef struct __wt_upd_skipped WT_UPD_SKIPPED; struct __wt_update; typedef struct __wt_update WT_UPDATE; +union __wt_rand_state; + typedef union __wt_rand_state WT_RAND_STATE; /* * Forward type declarations for internal types: END * DO NOT EDIT: automatically built by dist/s_typedef. diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c index cdd4f8a24e1..df558b12bef 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c @@ -38,6 +38,78 @@ * Joseph Seigh. Note that a similar (but not identical) algorithm was published * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable * Reader-Writer Synchronization for Shared-Memory Multiprocessors". + * + * The following is an explanation of this code. First, the underlying lock + * structure. + * + * struct { + * uint16_t writers; Now serving for writers + * uint16_t readers; Now serving for readers + * uint16_t users; Next available ticket number + * uint16_t __notused; Padding + * } + * + * First, imagine a store's 'take a number' ticket algorithm. A customer takes + * a unique ticket number and customers are served in ticket order. In the data + * structure, 'writers' is the next writer to be served, 'readers' is the next + * reader to be served, and 'users' is the next available ticket number. + * + * Next, consider exclusive (write) locks. The 'now serving' number for writers + * is 'writers'. To lock, 'take a number' and wait until that number is being + * served; more specifically, atomically copy and increment the current value of + * 'users', and then wait until 'writers' equals that copied number. + * + * Shared (read) locks are similar. Like writers, readers atomically get the + * next number available. However, instead of waiting for 'writers' to equal + * their number, they wait for 'readers' to equal their number. + * + * This has the effect of queuing lock requests in the order they arrive + * (incidentally avoiding starvation). + * + * Each lock/unlock pair requires incrementing both 'readers' and 'writers'. + * In the case of a reader, the 'readers' increment happens when the reader + * acquires the lock (to allow read-lock sharing), and the 'writers' increment + * happens when the reader releases the lock. In the case of a writer, both + * 'readers' and 'writers' are incremented when the writer releases the lock. + * + * For example, consider the following read (R) and write (W) lock requests: + * + * writers readers users + * 0 0 0 + * R: ticket 0, readers match OK 0 1 1 + * R: ticket 1, readers match OK 0 2 2 + * R: ticket 2, readers match OK 0 3 3 + * W: ticket 3, writers no match block 0 3 4 + * R: ticket 2, unlock 1 3 4 + * R: ticket 0, unlock 2 3 4 + * R: ticket 1, unlock 3 3 4 + * W: ticket 3, writers match OK 3 3 4 + * + * Note the writer blocks until 'writers' equals its ticket number and it does + * not matter if readers unlock in order or not. + * + * Readers or writers entering the system after the write lock is queued block, + * and the next ticket holder (reader or writer) will unblock when the writer + * unlocks. An example, continuing from the last line of the above example: + * + * writers readers users + * W: ticket 3, writers match OK 3 3 4 + * R: ticket 4, readers no match block 3 3 5 + * R: ticket 5, readers no match block 3 3 6 + * W: ticket 6, writers no match block 3 3 7 + * W: ticket 3, unlock 4 4 7 + * R: ticket 4, readers match OK 4 5 7 + * R: ticket 5, readers match OK 4 6 7 + * + * The 'users' field is a 2-byte value so the available ticket number wraps at + * 64K requests. If a thread's lock request is not granted until the 'users' + * field cycles and the same ticket is taken by another thread, we could grant + * a lock to two separate threads at the same time, and bad things happen: two + * writer threads or a reader thread and a writer thread would run in parallel, + * and lock waiters could be skipped if the unlocks race. This is unlikely, it + * only happens if a lock request is blocked by 64K other requests. The fix is + * to grow the lock structure fields, but the largest atomic instruction we have + * is 8 bytes, the structure has no room to grow. */ #include "wt_internal.h" @@ -69,20 +141,31 @@ __wt_rwlock_alloc( int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { - wt_rwlock_t *l; - uint64_t old, new, pad, users, writers; + wt_rwlock_t *l, new, old; WT_RET(__wt_verbose( session, WT_VERB_MUTEX, "rwlock: try_readlock %s", rwlock->name)); WT_STAT_FAST_CONN_INCR(session, rwlock_read); l = &rwlock->rwlock; - pad = l->s.pad; - users = l->s.users; - writers = l->s.writers; - old = (pad << 48) + (users << 32) + (users << 16) + writers; - new = (pad << 48) + ((users + 1) << 32) + ((users + 1) << 16) + writers; - return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY); + new = old = *l; + + /* + * This read lock can only be granted if the lock was last granted to + * a reader and there are no readers or writers blocked on the lock, + * that is, if this thread's ticket would be the next ticket granted. + * Do the cheap test to see if this can possibly succeed (and confirm + * the lock is in the correct state to grant this read lock). + */ + if (old.s.readers != old.s.users) + return (EBUSY); + + /* + * The replacement lock value is a result of allocating a new ticket and + * incrementing the reader value to match it. + */ + new.s.readers = new.s.users = old.s.users + 1; + return (WT_ATOMIC_CAS8(l->u, old.u, new.u) ? 0 : EBUSY); } /* @@ -93,8 +176,7 @@ int __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { wt_rwlock_t *l; - uint64_t me; - uint16_t val; + uint16_t ticket; int pause_cnt; WT_RET(__wt_verbose( @@ -102,17 +184,22 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_STAT_FAST_CONN_INCR(session, rwlock_read); l = &rwlock->rwlock; - me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32); - val = (uint16_t)(me >> 32); - for (pause_cnt = 0; val != l->s.readers;) { + + /* + * Possibly wrap: if we have more than 64K lockers waiting, the ticket + * value will wrap and two lockers will simultaneously be granted the + * lock. + */ + ticket = WT_ATOMIC_FETCH_ADD2(l->s.users, 1); + for (pause_cnt = 0; ticket != l->s.readers;) { /* * We failed to get the lock; pause before retrying and if we've * paused enough, sleep so we don't burn CPU to no purpose. This * situation happens if there are more threads than cores in the - * system and we're thrashing on shared resources. Regardless, - * don't sleep long, all we need is to schedule the other reader - * threads to complete a few more instructions and increment the - * reader count. + * system and we're thrashing on shared resources. + * + * Don't sleep long when waiting on a read lock, hopefully we're + * waiting on another read thread to increment the reader count. */ if (++pause_cnt < 1000) WT_PAUSE(); @@ -120,6 +207,10 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) __wt_sleep(0, 10); } + /* + * We're the only writer of the readers field, so the update does not + * need to be atomic. + */ ++l->s.readers; return (0); @@ -138,6 +229,11 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name)); l = &rwlock->rwlock; + + /* + * Increment the writers value (other readers are doing the same, make + * sure we don't race). + */ WT_ATOMIC_ADD2(l->s.writers, 1); return (0); @@ -150,20 +246,28 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { - wt_rwlock_t *l; - uint64_t old, new, pad, readers, users; + wt_rwlock_t *l, new, old; WT_RET(__wt_verbose( session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name)); WT_STAT_FAST_CONN_INCR(session, rwlock_write); l = &rwlock->rwlock; - pad = l->s.pad; - readers = l->s.readers; - users = l->s.users; - old = (pad << 48) + (users << 32) + (readers << 16) + users; - new = (pad << 48) + ((users + 1) << 32) + (readers << 16) + users; - return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY); + old = new = *l; + + /* + * This write lock can only be granted if the lock was last granted to + * a writer and there are no readers or writers blocked on the lock, + * that is, if this thread's ticket would be the next ticket granted. + * Do the cheap test to see if this can possibly succeed (and confirm + * the lock is in the correct state to grant this write lock). + */ + if (old.s.writers != old.s.users) + return (EBUSY); + + /* The replacement lock value is a result of allocating a new ticket. */ + ++new.s.users; + return (WT_ATOMIC_CAS8(l->u, old.u, new.u) ? 0 : EBUSY); } /* @@ -174,23 +278,33 @@ int __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { wt_rwlock_t *l; - uint64_t me; - uint16_t val; + uint16_t ticket; + int pause_cnt; WT_RET(__wt_verbose( session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name)); WT_STAT_FAST_CONN_INCR(session, rwlock_write); + l = &rwlock->rwlock; + /* - * Possibly wrap: if we have more than 64K lockers waiting, the count - * of writers will wrap and two lockers will simultaneously be granted - * the write lock. + * Possibly wrap: if we have more than 64K lockers waiting, the ticket + * value will wrap and two lockers will simultaneously be granted the + * lock. */ - l = &rwlock->rwlock; - me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32); - val = (uint16_t)(me >> 32); - while (val != l->s.writers) - WT_PAUSE(); + ticket = WT_ATOMIC_FETCH_ADD2(l->s.users, 1); + for (pause_cnt = 0; ticket != l->s.writers;) { + /* + * We failed to get the lock; pause before retrying and if we've + * paused enough, sleep so we don't burn CPU to no purpose. This + * situation happens if there are more threads than cores in the + * system and we're thrashing on shared resources. + */ + if (++pause_cnt < 1000) + WT_PAUSE(); + else + __wt_sleep(0, 10); + } return (0); } @@ -211,12 +325,23 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) copy = *l; + /* + * We're the only writer of the writers/readers fields, so the update + * does not need to be atomic; we have to update both values at the + * same time though, otherwise we'd potentially race with the thread + * next granted the lock. + * + * Use a memory barrier to ensure the compiler doesn't mess with these + * instructions and rework the code in a way that avoids the update as + * a unit. + */ WT_BARRIER(); ++copy.s.writers; ++copy.s.readers; - l->i.us = copy.i.us; + l->i.wr = copy.i.wr; + return (0); } diff --git a/src/third_party/wiredtiger/src/os_posix/os_thread.c b/src/third_party/wiredtiger/src/os_posix/os_thread.c index e4f24cdb44e..c7222aac6c4 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_thread.c +++ b/src/third_party/wiredtiger/src/os_posix/os_thread.c @@ -19,7 +19,8 @@ __wt_thread_create(WT_SESSION_IMPL *session, WT_DECL_RET; /* Spawn a new thread of control. */ - if ((ret = pthread_create(tidret, NULL, func, arg)) == 0) + WT_SYSCALL_RETRY(pthread_create(tidret, NULL, func, arg), ret); + if (ret == 0) return (0); WT_RET_MSG(session, ret, "pthread_create"); } @@ -33,7 +34,8 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) { WT_DECL_RET; - if ((ret = pthread_join(tid, NULL)) == 0) + WT_SYSCALL_RETRY(pthread_join(tid, NULL), ret); + if (ret == 0) return (0); WT_RET_MSG(session, ret, "pthread_join"); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 53a73b44feb..37acb28a00b 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -343,11 +343,12 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_RECONCILE *r; - int locked; + int page_lock, scan_lock, split_lock; conn = S2C(session); page = ref->page; mod = page->modify; + page_lock = scan_lock = split_lock = 0; /* We're shouldn't get called with a clean page, that's an error. */ if (!__wt_page_is_modified(page)) @@ -386,22 +387,38 @@ __wt_reconcile(WT_SESSION_IMPL *session, /* * The compaction process looks at the page's modification information; - * if compaction is running, lock the page down. - * - * Otherwise, flip on the scanning flag: obsolete updates cannot be - * freed while reconciliation is in progress. + * if compaction is running, acquire the page's lock. */ - locked = 0; if (conn->compact_in_memory_pass) { - locked = 1; WT_PAGE_LOCK(session, page); - } else + page_lock = 1; + } + + /* + * Reconciliation reads the lists of updates, so obsolete updates cannot + * be discarded while reconciliation is in progress. + */ + for (;;) { + F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret); + if (ret == 0) + break; + __wt_yield(); + } + scan_lock = 1; + + /* + * Mark internal pages as splitting to ensure we don't deadlock when + * performing an in-memory split during a checkpoint. + */ + if (WT_PAGE_IS_INTERNAL(page)) { for (;;) { - F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret); + F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret); if (ret == 0) break; __wt_yield(); } + split_lock = 1; + } /* Reconcile the page. */ switch (page->type) { @@ -434,11 +451,13 @@ __wt_reconcile(WT_SESSION_IMPL *session, else WT_TRET(__rec_write_wrapup_err(session, r, page)); - /* Release the page lock if we're holding one. */ - if (locked) - WT_PAGE_UNLOCK(session, page); - else + /* Release the locks we're holding. */ + if (split_lock) + F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED); + if (scan_lock) F_CLR_ATOMIC(page, WT_PAGE_SCANNING); + if (page_lock) + WT_PAGE_UNLOCK(session, page); /* * Clean up the boundary structures: some workloads result in millions @@ -3266,18 +3285,6 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_RET(__rec_split_init( session, r, page, page->pg_intl_recno, btree->maxintlpage)); - /* - * We need to mark this page as splitting, as this may be an in-memory - * split during a checkpoint. - */ - for (;;) { - F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret); - if (ret == 0) { - break; - } - __wt_yield(); - } - /* For each entry in the in-memory page... */ WT_INTL_FOREACH_BEGIN(session, page, ref) { /* Update the starting record number in case we split. */ @@ -3360,8 +3367,6 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) __rec_copy_incr(session, r, val); } WT_INTL_FOREACH_END; - F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED); - /* Write the remnant page. */ return (__rec_split_finish(session, r)); @@ -4094,18 +4099,6 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) */ r->cell_zero = 1; - /* - * We need to mark this page as splitting in order to ensure we don't - * deadlock when performing an in-memory split during a checkpoint. - */ - for (;;) { - F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret); - if (ret == 0) { - break; - } - __wt_yield(); - } - /* For each entry in the in-memory page... */ WT_INTL_FOREACH_BEGIN(session, page, ref) { /* @@ -4264,8 +4257,6 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) __rec_key_state_update(r, ovfl_key); } WT_INTL_FOREACH_END; - F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED); - /* Write the remnant page. */ return (__rec_split_finish(session, r)); diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index ef9735a8b98..1103dba7409 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -800,7 +800,7 @@ __session_commit_transaction(WT_SESSION *wt_session, const char *config) WT_STAT_FAST_CONN_INCR(session, txn_commit); txn = &session->txn; - if (F_ISSET(txn, WT_TXN_ERROR)) { + if (F_ISSET(txn, WT_TXN_ERROR) && txn->mod_count != 0) { __wt_errx(session, "failed transaction requires rollback"); ret = EINVAL; } @@ -1166,8 +1166,8 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, if (i == conn->session_size) WT_ERR_MSG(session, ENOMEM, "only configured to support %" PRIu32 " sessions" - " (including %" PRIu32 " internal)", - conn->session_size, WT_NUM_INTERNAL_SESSIONS); + " (including %d additional internal sessions)", + conn->session_size, WT_EXTRA_INTERNAL_SESSIONS); /* * If the active session count is increasing, update it. We don't worry diff --git a/src/third_party/wiredtiger/src/support/rand.c b/src/third_party/wiredtiger/src/support/rand.c index 7dfb98c5ca4..caac04d3529 100644 --- a/src/third_party/wiredtiger/src/support/rand.c +++ b/src/third_party/wiredtiger/src/support/rand.c @@ -41,18 +41,18 @@ * of the values to avoid that, and read/write in atomic, 8B chunks. */ #undef M_W -#define M_W(p) ((uint32_t *)&(p))[0] +#define M_W(r) r.x.w #undef M_Z -#define M_Z(p) ((uint32_t *)&(p))[1] +#define M_Z(r) r.x.z /* * __wt_random_init -- * Initialize return of a 32-bit pseudo-random number. */ void -__wt_random_init(uint64_t volatile * rnd_state) +__wt_random_init(WT_RAND_STATE volatile * rnd_state) { - uint64_t rnd; + WT_RAND_STATE rnd; M_W(rnd) = 521288629; M_Z(rnd) = 362436069; @@ -64,9 +64,9 @@ __wt_random_init(uint64_t volatile * rnd_state) * Return a 32-bit pseudo-random number. */ uint32_t -__wt_random(uint64_t volatile * rnd_state) +__wt_random(WT_RAND_STATE volatile * rnd_state) { - uint64_t rnd; + WT_RAND_STATE rnd; uint32_t w, z; /* diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index b0e7d660587..b706263d1ce 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -444,11 +444,15 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) stats->cursor_search.desc = "cursor: cursor search calls"; stats->cursor_search_near.desc = "cursor: cursor search near calls"; stats->cursor_update.desc = "cursor: cursor update calls"; - stats->dh_conn_ref.desc = - "data-handle: connection candidate referenced"; - stats->dh_conn_handles.desc = "data-handle: connection dhandles swept"; - stats->dh_conn_sweeps.desc = "data-handle: connection sweeps"; - stats->dh_conn_tod.desc = "data-handle: connection time-of-death sets"; + stats->dh_sweep_ref.desc = + "data-handle: connection sweep candidate became referenced"; + stats->dh_sweep_close.desc = + "data-handle: connection sweep dhandles closed"; + stats->dh_sweep_remove.desc = + "data-handle: connection sweep dhandles removed from hash list"; + stats->dh_sweep_tod.desc = + "data-handle: connection sweep time-of-death sets"; + stats->dh_sweeps.desc = "data-handle: connection sweeps"; stats->dh_session_handles.desc = "data-handle: session dhandles swept"; stats->dh_session_sweeps.desc = "data-handle: session sweep attempts"; stats->log_slot_closes.desc = "log: consolidated slot closures"; @@ -618,10 +622,11 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->cursor_search.v = 0; stats->cursor_search_near.v = 0; stats->cursor_update.v = 0; - stats->dh_conn_ref.v = 0; - stats->dh_conn_handles.v = 0; - stats->dh_conn_sweeps.v = 0; - stats->dh_conn_tod.v = 0; + stats->dh_sweep_ref.v = 0; + stats->dh_sweep_close.v = 0; + stats->dh_sweep_remove.v = 0; + stats->dh_sweep_tod.v = 0; + stats->dh_sweeps.v = 0; stats->dh_session_handles.v = 0; stats->dh_session_sweeps.v = 0; stats->log_slot_closes.v = 0; diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index c9924056e91..210c5dde5d0 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -125,20 +125,6 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); - current_id = snap_min = txn_global->current; - prev_oldest_id = txn_global->oldest_id; - - /* For pure read-only workloads, avoid scanning. */ - if (prev_oldest_id == current_id) { - txn_state->snap_min = current_id; - __txn_sort_snapshot(session, 0, current_id); - - /* Check that the oldest ID has not moved in the meantime. */ - if (prev_oldest_id == txn_global->oldest_id && - txn_global->scan_count == 0) - return; - } - /* * We're going to scan. Increment the count of scanners to prevent the * oldest ID from moving forwards. Spin if the count is negative, @@ -150,9 +136,21 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) } while (count < 0 || !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1)); - /* The oldest ID cannot change until the scan count goes to zero. */ - prev_oldest_id = txn_global->oldest_id; current_id = snap_min = txn_global->current; + prev_oldest_id = txn_global->oldest_id; + + /* For pure read-only workloads, avoid scanning. */ + if (prev_oldest_id == current_id) { + txn_state->snap_min = current_id; + __txn_sort_snapshot(session, 0, current_id); + + /* Check that the oldest ID has not moved in the meantime. */ + if (prev_oldest_id == txn_global->oldest_id) { + WT_ASSERT(session, txn_global->scan_count > 0); + (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); + return; + } + } /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); @@ -184,10 +182,6 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->snap_min = snap_min; - /* Update the last running ID if we have a much newer value. */ - if (snap_min > txn_global->last_running + 100) - txn_global->last_running = snap_min; - WT_ASSERT(session, txn_global->scan_count > 0); (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); @@ -214,7 +208,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; - uint64_t current_id, id, oldest_id, prev_oldest_id, snap_min; + uint64_t current_id, id, last_running, oldest_id, prev_oldest_id; uint32_t i, session_cnt; int32_t count; int last_running_moved; @@ -222,7 +216,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) conn = S2C(session); txn_global = &conn->txn_global; - current_id = snap_min = txn_global->current; + current_id = last_running = txn_global->current; oldest_session = NULL; prev_oldest_id = txn_global->oldest_id; @@ -247,7 +241,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) /* The oldest ID cannot change until the scan count goes to zero. */ prev_oldest_id = txn_global->oldest_id; - current_id = oldest_id = snap_min = txn_global->current; + current_id = oldest_id = last_running = txn_global->current; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); @@ -262,8 +256,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) */ if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && - WT_TXNID_LT(id, snap_min)) - snap_min = id; + WT_TXNID_LT(id, last_running)) + last_running = id; /* * !!! @@ -280,8 +274,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) } } - if (WT_TXNID_LT(snap_min, oldest_id)) - oldest_id = snap_min; + if (WT_TXNID_LT(last_running, oldest_id)) + oldest_id = last_running; /* The oldest ID can't move past any named snapshots. */ if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && @@ -289,26 +283,42 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) oldest_id = id; /* Update the last running ID. */ - if (WT_TXNID_LT(txn_global->last_running, snap_min)) { - txn_global->last_running = snap_min; - last_running_moved = 1; - } else - last_running_moved = 0; + last_running_moved = + WT_TXNID_LT(txn_global->last_running, last_running); /* Update the oldest ID. */ - if (WT_TXNID_LT(prev_oldest_id, oldest_id) && + if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) && WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) { WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { if ((id = s->id) != WT_TXN_NONE && - WT_TXNID_LT(id, oldest_id)) - oldest_id = id; + WT_TXNID_LT(id, last_running)) + last_running = id; if ((id = s->snap_min) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; } + + if (WT_TXNID_LT(last_running, oldest_id)) + oldest_id = last_running; + +#ifdef HAVE_DIAGNOSTIC + /* + * Make sure the ID doesn't move past any named snapshots. + * + * Don't include the read/assignment in the assert statement. + * Coverity complains if there are assignments only done in + * diagnostic builds, and when the read is from a volatile. + */ + id = txn_global->nsnap_oldest_id; + WT_ASSERT(session, + id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); +#endif + if (WT_TXNID_LT(txn_global->last_running, last_running)) + txn_global->last_running = last_running; if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; + WT_ASSERT(session, txn_global->scan_count == -1); txn_global->scan_count = 0; } else { if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && @@ -408,6 +418,9 @@ __wt_txn_release(WT_SESSION_IMPL *session) txn_global->checkpoint_id = 0; txn_global->checkpoint_pinned = WT_TXN_NONE; } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { + WT_ASSERT(session, + !WT_TXNID_LT(txn->id, txn_global->last_running)); + WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); @@ -458,7 +471,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) txn = &session->txn; conn = S2C(session); - WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR)); + WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0); if (!F_ISSET(txn, WT_TXN_RUNNING)) WT_RET_MSG(session, EINVAL, "No transaction is active"); @@ -582,6 +595,7 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) switch (op->type) { case WT_TXN_OP_BASIC: case WT_TXN_OP_INMEM: + WT_ASSERT(session, op->u.upd->txnid == txn->id); op->u.upd->txnid = WT_TXN_ABORTED; break; case WT_TXN_OP_REF: diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index 0eadcbf3b01..f321da303d7 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -522,7 +522,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session) */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); -done: +done: FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE); err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); WT_TRET(session->iface.close(&session->iface, NULL)); |