diff options
author | Luke Chen <luke.chen@mongodb.com> | 2019-07-02 14:01:12 +1000 |
---|---|---|
committer | Luke Chen <luke.chen@mongodb.com> | 2019-07-02 14:02:00 +1000 |
commit | 11a4b5016f134896e088d8c1c96d1c17225a3c86 (patch) | |
tree | 3385861ec5bc7f5ce74049b8bc6bba3fc91720cf /src/third_party/wiredtiger | |
parent | cdf7c88be60b287c316beda42b4ff9f197617942 (diff) | |
download | mongo-11a4b5016f134896e088d8c1c96d1c17225a3c86.tar.gz |
Import wiredtiger: d86b3a8a331a1ec478c4ea75ef1b15856b429790 from branch mongodb-4.2
ref: ee1bae2623..d86b3a8a33
for: 4.2.0-rc3
WT-4758 Create a workload that bottlenecks on the eviction server filling eviction queues
WT-4821 Update evergreen config to pull correct source for largescale test
WT-4875 Fix commit timestamp assert function to consider non transactional tombstones
WT-4877 Uninitialized memory being written during checkpoints
WT-4878 Disable random dhandle selection and fine tune eviction target calculations
WT-4881 Soften the restrictions on re-entering reconciliation
Diffstat (limited to 'src/third_party/wiredtiger')
-rw-r--r-- | src/third_party/wiredtiger/bench/wtperf/config.c | 27 | ||||
-rw-r--r-- | src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-scan.wtperf | 24 | ||||
-rw-r--r-- | src/third_party/wiredtiger/bench/wtperf/track.c | 18 | ||||
-rw-r--r-- | src/third_party/wiredtiger/bench/wtperf/wtperf.c | 277 | ||||
-rw-r--r-- | src/third_party/wiredtiger/bench/wtperf/wtperf.h | 5 | ||||
-rw-r--r-- | src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i | 10 | ||||
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_ckpt_scan.c | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/docs/wtperf.dox | 8 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/evict/evict_lru.c | 92 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_write.c | 104 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn.c | 7 | ||||
-rw-r--r-- | src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c | 4 | ||||
-rwxr-xr-x[-rw-r--r--] | src/third_party/wiredtiger/test/evergreen.yml | 1 |
14 files changed, 449 insertions, 132 deletions
diff --git a/src/third_party/wiredtiger/bench/wtperf/config.c b/src/third_party/wiredtiger/bench/wtperf/config.c index 549257b5089..18522e3f7e7 100644 --- a/src/third_party/wiredtiger/bench/wtperf/config.c +++ b/src/third_party/wiredtiger/bench/wtperf/config.c @@ -735,7 +735,8 @@ config_sanity(WTPERF *wtperf) ((opts->checkpoint_threads != 0 && opts->checkpoint_interval > opts->run_time) || opts->report_interval > opts->run_time || - opts->sample_interval > opts->run_time)) { + opts->sample_interval > opts->run_time || + opts->scan_interval > opts->run_time)) { fprintf(stderr, "interval value longer than the run-time\n"); return (EINVAL); } @@ -757,6 +758,29 @@ config_sanity(WTPERF *wtperf) return (EINVAL); } + if (opts->scan_pct > 100) { + fprintf(stderr, + "Invalid scan_pct - should be a percentage\n"); + return (EINVAL); + } + + /* If we have separate tables for scanning, we need a separate count. */ + if ((opts->scan_icount > 0 && opts->scan_table_count == 0) || + (opts->scan_icount == 0 && opts->scan_table_count > 0)) { + fprintf(stderr, + "scan_icount %" PRIu32 + " and scan_table_count %" PRIu32 + " must both be zero or nonzero.\n", + opts->scan_icount, opts->scan_table_count); + return (EINVAL); + } + if (opts->scan_interval > 0 && opts->icount == 0 && + opts->scan_icount == 0) { + fprintf(stderr, + "Invalid scan_interval - requires icount to be non-zero\n"); + return (EINVAL); + } + if (opts->value_sz_max < opts->value_sz) { if (F_ISSET(wtperf, CFG_GROW)) { fprintf(stderr, "value_sz_max %" PRIu32 @@ -948,6 +972,7 @@ config_opt_print(WTPERF *wtperf) opts->checkpoint_threads, opts->checkpoint_interval); printf("\t" "Reporting interval: %" PRIu32 "\n", opts->report_interval); printf("\t" "Sampling interval: %" PRIu32 "\n", opts->sample_interval); + printf("\t" "Scan interval: %" PRIu32 "\n", opts->scan_interval); printf("\t" "Verbosity: %" PRIu32 "\n", opts->verbose); } diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-scan.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-scan.wtperf new file mode 100644 index 00000000000..9c3dfa10d84 --- /dev/null +++ b/src/third_party/wiredtiger/bench/wtperf/runners/evict-btree-scan.wtperf @@ -0,0 +1,24 @@ +# wtperf options file: evict btree configuration +conn_config="cache_size=40G,checkpoint=(wait=60,log_size=2GB),eviction=(threads_min=12,threads_max=12),log=(enabled=true),session_max=600,eviction_target=60,statistics=(fast),statistics_log=(wait=1,json)" +# 1B records * (key=12 + value=138) is about 150G total data size +key_sz=12 +value_sz=138 +log_like_table=true +table_config="type=file" +icount=1000000000 +report_interval=5 +run_time=3600 +# Scans every 10 minutes for all the scan specific tables. +# .4B records * (key=12 + value=138) is about 60G total data size for scan +# Running on a machine with 64G physical memory, this exhausts both the +# WT cache and the system cache. +scan_interval=600 +scan_pct=100 +scan_table_count=20 +scan_icount=400000000 +populate_threads=5 +table_count=100 +threads=((count=400,reads=1),(count=20,inserts=1,throttle=500),(count=10,updates=1,throttle=500)) +# Add throughput/latency monitoring +max_latency=50000 +sample_interval=5 diff --git a/src/third_party/wiredtiger/bench/wtperf/track.c b/src/third_party/wiredtiger/bench/wtperf/track.c index ca380703764..3b8832dc6bf 100644 --- a/src/third_party/wiredtiger/bench/wtperf/track.c +++ b/src/third_party/wiredtiger/bench/wtperf/track.c @@ -69,6 +69,24 @@ sum_ckpt_ops(WTPERF *wtperf) } /* + * Return total scan operations. + */ +uint64_t +sum_scan_ops(WTPERF *wtperf) +{ + CONFIG_OPTS *opts; + uint64_t total; + + opts = wtperf->opts; + + if (opts->scan_interval > 0) + total = wtperf->scanthreads->scan.ops; + else + total = 0; + return (total); +} + +/* * Return total operations count for the worker threads. */ static uint64_t diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c index ecbd91fe8cc..dc3bd4f891f 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c @@ -41,6 +41,7 @@ static WT_THREAD_RET monitor(void *); static WT_THREAD_RET populate_thread(void *); static void randomize_value(WTPERF_THREAD *, char *); static void recreate_dir(const char *); +static WT_THREAD_RET scan_worker(void *); static int start_all_runs(WTPERF *); static int start_run(WTPERF *); static void start_threads(WTPERF *, WORKLOAD *, @@ -119,6 +120,15 @@ randomize_value(WTPERF_THREAD *thread, char *value_buf) static uint32_t map_key_to_table(CONFIG_OPTS *opts, uint64_t k) { + /* + * The first part of the key range is reserved for dedicated + * scan tables, if any. The scan tables do not grow, but the + * rest of the key space may. + */ + if (k < opts->scan_icount) + return ((uint32_t) + (opts->table_count + k % opts->scan_table_count)); + k -= opts->scan_icount; if (opts->range_partition) { /* Take care to return a result in [0..table_count-1]. */ if (k > opts->icount + opts->random_range) @@ -362,6 +372,7 @@ worker_async(void *arg) continue; break; default: + lprintf(wtperf, 0, 0, "invalid op!"); goto err; /* can't happen */ } @@ -376,8 +387,10 @@ worker_async(void *arg) wtperf->uris[map_key_to_table(wtperf->opts, next_val)], NULL, &cb, &asyncop)) == EBUSY) (void)usleep(10000); - if (ret != 0) + if (ret != 0) { + lprintf(wtperf, ret, 0, "failed async_new_op"); goto err; + } asyncop->app_private = thread; asyncop->set_key(asyncop, key_buf); @@ -513,6 +526,7 @@ worker(void *arg) WT_CURSOR **cursors, *cursor, *log_table_cursor, *tmp_cursor; WT_SESSION *session; size_t i; + uint32_t total_table_count; int64_t ops, ops_per_txn; uint64_t log_id, next_val, usecs; uint8_t *op, *op_end; @@ -570,8 +584,9 @@ worker(void *arg) goto err; } } else { - cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *)); - for (i = 0; i < opts->table_count; i++) { + total_table_count = opts->table_count + opts->scan_table_count; + cursors = dcalloc(total_table_count, sizeof(WT_CURSOR *)); + for (i = 0; i < total_table_count; i++) { if ((ret = session->open_cursor(session, wtperf->uris[i], NULL, NULL, &cursors[i])) != 0) { lprintf(wtperf, ret, 0, @@ -669,7 +684,6 @@ worker(void *arg) __wt_epoch(NULL, &start); cursor->set_key(cursor, key_buf); - switch (*op) { case WORKER_READ: /* @@ -764,7 +778,8 @@ worker(void *arg) if (ret == WT_NOTFOUND) break; -op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) { +op_err: if (ret == WT_ROLLBACK && + (ops_per_txn != 0 || opts->log_like_table)) { /* * If we are running with explicit transactions * configured and we hit a WT_ROLLBACK, then we @@ -1031,7 +1046,7 @@ populate_thread(void *arg) WT_SESSION *session; size_t i; uint64_t op, usecs; - uint32_t opcount; + uint32_t opcount, total_table_count; int intxn, measure_latency, ret, stress_checkpoint_due; char *value_buf, *key_buf; const char *cursor_config; @@ -1044,6 +1059,7 @@ populate_thread(void *arg) cursors = NULL; ret = stress_checkpoint_due = 0; trk = &thread->insert; + total_table_count = opts->table_count + opts->scan_table_count; key_buf = thread->key_buf; value_buf = thread->value_buf; @@ -1058,8 +1074,8 @@ populate_thread(void *arg) cursor_config = (opts->populate_threads == 1 && !opts->index) ? "bulk" : NULL; /* Create the cursors. */ - cursors = dcalloc(opts->table_count, sizeof(WT_CURSOR *)); - for (i = 0; i < opts->table_count; i++) { + cursors = dcalloc(total_table_count, sizeof(WT_CURSOR *)); + for (i = 0; i < total_table_count; i++) { if ((ret = session->open_cursor( session, wtperf->uris[i], NULL, cursor_config, &cursors[i])) != 0) { @@ -1073,7 +1089,7 @@ populate_thread(void *arg) /* Populate the databases. */ for (intxn = 0, opcount = 0;;) { op = get_next_incr(wtperf); - if (op > opts->icount) + if (op > (uint64_t)opts->icount + (uint64_t)opts->scan_icount) break; if (opts->populate_ops_per_txn != 0 && !intxn) { @@ -1166,7 +1182,6 @@ populate_thread(void *arg) err: wtperf->error = wtperf->stop = true; } free(cursors); - return (WT_THREAD_RET_VALUE); } @@ -1216,7 +1231,7 @@ populate_async(void *arg) /* Populate the databases. */ for (;;) { op = get_next_incr(wtperf); - if (op > opts->icount) + if (op > (uint64_t)opts->icount + (uint64_t)opts->scan_icount) break; /* * Allocate an async op for whichever table. @@ -1225,8 +1240,10 @@ populate_async(void *arg) conn, wtperf->uris[map_key_to_table(wtperf->opts, op)], NULL, &cb, &asyncop)) == EBUSY) (void)usleep(10000); - if (ret != 0) + if (ret != 0) { + lprintf(wtperf, ret, 0, "Failed async_new_op"); goto err; + } asyncop->app_private = thread; generate_key(opts, key_buf, op); @@ -1248,8 +1265,10 @@ populate_async(void *arg) * async_flush and those calls will convoy. That is not the * most efficient way, but we want to flush before measuring latency. */ - if (conn->async_flush(conn) != 0) + if (conn->async_flush(conn) != 0) { + lprintf(wtperf, ret, 0, "Failed async flush"); goto err; + } if (measure_latency) { __wt_epoch(NULL, &stop); ++trk->latency_ops; @@ -1327,6 +1346,7 @@ monitor(void *arg) "insert ops per second," "update ops per second," "checkpoints," + "scans," "read average latency(uS)," "read minimum latency(uS)," "read maximum latency(uS)," @@ -1378,7 +1398,7 @@ monitor(void *arg) (void)fprintf(fp, "%s,%" PRIu32 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 - ",%c" + ",%c,%c" ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 ",%" PRIu32 @@ -1386,6 +1406,7 @@ monitor(void *arg) buf, wtperf->totalsec, cur_reads, cur_inserts, cur_updates, wtperf->ckpt ? 'Y' : 'N', + wtperf->scan ? 'Y' : 'N', read_avg, read_min, read_max, insert_avg, insert_min, insert_max, update_avg, update_min, update_max); @@ -1547,6 +1568,141 @@ err: wtperf->error = wtperf->stop = true; return (WT_THREAD_RET_VALUE); } +static WT_THREAD_RET +scan_worker(void *arg) +{ + CONFIG_OPTS *opts; + WTPERF *wtperf; + WTPERF_THREAD *thread; + WT_CONNECTION *conn; + WT_CURSOR *cursor, **cursors; + WT_SESSION *session; + const char *uri; + char *key_buf; + struct timespec e, s; + uint32_t i, ntables, pct, table_start; + uint64_t cur_id, end_id, incr, items, start_id, tot_items; + int ret; + + thread = (WTPERF_THREAD *)arg; + key_buf = thread->key_buf; + wtperf = thread->wtperf; + opts = wtperf->opts; + conn = wtperf->conn; + session = NULL; + cursors = NULL; + items = 0; + uri = NULL; + + /* + * Figure out how many items we should scan. + * We base the percentage on the icount. + */ + pct = opts->scan_pct == 0 ? 100 : opts->scan_pct; + start_id = cur_id = 1; + + /* + * When we scan the tables, we will increment the key by an amount + * that causes us to visit each table in order, and jump ahead in + * the key space when returning to a table. By doing this, we don't + * repeat keys until we visit them all, but we don't visit keys in + * sequential order. This might better emulate the access pattern + * to a main table when an index is scanned, or a more complex query + * is performed. + */ + if (opts->scan_icount != 0) { + end_id = opts->scan_icount; + tot_items = (opts->scan_icount * pct) / 100; + incr = opts->scan_table_count * 1000 + 1; + table_start = opts->table_count; + ntables = opts->scan_table_count; + } else { + end_id = opts->icount; + tot_items = (opts->icount * pct) / 100; + incr = opts->table_count * 1000 + 1; + table_start = 0; + ntables = opts->table_count; + } + if ((ret = conn->open_session( + conn, NULL, opts->sess_config, &session)) != 0) { + lprintf(wtperf, ret, 0, + "open_session failed in scan thread."); + goto err; + } + cursors = dmalloc(ntables * sizeof(WT_CURSOR *)); + for (i = 0; i < ntables; i++) + if ((ret = session->open_cursor( + session, wtperf->uris[i + table_start], NULL, NULL, + &cursors[i])) != 0) { + lprintf(wtperf, ret, 0, + "open_cursor failed in scan thread."); + goto err; + } + + while (!wtperf->stop) { + /* Break the sleep up, so we notice interrupts faster. */ + for (i = 0; i < opts->scan_interval; i++) { + sleep(1); + if (wtperf->stop) + break; + } + /* If the workers are done, don't bother with a final call. */ + if (wtperf->stop) + break; + + __wt_epoch(NULL, &s); + + wtperf->scan = true; + items = 0; + while (items < tot_items && !wtperf->stop) { + cursor = cursors[map_key_to_table(opts, cur_id) - + table_start]; + generate_key(opts, key_buf, cur_id); + cursor->set_key(cursor, key_buf); + if ((ret = cursor->search(cursor)) != 0) { + lprintf(wtperf, ret, 0, "Failed scan search " + "key %s, items %d", key_buf, (int)items); + goto err; + } + + items++; + cur_id += incr; + if (cur_id >= end_id) { + /* + * Continue with the next slice of the key + * space. + */ + cur_id = ++start_id; + if (cur_id >= end_id) + cur_id = start_id = 1; + } + } + wtperf->scan = false; + if (ret == 0) + ++thread->scan.ops; + else { + lprintf(wtperf, ret, 0, "Scan operation failed for %s.", + uri); + goto err; + } + __wt_epoch(NULL, &e); + } + + if (session != NULL && + ((ret = session->close(session, NULL)) != 0)) { + lprintf(wtperf, ret, 0, + "Error closing session in scan worker."); + goto err; + } + + /* Notify our caller we failed and shut the system down. */ + if (0) { +err: wtperf->error = wtperf->stop = true; + } + free(cursors); + return (WT_THREAD_RET_VALUE); +} + static int execute_populate(WTPERF *wtperf) { @@ -1556,18 +1712,19 @@ execute_populate(WTPERF *wtperf) WTPERF_THREAD *popth; WT_THREAD_CALLBACK(*pfunc)(void *); size_t i; - uint64_t last_ops, msecs, print_ops_sec; + uint64_t last_ops, msecs, print_ops_sec, max_key; uint32_t interval, tables; wt_thread_t idle_table_cycle_thread; double print_secs; int elapsed, ret; opts = wtperf->opts; + max_key = (uint64_t)opts->icount + (uint64_t)opts->scan_icount; lprintf(wtperf, 0, 1, "Starting %" PRIu32 - " populate thread(s) for %" PRIu32 " items", - opts->populate_threads, opts->icount); + " populate thread(s) for %" PRIu64 " items", + opts->populate_threads, max_key); /* Start cycling idle tables if configured. */ start_idle_table_cycle(wtperf, &idle_table_cycle_thread); @@ -1587,7 +1744,7 @@ execute_populate(WTPERF *wtperf) __wt_epoch(NULL, &start); for (elapsed = 0, interval = 0, last_ops = 0; - wtperf->insert_key < opts->icount && !wtperf->error;) { + wtperf->insert_key < max_key && !wtperf->error;) { /* * Sleep for 100th of a second, report_interval is in second * granularity, each 100th increment of elapsed is a single @@ -1751,8 +1908,8 @@ execute_workload(WTPERF *wtperf) WT_SESSION **sessions; WT_THREAD_CALLBACK(*pfunc)(void *); wt_thread_t idle_table_cycle_thread; - uint64_t last_ckpts, last_inserts, last_reads, last_truncates; - uint64_t last_updates; + uint64_t last_ckpts, last_scans; + uint64_t last_inserts, last_reads, last_truncates, last_updates; uint32_t interval, run_ops, run_time; u_int i; int ret; @@ -1763,8 +1920,8 @@ execute_workload(WTPERF *wtperf) wtperf->insert_ops = wtperf->read_ops = wtperf->truncate_ops = 0; wtperf->update_ops = 0; - last_ckpts = last_inserts = last_reads = last_truncates = 0; - last_updates = 0; + last_ckpts = last_scans = 0; + last_inserts = last_reads = last_truncates = last_updates = 0; ret = 0; sessions = NULL; @@ -1844,6 +2001,7 @@ execute_workload(WTPERF *wtperf) /* Sum the operations we've done. */ wtperf->ckpt_ops = sum_ckpt_ops(wtperf); + wtperf->scan_ops = sum_scan_ops(wtperf); wtperf->insert_ops = sum_insert_ops(wtperf); wtperf->read_ops = sum_read_ops(wtperf); wtperf->update_ops = sum_update_ops(wtperf); @@ -1863,18 +2021,21 @@ execute_workload(WTPERF *wtperf) lprintf(wtperf, 0, 1, "%" PRIu64 " reads, %" PRIu64 " inserts, %" PRIu64 " updates, %" PRIu64 " truncates, %" PRIu64 - " checkpoints in %" PRIu32 " secs (%" PRIu32 " total secs)", + " checkpoints, %" PRIu64 " scans in %" PRIu32 + " secs (%" PRIu32 " total secs)", wtperf->read_ops - last_reads, wtperf->insert_ops - last_inserts, wtperf->update_ops - last_updates, wtperf->truncate_ops - last_truncates, wtperf->ckpt_ops - last_ckpts, + wtperf->scan_ops - last_scans, opts->report_interval, wtperf->totalsec); last_reads = wtperf->read_ops; last_inserts = wtperf->insert_ops; last_updates = wtperf->update_ops; last_truncates = wtperf->truncate_ops; last_ckpts = wtperf->ckpt_ops; + last_scans = wtperf->scan_ops; } /* Notify the worker threads they are done. */ @@ -1971,16 +2132,17 @@ create_uris(WTPERF *wtperf) { CONFIG_OPTS *opts; size_t len; - uint32_t i; + uint32_t i, total_table_count; opts = wtperf->opts; - wtperf->uris = dcalloc(opts->table_count, sizeof(char *)); + total_table_count = opts->table_count + opts->scan_table_count; + wtperf->uris = dcalloc(total_table_count, sizeof(char *)); len = strlen("table:") + strlen(opts->table_name) + 20; - for (i = 0; i < opts->table_count; i++) { + for (i = 0; i < total_table_count; i++) { /* If there is only one table, just use the base name. */ wtperf->uris[i] = dmalloc(len); - if (opts->table_count == 1) + if (total_table_count == 1) testutil_check(__wt_snprintf(wtperf->uris[i], len, "table:%s", opts->table_name)); else @@ -2003,6 +2165,7 @@ create_tables(WTPERF *wtperf) WT_SESSION *session; size_t i; int ret; + uint32_t total_table_count; char buf[512]; opts = wtperf->opts; @@ -2030,7 +2193,8 @@ create_tables(WTPERF *wtperf) return (ret); } - for (i = 0; i < opts->table_count; i++) { + total_table_count = opts->table_count + opts->scan_table_count; + for (i = 0; i < total_table_count; i++) { if (opts->log_partial && i > 0) { if (((ret = session->create(session, wtperf->uris[i], wtperf->partial_config)) != 0)) { @@ -2075,8 +2239,10 @@ wtperf_copy(const WTPERF *src, WTPERF **retp) CONFIG_OPTS *opts; WTPERF *dest; size_t i; + uint32_t total_table_count; opts = src->opts; + total_table_count = opts->table_count + opts->scan_table_count; dest = dcalloc(1, sizeof(WTPERF)); @@ -2091,8 +2257,8 @@ wtperf_copy(const WTPERF *src, WTPERF **retp) dest->reopen_config = dstrdup(src->reopen_config); if (src->uris != NULL) { - dest->uris = dcalloc(opts->table_count, sizeof(char *)); - for (i = 0; i < opts->table_count; i++) + dest->uris = dcalloc(total_table_count, sizeof(char *)); + for (i = 0; i < total_table_count; i++) dest->uris[i] = dstrdup(src->uris[i]); } @@ -2100,6 +2266,7 @@ wtperf_copy(const WTPERF *src, WTPERF **retp) dest->async_config = dstrdup(src->async_config); dest->ckptthreads = NULL; + dest->scanthreads = NULL; dest->popthreads = NULL; dest->workers = NULL; @@ -2137,7 +2304,7 @@ wtperf_free(WTPERF *wtperf) free(wtperf->log_table_uri); if (wtperf->uris != NULL) { - for (i = 0; i < opts->table_count; i++) + for (i = 0; i < opts->table_count + opts->scan_table_count; i++) free(wtperf->uris[i]); free(wtperf->uris); } @@ -2145,6 +2312,7 @@ wtperf_free(WTPERF *wtperf) free(wtperf->async_config); free(wtperf->ckptthreads); + free(wtperf->scanthreads); free(wtperf->popthreads); free(wtperf->workers); @@ -2336,10 +2504,19 @@ start_run(WTPERF *wtperf) "Starting %" PRIu32 " checkpoint thread(s)", opts->checkpoint_threads); wtperf->ckptthreads = dcalloc( - opts->checkpoint_threads, sizeof(WTPERF_THREAD)); + opts->checkpoint_threads, sizeof(WTPERF_THREAD)); start_threads(wtperf, NULL, wtperf->ckptthreads, opts->checkpoint_threads, checkpoint_worker); } + /* Start the scan thread. */ + if (opts->scan_interval != 0) { + lprintf(wtperf, 0, 1, + "Starting 1 scan thread"); + wtperf->scanthreads = dcalloc( + 1, sizeof(WTPERF_THREAD)); + start_threads(wtperf, NULL, wtperf->scanthreads, + 1, scan_worker); + } if (opts->pre_load_data) pre_load_data(wtperf); @@ -2353,6 +2530,7 @@ start_run(WTPERF *wtperf) wtperf->truncate_ops = sum_truncate_ops(wtperf); wtperf->update_ops = sum_update_ops(wtperf); wtperf->ckpt_ops = sum_ckpt_ops(wtperf); + wtperf->scan_ops = sum_scan_ops(wtperf); total_ops = wtperf->read_ops + wtperf->insert_ops + wtperf->update_ops; @@ -2381,6 +2559,9 @@ start_run(WTPERF *wtperf) lprintf(wtperf, 0, 1, "Executed %" PRIu64 " checkpoint operations", wtperf->ckpt_ops); + lprintf(wtperf, 0, 1, + "Executed %" PRIu64 " scan operations", + wtperf->scan_ops); latency_print(wtperf); } @@ -2394,6 +2575,7 @@ err: if (ret == 0) wtperf->stop = true; stop_threads(1, wtperf->ckptthreads); + stop_threads(1, wtperf->scanthreads); if (monitor_created != 0) testutil_check(__wt_thread_join(NULL, &monitor_thread)); @@ -2794,9 +2976,11 @@ start_threads(WTPERF *wtperf, WORKLOAD *workp, * for latency measurements, for the same reason. */ thread->ckpt.min_latency = + thread->scan.min_latency = thread->insert.min_latency = thread->read.min_latency = thread->update.min_latency = UINT32_MAX; - thread->ckpt.max_latency = thread->insert.max_latency = + thread->ckpt.max_latency = thread->scan.max_latency = + thread->insert.max_latency = thread->read.max_latency = thread->update.max_latency = 0; } @@ -2852,10 +3036,12 @@ drop_all_tables(WTPERF *wtperf) CONFIG_OPTS *opts; WT_SESSION *session; size_t i; + uint32_t total_table_count; uint64_t msecs; int ret, t_ret; opts = wtperf->opts; + total_table_count = opts->table_count + opts->scan_table_count; /* Drop any tables. */ if ((ret = wtperf->conn->open_session( @@ -2865,7 +3051,7 @@ drop_all_tables(WTPERF *wtperf) return (ret); } __wt_epoch(NULL, &start); - for (i = 0; i < opts->table_count; i++) { + for (i = 0; i < total_table_count; i++) { if ((ret = session->drop(session, wtperf->uris[i], NULL)) != 0) { lprintf(wtperf, ret, 0, @@ -2877,7 +3063,7 @@ drop_all_tables(WTPERF *wtperf) msecs = WT_TIMEDIFF_MS(stop, start); lprintf(wtperf, 0, 1, "Executed %" PRIu32 " drop operations average time %" PRIu64 "ms", - opts->table_count, msecs / opts->table_count); + total_table_count, msecs / total_table_count); err: if ((t_ret = session->close(session, NULL)) != 0 && ret == 0) ret = t_ret; @@ -2888,18 +3074,20 @@ static uint64_t wtperf_value_range(WTPERF *wtperf) { CONFIG_OPTS *opts; + uint64_t total_icount; opts = wtperf->opts; + total_icount = (uint64_t)opts->scan_icount + (uint64_t)opts->icount; if (opts->random_range) - return (opts->icount + opts->random_range); + return (total_icount + opts->random_range); /* * It is legal to configure a zero size populate phase, hide that * from other code by pretending the range is 1 in that case. */ - if (opts->icount + wtperf->insert_key == 0) + if (total_icount + wtperf->insert_key == 0) return (1); - return (opts->icount + + return (total_icount + wtperf->insert_key - (u_int)(wtperf->workers_cnt + 1)); } @@ -2910,12 +3098,15 @@ wtperf_rand(WTPERF_THREAD *thread) WT_CURSOR *rnd_cursor; WTPERF *wtperf; double S1, S2, U; - uint64_t rval; + uint64_t end_range, range, rval, start_range; int ret; char *key_buf; wtperf = thread->wtperf; opts = wtperf->opts; + end_range = wtperf_value_range(wtperf); + start_range = opts->scan_icount; + range = end_range - start_range; /* * If we have a random cursor set up then use it. @@ -2953,7 +3144,7 @@ wtperf_rand(WTPERF_THREAD *thread) if (opts->pareto != 0) { #define PARETO_SHAPE 1.5 S1 = (-1 / PARETO_SHAPE); - S2 = wtperf_value_range(wtperf) * + S2 = range * (opts->pareto / 100.0) * (PARETO_SHAPE - 1); U = 1 - (double)rval / (double)UINT32_MAX; rval = (uint64_t)((pow(U, S1) - 1) * S2); @@ -2962,13 +3153,13 @@ wtperf_rand(WTPERF_THREAD *thread) * 2% of the time, from my testing. That will lead to the * first item in the table being "hot". */ - if (rval > wtperf_value_range(wtperf)) + if (rval > end_range) rval = 0; } /* * Wrap the key to within the expected range and avoid zero: we never * insert that key. */ - rval = (rval % wtperf_value_range(wtperf)) + 1; - return (rval); + rval = (rval % range) + 1; + return (start_range + rval); } diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.h b/src/third_party/wiredtiger/bench/wtperf/wtperf.h index 7e43a62459a..e5163409b4e 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.h +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.h @@ -131,6 +131,7 @@ struct __wtperf { /* Per-database structure */ WTPERF_THREAD *ckptthreads; /* Checkpoint threads */ WTPERF_THREAD *popthreads; /* Populate threads */ + WTPERF_THREAD *scanthreads; /* Scan threads */ #define WORKLOAD_MAX 50 WTPERF_THREAD *workers; /* Worker threads */ @@ -141,6 +142,7 @@ struct __wtperf { /* Per-database structure */ /* State tracking variables. */ uint64_t ckpt_ops; /* checkpoint operations */ + uint64_t scan_ops; /* scan operations */ uint64_t insert_ops; /* insert operations */ uint64_t read_ops; /* read operations */ uint64_t truncate_ops; /* truncate operations */ @@ -150,6 +152,7 @@ struct __wtperf { /* Per-database structure */ uint64_t log_like_table_key; /* used to allocate IDs for log table */ volatile bool ckpt; /* checkpoint in progress */ + volatile bool scan; /* scan in progress */ volatile bool error; /* thread error */ volatile bool stop; /* notify threads to stop */ volatile bool in_warmup; /* running warmup phase */ @@ -245,6 +248,7 @@ struct __wtperf_thread { /* Per-thread structure */ TRACK ckpt; /* Checkpoint operations */ TRACK insert; /* Insert operations */ TRACK read; /* Read operations */ + TRACK scan; /* Scan operations */ TRACK update; /* Update operations */ TRACK truncate; /* Truncate operations */ TRACK truncate_sleep; /* Truncate sleep operations */ @@ -273,6 +277,7 @@ void start_idle_table_cycle(WTPERF *, wt_thread_t *); void stop_idle_table_cycle(WTPERF *, wt_thread_t); void worker_throttle(WTPERF_THREAD *); uint64_t sum_ckpt_ops(WTPERF *); +uint64_t sum_scan_ops(WTPERF *); uint64_t sum_insert_ops(WTPERF *); uint64_t sum_pop_ops(WTPERF *); uint64_t sum_read_ops(WTPERF *); diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i index d312ee8526d..079c419908f 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf_opt.i @@ -172,6 +172,16 @@ DEF_OPT_AS_UINT32(sample_interval, 0, DEF_OPT_AS_UINT32(sample_rate, 50, "how often the latency of operations is measured. One for every operation," "two for every second operation, three for every third operation etc.") +DEF_OPT_AS_UINT32(scan_icount, 0, + "number of records in scan tables to populate") +DEF_OPT_AS_UINT32(scan_interval, 0, + "scan tables every interval seconds during the workload phase," + " 0 to disable") +DEF_OPT_AS_UINT32(scan_pct, 10, + "percentage of entire data set scanned, if scan_interval is enabled") +DEF_OPT_AS_UINT32(scan_table_count, 0, + "number of separate tables to be used for scanning. Zero indicates " + "that tables are shared with other operations") DEF_OPT_AS_CONFIG_STRING(sess_config, "", "session configuration string") DEF_OPT_AS_UINT32(session_count_idle, 0, "number of idle sessions to create. Default 0.") diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 9809bf4e591..786a70c3e62 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "ee1bae262347285f46b5c56cc0490d20b9ee9c98", + "commit": "d86b3a8a331a1ec478c4ea75ef1b15856b429790", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-4.2" diff --git a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c index 57934dd0422..0ab4ea72318 100644 --- a/src/third_party/wiredtiger/src/block/block_ckpt_scan.c +++ b/src/third_party/wiredtiger/src/block/block_ckpt_scan.c @@ -91,6 +91,8 @@ __wt_block_checkpoint_final(WT_SESSION_IMPL *session, */ size = buf->size + WT_INTPACK64_MAXSIZE; WT_RET(__wt_buf_extend(session, buf, size)); + p = (uint8_t *)buf->mem + buf->size; + memset(p, 0, WT_INTPACK64_MAXSIZE); file_size_offset = buf->size; buf->size = size; diff --git a/src/third_party/wiredtiger/src/docs/wtperf.dox b/src/third_party/wiredtiger/src/docs/wtperf.dox index 8aa95c5b635..2525a77d62b 100644 --- a/src/third_party/wiredtiger/src/docs/wtperf.dox +++ b/src/third_party/wiredtiger/src/docs/wtperf.dox @@ -202,6 +202,14 @@ total workload seconds performance logging every interval seconds, 0 to disable @par sample_rate (unsigned int, default=50) how often the latency of operations is measured. One for every operation, two for every second operation, three for every third operation etc. +@par scan_icount (unsigned int, default=0) +number of records in scan tables to populate +@par scan_interval (unsigned int, default=0) +scan tables every interval seconds during the workload phase, 0 to disable +@par scan_pct (unsigned int, default=10) +percentage of entire data set scanned, if scan_interval is enabled +@par scan_table_count (unsigned int, default=0) +number of separate tables to be used for scanning. Zero indicates that tables are shared with other operations @par sess_config (string, default="") session configuration string @par session_count_idle (unsigned int, default=0) diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 397306e6aa6..931216376b9 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -18,7 +18,7 @@ static int __evict_server(WT_SESSION_IMPL *, bool *); static void __evict_tune_workers(WT_SESSION_IMPL *session); static int __evict_walk(WT_SESSION_IMPL *, WT_EVICT_QUEUE *); static int __evict_walk_tree( - WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *, uint64_t *); + WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *); #define WT_EVICT_HAS_WORKERS(s) \ (S2C(s)->evict_threads.current_threads > 1) @@ -1399,19 +1399,19 @@ __evict_walk_choose_dhandle( u_int dh_bucket_count, rnd_bucket, rnd_dh; conn = S2C(session); - *dhandle_p = NULL; WT_ASSERT(session, __wt_rwlock_islocked(session, &conn->dhandle_lock)); - /* Nothing to do if the dhandle list is empty. */ - if (TAILQ_EMPTY(&conn->dhqh)) - return; +#undef RANDOM_DH_SELECTION_ENABLED + +#ifdef RANDOM_DH_SELECTION_ENABLED + *dhandle_p = NULL; /* - * If we do not have a lot of dhandles, most hash buckets will be empty. + * If we don't have many dhandles, most hash buckets will be empty. * Just pick a random dhandle from the list in that case. */ - if (conn->dhandle_count < 10 * WT_HASH_ARRAY_SIZE) { + if (conn->dhandle_count < WT_HASH_ARRAY_SIZE / 4) { rnd_dh = __wt_random(&session->rnd) % conn->dhandle_count; dhandle = TAILQ_FIRST(&conn->dhqh); for (; rnd_dh > 0; rnd_dh--) @@ -1435,6 +1435,18 @@ __evict_walk_choose_dhandle( dhandle = TAILQ_FIRST(&conn->dhhash[rnd_bucket]); for (; rnd_dh > 0; rnd_dh--) dhandle = TAILQ_NEXT(dhandle, hashq); +#else + /* Just step through dhandles. */ + dhandle = *dhandle_p; + if (dhandle != NULL) + dhandle = TAILQ_NEXT(dhandle, q); + if (dhandle == NULL) + dhandle = TAILQ_FIRST(&conn->dhqh); + + WT_UNUSED(dh_bucket_count); + WT_UNUSED(rnd_bucket); + WT_UNUSED(rnd_dh); +#endif *dhandle_p = dhandle; } @@ -1452,9 +1464,8 @@ __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue) WT_DATA_HANDLE *dhandle; WT_DECL_RET; WT_TRACK_OP_DECL; - uint64_t loop_count; - uint64_t pages_seen_file, pages_seen_interim, pages_seen_total; - u_int max_entries, retries, slot, start_slot, total_candidates; + u_int loop_count, max_entries, retries, slot, start_slot; + u_int total_candidates; bool dhandle_locked, incr; WT_TRACK_OP_INIT(session); @@ -1480,31 +1491,14 @@ __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue) total_candidates = (u_int)(F_ISSET(cache, WT_CACHE_EVICT_CLEAN) ? __wt_cache_pages_inuse(cache) : cache->pages_dirty_leaf); max_entries = WT_MIN(max_entries, 1 + total_candidates / 2); - pages_seen_interim = pages_seen_total = 0; retry: loop_count = 0; - while (slot < max_entries) { - loop_count++; - + while (slot < max_entries && loop_count++ < conn->dhandle_count) { /* We're done if shutting down or reconfiguring. */ if (F_ISSET(conn, WT_CONN_CLOSING) || F_ISSET(conn, WT_CONN_RECONFIGURING)) break; - /* If we have seen enough pages in this walk, we're done. */ - if (pages_seen_total > WT_EVICT_WALK_INCR * 100) - break; - - /* - * If we are not finding pages at all, we're done. - * Every 100th iteration, check if we made progress. - */ - if (loop_count % 100 == 0) { - if (pages_seen_interim == pages_seen_total) - break; - pages_seen_interim = pages_seen_total; - } - /* * If another thread is waiting on the eviction server to clear * the walk point in a tree, give up. @@ -1620,9 +1614,8 @@ retry: loop_count = 0; */ cache->walk_tree = dhandle; WT_WITH_DHANDLE(session, dhandle, - ret = __evict_walk_tree(session, queue, - max_entries, &slot, &pages_seen_file)); - pages_seen_total += pages_seen_file; + ret = __evict_walk_tree( + session, queue, max_entries, &slot)); WT_ASSERT(session, __wt_session_gen( session, WT_GEN_SPLIT) == 0); @@ -1713,22 +1706,14 @@ __evict_push_candidate(WT_SESSION_IMPL *session, * Calculate how many pages to queue for a given tree. */ static uint32_t -__evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries) +__evict_walk_target(WT_SESSION_IMPL *session) { WT_CACHE *cache; uint64_t btree_inuse, bytes_per_slot, cache_inuse; uint32_t target_pages_clean, target_pages_dirty, target_pages; - uint32_t total_slots; cache = S2C(session)->cache; target_pages_clean = target_pages_dirty = 0; - total_slots = max_entries; - - /* - * The number of times we should fill the queue by the end of - * considering all trees. - */ -#define QUEUE_FILLS_PER_PASS 10 /* * The minimum number of pages we should consider per tree. @@ -1744,7 +1729,7 @@ __evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries) if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) { btree_inuse = __wt_btree_bytes_evictable(session); cache_inuse = __wt_cache_bytes_inuse(cache); - bytes_per_slot = 1 + cache_inuse / total_slots; + bytes_per_slot = 1 + cache_inuse / cache->evict_slots; target_pages_clean = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); } @@ -1752,20 +1737,12 @@ __evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries) if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) { btree_inuse = __wt_btree_dirty_leaf_inuse(session); cache_inuse = __wt_cache_dirty_leaf_inuse(cache); - bytes_per_slot = 1 + cache_inuse / total_slots; + bytes_per_slot = 1 + cache_inuse / cache->evict_slots; target_pages_dirty = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); } - /* - * Weight the number of target pages by the number of times we want to - * fill the cache per pass through all the trees. Note that we don't - * build this into the calculation above because we don't want to favor - * small trees, so round to a whole number of slots (zero for small - * trees) before multiplying. - */ - target_pages = WT_MAX(target_pages_clean, target_pages_dirty) * - QUEUE_FILLS_PER_PASS; + target_pages = WT_MAX(target_pages_clean, target_pages_dirty); /* * Walk trees with a small fraction of the cache in case there are so @@ -1800,8 +1777,8 @@ __evict_walk_target(WT_SESSION_IMPL *session, u_int max_entries) * Get a few page eviction candidates from a single underlying file. */ static int -__evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, - u_int max_entries, u_int *slotp, uint64_t *pages_seen_p) +__evict_walk_tree(WT_SESSION_IMPL *session, + WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp) { WT_BTREE *btree; WT_CACHE *cache; @@ -1821,7 +1798,6 @@ __evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, last_parent = NULL; restarts = 0; give_up = urgent_queued = false; - *pages_seen_p = 0; /* * Figure out how many slots to fill from this tree. @@ -1830,12 +1806,10 @@ __evict_walk_tree(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, start = queue->evict_queue + *slotp; remaining_slots = max_entries - *slotp; if (btree->evict_walk_progress >= btree->evict_walk_target) { - btree->evict_walk_target = - __evict_walk_target(session, max_entries); + btree->evict_walk_target = __evict_walk_target(session); btree->evict_walk_progress = 0; } - target_pages = WT_MIN(btree->evict_walk_target / QUEUE_FILLS_PER_PASS, - btree->evict_walk_target - btree->evict_walk_progress); + target_pages = btree->evict_walk_target - btree->evict_walk_progress; if (target_pages > remaining_slots) target_pages = remaining_slots; @@ -2194,8 +2168,6 @@ fast: /* If the page can't be evicted, give up. */ btree->evict_ref = ref; } - *pages_seen_p = pages_seen; - WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked); WT_STAT_CONN_INCRV(session, cache_eviction_pages_seen, pages_seen); WT_STAT_DATA_INCRV(session, cache_eviction_pages_seen, pages_seen); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 51cabeda029..7193e6f2b2c 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -27,6 +27,8 @@ static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *); static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_write_wrapup_err( WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); +static int __reconcile(WT_SESSION_IMPL *, + WT_REF *, WT_SALVAGE_COOKIE *, uint32_t, bool *, bool *); /* * __wt_reconcile -- @@ -36,19 +38,15 @@ int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags, bool *lookaside_retryp) { - WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; - WT_PAGE_MODIFY *mod; - WT_RECONCILE *r; - uint64_t oldest_id; + bool no_reconcile_set, page_locked; - btree = S2BT(session); - page = ref->page; - mod = page->modify; if (lookaside_retryp != NULL) *lookaside_retryp = false; + page = ref->page; + __wt_verbose(session, WT_VERB_RECONCILE, "%p reconcile %s (%s%s%s)", (void *)ref, __wt_page_type_string(page->type), @@ -77,10 +75,19 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, LF_ISSET(WT_REC_VISIBLE_ALL) || F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT)); - /* We shouldn't get called with a clean page, that's an error. */ + /* It's an error to be called with a clean page. */ WT_ASSERT(session, __wt_page_is_modified(page)); /* + * Reconciliation acquires and releases pages, and in rare cases that + * page release triggers eviction. If the page is dirty, eviction can + * trigger reconciliation, and we re-enter this code. Reconciliation + * isn't re-entrant, so we need to ensure that doesn't happen. + */ + no_reconcile_set = F_ISSET(session, WT_SESSION_NO_RECONCILE); + F_SET(session, WT_SESSION_NO_RECONCILE); + + /* * Reconciliation locks the page for three reasons: * Reconciliation reads the lists of page updates, obsolete updates * cannot be discarded while reconciliation is in progress; @@ -90,6 +97,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, * a child page splitting during the reconciliation. */ WT_PAGE_LOCK(session, page); + page_locked = true; /* * Now that the page is locked, if attempting to evict it, check again @@ -97,20 +105,37 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, * while we were waiting to acquire the lock (e.g., the page could have * split). */ - if (LF_ISSET(WT_REC_EVICT) && - !__wt_page_can_evict(session, ref, NULL)) { - WT_PAGE_UNLOCK(session, page); - return (__wt_set_return(session, EBUSY)); - } + if (LF_ISSET(WT_REC_EVICT) && !__wt_page_can_evict(session, ref, NULL)) + WT_ERR(__wt_set_return(session, EBUSY)); - /* Initialize the reconciliation structure for each new run. */ - if ((ret = __rec_init( - session, ref, flags, salvage, &session->reconcile)) != 0) { + /* + * Reconcile the page. The reconciliation code unlocks the page as soon + * as possible, and returns that information. + */ + ret = __reconcile(session, ref, + salvage, flags, lookaside_retryp, &page_locked); + +err: + if (page_locked) WT_PAGE_UNLOCK(session, page); - return (ret); - } - r = session->reconcile; + if (!no_reconcile_set) + F_CLR(session, WT_SESSION_NO_RECONCILE); + return (ret); +} +/* + * __reconcile_save_evict_state -- + * Save the transaction state that causes history to be pinned, whether + * reconciliation succeeds or fails. + */ +static void +__reconcile_save_evict_state( + WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) +{ + WT_PAGE_MODIFY *mod; + uint64_t oldest_id; + + mod = ref->page->modify; oldest_id = __wt_txn_oldest_id(session); /* @@ -136,6 +161,32 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_ASSERT(session, WT_TXNID_LE(mod->last_oldest_id, oldest_id)); mod->last_oldest_id = oldest_id; #endif +} + +/* + * __reconcile -- + * Reconcile an in-memory page into its on-disk format, and write it. + */ +static int +__reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, + uint32_t flags, bool *lookaside_retryp, bool *page_lockedp) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_MODIFY *mod; + WT_RECONCILE *r; + + btree = S2BT(session); + page = ref->page; + mod = page->modify; + + /* Save the eviction state. */ + __reconcile_save_evict_state(session, ref, flags); + + /* Initialize the reconciliation structure for each new run. */ + WT_RET(__rec_init(session, ref, flags, salvage, &session->reconcile)); + r = session->reconcile; /* Reconcile the page. */ switch (page->type) { @@ -190,6 +241,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, S2C(session)->txn_global.stable_timestamp; /* Release the reconciliation lock. */ + *page_lockedp = false; WT_PAGE_UNLOCK(session, page); /* Update statistics. */ @@ -522,7 +574,16 @@ __rec_init(WT_SESSION_IMPL *session, btree = S2BT(session); page = ref->page; - if ((r = *(WT_RECONCILE **)reconcilep) == NULL) { + /* + * Reconciliation is not re-entrant, make sure that doesn't happen. Our + * caller sets WT_SESSION_IMPL.WT_SESSION_NO_RECONCILE to prevent it, + * but it's been a problem in the past, check to be sure. + */ + r = *(WT_RECONCILE **)reconcilep; + if (r != NULL && r->ref != NULL) + WT_RET_MSG(session, WT_ERROR, "reconciliation re-entered"); + + if (r == NULL) { WT_RET(__wt_calloc_one(session, &r)); session->reconcile_cleanup = __rec_destroy_session; @@ -535,9 +596,6 @@ __rec_init(WT_SESSION_IMPL *session, F_SET(&r->chunkB.image, WT_ITEM_ALIGNED); } - /* Reconciliation is not re-entrant, make sure that doesn't happen. */ - WT_ASSERT(session, r->ref == NULL); - /* Remember the configuration. */ r->ref = ref; r->page = page; diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 0469882c08e..53bde4a499b 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -731,11 +731,12 @@ __txn_commit_timestamps_assert(WT_SESSION_IMPL *session) } else upd = op->u.op_upd->next; /* - * Skip over any aborted update structures or ones - * from our own transaction. + * Skip over any aborted update structures, internally + * created update structures or ones from our own + * transaction. */ while (upd != NULL && (upd->txnid == WT_TXN_ABORTED || - upd->txnid == txn->id)) + upd->txnid == WT_TXN_NONE || upd->txnid == txn->id)) upd = upd->next; /* diff --git a/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c b/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c index fea9ad0bfe3..40b4c543500 100644 --- a/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c +++ b/src/third_party/wiredtiger/test/csuite/wt4333_handle_locks/main.c @@ -139,8 +139,10 @@ op(WT_SESSION *session, WT_RAND_STATE *rnd, WT_CURSOR **cpp) /* Close the cursor half the time, otherwise cache it. */ if (__wt_random(rnd) % 2 == 0) testutil_check(cursor->close(cursor)); - else + else { + testutil_check(cursor->reset(cursor)); *cpp = cursor; + } (void)__wt_atomic_add64(&worker, 1); } diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index e9ae4d1e37b..5883b8dc42e 100644..100755 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -1021,6 +1021,7 @@ tasks: run_on: - rhel62-large commands: + - func: "fetch source" - func: "fetch mongo-tests repo" - command: shell.exec params: |