diff options
author | Ramon Fernandez <ramon.fernandez@mongodb.com> | 2015-08-06 16:55:10 -0400 |
---|---|---|
committer | Ramon Fernandez <ramon.fernandez@mongodb.com> | 2015-08-06 16:55:10 -0400 |
commit | 4b03c1c71f9657d0b84617ef32ea5b96faec576b (patch) | |
tree | a957278b79c29733ef0fe6aa87c09784aa7af890 | |
parent | da5744a4dba5bdba10e5e5fb63eb27fbf769de0d (diff) | |
download | mongo-4b03c1c71f9657d0b84617ef32ea5b96faec576b.tar.gz |
Import wiredtiger-wiredtiger-mongodb-3.0.4-49-g48648de.tar.gz from wiredtiger branch mongodb-3.0
58 files changed, 815 insertions, 664 deletions
diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c index 8780d270664..a4f679ae736 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c @@ -117,13 +117,13 @@ randomize_value(CONFIG_THREAD *thread, char *value_buf) * randomly chosen byte (other than the trailing NUL). * Make sure we don't write a NUL: keep the value the same length. */ - i = __wt_random(thread->rnd) % (thread->cfg->value_sz - 1); + i = __wt_random(&thread->rnd) % (thread->cfg->value_sz - 1); while (value_buf[i] == '\0' && i > 0) --i; if (i > 0) { vb = (uint8_t *)value_buf; - vb[0] = (__wt_random(thread->rnd) % 255) + 1; - vb[i] = (__wt_random(thread->rnd) % 255) + 1; + vb[0] = (__wt_random(&thread->rnd) % 255) + 1; + vb[i] = (__wt_random(&thread->rnd) % 255) + 1; } } @@ -2155,13 +2155,11 @@ start_threads(CONFIG *cfg, * new RNG state further along in the sequence. */ if (i == 0) - __wt_random_init(thread->rnd); - else { - thread->rnd[0] = (thread - 1)->rnd[0]; - thread->rnd[1] = (thread - 1)->rnd[1]; - } + __wt_random_init(&thread->rnd); + else + thread->rnd = (thread - 1)->rnd; for (j = 0; j < 1000; ++j) - (void)__wt_random(thread->rnd); + (void)__wt_random(&thread->rnd); /* * Every thread gets a key/data buffer because we don't bother @@ -2283,7 +2281,7 @@ wtperf_rand(CONFIG_THREAD *thread) * Use WiredTiger's random number routine: it's lock-free and fairly * good. */ - rval = (uint64_t)__wt_random(thread->rnd); + rval = (uint64_t)__wt_random(&thread->rnd); /* Use Pareto distribution to give 80/20 hot/cold values. */ if (cfg->pareto) { diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.h b/src/third_party/wiredtiger/bench/wtperf/wtperf.h index 201623c7859..7ae55c5ca19 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.h +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.h @@ -209,7 +209,7 @@ typedef struct { struct __config_thread { /* Per-thread structure */ CONFIG *cfg; /* Enclosing configuration */ - uint32_t rnd[2]; /* Random number generation state */ + WT_RAND_STATE rnd; /* Random number generation state */ pthread_t handle; /* Handle */ diff --git a/src/third_party/wiredtiger/dist/log.py b/src/third_party/wiredtiger/dist/log.py index 57b8fdc0f23..abe72cea5c4 100644 --- a/src/third_party/wiredtiger/dist/log.py +++ b/src/third_party/wiredtiger/dist/log.py @@ -114,7 +114,8 @@ __wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp) { \tWT_ITEM *logrec; -\tWT_RET(__wt_scr_alloc(session, WT_ALIGN(size + 1, LOG_ALIGN), &logrec)); +\tWT_RET( +\t __wt_scr_alloc(session, WT_ALIGN(size + 1, WT_LOG_ALIGN), &logrec)); \tWT_CLEAR(*(WT_LOG_RECORD *)logrec->data); \tlogrec->size = offsetof(WT_LOG_RECORD, record); diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index a966134e5ce..2bdd1d88a54 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -809,6 +809,7 @@ minorp minprefix mkdir mmap +mmrand mnt msecs msg diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index f133ab899ea..6e2efb66eb6 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -215,7 +215,6 @@ connection_stats = [ ########################################## # Logging statistics ########################################## - LogStat('log_buffer_grow', 'log buffer size increases'), LogStat('log_buffer_size', 'total log buffer size', 'no_clear,no_scale'), LogStat('log_bytes_payload', 'log bytes of payload data'), LogStat('log_bytes_written', 'log bytes written'), diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c index 18c3978c90f..40bba8184a1 100644 --- a/src/third_party/wiredtiger/src/block/block_ckpt.c +++ b/src/third_party/wiredtiger/src/block/block_ckpt.c @@ -315,7 +315,7 @@ __ckpt_extlist_fblocks( * file that contains a previous checkpoint's extents. */ return (__wt_block_insert_ext( - session, &block->live.ckpt_avail, el->offset, el->size)); + session, block, &block->live.ckpt_avail, el->offset, el->size)); } #ifdef HAVE_DIAGNOSTIC @@ -537,7 +537,7 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) * must be paired in the checkpoint. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) - WT_ERR(__wt_block_insert_ext(session, + WT_ERR(__wt_block_insert_ext(session, block, &a->discard, a->root_offset, a->root_size)); /* @@ -554,10 +554,10 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( - session, &a->alloc, &b->alloc)); + session, block, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( - session, &a->discard, &b->discard)); + session, block, &a->discard, &b->discard)); /* * If the "to" checkpoint is also being deleted, we're done with @@ -775,7 +775,8 @@ __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block) block->ckpt_inprogress = 0; __wt_spin_lock(session, &block->live_lock); - ret = __wt_block_extlist_merge(session, &ci->ckpt_avail, &ci->avail); + ret = __wt_block_extlist_merge( + session, block, &ci->ckpt_avail, &ci->avail); __wt_spin_unlock(session, &block->live_lock); /* Discard the lists remaining after the checkpoint call. */ diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c index e89c70060f3..d593537446b 100644 --- a/src/third_party/wiredtiger/src/block/block_ext.c +++ b/src/third_party/wiredtiger/src/block/block_ext.c @@ -8,12 +8,25 @@ #include "wt_internal.h" -static int __block_append(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t); +/* + * WT_BLOCK_RET -- + * Handle extension list errors that would normally panic the system but + * which should fail gracefully when verifying. + */ +#define WT_BLOCK_RET(session, block, v, ...) do { \ + int __ret = (v); \ + __wt_err(session, __ret, __VA_ARGS__); \ + return ((block)->verify ? __ret : __wt_panic(session)); \ +} while (0) + +static int __block_append(WT_SESSION_IMPL *, + WT_BLOCK *, WT_EXTLIST *, wt_off_t, wt_off_t); static int __block_ext_overlap(WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, WT_EXT **, WT_EXTLIST *, WT_EXT **); static int __block_extlist_dump( WT_SESSION_IMPL *, const char *, WT_EXTLIST *, int); -static int __block_merge(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t); +static int __block_merge(WT_SESSION_IMPL *, + WT_BLOCK *, WT_EXTLIST *, wt_off_t, wt_off_t); /* * __block_off_srch_last -- @@ -308,8 +321,8 @@ __wt_block_misplaced(WT_SESSION_IMPL *session, * Remove a record from an extent list. */ static int -__block_off_remove( - WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, WT_EXT **extp) +__block_off_remove(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_EXTLIST *el, wt_off_t off, WT_EXT **extp) { WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH]; WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH]; @@ -370,7 +383,7 @@ __block_off_remove( return (0); corrupt: - WT_PANIC_RET(session, EINVAL, + WT_BLOCK_RET(session, block, EINVAL, "attempt to remove non-existent offset from an extent list"); } @@ -380,8 +393,8 @@ corrupt: * overlapping entry. */ int -__wt_block_off_remove_overlap( - WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size) +__wt_block_off_remove_overlap(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_EXTLIST *el, wt_off_t off, wt_off_t size) { WT_EXT *before, *after, *ext; wt_off_t a_off, a_size, b_off, b_size; @@ -393,7 +406,8 @@ __wt_block_off_remove_overlap( /* If "before" or "after" overlaps, retrieve the overlapping entry. */ if (before != NULL && before->off + before->size > off) { - WT_RET(__block_off_remove(session, el, before->off, &ext)); + WT_RET(__block_off_remove( + session, block, el, before->off, &ext)); /* Calculate overlapping extents. */ a_off = ext->off; @@ -401,7 +415,8 @@ __wt_block_off_remove_overlap( b_off = off + size; b_size = ext->size - (a_size + size); } else if (after != NULL && off + size > after->off) { - WT_RET(__block_off_remove(session, el, after->off, &ext)); + WT_RET(__block_off_remove( + session, block, el, after->off, &ext)); /* * Calculate overlapping extents. There's no initial overlap @@ -525,7 +540,7 @@ __wt_block_alloc( __block_size_srch(block->live.avail.sz, size, sstack); if ((szp = *sstack[0]) == NULL) { append: WT_RET(__block_extend(session, block, offp, size)); - WT_RET(__block_append(session, + WT_RET(__block_append(session, block, &block->live.alloc, *offp, (wt_off_t)size)); return (0); } @@ -535,7 +550,8 @@ append: WT_RET(__block_extend(session, block, offp, size)); } /* Remove the record, and set the returned offset. */ - WT_RET(__block_off_remove(session, &block->live.avail, ext->off, &ext)); + WT_RET(__block_off_remove( + session, block, &block->live.avail, ext->off, &ext)); *offp = ext->off; /* If doing a partial allocation, adjust the record and put it back. */ @@ -561,7 +577,7 @@ append: WT_RET(__block_extend(session, block, offp, size)); /* Add the newly allocated extent to the list of allocations. */ WT_RET(__block_merge( - session, &block->live.alloc, *offp, (wt_off_t)size)); + session, block, &block->live.alloc, *offp, (wt_off_t)size)); return (0); } @@ -618,12 +634,12 @@ __wt_block_off_free( * list. */ if ((ret = __wt_block_off_remove_overlap( - session, &block->live.alloc, offset, size)) == 0) - ret = __block_merge( - session, &block->live.avail, offset, (wt_off_t)size); + session, block, &block->live.alloc, offset, size)) == 0) + ret = __block_merge(session, block, + &block->live.avail, offset, (wt_off_t)size); else if (ret == WT_NOTFOUND) - ret = __block_merge( - session, &block->live.discard, offset, (wt_off_t)size); + ret = __block_merge(session, block, + &block->live.discard, offset, (wt_off_t)size); return (ret); } @@ -770,9 +786,12 @@ __block_ext_overlap(WT_SESSION_IMPL *session, */ *ap = (*ap)->next[0]; *bp = (*bp)->next[0]; - WT_RET(__block_merge(session, avail, b->off, b->size)); - WT_RET(__block_off_remove(session, ael, a->off, NULL)); - WT_RET(__block_off_remove(session, bel, b->off, NULL)); + WT_RET(__block_merge( + session, block, avail, b->off, b->size)); + WT_RET(__block_off_remove( + session, block, ael, a->off, NULL)); + WT_RET(__block_off_remove( + session, block, bel, b->off, NULL)); } else if (a->size > b->size) { /* Case #4 */ /* @@ -780,7 +799,8 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Increment/Decrement A's offset/size by the size of B * Insert A on its list */ - WT_RET(__block_off_remove(session, ael, a->off, &a)); + WT_RET(__block_off_remove( + session, block, ael, a->off, &a)); a->off += b->size; a->size -= b->size; WT_RET(__block_ext_insert(session, ael, a)); @@ -791,15 +811,18 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Delete B */ *bp = (*bp)->next[0]; - WT_RET(__block_merge(session, avail, b->off, b->size)); - WT_RET(__block_off_remove(session, bel, b->off, NULL)); + WT_RET(__block_merge( + session, block, avail, b->off, b->size)); + WT_RET(__block_off_remove( + session, block, bel, b->off, NULL)); } else { /* Case #9 */ /* * Remove B from its list * Increment/Decrement B's offset/size by the size of A * Insert B on its list */ - WT_RET(__block_off_remove(session, bel, b->off, &b)); + WT_RET(__block_off_remove( + session, block, bel, b->off, &b)); b->off += a->size; b->size -= a->size; WT_RET(__block_ext_insert(session, bel, b)); @@ -810,8 +833,10 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Delete A */ *ap = (*ap)->next[0]; - WT_RET(__block_merge(session, avail, a->off, a->size)); - WT_RET(__block_off_remove(session, ael, a->off, NULL)); + WT_RET(__block_merge( + session, block, avail, a->off, a->size)); + WT_RET(__block_off_remove( + session, block, ael, a->off, NULL)); } /* Case #6 */ } else if (a->off + a->size == b->off + b->size) { /* @@ -819,7 +844,7 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Decrement A's size by the size of B * Insert A on its list */ - WT_RET(__block_off_remove(session, ael, a->off, &a)); + WT_RET(__block_off_remove(session, block, ael, a->off, &a)); a->size -= b->size; WT_RET(__block_ext_insert(session, ael, a)); @@ -829,8 +854,8 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Delete B */ *bp = (*bp)->next[0]; - WT_RET(__block_merge(session, avail, b->off, b->size)); - WT_RET(__block_off_remove(session, bel, b->off, NULL)); + WT_RET(__block_merge(session, block, avail, b->off, b->size)); + WT_RET(__block_off_remove(session, block, bel, b->off, NULL)); } else if /* Case #3, #7 */ (a->off + a->size < b->off + b->size) { /* @@ -838,14 +863,14 @@ __block_ext_overlap(WT_SESSION_IMPL *session, */ off = b->off; size = (a->off + a->size) - b->off; - WT_RET(__block_merge(session, avail, off, size)); + WT_RET(__block_merge(session, block, avail, off, size)); /* * Remove A from its list * Decrement A's size by the overlap * Insert A on its list */ - WT_RET(__block_off_remove(session, ael, a->off, &a)); + WT_RET(__block_off_remove(session, block, ael, a->off, &a)); a->size -= size; WT_RET(__block_ext_insert(session, ael, a)); @@ -854,7 +879,7 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Increment/Decrement B's offset/size by the overlap * Insert B on its list */ - WT_RET(__block_off_remove(session, bel, b->off, &b)); + WT_RET(__block_off_remove(session, block, bel, b->off, &b)); b->off += size; b->size -= size; WT_RET(__block_ext_insert(session, bel, b)); @@ -868,12 +893,12 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Decrement A's size by trailing part of A plus B's size * Insert A on its list */ - WT_RET(__block_off_remove(session, ael, a->off, &a)); + WT_RET(__block_off_remove(session, block, ael, a->off, &a)); a->size = b->off - a->off; WT_RET(__block_ext_insert(session, ael, a)); /* Add trailing part of A to A's list as a new element. */ - WT_RET(__block_merge(session, ael, off, size)); + WT_RET(__block_merge(session, block, ael, off, size)); /* * Move caller's B to the next element @@ -881,8 +906,8 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Delete B */ *bp = (*bp)->next[0]; - WT_RET(__block_merge(session, avail, b->off, b->size)); - WT_RET(__block_off_remove(session, bel, b->off, NULL)); + WT_RET(__block_merge(session, block, avail, b->off, b->size)); + WT_RET(__block_off_remove(session, block, bel, b->off, NULL)); } return (0); @@ -893,7 +918,8 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Merge one extent list into another. */ int -__wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b) +__wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_EXTLIST *a, WT_EXTLIST *b) { WT_EXT *ext; WT_EXTLIST tmp; @@ -923,7 +949,7 @@ __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b) } WT_EXT_FOREACH(ext, a->off) - WT_RET(__block_merge(session, b, ext->off, ext->size)); + WT_RET(__block_merge(session, block, b, ext->off, ext->size)); return (0); } @@ -933,12 +959,13 @@ __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b) * Append a new entry to the allocation list. */ static int -__block_append( - WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size) +__block_append(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_EXTLIST *el, wt_off_t off, wt_off_t size) { WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH]; u_int i; + WT_UNUSED(block); WT_ASSERT(session, el->track_size == 0); /* @@ -979,8 +1006,8 @@ __block_append( * Insert an extent into an extent list, merging if possible. */ int -__wt_block_insert_ext( - WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size) +__wt_block_insert_ext(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_EXTLIST *el, wt_off_t off, wt_off_t size) { /* * There are currently two copies of this function (this code is a one- @@ -993,7 +1020,7 @@ __wt_block_insert_ext( * Callers of this function are expected to have already acquired any * locks required to manipulate the extent list. */ - return (__block_merge(session, el, off, size)); + return (__block_merge(session, block, el, off, size)); } /* @@ -1002,8 +1029,8 @@ __wt_block_insert_ext( * version). */ static int -__block_merge( - WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size) +__block_merge(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_EXTLIST *el, wt_off_t off, wt_off_t size) { WT_EXT *ext, *after, *before; @@ -1014,7 +1041,7 @@ __block_merge( __block_off_srch_pair(el, off, &before, &after); if (before != NULL) { if (before->off + before->size > off) - WT_PANIC_RET(session, EINVAL, + WT_BLOCK_RET(session, block, EINVAL, "%s: existing range %" PRIdMAX "-%" PRIdMAX " overlaps with merge range %" PRIdMAX "-%" PRIdMAX, el->name, @@ -1025,8 +1052,8 @@ __block_merge( before = NULL; } if (after != NULL) { - if (off + size > after->off) - WT_PANIC_RET(session, EINVAL, + if (off + size > after->off) { + WT_BLOCK_RET(session, block, EINVAL, "%s: merge range %" PRIdMAX "-%" PRIdMAX " overlaps with existing range %" PRIdMAX "-%" PRIdMAX, @@ -1034,6 +1061,7 @@ __block_merge( (intmax_t)off, (intmax_t)(off + size), (intmax_t)after->off, (intmax_t)(after->off + after->size)); + } if (off + size != after->off) after = NULL; } @@ -1053,7 +1081,8 @@ __block_merge( * the record we're going to use, adjust it and re-insert it. */ if (before == NULL) { - WT_RET(__block_off_remove(session, el, after->off, &ext)); + WT_RET(__block_off_remove( + session, block, el, after->off, &ext)); WT_RET(__wt_verbose(session, WT_VERB_BLOCK, "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %" @@ -1067,10 +1096,11 @@ __block_merge( } else { if (after != NULL) { size += after->size; - WT_RET( - __block_off_remove(session, el, after->off, NULL)); + WT_RET(__block_off_remove( + session, block, el, after->off, NULL)); } - WT_RET(__block_off_remove(session, el, before->off, &ext)); + WT_RET(__block_off_remove( + session, block, el, before->off, &ext)); WT_RET(__wt_verbose(session, WT_VERB_BLOCK, "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %" @@ -1115,8 +1145,8 @@ __wt_block_extlist_read_avail(WT_SESSION_IMPL *session, * Extent blocks are allocated from the available list: if reading the * avail list, the extent blocks might be included, remove them. */ - WT_ERR_NOTFOUND_OK( - __wt_block_off_remove_overlap(session, el, el->offset, el->size)); + WT_ERR_NOTFOUND_OK(__wt_block_off_remove_overlap( + session, block, el, el->offset, el->size)); err: #ifdef HAVE_DIAGNOSTIC @@ -1137,7 +1167,8 @@ __wt_block_extlist_read(WT_SESSION_IMPL *session, WT_DECL_ITEM(tmp); WT_DECL_RET; wt_off_t off, size; - int (*func)(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t); + int (*func)( + WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, wt_off_t, wt_off_t); const uint8_t *p; /* If there isn't a list, we're done. */ @@ -1187,14 +1218,16 @@ __wt_block_extlist_read(WT_SESSION_IMPL *session, if (off < block->allocsize || off % block->allocsize != 0 || size % block->allocsize != 0 || - off + size > ckpt_size) -corrupted: WT_PANIC_RET(session, WT_ERROR, + off + size > ckpt_size) { +corrupted: __wt_scr_free(session, &tmp); + WT_BLOCK_RET(session, block, WT_ERROR, "file contains a corrupted %s extent list, range %" PRIdMAX "-%" PRIdMAX " past end-of-file", el->name, (intmax_t)off, (intmax_t)(off + size)); + } - WT_ERR(func(session, el, off, size)); + WT_ERR(func(session, block, el, off, size)); } if (WT_VERBOSE_ISSET(session, WT_VERB_BLOCK)) @@ -1290,7 +1323,7 @@ __wt_block_extlist_write(WT_SESSION_IMPL *session, * blocks never appear on any allocation list. */ WT_TRET(__wt_block_off_remove_overlap( - session, &block->live.alloc, el->offset, el->size)); + session, block, &block->live.alloc, el->offset, el->size)); WT_ERR(__wt_verbose(session, WT_VERB_BLOCK, "%s written %" PRIdMAX "/%" PRIu32, @@ -1331,7 +1364,7 @@ __wt_block_extlist_truncate( */ orig = fh->size; size = ext->off; - WT_RET(__block_off_remove(session, el, size, NULL)); + WT_RET(__block_off_remove(session, block, el, size, NULL)); fh->size = size; /* diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c index 8e45ec85a97..4728066b487 100644 --- a/src/third_party/wiredtiger/src/block/block_open.c +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -133,8 +133,7 @@ __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block) bucket = block->name_hash % WT_HASH_ARRAY_SIZE; WT_CONN_BLOCK_REMOVE(conn, block, bucket); - if (block->name != NULL) - __wt_free(session, block->name); + __wt_free(session, block->name); if (block->fh != NULL) WT_TRET(__wt_close(session, &block->fh)); @@ -196,14 +195,20 @@ __wt_block_open(WT_SESSION_IMPL *session, } } - /* Basic structure allocation, initialization. */ + /* + * Basic structure allocation, initialization. + * + * Note: set the block's name-hash value before any work that can fail + * because cleanup calls the block destroy code which uses that hash + * value to remove the block from the underlying linked lists. + */ WT_ERR(__wt_calloc_one(session, &block)); block->ref = 1; + block->name_hash = hash; + block->allocsize = allocsize; WT_CONN_BLOCK_INSERT(conn, block, bucket); WT_ERR(__wt_strdup(session, filename, &block->name)); - block->name_hash = hash; - block->allocsize = allocsize; WT_ERR(__wt_config_gets(session, cfg, "block_allocation", &cval)); block->allocfirst = diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c index ef944fcb152..0d631396b41 100644 --- a/src/third_party/wiredtiger/src/block/block_read.c +++ b/src/third_party/wiredtiger/src/block/block_read.c @@ -192,21 +192,29 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, buf->size = size; blk = WT_BLOCK_HEADER_REF(buf->mem); - page_cksum = blk->cksum; - if (page_cksum == cksum) { + if (blk->cksum == cksum) { blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (page_cksum == cksum) return (0); - } - if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) - __wt_errx(session, - "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" - PRIu32 " != %" PRIu32 "]", - size, (uintmax_t)offset, cksum, page_cksum); + if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) + __wt_errx(session, + "read checksum error for %" PRIu32 "B block at " + "offset %" PRIuMAX ": calculated block checksum " + "of %" PRIu32 " doesn't match expected checksum " + "of %" PRIu32, + size, (uintmax_t)offset, page_cksum, cksum); + } else + if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) + __wt_errx(session, + "read checksum error for %" PRIu32 "B block at " + "offset %" PRIuMAX ": block header checksum " + "of %" PRIu32 " doesn't match expected checksum " + "of %" PRIu32, + size, (uintmax_t)offset, blk->cksum, cksum); /* Panic if a checksum fails during an ordinary read. */ return (block->verify || diff --git a/src/third_party/wiredtiger/src/block/block_slvg.c b/src/third_party/wiredtiger/src/block/block_slvg.c index 517fb92491e..c78a6c39942 100644 --- a/src/third_party/wiredtiger/src/block/block_slvg.c +++ b/src/third_party/wiredtiger/src/block/block_slvg.c @@ -53,7 +53,7 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block) * any blocks we don't want as we process the file. */ WT_RET(__wt_block_insert_ext( - session, &block->live.alloc, allocsize, len - allocsize)); + session, block, &block->live.alloc, allocsize, len - allocsize)); return (0); } diff --git a/src/third_party/wiredtiger/src/block/block_vrfy.c b/src/third_party/wiredtiger/src/block/block_vrfy.c index 29a9e4950b4..c9df768a624 100644 --- a/src/third_party/wiredtiger/src/block/block_vrfy.c +++ b/src/third_party/wiredtiger/src/block/block_vrfy.c @@ -87,6 +87,12 @@ __wt_block_verify_start(WT_SESSION_IMPL *session, WT_RET(__bit_alloc(session, block->frags, &block->fragfile)); /* + * Set this before reading any extent lists: don't panic if we see + * corruption. + */ + block->verify = 1; + + /* * We maintain an allocation list that is rolled forward through the * set of checkpoints. */ @@ -102,8 +108,6 @@ __wt_block_verify_start(WT_SESSION_IMPL *session, /* Configuration: strict behavior on any error. */ WT_RET(__wt_config_gets(session, cfg, "strict", &cval)); block->verify_strict = cval.val ? 1 : 0; - - block->verify = 1; return (0); } @@ -228,7 +232,7 @@ __wt_verify_ckpt_load( WT_RET(__wt_block_extlist_read( session, block, el, ci->file_size)); WT_RET(__wt_block_extlist_merge( - session, el, &block->verify_alloc)); + session, block, el, &block->verify_alloc)); __wt_block_extlist_free(session, el); } el = &ci->discard; @@ -236,7 +240,7 @@ __wt_verify_ckpt_load( WT_RET(__wt_block_extlist_read( session, block, el, ci->file_size)); WT_EXT_FOREACH(ext, el->off) - WT_RET(__wt_block_off_remove_overlap(session, + WT_RET(__wt_block_off_remove_overlap(session, block, &block->verify_alloc, ext->off, ext->size)); __wt_block_extlist_free(session, el); } @@ -265,7 +269,7 @@ __wt_verify_ckpt_load( * checkpoints. */ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) - WT_RET(__wt_block_off_remove_overlap(session, + WT_RET(__wt_block_off_remove_overlap(session, block, &block->verify_alloc, ci->root_offset, ci->root_size)); /* diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index dba2da223bd..041398d4e43 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -600,7 +600,7 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) switch (page->type) { case WT_PAGE_COL_INT: __dmsg(ds, " recno %" PRIu64, page->pg_intl_recno); - pindex = WT_INTL_INDEX_COPY(page); + pindex = WT_INTL_INDEX_GET_SAFE(page); entries = pindex->entries; break; case WT_PAGE_COL_FIX: @@ -612,7 +612,7 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) entries = page->pg_var_entries; break; case WT_PAGE_ROW_INT: - pindex = WT_INTL_INDEX_COPY(page); + pindex = WT_INTL_INDEX_GET_SAFE(page); entries = pindex->entries; break; case WT_PAGE_ROW_LEAF: @@ -634,8 +634,8 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) __dmsg(ds, ", evict-lru"); if (F_ISSET_ATOMIC(page, WT_PAGE_SCANNING)) __dmsg(ds, ", scanning"); - if (F_ISSET_ATOMIC(page, WT_PAGE_SPLITTING)) - __dmsg(ds, ", splitting"); + if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED)) + __dmsg(ds, ", split locked"); if (mod != NULL) switch (F_ISSET(mod, WT_PM_REC_MASK)) { diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index 2a0a5e37f98..a05c6217338 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -56,7 +56,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); - WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLITTING)); + WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED)); #ifdef HAVE_DIAGNOSTIC { @@ -210,7 +210,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) static void __free_page_int(WT_SESSION_IMPL *session, WT_PAGE *page) { - __wt_free_ref_index(session, page, WT_INTL_INDEX_COPY(page), 0); + __wt_free_ref_index(session, page, WT_INTL_INDEX_GET_SAFE(page), 0); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index 9d9ab66e0f7..e249f997d87 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -422,7 +422,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation) __wt_page_alloc(session, WT_PAGE_COL_INT, 1, 1, 1, &root)); root->pg_intl_parent_ref = &btree->root; - pindex = WT_INTL_INDEX_COPY(root); + pindex = WT_INTL_INDEX_GET_SAFE(root); ref = pindex->index[0]; ref->home = root; ref->page = NULL; @@ -435,7 +435,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation) __wt_page_alloc(session, WT_PAGE_ROW_INT, 0, 1, 1, &root)); root->pg_intl_parent_ref = &btree->root; - pindex = WT_INTL_INDEX_COPY(root); + pindex = WT_INTL_INDEX_GET_SAFE(root); ref = pindex->index[0]; ref->home = root; ref->page = NULL; diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 120220223f8..b8b67720fce 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -272,7 +272,7 @@ __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, size += sizeof(WT_REF); } if (0) { -err: if ((pindex = WT_INTL_INDEX_COPY(page)) != NULL) { +err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) { for (i = 0; i < pindex->entries; ++i) __wt_free(session, pindex->index[i]); __wt_free(session, pindex); @@ -459,7 +459,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) * Walk the page, building references: the page contains value items. * The value items are on-page items (WT_CELL_VALUE). */ - pindex = WT_INTL_INDEX_COPY(page); + pindex = WT_INTL_INDEX_GET_SAFE(page); refp = pindex->index; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ref = *refp++; @@ -594,7 +594,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) * location cookie pairs. Keys are on-page/overflow items and location * cookies are WT_CELL_ADDR_XXX items. */ - pindex = WT_INTL_INDEX_COPY(page); + pindex = WT_INTL_INDEX_GET_SAFE(page); refp = pindex->index; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ref = *refp; diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index ba1802116d0..e493a84679a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -1175,7 +1175,7 @@ __slvg_col_build_internal( __wt_page_alloc(session, WT_PAGE_COL_INT, 1, leaf_cnt, 1, &page)); WT_ERR(__slvg_modify_init(session, page)); - pindex = WT_INTL_INDEX_COPY(page); + WT_INTL_INDEX_GET(session, page, pindex); for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) { if ((trk = ss->pages[i]) == NULL) continue; @@ -1820,7 +1820,7 @@ __slvg_row_build_internal( __wt_page_alloc(session, WT_PAGE_ROW_INT, 0, leaf_cnt, 1, &page)); WT_ERR(__slvg_modify_init(session, page)); - pindex = WT_INTL_INDEX_COPY(page); + WT_INTL_INDEX_GET(session, page, pindex); for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) { if ((trk = ss->pages[i]) == NULL) continue; diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index eb2382cd610..acef71f1d94 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -178,7 +178,7 @@ __split_should_deepen( btree = S2BT(session); page = ref->page; - pindex = WT_INTL_INDEX_COPY(page); + pindex = WT_INTL_INDEX_GET_SAFE(page); /* * Deepen the tree if the page's memory footprint is larger than the @@ -393,7 +393,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) parent_incr = parent_decr = 0; panic = 0; - pindex = WT_INTL_INDEX_COPY(parent); + pindex = WT_INTL_INDEX_GET_SAFE(parent); WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); @@ -491,7 +491,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) * to change. */ child_incr = 0; - child_pindex = WT_INTL_INDEX_COPY(child); + child_pindex = WT_INTL_INDEX_GET_SAFE(child); for (child_refp = child_pindex->index, j = 0; j < slots; ++j) { WT_ERR(__split_ref_deepen_move(session, parent, *parent_refp, &parent_decr, &child_incr)); @@ -518,7 +518,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) * footprint. From now on we've modified the parent page, attention * needs to be paid. */ - WT_ASSERT(session, WT_INTL_INDEX_COPY(parent) == pindex); + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); WT_INTL_INDEX_SET(parent, alloc_index); split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1); panic = 1; @@ -567,7 +567,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) */ if (child_ref->home == parent) { child_ref->home = child; - child_ref->ref_hint = 0; + child_ref->pindex_hint = 0; } } WT_INTL_FOREACH_END; } @@ -825,11 +825,11 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, */ for (;;) { parent = ref->home; - F_CAS_ATOMIC(parent, WT_PAGE_SPLITTING, ret); + F_CAS_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED, ret); if (ret == 0) { if (parent == ref->home) break; - F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING); + F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED); continue; } __wt_yield(); @@ -847,7 +847,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, hazard = 1; } - pindex = WT_INTL_INDEX_COPY(parent); + pindex = WT_INTL_INDEX_GET_SAFE(parent); parent_entries = pindex->entries; /* @@ -906,7 +906,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * Update the parent page's index: this update makes the split visible * to threads descending the tree. */ - WT_ASSERT(session, WT_INTL_INDEX_COPY(parent) == pindex); + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); WT_INTL_INDEX_SET(parent, alloc_index); split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1); alloc_index = NULL; @@ -1037,7 +1037,7 @@ err: if (!complete) if (next_ref->state == WT_REF_SPLIT) next_ref->state = WT_REF_DELETED; } - F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING); + F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED); if (hazard) WT_TRET(__wt_hazard_clear(session, parent)); diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index ca3b8f327b3..0650f26e5e9 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -70,7 +70,7 @@ __sync_file(WT_SESSION_IMPL *session, int syncop) if (__wt_page_is_modified(page) && __wt_txn_visible_all( session, page->modify->update_txn)) { - if (txn->isolation == TXN_ISO_READ_COMMITTED) + if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; @@ -185,7 +185,7 @@ err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); - if (txn->isolation == TXN_ISO_READ_COMMITTED && session->ncursors == 0) + if (txn->isolation == WT_ISO_READ_COMMITTED && session->ncursors == 0) __wt_txn_release_snapshot(session); if (btree->checkpointing) { diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index d146850b505..2705f371fb5 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -9,6 +9,66 @@ #include "wt_internal.h" /* + * __page_refp -- + * Return the page's index and slot for a reference. + */ +static inline void +__page_refp(WT_SESSION_IMPL *session, + WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp) +{ + WT_PAGE_INDEX *pindex; + uint32_t i; + + /* + * Copy the parent page's index value: the page can split at any time, + * but the index's value is always valid, even if it's not up-to-date. + */ +retry: WT_INTL_INDEX_GET(session, ref->home, pindex); + + /* + * Use the page's reference hint: it should be correct unless the page + * split before our slot. If the page splits after our slot, the hint + * will point earlier in the array than our actual slot, so the first + * loop is from the hint to the end of the list, and the second loop + * is from the start of the list to the end of the list. (The second + * loop overlaps the first, but that only happen in cases where we've + * deepened the tree and aren't going to find our slot at all, that's + * not worth optimizing.) + * + * It's not an error for the reference hint to be wrong, it just means + * the first retrieval (which sets the hint for subsequent retrievals), + * is slower. + */ + i = ref->pindex_hint; + if (i < pindex->entries && pindex->index[i]->page == ref->page) { + *pindexp = pindex; + *slotp = i; + return; + } + while (++i < pindex->entries) + if (pindex->index[i]->page == ref->page) { + *pindexp = pindex; + *slotp = ref->pindex_hint = i; + return; + } + for (i = 0; i < pindex->entries; ++i) + if (pindex->index[i]->page == ref->page) { + *pindexp = pindex; + *slotp = ref->pindex_hint = i; + return; + } + + /* + * If we don't find our reference, the page split into a new level and + * our home pointer references the wrong page. After internal pages + * deepen, their reference structure home value are updated; yield and + * wait for that to happen. + */ + __wt_yield(); + goto retry; +} + +/* * __wt_tree_walk -- * Move to the next/previous page in the tree. */ @@ -21,10 +81,11 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; - int prev, skip; + int empty_internal, prev, skip; uint32_t slot; btree = S2BT(session); + empty_internal = 0; /* * Tree walks are special: they look inside page structures that splits @@ -99,7 +160,7 @@ ascend: /* } /* Figure out the current slot in the WT_REF array. */ - __wt_page_refp(session, ref, &pindex, &slot); + __page_refp(session, ref, &pindex, &slot); for (;;) { /* @@ -111,6 +172,15 @@ ascend: /* (!prev && slot == pindex->entries - 1)) { ref = ref->home->pg_intl_parent_ref; + /* + * If we got all the way through an internal page and + * all of the child pages were deleted, evict it. + */ + if (empty_internal) { + __wt_page_evict_soon(ref->page); + empty_internal = 0; + } + /* Optionally skip internal pages. */ if (LF_ISSET(WT_READ_SKIP_INTL)) goto ascend; @@ -134,19 +204,13 @@ ascend: /* * parent of the current child page, our parent * reference can't have split or been evicted. */ - __wt_page_refp(session, ref, &pindex, &slot); + __page_refp(session, ref, &pindex, &slot); if ((ret = __wt_page_swap( session, couple, ref, flags)) != 0) { WT_TRET(__wt_page_release( session, couple, flags)); WT_ERR(ret); } - - /* - * Set the reference hint (used when we continue - * the walk). - */ - ref->ref_hint = slot; } *refp = ref; @@ -162,13 +226,22 @@ ascend: /* ++*walkcntp; for (;;) { + /* + * Move to the next slot, and set the reference hint if + * it's wrong (used when we continue the walk). We don't + * update those hints when splitting, so it's common for + * them to be incorrect in some workloads. + */ ref = pindex->index[slot]; + if (ref->pindex_hint != slot) + ref->pindex_hint = slot; /* - * Set the reference hint (used when we continue the - * walk). + * If we see any child states other than deleted, the + * page isn't empty. */ - ref->ref_hint = slot; + if (ref->state != WT_REF_DELETED) + empty_internal = 0; if (LF_ISSET(WT_READ_CACHE)) { /* @@ -270,7 +343,7 @@ ascend: /* couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; - __wt_page_refp(session, ref, &pindex, &slot); + __page_refp(session, ref, &pindex, &slot); if (couple == couple_orig) break; } @@ -282,10 +355,10 @@ ascend: /* */ descend: couple = ref; page = ref->page; - if (page->type == WT_PAGE_ROW_INT || - page->type == WT_PAGE_COL_INT) { - pindex = WT_INTL_INDEX_COPY(page); + if (WT_PAGE_IS_INTERNAL(page)) { + WT_INTL_INDEX_GET(session, page, pindex); slot = prev ? pindex->entries - 1 : 0; + empty_internal = 1; } else { *refp = ref; goto done; diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c index dda56c19636..01db31057fc 100644 --- a/src/third_party/wiredtiger/src/btree/col_modify.c +++ b/src/third_party/wiredtiger/src/btree/col_modify.c @@ -160,7 +160,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ - if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0) + if (cbt->ins_stack[0] == NULL || recno == 0) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index db1b565b439..a34a223168d 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -50,7 +50,7 @@ restart: page = current->page; WT_ASSERT(session, current->key.recno == page->pg_intl_recno); WT_ASSERT(session, session->split_gen != 0); - pindex = WT_INTL_INDEX_COPY(page); + WT_INTL_INDEX_GET(session, page, pindex); base = pindex->entries; descent = pindex->index[base - 1]; diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index d56b44bbd95..2dd42de5900 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -47,13 +47,13 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) */ int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, - WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove) + WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd_arg, int is_remove) { WT_DECL_RET; WT_INSERT *ins; WT_INSERT_HEAD *ins_head, **ins_headp; WT_PAGE *page; - WT_UPDATE *old_upd, **upd_entry; + WT_UPDATE *old_upd, *upd, **upd_entry; size_t ins_size, upd_size; uint32_t ins_slot; u_int i, skipdepth; @@ -61,6 +61,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, ins = NULL; page = cbt->ref->page; + upd = upd_arg; logged = 0; /* This code expects a remove to have a NULL value. */ @@ -90,7 +91,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, } else upd_entry = &cbt->ins->upd; - if (upd == NULL) { + if (upd_arg == NULL) { /* Make sure the update can proceed. */ WT_ERR(__wt_txn_update_check( session, old_upd = *upd_entry)); @@ -165,7 +166,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, cbt->ins_head = ins_head; cbt->ins = ins; - if (upd == NULL) { + if (upd_arg == NULL) { WT_ERR( __wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, upd)); @@ -191,7 +192,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ - if (WT_SKIP_FIRST(ins_head) == NULL) + if (cbt->ins_stack[0] == NULL) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; @@ -218,7 +219,8 @@ err: /* __wt_txn_unmodify(session); __wt_free(session, ins); cbt->ins = NULL; - __wt_free(session, upd); + if (upd_arg == NULL) + __wt_free(session, upd); } return (ret); diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 9967c5ecb0c..6a8ca5f401c 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -196,7 +196,7 @@ restart: page = current->page; break; WT_ASSERT(session, session->split_gen != 0); - pindex = WT_INTL_INDEX_COPY(page); + WT_INTL_INDEX_GET(session, page, pindex); /* * Fast-path internal pages with one child, a common case for @@ -489,9 +489,9 @@ restart: break; WT_ASSERT(session, session->split_gen != 0); - pindex = WT_INTL_INDEX_COPY(page); + WT_INTL_INDEX_GET(session, page, pindex); descent = pindex->index[ - __wt_random(session->rnd) % pindex->entries]; + __wt_random(&session->rnd) % pindex->entries]; /* * Swap the parent page for the child page; return on error, @@ -524,9 +524,9 @@ restart: cbt->ref = current; cbt->compare = 0; WT_ASSERT(session, session->split_gen != 0); - pindex = WT_INTL_INDEX_COPY(btree->root.page); + WT_INTL_INDEX_GET(session, btree->root.page, pindex); cbt->slot = pindex->entries < 2 ? - __wt_random(session->rnd) % page->pg_row_entries : 0; + __wt_random(&session->rnd) % page->pg_row_entries : 0; return (__wt_row_leaf_key(session, page, page->pg_row_d + cbt->slot, &cbt->search_key, 0)); diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index b41cad25914..dacbb0539a9 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1630,7 +1630,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, session = conn->default_session = &conn->dummy_session; session->iface.connection = &conn->iface; session->name = "wiredtiger_open"; - __wt_random_init(session->rnd); + __wt_random_init(&session->rnd); __wt_event_handler_set(session, event_handler); /* Remaining basic initialization of the connection structure. */ diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index 85d9bb08d26..1e5e322016c 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -369,7 +369,7 @@ __log_wrlsn_server(void *arg) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; - WT_LOG_WRLSN_ENTRY written[SLOT_POOL]; + WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL]; WT_LOGSLOT *slot; WT_SESSION_IMPL *session; size_t written_i; @@ -392,7 +392,7 @@ __log_wrlsn_server(void *arg) * Walk the array once saving any slots that are in the * WT_LOG_SLOT_WRITTEN state. */ - while (i < SLOT_POOL) { + while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; if (slot->slot_state != WT_LOG_SLOT_WRITTEN) @@ -433,7 +433,7 @@ __log_wrlsn_server(void *arg) /* * Signal the close thread if needed. */ - if (F_ISSET(slot, SLOT_CLOSEFH)) + if (F_ISSET(slot, WT_SLOT_CLOSEFH)) WT_ERR(__wt_cond_signal(session, conn->log_file_cond)); WT_ERR(__wt_log_slot_free(session, slot)); @@ -541,9 +541,9 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) &log->log_archive_lock, "log archive lock")); if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG)) log->allocsize = - WT_MAX((uint32_t)conn->buffer_alignment, LOG_ALIGN); + WT_MAX((uint32_t)conn->buffer_alignment, WT_LOG_ALIGN); else - log->allocsize = LOG_ALIGN; + log->allocsize = WT_LOG_ALIGN; WT_INIT_LSN(&log->alloc_lsn); WT_INIT_LSN(&log->ckpt_lsn); WT_INIT_LSN(&log->first_lsn); diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 63a905539ce..d99d90ec323 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -157,7 +157,6 @@ __evict_server(void *arg) WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - WT_EVICT_WORKER *worker; WT_SESSION_IMPL *session; session = arg; @@ -172,30 +171,6 @@ __evict_server(void *arg) break; /* - * If we have caught up and there are more than the minimum - * number of eviction workers running, shut one down. - */ - if (conn->evict_workers > conn->evict_workers_min) { - WT_TRET(__wt_verbose(session, WT_VERB_EVICTSERVER, - "Stopping evict worker: %"PRIu32"\n", - conn->evict_workers)); - worker = &conn->evict_workctx[--conn->evict_workers]; - F_CLR(worker, WT_EVICT_WORKER_RUN); - WT_TRET(__wt_cond_signal( - session, cache->evict_waiter_cond)); - WT_TRET(__wt_thread_join(session, worker->tid)); - /* - * Flag errors here with a message, but don't shut down - * the eviction server - that's fatal. - */ - WT_ASSERT(session, ret == 0); - if (ret != 0) { - (void)__wt_msg(session, - "Error stopping eviction worker: %d", ret); - ret = 0; - } - } - /* * Clear the walks so we don't pin pages while asleep, * otherwise we can block applications evicting large pages. */ @@ -692,7 +667,7 @@ __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref) __wt_txn_update_oldest(session, 1); txn = &session->txn; saved_iso = txn->isolation; - txn->isolation = TXN_ISO_EVICTION; + txn->isolation = WT_ISO_EVICTION; /* * Sanity check: if a transaction has updates, its updates should not diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 23b17ef2cd3..1c04af1aef3 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -422,8 +422,17 @@ struct __wt_page { /* * Macros to copy/set the index because the name is obscured to ensure * the field isn't read multiple times. + * + * There are two versions of WT_INTL_INDEX_GET because the session split + * generation is usually set, but it's not always required: for example, + * if a page is locked for splitting, or being created or destroyed. */ -#define WT_INTL_INDEX_COPY(page) ((page)->u.intl.__index) +#define WT_INTL_INDEX_GET_SAFE(page) \ + ((page)->u.intl.__index) +#define WT_INTL_INDEX_GET(session, page, pindex) do { \ + WT_ASSERT(session, session->split_gen != 0); \ + (pindex) = WT_INTL_INDEX_GET_SAFE(page); \ +} while (0) #define WT_INTL_INDEX_SET(page, v) do { \ WT_WRITE_BARRIER(); \ ((page)->u.intl.__index) = (v); \ @@ -439,7 +448,7 @@ struct __wt_page { WT_PAGE_INDEX *__pindex; \ WT_REF **__refp; \ uint32_t __entries; \ - for (__pindex = WT_INTL_INDEX_COPY(page), \ + for (__pindex = WT_INTL_INDEX_GET_SAFE(page), \ __refp = __pindex->index, \ __entries = __pindex->entries; __entries > 0; --__entries) {\ (ref) = *__refp++; @@ -541,7 +550,7 @@ struct __wt_page { #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ #define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */ #define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ -#define WT_PAGE_SPLITTING 0x40 /* An internal page is growing */ +#define WT_PAGE_SPLIT_LOCKED 0x40 /* An internal page is growing */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ /* @@ -672,7 +681,7 @@ struct __wt_ref { * up our slot in the page's index structure. */ WT_PAGE * volatile home; /* Reference page */ - uint32_t ref_hint; /* Reference page index hint */ + uint32_t pindex_hint; /* Reference page index hint */ volatile WT_PAGE_STATE state; /* Page state */ diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index aac430988be..23cb54a4179 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -269,62 +269,6 @@ __wt_page_evict_soon(WT_PAGE *page) } /* - * __wt_page_refp -- - * Return the page's index and slot for a reference. - */ -static inline void -__wt_page_refp(WT_SESSION_IMPL *session, - WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp) -{ - WT_PAGE_INDEX *pindex; - uint32_t i; - - WT_ASSERT(session, session->split_gen != 0); - - /* - * Copy the parent page's index value: the page can split at any time, - * but the index's value is always valid, even if it's not up-to-date. - */ -retry: pindex = WT_INTL_INDEX_COPY(ref->home); - - /* - * Use the page's reference hint: it should be correct unless the page - * split before our slot. If the page splits after our slot, the hint - * will point earlier in the array than our actual slot, so the first - * loop is from the hint to the end of the list, and the second loop - * is from the start of the list to the end of the list. (The second - * loop overlaps the first, but that only happen in cases where we've - * deepened the tree and aren't going to find our slot at all, that's - * not worth optimizing.) - * - * It's not an error for the reference hint to be wrong, it just means - * the first retrieval (which sets the hint for subsequent retrievals), - * is slower. - */ - for (i = ref->ref_hint; i < pindex->entries; ++i) - if (pindex->index[i]->page == ref->page) { - *pindexp = pindex; - *slotp = ref->ref_hint = i; - return; - } - for (i = 0; i < pindex->entries; ++i) - if (pindex->index[i]->page == ref->page) { - *pindexp = pindex; - *slotp = ref->ref_hint = i; - return; - } - - /* - * If we don't find our reference, the page split into a new level and - * our home pointer references the wrong page. After internal pages - * deepen, their reference structure home value are updated; yield and - * wait for that to happen. - */ - __wt_yield(); - goto retry; -} - -/* * __wt_page_modify_init -- * A page is about to be modified, allocate the modification structure. */ @@ -1219,19 +1163,19 @@ __wt_skip_choose_depth(WT_SESSION_IMPL *session) u_int d; for (d = 1; d < WT_SKIP_MAXDEPTH && - __wt_random(session->rnd) < WT_SKIP_PROBABILITY; d++) + __wt_random(&session->rnd) < WT_SKIP_PROBABILITY; d++) ; return (d); } /* - * __wt_btree_size_overflow -- + * __wt_btree_lsm_size -- * Check if the size of an in-memory tree with a single leaf page is over * a specified maximum. If called on anything other than a simple tree with a * single leaf page, returns true so the calling code will switch to a new tree. */ static inline int -__wt_btree_size_overflow(WT_SESSION_IMPL *session, uint64_t maxsize) +__wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize) { WT_BTREE *btree; WT_PAGE *child, *root; @@ -1250,7 +1194,7 @@ __wt_btree_size_overflow(WT_SESSION_IMPL *session, uint64_t maxsize) return (1); /* Check for a tree with a single leaf page. */ - pindex = WT_INTL_INDEX_COPY(root); + pindex = WT_INTL_INDEX_GET_SAFE(root); if (pindex->entries != 1) /* > 1 child page, switch */ return (1); diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index 606fee53749..47b772377c0 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -187,6 +187,12 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter) if (reenter) WT_RET(__curfile_leave(cbt)); + /* + * Any old insert position is now invalid. We rely on this being + * cleared to detect if a new skiplist is installed after a search. + */ + cbt->ins_stack[0] = NULL; + /* If the transaction is idle, check that the cache isn't full. */ WT_RET(__wt_txn_idle_cache_check(session)); diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 63b6bb2cbc5..a11f3dcd73c 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -26,14 +26,14 @@ extern int __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block); extern int __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp); extern int __wt_block_compact_page_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int *skipp); extern int __wt_block_misplaced(WT_SESSION_IMPL *session, WT_BLOCK *block, const char *tag, wt_off_t offset, uint32_t size, int live); -extern int __wt_block_off_remove_overlap( WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size); +extern int __wt_block_off_remove_overlap(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t off, wt_off_t size); extern int __wt_block_alloc( WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size); extern int __wt_block_free(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size); extern int __wt_block_off_free( WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size); extern int __wt_block_extlist_check( WT_SESSION_IMPL *session, WT_EXTLIST *al, WT_EXTLIST *bl); extern int __wt_block_extlist_overlap( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci); -extern int __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b); -extern int __wt_block_insert_ext( WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size); +extern int __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *a, WT_EXTLIST *b); +extern int __wt_block_insert_ext(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t off, wt_off_t size); extern int __wt_block_extlist_read_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size); extern int __wt_block_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size); extern int __wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, WT_EXTLIST *additional); @@ -171,7 +171,7 @@ extern int __wt_row_ikey_alloc(WT_SESSION_IMPL *session, uint32_t cell_offset, c extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref); extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref); extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page); -extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove); +extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd_arg, int is_remove); extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep); extern int __wt_update_alloc( WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep); extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); @@ -363,7 +363,6 @@ extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size); extern int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); -extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize); extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm); extern int __wt_clsm_await_switch(WT_CURSOR_LSM *clsm); extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks); @@ -644,8 +643,8 @@ extern uint32_t __wt_nlpo2(uint32_t v); extern uint32_t __wt_log2_int(uint32_t n); extern int __wt_ispo2(uint32_t v); extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2); -extern void __wt_random_init(uint32_t *rnd); -extern uint32_t __wt_random(uint32_t *rnd); +extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state); +extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state); extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size); extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))); extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))); diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h index 2efbb20b39a..7135bd479c7 100644 --- a/src/third_party/wiredtiger/src/include/gcc.h +++ b/src/third_party/wiredtiger/src/include/gcc.h @@ -87,18 +87,25 @@ * To avoid locking shared data structures such as statistics and to permit * atomic state changes, we rely on the WT_ATOMIC_ADD and WT_ATOMIC_CAS * (compare and swap) operations. - * - * Note that we avoid __sync_bool_compare_and_swap due to problems with - * optimization with some versions of clang. See - * http://llvm.org/bugs/show_bug.cgi?id=21499 for details. */ #define __WT_ATOMIC_ADD(v, val, n) \ (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_add_and_fetch(&(v), val)) #define __WT_ATOMIC_FETCH_ADD(v, val, n) \ (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_fetch_and_add(&(v), val)) +#ifdef __clang__ +/* + * We avoid __sync_bool_compare_and_swap with due to problems with + * optimization with some versions of clang. See + * http://llvm.org/bugs/show_bug.cgi?id=21499 for details. + */ #define __WT_ATOMIC_CAS(v, old, new, n) \ (WT_STATIC_ASSERT(sizeof(v) == (n)), \ __sync_val_compare_and_swap(&(v), old, new) == (old)) +#else +#define __WT_ATOMIC_CAS(v, old, new, n) \ + (WT_STATIC_ASSERT(sizeof(v) == (n)), \ + __sync_bool_compare_and_swap(&(v), old, new)) +#endif #define __WT_ATOMIC_CAS_VAL(v, old, new, n) \ (WT_STATIC_ASSERT(sizeof(v) == (n)), \ __sync_val_compare_and_swap(&(v), old, new)) diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h index 66f346e2fc3..7a8a13327fa 100644 --- a/src/third_party/wiredtiger/src/include/log.h +++ b/src/third_party/wiredtiger/src/include/log.h @@ -11,8 +11,8 @@ #define WT_LOG_TMPNAME "WiredTigerTmplog" /* Log temporary name */ /* Logging subsystem declarations. */ -#define LOG_ALIGN 128 -#define WT_LOG_SLOT_BUF_INIT_SIZE 64 * 1024 +#define WT_LOG_ALIGN 128 +#define WT_LOG_SLOT_BUF_SIZE 256 * 1024 #define WT_INIT_LSN(l) do { \ (l)->file = 1; \ @@ -81,7 +81,7 @@ typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct { volatile int64_t slot_state; /* Slot state */ uint64_t slot_group_size; /* Group size */ int32_t slot_error; /* Error value */ -#define SLOT_INVALID_INDEX 0xffffffff +#define WT_SLOT_INVALID_INDEX 0xffffffff uint32_t slot_index; /* Active slot index */ wt_off_t slot_start_offset; /* Starting file offset */ WT_LSN slot_release_lsn; /* Slot release LSN */ @@ -91,15 +91,14 @@ typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct { WT_ITEM slot_buf; /* Buffer for grouped writes */ int32_t slot_churn; /* Active slots are scarce. */ -#define SLOT_BUF_GROW 0x01 /* Grow buffer on release */ -#define SLOT_BUFFERED 0x02 /* Buffer writes */ -#define SLOT_CLOSEFH 0x04 /* Close old fh on release */ -#define SLOT_SYNC 0x08 /* Needs sync on release */ -#define SLOT_SYNC_DIR 0x10 /* Directory sync on release */ +#define WT_SLOT_BUFFERED 0x01 /* Buffer writes */ +#define WT_SLOT_CLOSEFH 0x02 /* Close old fh on release */ +#define WT_SLOT_SYNC 0x04 /* Needs sync on release */ +#define WT_SLOT_SYNC_DIR 0x08 /* Directory sync on release */ uint32_t flags; /* Flags */ } WT_LOGSLOT; -#define SLOT_INIT_FLAGS (SLOT_BUFFERED) +#define WT_SLOT_INIT_FLAGS (WT_SLOT_BUFFERED) typedef struct { WT_LOGSLOT *slot; @@ -150,16 +149,17 @@ typedef struct { /* * Consolidation array information - * SLOT_ACTIVE must be less than SLOT_POOL. + * WT_SLOT_ACTIVE must be less than WT_SLOT_POOL. * Our testing shows that the more consolidation we generate the * better the performance we see which equates to an active slot * slot count of one. */ -#define SLOT_ACTIVE 1 -#define SLOT_POOL 128 +#define WT_SLOT_ACTIVE 1 +#define WT_SLOT_POOL 128 uint32_t pool_index; /* Global pool index */ - WT_LOGSLOT *slot_array[SLOT_ACTIVE]; /* Active slots */ - WT_LOGSLOT slot_pool[SLOT_POOL]; /* Pool of all slots */ + WT_LOGSLOT *slot_array[WT_SLOT_ACTIVE]; /* Active slots */ + WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */ + size_t slot_buf_size; /* Buffer size for slots */ #define WT_LOG_FORCE_CONSOLIDATE 0x01 /* Disable direct writes */ uint32_t flags; diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index 12cf2dec375..ba12f00f672 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -215,3 +215,11 @@ #define __wt_page_swap(session, held, want, flags) \ __wt_page_swap_func(session, held, want, flags) #endif + +/* Random number generator state. */ +union __wt_rand_state { + uint64_t v; + struct { + uint32_t w, z; + } x; +}; diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h index 07aa740c525..8f3cd168193 100644 --- a/src/third_party/wiredtiger/src/include/mutex.h +++ b/src/third_party/wiredtiger/src/include/mutex.h @@ -32,7 +32,9 @@ typedef union { /* Read/write lock */ WiredTiger read/write locks require modification for big-endian systems. #else uint64_t u; - uint32_t us; + struct { + uint32_t us; + } i; struct { uint16_t writers; uint16_t readers; diff --git a/src/third_party/wiredtiger/src/include/os.h b/src/third_party/wiredtiger/src/include/os.h index ba5d95657d5..edb59b0f521 100644 --- a/src/third_party/wiredtiger/src/include/os.h +++ b/src/third_party/wiredtiger/src/include/os.h @@ -56,7 +56,7 @@ typedef enum { case EMFILE: \ case ENFILE: \ case ENOSPC: \ - __wt_sleep(0L, 500000L); \ + __wt_sleep(0L, 50000L); \ continue; \ default: \ break; \ diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i index 9e6b0f7916c..0fc23348800 100644 --- a/src/third_party/wiredtiger/src/include/serial.i +++ b/src/third_party/wiredtiger/src/include/serial.i @@ -30,11 +30,11 @@ __page_write_gen_wrapped_check(WT_PAGE *page) } /* - * __insert_serial_func -- - * Worker function to add a WT_INSERT entry to a skiplist. + * __insert_simple_func -- + * Worker function to add a WT_INSERT entry to the middle of a skiplist. */ static inline int -__insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, +__insert_simple_func(WT_SESSION_IMPL *session, WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth) { u_int i; @@ -42,31 +42,62 @@ __insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, WT_UNUSED(session); /* - * Confirm we are still in the expected position, and no item has been - * added where our insert belongs. Take extra care at the beginning - * and end of the list (at each level): retry if we race there. + * Update the skiplist elements referencing the new WT_INSERT item. + * If we fail connecting one of the upper levels in the skiplist, + * return success: the levels we updated are correct and sufficient. + * Even though we don't get the benefit of the memory we allocated, + * we can't roll back. * - * !!! - * Note the test for ins_stack[0] == NULL: that's the test for an - * uninitialized cursor, ins_stack[0] is cleared as part of - * initializing a cursor for a search. + * All structure setup must be flushed before the structure is entered + * into the list. We need a write barrier here, our callers depend on + * it. Don't pass complex arguments to the macro, some implementations + * read the old value multiple times. */ for (i = 0; i < skipdepth; i++) { - if (ins_stack[i] == NULL || - *ins_stack[i] != new_ins->next[i]) - return (WT_RESTART); - if (new_ins->next[i] == NULL && - ins_head->tail[i] != NULL && - ins_stack[i] != &ins_head->tail[i]->next[i]) - return (WT_RESTART); + WT_INSERT *old_ins = *ins_stack[i]; + if (old_ins != new_ins->next[i] || + !WT_ATOMIC_CAS8(*ins_stack[i], old_ins, new_ins)) + return (i == 0 ? WT_RESTART : 0); } - /* Update the skiplist elements referencing the new WT_INSERT item. */ + return (0); +} + +/* + * __insert_serial_func -- + * Worker function to add a WT_INSERT entry to a skiplist. + */ +static inline int +__insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, + WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth) +{ + u_int i; + + /* The cursor should be positioned. */ + WT_ASSERT(session, ins_stack[0] != NULL); + + /* + * Update the skiplist elements referencing the new WT_INSERT item. + * + * Confirm we are still in the expected position, and no item has been + * added where our insert belongs. If we fail connecting one of the + * upper levels in the skiplist, return success: the levels we updated + * are correct and sufficient. Even though we don't get the benefit of + * the memory we allocated, we can't roll back. + * + * All structure setup must be flushed before the structure is entered + * into the list. We need a write barrier here, our callers depend on + * it. Don't pass complex arguments to the macro, some implementations + * read the old value multiple times. + */ for (i = 0; i < skipdepth; i++) { + WT_INSERT *old_ins = *ins_stack[i]; + if (old_ins != new_ins->next[i] || + !WT_ATOMIC_CAS8(*ins_stack[i], old_ins, new_ins)) + return (i == 0 ? WT_RESTART : 0); if (ins_head->tail[i] == NULL || ins_stack[i] == &ins_head->tail[i]->next[i]) ins_head->tail[i] = new_ins; - *ins_stack[i] = new_ins; } return (0); @@ -128,20 +159,20 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT *new_ins = *new_insp; WT_DECL_RET; - /* Clear references to memory we now own. */ - *new_insp = NULL; - /* Check for page write generation wrap. */ WT_RET(__page_write_gen_wrapped_check(page)); + /* Clear references to memory we now own and must free on error. */ + *new_insp = NULL; + /* Acquire the page's spinlock, call the worker function. */ WT_PAGE_LOCK(session, page); ret = __col_append_serial_func( session, ins_head, ins_stack, new_ins, recnop, skipdepth); WT_PAGE_UNLOCK(session, page); - /* Free unused memory on error. */ if (ret != 0) { + /* Free unused memory on error. */ __wt_free(session, new_ins); return (ret); } @@ -171,21 +202,32 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page, { WT_INSERT *new_ins = *new_insp; WT_DECL_RET; - - /* Clear references to memory we now own. */ - *new_insp = NULL; + int simple; + u_int i; /* Check for page write generation wrap. */ WT_RET(__page_write_gen_wrapped_check(page)); - /* Acquire the page's spinlock, call the worker function. */ - WT_PAGE_LOCK(session, page); - ret = __insert_serial_func( - session, ins_head, ins_stack, new_ins, skipdepth); - WT_PAGE_UNLOCK(session, page); + /* Clear references to memory we now own and must free on error. */ + *new_insp = NULL; + + simple = 1; + for (i = 0; i < skipdepth; i++) + if (new_ins->next[i] == NULL) + simple = 0; + + if (simple) + ret = __insert_simple_func( + session, ins_stack, new_ins, skipdepth); + else { + WT_PAGE_LOCK(session, page); + ret = __insert_serial_func( + session, ins_head, ins_stack, new_ins, skipdepth); + WT_PAGE_UNLOCK(session, page); + } - /* Free unused memory on error. */ if (ret != 0) { + /* Free unused memory on error. */ __wt_free(session, new_ins); return (ret); } @@ -215,17 +257,19 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DECL_RET; WT_UPDATE *obsolete, *upd = *updp; - /* Clear references to memory we now own. */ - *updp = NULL; - /* Check for page write generation wrap. */ WT_RET(__page_write_gen_wrapped_check(page)); + /* Clear references to memory we now own and must free on error. */ + *updp = NULL; + /* + * All structure setup must be flushed before the structure is entered + * into the list. We need a write barrier here, our callers depend on + * it. + * * Swap the update into place. If that fails, a new update was added - * after our search, we raced. Check if our update is still permitted, - * and if it is, do a full-barrier to ensure the update's next pointer - * is set before we update the linked list and try again. + * after our search, we raced. Check if our update is still permitted. */ while (!WT_ATOMIC_CAS8(*srch_upd, upd->next, upd)) { if ((ret = __wt_txn_update_check( @@ -234,7 +278,6 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, __wt_free(session, upd); return (ret); } - WT_WRITE_BARRIER(); } /* diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index 2c88727c662..3efb8011e3b 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -146,9 +146,9 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { * to clear everything but the fields that persist. */ #define WT_SESSION_CLEAR_SIZE(s) \ - (WT_PTRDIFF(&(s)->rnd[0], s)) + (WT_PTRDIFF(&(s)->rnd, s)) - uint32_t rnd[2]; /* Random number generation state */ + WT_RAND_STATE rnd; /* Random number generation state */ /* Hashed handle reference list array */ SLIST_HEAD(__dhandles_hash, __wt_data_handle_cache) *dhhash; diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 8624ebb456e..2acaad39b0e 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -203,7 +203,6 @@ struct __wt_connection_stats { WT_STATS dh_session_handles; WT_STATS dh_session_sweeps; WT_STATS file_open; - WT_STATS log_buffer_grow; WT_STATS log_buffer_size; WT_STATS log_bytes_payload; WT_STATS log_bytes_written; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index d2b369a41c4..7a31ed2f3fe 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -63,10 +63,10 @@ struct __wt_txn_global { }; typedef enum __wt_txn_isolation { - TXN_ISO_EVICTION, /* Internal: eviction context */ - TXN_ISO_READ_UNCOMMITTED, - TXN_ISO_READ_COMMITTED, - TXN_ISO_SNAPSHOT + WT_ISO_EVICTION, /* Internal: eviction context */ + WT_ISO_READ_UNCOMMITTED, + WT_ISO_READ_COMMITTED, + WT_ISO_SNAPSHOT } WT_TXN_ISOLATION; /* diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index a9b19ca1ff5..95a8f99cf1b 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -170,7 +170,7 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) * Eviction only sees globally visible updates, or if there is a * checkpoint transaction running, use its transaction. */ - if (txn->isolation == TXN_ISO_EVICTION) + if (txn->isolation == WT_ISO_EVICTION) return (__wt_txn_visible_all(session, id)); /* @@ -183,7 +183,7 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) * Metadata updates use non-transactional techniques (such as the * schema and metadata locks) to protect access to in-flight updates. */ - if (txn->isolation == TXN_ISO_READ_UNCOMMITTED || + if (txn->isolation == WT_ISO_READ_UNCOMMITTED || session->dhandle == session->meta_dhandle) return (1); @@ -192,7 +192,7 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) return (1); /* - * TXN_ISO_SNAPSHOT, TXN_ISO_READ_COMMITTED: the ID is visible if it is + * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is * not the result of a concurrent transaction, that is, if was * committed before the snapshot was taken. * @@ -222,19 +222,19 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) txn->isolation = session->isolation; txn->txn_logsync = S2C(session)->txn_logsync; - if (cfg != NULL) - WT_RET(__wt_txn_config(session, cfg)); + if (cfg != NULL) + WT_RET(__wt_txn_config(session, cfg)); F_SET(txn, TXN_RUNNING); - if (txn->isolation == TXN_ISO_SNAPSHOT) { + if (txn->isolation == WT_ISO_SNAPSHOT) { if (session->ncursors > 0) WT_RET(__wt_session_copy_values(session)); - /* - * We're about to allocate a snapshot: if we need to block for - * eviction, it's better to do it beforehand. - */ - WT_RET(__wt_cache_full_check(session)); + /* + * We're about to allocate a snapshot: if we need to block for + * eviction, it's better to do it beforehand. + */ + WT_RET(__wt_cache_full_check(session)); __wt_txn_get_snapshot(session); } return (0); @@ -385,7 +385,7 @@ __wt_txn_update_check(WT_SESSION_IMPL *session, WT_UPDATE *upd) WT_TXN *txn; txn = &session->txn; - if (txn->isolation == TXN_ISO_SNAPSHOT) + if (txn->isolation == WT_ISO_SNAPSHOT) while (upd != NULL && !__wt_txn_visible(session, upd->txnid)) { if (upd->txnid != WT_TXN_ABORTED) { WT_STAT_FAST_DATA_INCR( @@ -411,7 +411,7 @@ __wt_txn_read_last(WT_SESSION_IMPL *session) /* Release the snap_min ID we put in the global table. */ if (!F_ISSET(txn, TXN_RUNNING) || - txn->isolation != TXN_ISO_SNAPSHOT) + txn->isolation != WT_ISO_SNAPSHOT) __wt_txn_release_snapshot(session); } @@ -446,12 +446,12 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) * further forward, so that once a read-uncommitted cursor is * positioned on a value, it can't be freed. */ - if (txn->isolation == TXN_ISO_READ_UNCOMMITTED && + if (txn->isolation == WT_ISO_READ_UNCOMMITTED && !F_ISSET(txn, TXN_HAS_ID) && TXNID_LT(txn_state->snap_min, txn_global->last_running)) txn_state->snap_min = txn_global->last_running; - if (txn->isolation != TXN_ISO_READ_UNCOMMITTED && + if (txn->isolation != WT_ISO_READ_UNCOMMITTED && !F_ISSET(txn, TXN_HAS_SNAPSHOT)) __wt_txn_get_snapshot(session); } diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 4804290acba..c28ce83d122 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -3345,150 +3345,148 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_DH_SESSION_SWEEPS 1069 /*! connection: files currently open */ #define WT_STAT_CONN_FILE_OPEN 1070 -/*! log: log buffer size increases */ -#define WT_STAT_CONN_LOG_BUFFER_GROW 1071 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1072 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1071 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1073 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1072 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1074 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1073 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1075 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1074 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1076 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1075 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1077 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1076 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1078 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1077 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1079 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1078 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1080 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1079 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1081 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1080 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1082 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1081 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1083 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1082 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1084 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1083 /*! log: log read operations */ -#define WT_STAT_CONN_LOG_READS 1085 +#define WT_STAT_CONN_LOG_READS 1084 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1086 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1085 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1087 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1086 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1088 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1087 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1089 +#define WT_STAT_CONN_LOG_SCANS 1088 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1090 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1089 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1091 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1090 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1092 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1091 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1093 +#define WT_STAT_CONN_LOG_SLOT_RACES 1092 /*! log: slots selected for switching that were unavailable */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1094 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1093 /*! log: record size exceeded maximum */ -#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1095 +#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1094 /*! log: failed to find a slot large enough for record */ -#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1096 +#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1095 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1097 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1096 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1098 +#define WT_STAT_CONN_LOG_SYNC 1097 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1099 +#define WT_STAT_CONN_LOG_SYNC_DIR 1098 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1100 +#define WT_STAT_CONN_LOG_WRITE_LSN 1099 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1101 +#define WT_STAT_CONN_LOG_WRITES 1100 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1102 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1101 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1103 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1102 /*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1104 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1103 /*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1105 +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1104 /*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1106 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1105 /*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1107 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1106 /*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1108 +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1107 /*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1109 +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1108 /*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1110 +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1109 /*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1111 +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1110 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1112 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1111 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1113 +#define WT_STAT_CONN_MEMORY_FREE 1112 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1114 +#define WT_STAT_CONN_MEMORY_GROW 1113 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1115 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1114 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1116 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1115 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1117 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1116 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1118 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1117 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1119 +#define WT_STAT_CONN_PAGE_SLEEP 1118 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1120 +#define WT_STAT_CONN_READ_IO 1119 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1121 +#define WT_STAT_CONN_REC_PAGES 1120 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1122 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1121 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1123 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1122 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1124 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1123 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1125 +#define WT_STAT_CONN_RWLOCK_READ 1124 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1126 +#define WT_STAT_CONN_RWLOCK_WRITE 1125 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1127 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1126 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1128 +#define WT_STAT_CONN_SESSION_OPEN 1127 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1129 +#define WT_STAT_CONN_TXN_BEGIN 1128 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1130 +#define WT_STAT_CONN_TXN_CHECKPOINT 1129 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1131 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1130 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1132 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1131 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1133 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1132 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1134 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1133 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1135 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1134 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1136 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1135 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1137 +#define WT_STAT_CONN_TXN_COMMIT 1136 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1138 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1137 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1139 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1138 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1140 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1139 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1141 +#define WT_STAT_CONN_TXN_ROLLBACK 1140 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1142 +#define WT_STAT_CONN_WRITE_IO 1141 /*! * @} diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index 576827bebcd..fa25fc872f4 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -265,6 +265,8 @@ struct __wt_upd_skipped; typedef struct __wt_upd_skipped WT_UPD_SKIPPED; struct __wt_update; typedef struct __wt_update WT_UPDATE; +union __wt_rand_state; + typedef union __wt_rand_state WT_RAND_STATE; /* * Forward type declarations for internal types: END * DO NOT EDIT: automatically built by dist/s_typedef. diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 5c1d76105cb..76cf5f55f7b 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -363,7 +363,7 @@ __log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) if (!__log_size_fit(session, &log->alloc_lsn, recsize)) { WT_RET(__wt_log_newfile(session, 0, &created_log)); if (log->log_close_fh != NULL) - F_SET(slot, SLOT_CLOSEFH); + F_SET(slot, WT_SLOT_CLOSEFH); } /* @@ -456,8 +456,9 @@ __log_fill(WT_SESSION_IMPL *session, logrec = (WT_LOG_RECORD *)record->mem; /* * Call __wt_write. For now the offset is the real byte offset. - * If the offset becomes a unit of LOG_ALIGN this is where we would - * multiply by LOG_ALIGN to get the real file byte offset for write(). + * If the offset becomes a unit of WT_LOG_ALIGN this is where we would + * multiply by WT_LOG_ALIGN to get the real file byte offset for + * write(). */ if (direct) WT_ERR(__wt_write(session, myslot->slot->slot_fh, @@ -567,7 +568,7 @@ __log_openfile(WT_SESSION_IMPL *session, log = S2C(session)->log; if (log == NULL) - allocsize = LOG_ALIGN; + allocsize = WT_LOG_ALIGN; else allocsize = log->allocsize; WT_RET(__wt_scr_alloc(session, 0, &buf)); @@ -943,7 +944,7 @@ __log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *eof) *eof = 0; WT_RET(__wt_filesize(session, fh, &log_size)); if (log == NULL) - allocsize = LOG_ALIGN; + allocsize = WT_LOG_ALIGN; else allocsize = log->allocsize; @@ -1031,7 +1032,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) *freep = 1; /* Write the buffered records */ - if (F_ISSET(slot, SLOT_BUFFERED)) { + if (F_ISSET(slot, WT_SLOT_BUFFERED)) { write_size = (size_t) (slot->slot_end_lsn.offset - slot->slot_start_offset); WT_ERR(__wt_write(session, slot->slot_fh, @@ -1045,8 +1046,8 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) * off to the worker thread. The caller is responsible for freeing * the slot in that case. Otherwise the worker thread will free it. */ - if (F_ISSET(slot, SLOT_BUFFERED) && - !F_ISSET(slot, SLOT_SYNC | SLOT_SYNC_DIR)) { + if (F_ISSET(slot, WT_SLOT_BUFFERED) && + !F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { *freep = 0; slot->slot_state = WT_LOG_SLOT_WRITTEN; /* @@ -1076,7 +1077,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) /* * Signal the close thread if needed. */ - if (F_ISSET(slot, SLOT_CLOSEFH)) + if (F_ISSET(slot, WT_SLOT_CLOSEFH)) WT_ERR(__wt_cond_signal(session, conn->log_file_cond)); /* @@ -1084,7 +1085,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) * so that threads finishing writing to the log will wait while the * current fsync completes and advance log->sync_lsn. */ - while (F_ISSET(slot, SLOT_SYNC | SLOT_SYNC_DIR)) { + while (F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { /* * We have to wait until earlier log files have finished their * sync operations. The most recent one will set the LSN to the @@ -1109,7 +1110,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) * not yet stable in its parent directory. Do that * now if needed. */ - if (F_ISSET(slot, SLOT_SYNC_DIR) && + if (F_ISSET(slot, WT_SLOT_SYNC_DIR) && (log->sync_dir_lsn.file < sync_lsn.file)) { WT_ASSERT(session, log->log_dir_fh != NULL); WT_ERR(__wt_verbose(session, WT_VERB_LOG, @@ -1124,7 +1125,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) /* * Sync the log file if needed. */ - if (F_ISSET(slot, SLOT_SYNC) && + if (F_ISSET(slot, WT_SLOT_SYNC) && LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_release: sync log %s", log->log_fh->name)); @@ -1136,7 +1137,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) /* * Clear the flags before leaving the loop. */ - F_CLR(slot, SLOT_SYNC | SLOT_SYNC_DIR); + F_CLR(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR); locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); break; @@ -1421,7 +1422,7 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, * records and larger allocation boundaries should always be * a multiple of this. */ - allocsize = LOG_ALIGN; + allocsize = WT_LOG_ALIGN; lastlog = 0; firstlog = UINT32_MAX; WT_RET(__log_get_files(session, @@ -1447,7 +1448,7 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, session, 0, &log_fh, WT_LOG_FILENAME, start_lsn.file)); WT_ERR(__log_filesize(session, log_fh, &log_size)); rd_lsn = start_lsn; - WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN)); + WT_ERR(__wt_buf_initsize(session, &buf, WT_LOG_ALIGN)); for (;;) { if (rd_lsn.offset + allocsize > log_size) { advance: @@ -1620,9 +1621,9 @@ __log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, locked = 1; if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) - F_SET(&tmp, SLOT_SYNC_DIR); + F_SET(&tmp, WT_SLOT_SYNC_DIR); if (LF_ISSET(WT_LOG_FSYNC)) - F_SET(&tmp, SLOT_SYNC); + F_SET(&tmp, WT_SLOT_SYNC); WT_ERR(__log_acquire(session, record->size, &tmp)); __wt_spin_unlock(session, &log->log_slot_lock); locked = 0; @@ -1820,11 +1821,6 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, session, record, lsnp, flags)) == EAGAIN) ; WT_ERR(ret); - /* - * Increase the buffer size of any slots we can get access - * to, so future consolidations are likely to succeed. - */ - WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len)); return (0); } WT_ERR(ret); diff --git a/src/third_party/wiredtiger/src/log/log_auto.c b/src/third_party/wiredtiger/src/log/log_auto.c index f35a7058511..bd830687df2 100644 --- a/src/third_party/wiredtiger/src/log/log_auto.c +++ b/src/third_party/wiredtiger/src/log/log_auto.c @@ -7,7 +7,8 @@ __wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp) { WT_ITEM *logrec; - WT_RET(__wt_scr_alloc(session, WT_ALIGN(size + 1, LOG_ALIGN), &logrec)); + WT_RET( + __wt_scr_alloc(session, WT_ALIGN(size + 1, WT_LOG_ALIGN), &logrec)); WT_CLEAR(*(WT_LOG_RECORD *)logrec->data); logrec->size = offsetof(WT_LOG_RECORD, record); diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c index 02b3056be6f..a08a9aff001 100644 --- a/src/third_party/wiredtiger/src/log/log_slot.c +++ b/src/third_party/wiredtiger/src/log/log_slot.c @@ -35,15 +35,15 @@ __wt_log_slot_init(WT_SESSION_IMPL *session) conn = S2C(session); log = conn->log; - for (i = 0; i < SLOT_POOL; i++) { + for (i = 0; i < WT_SLOT_POOL; i++) { log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE; - log->slot_pool[i].slot_index = SLOT_INVALID_INDEX; + log->slot_pool[i].slot_index = WT_SLOT_INVALID_INDEX; } /* * Set up the available slots from the pool the first time. */ - for (i = 0; i < SLOT_ACTIVE; i++) { + for (i = 0; i < WT_SLOT_ACTIVE; i++) { slot = &log->slot_pool[i]; slot->slot_index = (uint32_t)i; slot->slot_state = WT_LOG_SLOT_READY; @@ -53,14 +53,18 @@ __wt_log_slot_init(WT_SESSION_IMPL *session) /* * Allocate memory for buffers now that the arrays are setup. Split * this out to make error handling simpler. + * + * Cap the slot buffer to the log file size. */ - for (i = 0; i < SLOT_POOL; i++) { + log->slot_buf_size = + WT_MIN((size_t)conn->log_file_max, WT_LOG_SLOT_BUF_SIZE); + for (i = 0; i < WT_SLOT_POOL; i++) { WT_ERR(__wt_buf_init(session, - &log->slot_pool[i].slot_buf, WT_LOG_SLOT_BUF_INIT_SIZE)); - F_SET(&log->slot_pool[i], SLOT_INIT_FLAGS); + &log->slot_pool[i].slot_buf, log->slot_buf_size)); + F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); } WT_STAT_FAST_CONN_INCRV(session, - log_buffer_size, WT_LOG_SLOT_BUF_INIT_SIZE * SLOT_POOL); + log_buffer_size, log->slot_buf_size * WT_SLOT_POOL); if (0) { err: while (--i >= 0) __wt_buf_free(session, &log->slot_pool[i].slot_buf); @@ -82,7 +86,7 @@ __wt_log_slot_destroy(WT_SESSION_IMPL *session) conn = S2C(session); log = conn->log; - for (i = 0; i < SLOT_POOL; i++) + for (i = 0; i < WT_SLOT_POOL; i++) __wt_buf_free(session, &log->slot_pool[i].slot_buf); return (0); } @@ -101,13 +105,18 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, WT_LOG *log; WT_LOGSLOT *slot; int64_t cur_state, new_state, old_state; - uint32_t allocated_slot, slot_grow_attempts; + uint32_t allocated_slot, slot_attempts; conn = S2C(session); log = conn->log; - slot_grow_attempts = 0; + slot_attempts = 0; + + if (mysize >= (uint64_t)log->slot_buf_size) { + WT_STAT_FAST_CONN_INCR(session, log_slot_toobig); + return (ENOMEM); + } find_slot: - allocated_slot = __wt_random(session->rnd) % SLOT_ACTIVE; + allocated_slot = __wt_random(&session->rnd) % WT_SLOT_ACTIVE; slot = log->slot_array[allocated_slot]; old_state = slot->slot_state; join_slot: @@ -131,12 +140,11 @@ join_slot: goto find_slot; } /* - * If the slot buffer isn't big enough to hold this update, mark - * the slot for a buffer size increase and find another slot. + * If the slot buffer isn't big enough to hold this update, try + * to find another slot. */ if (new_state > (int64_t)slot->slot_buf.memsize) { - F_SET(slot, SLOT_BUF_GROW); - if (++slot_grow_attempts > 5) { + if (++slot_attempts > 5) { WT_STAT_FAST_CONN_INCR(session, log_slot_toosmall); return (ENOMEM); } @@ -159,9 +167,9 @@ join_slot: */ WT_STAT_FAST_CONN_INCR(session, log_slot_joins); if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) - F_SET(slot, SLOT_SYNC_DIR); + F_SET(slot, WT_SLOT_SYNC_DIR); if (LF_ISSET(WT_LOG_FSYNC)) - F_SET(slot, SLOT_SYNC); + F_SET(slot, WT_SLOT_SYNC); myslotp->slot = slot; myslotp->offset = (wt_off_t)old_state - WT_LOG_SLOT_READY; return (0); @@ -193,7 +201,7 @@ retry: */ pool_i = log->pool_index; newslot = &log->slot_pool[pool_i]; - if (++log->pool_index >= SLOT_POOL) + if (++log->pool_index >= WT_SLOT_POOL) log->pool_index = 0; if (newslot->slot_state != WT_LOG_SLOT_FREE) { WT_STAT_FAST_CONN_INCR(session, log_slot_switch_fails); @@ -203,7 +211,7 @@ retry: * churn is used to change how long we pause before closing * the slot - which leads to more consolidation and less churn. */ - if (++switch_fails % SLOT_POOL == 0 && slot->slot_churn < 5) + if (++switch_fails % WT_SLOT_POOL == 0 && slot->slot_churn < 5) ++slot->slot_churn; __wt_yield(); goto retry; @@ -297,90 +305,13 @@ __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size) int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { - WT_DECL_RET; - - ret = 0; - /* - * Grow the buffer if needed before returning it to the pool. - */ - if (F_ISSET(slot, SLOT_BUF_GROW)) { - WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); - WT_STAT_FAST_CONN_INCRV(session, - log_buffer_size, slot->slot_buf.memsize); - WT_ERR(__wt_buf_grow(session, - &slot->slot_buf, slot->slot_buf.memsize * 2)); - } -err: - /* - * No matter if there is an error, we always want to free - * the slot back to the pool. - */ + WT_UNUSED(session); /* * Make sure flags don't get retained between uses. * We have to reset them them here because multiple threads may * change the flags when joining the slot. */ - slot->flags = SLOT_INIT_FLAGS; + slot->flags = WT_SLOT_INIT_FLAGS; slot->slot_state = WT_LOG_SLOT_FREE; - return (ret); -} - -/* - * __wt_log_slot_grow_buffers -- - * Increase the buffer size of all available slots in the buffer pool. - * Go to some lengths to include active (but unused) slots to handle - * the case where all log write record sizes exceed the size of the - * active buffer. - */ -int -__wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize) -{ - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_LOG *log; - WT_LOGSLOT *slot; - int64_t orig_state; - uint64_t old_size, total_growth; - int i; - - conn = S2C(session); - log = conn->log; - total_growth = 0; - WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); - /* - * Take the log slot lock to prevent other threads growing buffers - * at the same time. Could tighten the scope of this lock, or have - * a separate lock if there is contention. - */ - __wt_spin_lock(session, &log->log_slot_lock); - for (i = 0; i < SLOT_POOL; i++) { - slot = &log->slot_pool[i]; - /* Avoid atomic operations if they won't succeed. */ - if (slot->slot_state != WT_LOG_SLOT_FREE && - slot->slot_state != WT_LOG_SLOT_READY) - continue; - /* Don't keep growing unrelated buffers. */ - if (slot->slot_buf.memsize > (10 * newsize) && - !F_ISSET(slot, SLOT_BUF_GROW)) - continue; - orig_state = WT_ATOMIC_CAS_VAL8( - slot->slot_state, WT_LOG_SLOT_FREE, WT_LOG_SLOT_PENDING); - if (orig_state != WT_LOG_SLOT_FREE) { - orig_state = WT_ATOMIC_CAS_VAL8(slot->slot_state, - WT_LOG_SLOT_READY, WT_LOG_SLOT_PENDING); - if (orig_state != WT_LOG_SLOT_READY) - continue; - } - - /* We have a slot - now go ahead and grow the buffer. */ - old_size = slot->slot_buf.memsize; - F_CLR(slot, SLOT_BUF_GROW); - WT_ERR(__wt_buf_grow(session, &slot->slot_buf, - WT_MAX(slot->slot_buf.memsize * 2, newsize))); - slot->slot_state = orig_state; - total_growth += slot->slot_buf.memsize - old_size; - } -err: __wt_spin_unlock(session, &log->log_slot_lock); - WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, total_growth); - return (ret); + return (0); } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index 111de7a2be1..0962da7768b 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -132,10 +132,11 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) hard_limit = F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH) ? 1 : 0; if (have_primary) { + WT_ENTER_PAGE_INDEX(session); WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree, - ovfl = __wt_btree_size_overflow( - session, hard_limit ? + ovfl = __wt_btree_lsm_size(session, hard_limit ? 2 * lsm_tree->chunk_size : lsm_tree->chunk_size)); + WT_LEAVE_PAGE_INDEX(session); /* If there was no overflow, we're done. */ if (!ovfl) @@ -206,7 +207,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update) if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) goto open; - if (session->txn.isolation == TXN_ISO_SNAPSHOT) + if (session->txn.isolation == WT_ISO_SNAPSHOT) __wt_txn_cursor_op(session); /* @@ -219,7 +220,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update) * conflict. */ clsm->nupdates = 1; - if (session->txn.isolation == TXN_ISO_SNAPSHOT && + if (session->txn.isolation == WT_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { WT_ASSERT(session, F_ISSET(&session->txn, TXN_HAS_SNAPSHOT)); @@ -245,7 +246,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update) * - a read operation and the cursor is open for reading. */ if ((!update || - session->txn.isolation != TXN_ISO_SNAPSHOT || + session->txn.isolation != WT_ISO_SNAPSHOT || F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) && ((update && clsm->primary_chunk != NULL) || (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ)))) @@ -417,7 +418,7 @@ __clsm_open_cursors( * Ensure that any snapshot update has cursors on the right set of * chunks to guarantee visibility is correct. */ - if (update && txn->isolation == TXN_ISO_SNAPSHOT) + if (update && txn->isolation == WT_ISO_SNAPSHOT) F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT); /* @@ -1533,9 +1534,11 @@ __wt_clsm_open(WT_SESSION_IMPL *session, if (bulk && (ret == EBUSY || (ret == 0 && lsm_tree->nchunks > 1))) WT_ERR_MSG(session, EINVAL, "bulk-load is only supported on newly created LSM trees"); - WT_ASSERT(session, !bulk || lsm_tree->exclusive); /* Flag any errors from the tree get. */ - WT_RET(ret); + WT_ERR(ret); + + /* Make sure we have exclusive access if and only if we want it */ + WT_ASSERT(session, !bulk || lsm_tree->exclusive); WT_ERR(__wt_calloc_one(session, &clsm)); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c index 12b24984fcb..84c509158d1 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_manager.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c @@ -227,7 +227,7 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session) for (i = 0; i < WT_LSM_MAX_WORKERS; i++) { WT_ERR(__wt_open_internal_session( S2C(session), "lsm-worker", 1, 0, &worker_session)); - worker_session->isolation = TXN_ISO_READ_UNCOMMITTED; + worker_session->isolation = WT_ISO_READ_UNCOMMITTED; manager->lsm_worker_cookies[i].session = worker_session; } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index 99140f89c51..4f5e1516f1c 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -109,7 +109,7 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, * enough to trigger checkpoints. */ if (evict_chunk != NULL && flush_chunk != NULL) { - chunk = (__wt_random(session->rnd) & 1) ? + chunk = (__wt_random(&session->rnd) & 1) ? evict_chunk : flush_chunk; WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_FLUSH, 0, lsm_tree)); @@ -307,7 +307,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { saved_isolation = session->txn.isolation; - session->txn.isolation = TXN_ISO_EVICTION; + session->txn.isolation = WT_ISO_EVICTION; ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c index 66e7e3977f4..189a095ae74 100644 --- a/src/third_party/wiredtiger/src/meta/meta_track.c +++ b/src/third_party/wiredtiger/src/meta/meta_track.c @@ -67,6 +67,18 @@ __meta_track_next(WT_SESSION_IMPL *session, WT_META_TRACK **trkp) } /* + * __meta_track_clear -- + * Clear the structure. + */ +static void +__meta_track_clear(WT_SESSION_IMPL *session, WT_META_TRACK *trk) +{ + __wt_free(session, trk->a); + __wt_free(session, trk->b); + memset(trk, 0, sizeof(WT_META_TRACK)); +} + +/* * __wt_meta_track_discard -- * Cleanup metadata tracking when closing a session. */ @@ -96,31 +108,21 @@ __wt_meta_track_on(WT_SESSION_IMPL *session) * Apply the changes in a metadata tracking record. */ static int -__meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) +__meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; int tret; - /* - * Unlock handles and complete checkpoints regardless of whether we are - * unrolling. - */ - if (!unroll && trk->op != WT_ST_CHECKPOINT && - trk->op != WT_ST_DROP_COMMIT && trk->op != WT_ST_LOCK) - goto free; - switch (trk->op) { case WT_ST_EMPTY: /* Unused slot */ break; case WT_ST_CHECKPOINT: /* Checkpoint, see above */ - if (!unroll) { - btree = trk->dhandle->handle; - bm = btree->bm; - WT_WITH_DHANDLE(session, trk->dhandle, - WT_TRET(bm->checkpoint_resolve(bm, session))); - } + btree = trk->dhandle->handle; + bm = btree->bm; + WT_WITH_DHANDLE(session, trk->dhandle, + WT_TRET(bm->checkpoint_resolve(bm, session))); break; case WT_ST_DROP_COMMIT: if ((tret = __wt_remove_if_exists(session, trk->a)) != 0) { @@ -130,8 +132,40 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) WT_TRET(tret); } break; + case WT_ST_LOCK: + WT_WITH_DHANDLE(session, trk->dhandle, + WT_TRET(__wt_session_release_btree(session))); + break; + case WT_ST_FILEOP: + case WT_ST_REMOVE: + case WT_ST_SET: + break; + WT_ILLEGAL_VALUE(session); + } + + __meta_track_clear(session, trk); + return (ret); +} + +/* + * __meta_track_unroll -- + * Undo the changes in a metadata tracking record. + */ +static int +__meta_track_unroll(WT_SESSION_IMPL *session, WT_META_TRACK *trk) +{ + WT_DECL_RET; + int tret; + + switch (trk->op) { + case WT_ST_EMPTY: /* Unused slot */ + break; + case WT_ST_CHECKPOINT: /* Checkpoint, see above */ + break; + case WT_ST_DROP_COMMIT: + break; case WT_ST_LOCK: /* Handle lock, see above */ - if (unroll && trk->created) + if (trk->created) F_SET(trk->dhandle, WT_DHANDLE_DISCARD); WT_WITH_DHANDLE(session, trk->dhandle, WT_TRET(__wt_session_release_btree(session))); @@ -185,11 +219,7 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) WT_ILLEGAL_VALUE(session); } -free: trk->op = WT_ST_EMPTY; - __wt_free(session, trk->a); - __wt_free(session, trk->b); - trk->dhandle = NULL; - + __meta_track_clear(session, trk); return (ret); } @@ -253,33 +283,38 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, int need_sync, int unroll) if (trk == trk_orig) return (0); - while (--trk >= trk_orig) - WT_TRET(__meta_track_apply(session, trk, unroll)); + if (unroll) { + while (--trk >= trk_orig) + WT_TRET(__meta_track_unroll(session, trk)); + /* Unroll operations don't need to flush the metadata. */ + return (ret); + } /* - * Unroll operations don't need to flush the metadata. - * - * Also, if we don't have the metadata handle (e.g, we're in the - * process of creating the metadata), we can't sync it. + * If we don't have the metadata handle (e.g, we're in the process of + * creating the metadata), we can't sync it. */ - if (unroll || ret != 0 || !need_sync || session->meta_dhandle == NULL) - return (ret); + if (!need_sync || session->meta_dhandle == NULL) + goto done; /* If we're logging, make sure the metadata update was flushed. */ if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED)) { - if (!FLD_ISSET(S2C(session)->txn_logsync, - WT_LOG_DSYNC | WT_LOG_FSYNC)) - WT_WITH_DHANDLE(session, session->meta_dhandle, - ret = __wt_txn_checkpoint_log(session, - 0, WT_TXN_LOG_CKPT_SYNC, NULL)); + WT_WITH_DHANDLE(session, session->meta_dhandle, + ret = __wt_txn_checkpoint_log(session, + 0, WT_TXN_LOG_CKPT_SYNC, NULL)); + WT_RET(ret); } else { WT_WITH_DHANDLE(session, session->meta_dhandle, ret = __wt_checkpoint(session, NULL)); WT_RET(ret); WT_WITH_DHANDLE(session, session->meta_dhandle, ret = __wt_checkpoint_sync(session, NULL)); + WT_RET(ret); } +done: /* Apply any tracked operations post-commit. */ + for (; trk_orig < trk; trk_orig++) + WT_TRET(__meta_track_apply(session, trk_orig)); return (ret); } @@ -316,7 +351,7 @@ __wt_meta_track_sub_off(WT_SESSION_IMPL *session) session->meta_track_next = session->meta_track_sub = NULL; while (--trk >= trk_orig) - WT_TRET(__meta_track_apply(session, trk, 0)); + WT_TRET(__meta_track_apply(session, trk)); session->meta_track_next = trk_orig; return (ret); diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c index 3a3b0e0d74f..c3ae43b605f 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c @@ -216,7 +216,7 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) ++copy.s.writers; ++copy.s.readers; - l->us = copy.us; + l->i.us = copy.i.us; return (0); } diff --git a/src/third_party/wiredtiger/src/os_posix/os_thread.c b/src/third_party/wiredtiger/src/os_posix/os_thread.c index c70a04c8df7..10eeef558bc 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_thread.c +++ b/src/third_party/wiredtiger/src/os_posix/os_thread.c @@ -19,7 +19,8 @@ __wt_thread_create(WT_SESSION_IMPL *session, WT_DECL_RET; /* Spawn a new thread of control. */ - if ((ret = pthread_create(tidret, NULL, func, arg)) == 0) + WT_SYSCALL_RETRY(pthread_create(tidret, NULL, func, arg), ret); + if (ret == 0) return (0); WT_RET_MSG(session, ret, "pthread_create"); } @@ -33,7 +34,8 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) { WT_DECL_RET; - if ((ret = pthread_join(tid, NULL)) == 0) + WT_SYSCALL_RETRY(pthread_join(tid, NULL), ret); + if (ret == 0) return (0); WT_RET_MSG(session, ret, "pthread_join"); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 14ab05fbb25..e11490ac7fc 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -343,11 +343,12 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_RECONCILE *r; - int locked; + int page_lock, scan_lock, split_lock; conn = S2C(session); page = ref->page; mod = page->modify; + page_lock = scan_lock = split_lock = 0; /* We're shouldn't get called with a clean page, that's an error. */ if (!__wt_page_is_modified(page)) @@ -386,22 +387,38 @@ __wt_reconcile(WT_SESSION_IMPL *session, /* * The compaction process looks at the page's modification information; - * if compaction is running, lock the page down. - * - * Otherwise, flip on the scanning flag: obsolete updates cannot be - * freed while reconciliation is in progress. + * if compaction is running, acquire the page's lock. */ - locked = 0; if (conn->compact_in_memory_pass) { - locked = 1; WT_PAGE_LOCK(session, page); - } else + page_lock = 1; + } + + /* + * Reconciliation reads the lists of updates, so obsolete updates cannot + * be discarded while reconciliation is in progress. + */ + for (;;) { + F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret); + if (ret == 0) + break; + __wt_yield(); + } + scan_lock = 1; + + /* + * Mark internal pages as splitting to ensure we don't deadlock when + * performing an in-memory split during a checkpoint. + */ + if (WT_PAGE_IS_INTERNAL(page)) { for (;;) { - F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret); + F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret); if (ret == 0) break; __wt_yield(); } + split_lock = 1; + } /* Reconcile the page. */ switch (page->type) { @@ -434,11 +451,13 @@ __wt_reconcile(WT_SESSION_IMPL *session, else WT_TRET(__rec_write_wrapup_err(session, r, page)); - /* Release the page lock if we're holding one. */ - if (locked) - WT_PAGE_UNLOCK(session, page); - else + /* Release the locks we're holding. */ + if (split_lock) + F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED); + if (scan_lock) F_CLR_ATOMIC(page, WT_PAGE_SCANNING); + if (page_lock) + WT_PAGE_UNLOCK(session, page); /* * Clean up the boundary structures: some workloads result in millions @@ -523,7 +542,7 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) } WT_ASSERT(session, session->split_gen != 0); - pindex = WT_INTL_INDEX_COPY(next); + WT_INTL_INDEX_GET(session, next, pindex); for (i = 0; i < mod->mod_multi_entries; ++i) { WT_ERR(__wt_multi_to_ref(session, next, &mod->mod_multi[i], &pindex->index[i], NULL)); @@ -2961,7 +2980,7 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) "bulk-load is only possible for newly created trees"); /* Get a reference to the empty leaf page. */ - pindex = WT_INTL_INDEX_COPY(btree->root.page); + pindex = WT_INTL_INDEX_GET_SAFE(btree->root.page); cbulk->ref = pindex->index[0]; cbulk->leaf = cbulk->ref->page; @@ -5046,6 +5065,9 @@ err: __wt_scr_free(session, &tkey); WT_FULL_BARRIER(); } else { mod->rec_max_txn = r->max_txn; + if (!F_ISSET(r, WT_EVICTING) && + TXNID_LT(btree->rec_max_txn, r->max_txn)) + btree->rec_max_txn = r->max_txn; if (WT_ATOMIC_CAS4(mod->write_gen, r->orig_write_gen, 0)) __wt_cache_dirty_decr(session, page); diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 599c7bdf44a..b042e73f7d5 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -202,9 +202,9 @@ __session_reconfigure(WT_SESSION *wt_session, const char *config) if (cval.len != 0) session->isolation = session->txn.isolation = WT_STRING_MATCH("snapshot", cval.str, cval.len) ? - TXN_ISO_SNAPSHOT : + WT_ISO_SNAPSHOT : WT_STRING_MATCH("read-uncommitted", cval.str, cval.len) ? - TXN_ISO_READ_UNCOMMITTED : TXN_ISO_READ_COMMITTED; + WT_ISO_READ_UNCOMMITTED : WT_ISO_READ_COMMITTED; err: API_END_RET_NOTFOUND_MAP(session, ret); } @@ -1064,7 +1064,7 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, WT_ERR(__wt_cond_alloc(session, "session", 0, &session_ret->cond)); if (WT_SESSION_FIRST_USE(session_ret)) - __wt_random_init(session_ret->rnd); + __wt_random_init(&session_ret->rnd); __wt_event_handler_set(session_ret, event_handler == NULL ? session->event_handler : event_handler); @@ -1087,7 +1087,7 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, } /* Initialize transaction support: default to read-committed. */ - session_ret->isolation = TXN_ISO_READ_COMMITTED; + session_ret->isolation = WT_ISO_READ_COMMITTED; WT_ERR(__wt_txn_init(session_ret)); /* diff --git a/src/third_party/wiredtiger/src/support/rand.c b/src/third_party/wiredtiger/src/support/rand.c index bd51b2ea0d5..4d0f90b87dc 100644 --- a/src/third_party/wiredtiger/src/support/rand.c +++ b/src/third_party/wiredtiger/src/support/rand.c @@ -29,19 +29,22 @@ #include "wt_internal.h" #undef M_W -#define M_W (rnd)[0] +#define M_W(r) r.x.w #undef M_Z -#define M_Z (rnd)[1] +#define M_Z(r) r.x.z /* * __wt_random_init -- * Initialize return of a 32-bit pseudo-random number. */ void -__wt_random_init(uint32_t *rnd) +__wt_random_init(WT_RAND_STATE volatile * rnd_state) { - M_W = 521288629; - M_Z = 362436069; + WT_RAND_STATE rnd; + + M_W(rnd) = 521288629; + M_Z(rnd) = 362436069; + *rnd_state = rnd; } /* @@ -60,11 +63,32 @@ __wt_random_init(uint32_t *rnd) * forever. Take local copies of the shared values to avoid this. */ uint32_t -__wt_random(uint32_t *rnd) +__wt_random(WT_RAND_STATE volatile * rnd_state) { - uint32_t w = M_W, z = M_Z; + WT_RAND_STATE rnd; + uint32_t w, z; + + /* + * Take a copy of the random state so we can ensure that the + * calculation operates on the state consistently regardless of + * concurrent calls with the same random state. + */ + rnd = *rnd_state; + w = M_W(rnd); + z = M_Z(rnd); + + /* + * Check if the value goes to 0 (from which we won't recover), and reset + * to the initial state. This has additional benefits if a caller fails + * to initialize the state, or initializes with a seed that results in a + * short period. + */ + if (z == 0 || w == 0) + __wt_random_init(rnd_state); + + M_Z(rnd) = z = 36969 * (z & 65535) + (z >> 16); + M_W(rnd) = w = 18000 * (w & 65535) + (w >> 16); + *rnd_state = rnd; - M_Z = z = 36969 * (z & 65535) + (z >> 16); - M_W = w = 18000 * (w & 65535) + (w >> 16); - return (z << 16) + (w & 65535); + return ((z << 16) + (w & 65535)); } diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 824914bf8bf..8db47646b11 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -447,7 +447,6 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) stats->log_slot_joins.desc = "log: consolidated slot joins"; stats->log_slot_toosmall.desc = "log: failed to find a slot large enough for record"; - stats->log_buffer_grow.desc = "log: log buffer size increases"; stats->log_bytes_payload.desc = "log: log bytes of payload data"; stats->log_bytes_written.desc = "log: log bytes written"; stats->log_reads.desc = "log: log read operations"; @@ -622,7 +621,6 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->log_slot_transitions.v = 0; stats->log_slot_joins.v = 0; stats->log_slot_toosmall.v = 0; - stats->log_buffer_grow.v = 0; stats->log_bytes_payload.v = 0; stats->log_bytes_written.v = 0; stats->log_reads.v = 0; diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index f6f5a695b4f..a391ec8be88 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -59,7 +59,7 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session) WT_ASSERT(session, txn_state->snap_min == WT_TXN_NONE || - session->txn.isolation == TXN_ISO_READ_UNCOMMITTED || + session->txn.isolation == WT_ISO_READ_UNCOMMITTED || !__wt_txn_visible_all(session, txn_state->snap_min)); txn_state->snap_min = WT_TXN_NONE; @@ -87,20 +87,6 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); - current_id = snap_min = txn_global->current; - prev_oldest_id = txn_global->oldest_id; - - /* For pure read-only workloads, avoid scanning. */ - if (prev_oldest_id == current_id) { - txn_state->snap_min = current_id; - __txn_sort_snapshot(session, 0, current_id); - - /* Check that the oldest ID has not moved in the meantime. */ - if (prev_oldest_id == txn_global->oldest_id && - txn_global->scan_count == 0) - return; - } - /* * We're going to scan. Increment the count of scanners to prevent the * oldest ID from moving forwards. Spin if the count is negative, @@ -112,9 +98,21 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) } while (count < 0 || !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1)); - /* The oldest ID cannot change until the scan count goes to zero. */ - prev_oldest_id = txn_global->oldest_id; current_id = snap_min = txn_global->current; + prev_oldest_id = txn_global->oldest_id; + + /* For pure read-only workloads, avoid scanning. */ + if (prev_oldest_id == current_id) { + txn_state->snap_min = current_id; + __txn_sort_snapshot(session, 0, current_id); + + /* Check that the oldest ID has not moved in the meantime. */ + if (prev_oldest_id == txn_global->oldest_id) { + WT_ASSERT(session, txn_global->scan_count > 0); + (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); + return; + } + } /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); @@ -299,9 +297,9 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) if (cval.len != 0) txn->isolation = WT_STRING_MATCH("snapshot", cval.str, cval.len) ? - TXN_ISO_SNAPSHOT : + WT_ISO_SNAPSHOT : WT_STRING_MATCH("read-committed", cval.str, cval.len) ? - TXN_ISO_READ_COMMITTED : TXN_ISO_READ_UNCOMMITTED; + WT_ISO_READ_COMMITTED : WT_ISO_READ_UNCOMMITTED; /* * The default sync setting is inherited from the connection, but can @@ -333,6 +331,7 @@ __wt_txn_release(WT_SESSION_IMPL *session) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; + int was_oldest; txn = &session->txn; WT_ASSERT(session, txn->mod_count == 0); @@ -340,6 +339,7 @@ __wt_txn_release(WT_SESSION_IMPL *session) txn_global = &S2C(session)->txn_global; txn_state = WT_SESSION_TXN_STATE(session); + was_oldest = 0; /* Clear the transaction's ID from the global table. */ if (WT_SESSION_IS_CHECKPOINT(session)) { @@ -353,6 +353,9 @@ __wt_txn_release(WT_SESSION_IMPL *session) WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); + + /* Quick check for the oldest transaction. */ + was_oldest = (txn->id == txn_global->last_running); txn->id = WT_TXN_NONE; } @@ -369,7 +372,16 @@ __wt_txn_release(WT_SESSION_IMPL *session) */ __wt_txn_release_snapshot(session); txn->isolation = session->isolation; - F_CLR(txn, TXN_ERROR | TXN_HAS_ID | TXN_RUNNING); + /* Ensure the transaction flags are cleared on exit */ + txn->flags = 0; + + /* + * When the oldest transaction in the system completes, bump the oldest + * ID. This is racy and so not guaranteed, but in practice it keeps + * the oldest ID from falling too far behind. + */ + if (was_oldest) + __wt_txn_update_oldest(session, 1); } /* diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 08d8b778371..1ae99fb1c97 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -358,10 +358,10 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) u_int i; conn = S2C(session); + txn = &session->txn; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); saved_isolation = session->isolation; - txn = &session->txn; full = idle = logging = tracking = 0; /* Ensure the metadata table is open before taking any locks. */ @@ -373,6 +373,9 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_RET(__checkpoint_apply_all(session, cfg, NULL, &full)); + /* Configure logging only if doing a full checkpoint. */ + logging = FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED); + /* * Get a list of handles we want to flush; this may pull closed objects * into the session cache, but we're going to do that eventually anyway. @@ -400,7 +403,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) "starting write leaves", &verb_timer)); /* Flush dirty leaf pages before we start the checkpoint. */ - session->isolation = txn->isolation = TXN_ISO_READ_COMMITTED; + session->isolation = txn->isolation = WT_ISO_READ_COMMITTED; WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_write_leaves)); /* @@ -421,7 +424,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) tracking = 1; /* Tell logging that we are about to start a database checkpoint. */ - if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && full) + if (full && logging) WT_ERR(__wt_txn_checkpoint_log( session, full, WT_TXN_LOG_CKPT_PREPARE, NULL)); @@ -491,11 +494,9 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) txn_state->id = txn_state->snap_min = WT_TXN_NONE; /* Tell logging that we have started a database checkpoint. */ - if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && full) { + if (full && logging) WT_ERR(__wt_txn_checkpoint_log( session, full, WT_TXN_LOG_CKPT_START, NULL)); - logging = 1; - } WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint)); @@ -531,22 +532,29 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_txn_commit(session, NULL)); /* - * Disable metadata tracking during the metadata checkpoint. - * - * We don't lock old checkpoints in the metadata file: there is no way - * to open one. We are holding other handle locks, it is not safe to - * lock conn->spinlock. + * Ensure that the metadata changes are durable before the checkpoint + * is resolved. Do this by either checkpointing the metadata or syncing + * the log file. + * Recovery relies on the checkpoint LSN in the metadata only being + * updated by full checkpoints so only checkpoint the metadata for + * full or non-logged checkpoints. */ - session->isolation = txn->isolation = TXN_ISO_READ_UNCOMMITTED; - saved_meta_next = session->meta_track_next; - session->meta_track_next = NULL; - WT_WITH_DHANDLE(session, - session->meta_dhandle, ret = __wt_checkpoint(session, cfg)); - session->meta_track_next = saved_meta_next; - WT_ERR(ret); - - WT_ERR(__checkpoint_verbose_track(session, - "metadata sync completed", &verb_timer)); + if (full || !logging) { + session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; + /* Disable metadata tracking during the metadata checkpoint. */ + saved_meta_next = session->meta_track_next; + session->meta_track_next = NULL; + WT_WITH_DHANDLE(session, + session->meta_dhandle, ret = __wt_checkpoint(session, cfg)); + session->meta_track_next = saved_meta_next; + WT_ERR(ret); + + WT_ERR(__checkpoint_verbose_track(session, + "metadata sync completed", &verb_timer)); + } else + WT_WITH_DHANDLE(session, session->meta_dhandle, + ret = __wt_txn_checkpoint_log(session, + 0, WT_TXN_LOG_CKPT_SYNC, NULL)); if (full) { WT_ERR(__wt_epoch(session, &stop)); @@ -566,7 +574,7 @@ err: /* * overwritten the checkpoint, so what ends up on disk is not * consistent. */ - session->isolation = txn->isolation = TXN_ISO_READ_UNCOMMITTED; + session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; if (tracking) WT_TRET(__wt_meta_track_off(session, 0, ret != 0)); @@ -585,8 +593,8 @@ err: /* * Tell logging that we have finished a database checkpoint. Do not * write a log record if the database was idle. */ - if (logging) { - if (ret == 0 && full && + if (full && logging) { + if (ret == 0 && F_ISSET((WT_BTREE *)session->meta_dhandle->handle, WT_BTREE_SKIP_CKPT)) idle = 1; @@ -1170,19 +1178,21 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, int final) } /* - * If closing a modified file, checkpoint the file and optionally flush - * the writes (the checkpoint call will discard the blocks, there's no - * additional step needed). - * * We should already have the schema lock unless we're finishing a bulk * load -- the only other paths to closing files (sweep and LSM) have * already checked for read-only trees. */ - if (!final) - WT_ASSERT(session, - bulk || F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); + WT_ASSERT(session, + final || bulk || F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); + + /* + * Turn on metadata tracking if: + * - The session is not already doing metadata tracking. + * - The file was bulk loaded. + * - The close is not during connection close. + */ + need_tracking = !WT_META_TRACKING(session) && !bulk && !final; - need_tracking = !bulk && !final && !WT_META_TRACKING(session); if (need_tracking) WT_RET(__wt_meta_track_on(session)); diff --git a/src/third_party/wiredtiger/src/txn/txn_ext.c b/src/third_party/wiredtiger/src/txn/txn_ext.c index e35b6f16ea1..36d42a8996f 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ext.c +++ b/src/third_party/wiredtiger/src/txn/txn_ext.c @@ -40,9 +40,9 @@ __wt_ext_transaction_isolation_level( session = (WT_SESSION_IMPL *)wt_session; txn = &session->txn; - if (txn->isolation == TXN_ISO_READ_COMMITTED) + if (txn->isolation == WT_ISO_READ_COMMITTED) return (WT_TXN_ISO_READ_COMMITTED); - if (txn->isolation == TXN_ISO_READ_UNCOMMITTED) + if (txn->isolation == WT_ISO_READ_UNCOMMITTED) return (WT_TXN_ISO_READ_UNCOMMITTED); return (WT_TXN_ISO_SNAPSHOT); } |