diff options
author | Alexander Gorrod <alexander.gorrod@mongodb.com> | 2015-07-23 23:23:52 +0000 |
---|---|---|
committer | Alexander Gorrod <alexander.gorrod@mongodb.com> | 2015-07-23 23:23:52 +0000 |
commit | 455aa3de2fe23454b8acd2a6d4ae575f2bb1aa74 (patch) | |
tree | 5f06bd87e9e5fbbeb7f709f919965f998cc8ec1b /src/third_party | |
parent | 92f1bacdb1dbc17919e7a0f77f0d6c5b981933da (diff) | |
download | mongo-455aa3de2fe23454b8acd2a6d4ae575f2bb1aa74.tar.gz |
Import wiredtiger-wiredtiger-2.6.1-332-gfdedd36.tar.gz from wiredtiger branch mongodb-3.2
Diffstat (limited to 'src/third_party')
25 files changed, 475 insertions, 325 deletions
diff --git a/src/third_party/wiredtiger/dist/s_all b/src/third_party/wiredtiger/dist/s_all index 60e8b8f1551..c624db06a97 100755 --- a/src/third_party/wiredtiger/dist/s_all +++ b/src/third_party/wiredtiger/dist/s_all @@ -42,7 +42,7 @@ errchk() return fi - echo "####################### ERROR ############################" + echo "####################### MESSAGE ############################" echo "s_all run of: \"$1\" resulted in:" sed -e 's/^/ /' $2 echo "#######################" diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index ba5717d1b4a..1ed92b79ba8 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -506,6 +506,7 @@ dmsg doxgen doxygen ds +dsb dsk dsrc dst @@ -648,6 +649,7 @@ lang latencies lbrace lbracket +ld len lenp level's @@ -897,6 +899,7 @@ subtree sunique superset sw +sy sys t's tV diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index ee9c8782594..caf68364696 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -241,12 +241,11 @@ connection_stats = [ LogStat('log_writes', 'log write operations'), LogStat('log_write_lsn', 'log server thread advances write LSN'), + LogStat('log_slot_coalesced', 'written slots coalesced'), LogStat('log_slot_consolidated', 'logging bytes consolidated'), LogStat('log_slot_closes', 'consolidated slot closures'), LogStat('log_slot_joins', 'consolidated slot joins'), LogStat('log_slot_races', 'consolidated slot join races'), - LogStat('log_slot_switch_fails', - 'slots selected for switching that were unavailable'), LogStat('log_slot_toobig', 'record size exceeded maximum'), LogStat('log_slot_toosmall', 'failed to find a slot large enough for record'), diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c index 15a9da169fc..c88c44fb9c3 100644 --- a/src/third_party/wiredtiger/src/block/block_ckpt.c +++ b/src/third_party/wiredtiger/src/block/block_ckpt.c @@ -315,7 +315,7 @@ __ckpt_extlist_fblocks( * file that contains a previous checkpoint's extents. */ return (__wt_block_insert_ext( - session, &block->live.ckpt_avail, el->offset, el->size)); + session, block, &block->live.ckpt_avail, el->offset, el->size)); } #ifdef HAVE_DIAGNOSTIC @@ -537,7 +537,7 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) * must be paired in the checkpoint. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) - WT_ERR(__wt_block_insert_ext(session, + WT_ERR(__wt_block_insert_ext(session, block, &a->discard, a->root_offset, a->root_size)); /* @@ -554,10 +554,10 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( - session, &a->alloc, &b->alloc)); + session, block, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( - session, &a->discard, &b->discard)); + session, block, &a->discard, &b->discard)); /* * If the "to" checkpoint is also being deleted, we're done with @@ -775,7 +775,8 @@ __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block) block->ckpt_inprogress = 0; __wt_spin_lock(session, &block->live_lock); - ret = __wt_block_extlist_merge(session, &ci->ckpt_avail, &ci->avail); + ret = __wt_block_extlist_merge( + session, block, &ci->ckpt_avail, &ci->avail); __wt_spin_unlock(session, &block->live_lock); /* Discard the lists remaining after the checkpoint call. */ diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c index e89c70060f3..d593537446b 100644 --- a/src/third_party/wiredtiger/src/block/block_ext.c +++ b/src/third_party/wiredtiger/src/block/block_ext.c @@ -8,12 +8,25 @@ #include "wt_internal.h" -static int __block_append(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t); +/* + * WT_BLOCK_RET -- + * Handle extension list errors that would normally panic the system but + * which should fail gracefully when verifying. + */ +#define WT_BLOCK_RET(session, block, v, ...) do { \ + int __ret = (v); \ + __wt_err(session, __ret, __VA_ARGS__); \ + return ((block)->verify ? __ret : __wt_panic(session)); \ +} while (0) + +static int __block_append(WT_SESSION_IMPL *, + WT_BLOCK *, WT_EXTLIST *, wt_off_t, wt_off_t); static int __block_ext_overlap(WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, WT_EXT **, WT_EXTLIST *, WT_EXT **); static int __block_extlist_dump( WT_SESSION_IMPL *, const char *, WT_EXTLIST *, int); -static int __block_merge(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t); +static int __block_merge(WT_SESSION_IMPL *, + WT_BLOCK *, WT_EXTLIST *, wt_off_t, wt_off_t); /* * __block_off_srch_last -- @@ -308,8 +321,8 @@ __wt_block_misplaced(WT_SESSION_IMPL *session, * Remove a record from an extent list. */ static int -__block_off_remove( - WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, WT_EXT **extp) +__block_off_remove(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_EXTLIST *el, wt_off_t off, WT_EXT **extp) { WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH]; WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH]; @@ -370,7 +383,7 @@ __block_off_remove( return (0); corrupt: - WT_PANIC_RET(session, EINVAL, + WT_BLOCK_RET(session, block, EINVAL, "attempt to remove non-existent offset from an extent list"); } @@ -380,8 +393,8 @@ corrupt: * overlapping entry. */ int -__wt_block_off_remove_overlap( - WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size) +__wt_block_off_remove_overlap(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_EXTLIST *el, wt_off_t off, wt_off_t size) { WT_EXT *before, *after, *ext; wt_off_t a_off, a_size, b_off, b_size; @@ -393,7 +406,8 @@ __wt_block_off_remove_overlap( /* If "before" or "after" overlaps, retrieve the overlapping entry. */ if (before != NULL && before->off + before->size > off) { - WT_RET(__block_off_remove(session, el, before->off, &ext)); + WT_RET(__block_off_remove( + session, block, el, before->off, &ext)); /* Calculate overlapping extents. */ a_off = ext->off; @@ -401,7 +415,8 @@ __wt_block_off_remove_overlap( b_off = off + size; b_size = ext->size - (a_size + size); } else if (after != NULL && off + size > after->off) { - WT_RET(__block_off_remove(session, el, after->off, &ext)); + WT_RET(__block_off_remove( + session, block, el, after->off, &ext)); /* * Calculate overlapping extents. There's no initial overlap @@ -525,7 +540,7 @@ __wt_block_alloc( __block_size_srch(block->live.avail.sz, size, sstack); if ((szp = *sstack[0]) == NULL) { append: WT_RET(__block_extend(session, block, offp, size)); - WT_RET(__block_append(session, + WT_RET(__block_append(session, block, &block->live.alloc, *offp, (wt_off_t)size)); return (0); } @@ -535,7 +550,8 @@ append: WT_RET(__block_extend(session, block, offp, size)); } /* Remove the record, and set the returned offset. */ - WT_RET(__block_off_remove(session, &block->live.avail, ext->off, &ext)); + WT_RET(__block_off_remove( + session, block, &block->live.avail, ext->off, &ext)); *offp = ext->off; /* If doing a partial allocation, adjust the record and put it back. */ @@ -561,7 +577,7 @@ append: WT_RET(__block_extend(session, block, offp, size)); /* Add the newly allocated extent to the list of allocations. */ WT_RET(__block_merge( - session, &block->live.alloc, *offp, (wt_off_t)size)); + session, block, &block->live.alloc, *offp, (wt_off_t)size)); return (0); } @@ -618,12 +634,12 @@ __wt_block_off_free( * list. */ if ((ret = __wt_block_off_remove_overlap( - session, &block->live.alloc, offset, size)) == 0) - ret = __block_merge( - session, &block->live.avail, offset, (wt_off_t)size); + session, block, &block->live.alloc, offset, size)) == 0) + ret = __block_merge(session, block, + &block->live.avail, offset, (wt_off_t)size); else if (ret == WT_NOTFOUND) - ret = __block_merge( - session, &block->live.discard, offset, (wt_off_t)size); + ret = __block_merge(session, block, + &block->live.discard, offset, (wt_off_t)size); return (ret); } @@ -770,9 +786,12 @@ __block_ext_overlap(WT_SESSION_IMPL *session, */ *ap = (*ap)->next[0]; *bp = (*bp)->next[0]; - WT_RET(__block_merge(session, avail, b->off, b->size)); - WT_RET(__block_off_remove(session, ael, a->off, NULL)); - WT_RET(__block_off_remove(session, bel, b->off, NULL)); + WT_RET(__block_merge( + session, block, avail, b->off, b->size)); + WT_RET(__block_off_remove( + session, block, ael, a->off, NULL)); + WT_RET(__block_off_remove( + session, block, bel, b->off, NULL)); } else if (a->size > b->size) { /* Case #4 */ /* @@ -780,7 +799,8 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Increment/Decrement A's offset/size by the size of B * Insert A on its list */ - WT_RET(__block_off_remove(session, ael, a->off, &a)); + WT_RET(__block_off_remove( + session, block, ael, a->off, &a)); a->off += b->size; a->size -= b->size; WT_RET(__block_ext_insert(session, ael, a)); @@ -791,15 +811,18 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Delete B */ *bp = (*bp)->next[0]; - WT_RET(__block_merge(session, avail, b->off, b->size)); - WT_RET(__block_off_remove(session, bel, b->off, NULL)); + WT_RET(__block_merge( + session, block, avail, b->off, b->size)); + WT_RET(__block_off_remove( + session, block, bel, b->off, NULL)); } else { /* Case #9 */ /* * Remove B from its list * Increment/Decrement B's offset/size by the size of A * Insert B on its list */ - WT_RET(__block_off_remove(session, bel, b->off, &b)); + WT_RET(__block_off_remove( + session, block, bel, b->off, &b)); b->off += a->size; b->size -= a->size; WT_RET(__block_ext_insert(session, bel, b)); @@ -810,8 +833,10 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Delete A */ *ap = (*ap)->next[0]; - WT_RET(__block_merge(session, avail, a->off, a->size)); - WT_RET(__block_off_remove(session, ael, a->off, NULL)); + WT_RET(__block_merge( + session, block, avail, a->off, a->size)); + WT_RET(__block_off_remove( + session, block, ael, a->off, NULL)); } /* Case #6 */ } else if (a->off + a->size == b->off + b->size) { /* @@ -819,7 +844,7 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Decrement A's size by the size of B * Insert A on its list */ - WT_RET(__block_off_remove(session, ael, a->off, &a)); + WT_RET(__block_off_remove(session, block, ael, a->off, &a)); a->size -= b->size; WT_RET(__block_ext_insert(session, ael, a)); @@ -829,8 +854,8 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Delete B */ *bp = (*bp)->next[0]; - WT_RET(__block_merge(session, avail, b->off, b->size)); - WT_RET(__block_off_remove(session, bel, b->off, NULL)); + WT_RET(__block_merge(session, block, avail, b->off, b->size)); + WT_RET(__block_off_remove(session, block, bel, b->off, NULL)); } else if /* Case #3, #7 */ (a->off + a->size < b->off + b->size) { /* @@ -838,14 +863,14 @@ __block_ext_overlap(WT_SESSION_IMPL *session, */ off = b->off; size = (a->off + a->size) - b->off; - WT_RET(__block_merge(session, avail, off, size)); + WT_RET(__block_merge(session, block, avail, off, size)); /* * Remove A from its list * Decrement A's size by the overlap * Insert A on its list */ - WT_RET(__block_off_remove(session, ael, a->off, &a)); + WT_RET(__block_off_remove(session, block, ael, a->off, &a)); a->size -= size; WT_RET(__block_ext_insert(session, ael, a)); @@ -854,7 +879,7 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Increment/Decrement B's offset/size by the overlap * Insert B on its list */ - WT_RET(__block_off_remove(session, bel, b->off, &b)); + WT_RET(__block_off_remove(session, block, bel, b->off, &b)); b->off += size; b->size -= size; WT_RET(__block_ext_insert(session, bel, b)); @@ -868,12 +893,12 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Decrement A's size by trailing part of A plus B's size * Insert A on its list */ - WT_RET(__block_off_remove(session, ael, a->off, &a)); + WT_RET(__block_off_remove(session, block, ael, a->off, &a)); a->size = b->off - a->off; WT_RET(__block_ext_insert(session, ael, a)); /* Add trailing part of A to A's list as a new element. */ - WT_RET(__block_merge(session, ael, off, size)); + WT_RET(__block_merge(session, block, ael, off, size)); /* * Move caller's B to the next element @@ -881,8 +906,8 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Delete B */ *bp = (*bp)->next[0]; - WT_RET(__block_merge(session, avail, b->off, b->size)); - WT_RET(__block_off_remove(session, bel, b->off, NULL)); + WT_RET(__block_merge(session, block, avail, b->off, b->size)); + WT_RET(__block_off_remove(session, block, bel, b->off, NULL)); } return (0); @@ -893,7 +918,8 @@ __block_ext_overlap(WT_SESSION_IMPL *session, * Merge one extent list into another. */ int -__wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b) +__wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_EXTLIST *a, WT_EXTLIST *b) { WT_EXT *ext; WT_EXTLIST tmp; @@ -923,7 +949,7 @@ __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b) } WT_EXT_FOREACH(ext, a->off) - WT_RET(__block_merge(session, b, ext->off, ext->size)); + WT_RET(__block_merge(session, block, b, ext->off, ext->size)); return (0); } @@ -933,12 +959,13 @@ __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b) * Append a new entry to the allocation list. */ static int -__block_append( - WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size) +__block_append(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_EXTLIST *el, wt_off_t off, wt_off_t size) { WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH]; u_int i; + WT_UNUSED(block); WT_ASSERT(session, el->track_size == 0); /* @@ -979,8 +1006,8 @@ __block_append( * Insert an extent into an extent list, merging if possible. */ int -__wt_block_insert_ext( - WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size) +__wt_block_insert_ext(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_EXTLIST *el, wt_off_t off, wt_off_t size) { /* * There are currently two copies of this function (this code is a one- @@ -993,7 +1020,7 @@ __wt_block_insert_ext( * Callers of this function are expected to have already acquired any * locks required to manipulate the extent list. */ - return (__block_merge(session, el, off, size)); + return (__block_merge(session, block, el, off, size)); } /* @@ -1002,8 +1029,8 @@ __wt_block_insert_ext( * version). */ static int -__block_merge( - WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size) +__block_merge(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_EXTLIST *el, wt_off_t off, wt_off_t size) { WT_EXT *ext, *after, *before; @@ -1014,7 +1041,7 @@ __block_merge( __block_off_srch_pair(el, off, &before, &after); if (before != NULL) { if (before->off + before->size > off) - WT_PANIC_RET(session, EINVAL, + WT_BLOCK_RET(session, block, EINVAL, "%s: existing range %" PRIdMAX "-%" PRIdMAX " overlaps with merge range %" PRIdMAX "-%" PRIdMAX, el->name, @@ -1025,8 +1052,8 @@ __block_merge( before = NULL; } if (after != NULL) { - if (off + size > after->off) - WT_PANIC_RET(session, EINVAL, + if (off + size > after->off) { + WT_BLOCK_RET(session, block, EINVAL, "%s: merge range %" PRIdMAX "-%" PRIdMAX " overlaps with existing range %" PRIdMAX "-%" PRIdMAX, @@ -1034,6 +1061,7 @@ __block_merge( (intmax_t)off, (intmax_t)(off + size), (intmax_t)after->off, (intmax_t)(after->off + after->size)); + } if (off + size != after->off) after = NULL; } @@ -1053,7 +1081,8 @@ __block_merge( * the record we're going to use, adjust it and re-insert it. */ if (before == NULL) { - WT_RET(__block_off_remove(session, el, after->off, &ext)); + WT_RET(__block_off_remove( + session, block, el, after->off, &ext)); WT_RET(__wt_verbose(session, WT_VERB_BLOCK, "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %" @@ -1067,10 +1096,11 @@ __block_merge( } else { if (after != NULL) { size += after->size; - WT_RET( - __block_off_remove(session, el, after->off, NULL)); + WT_RET(__block_off_remove( + session, block, el, after->off, NULL)); } - WT_RET(__block_off_remove(session, el, before->off, &ext)); + WT_RET(__block_off_remove( + session, block, el, before->off, &ext)); WT_RET(__wt_verbose(session, WT_VERB_BLOCK, "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %" @@ -1115,8 +1145,8 @@ __wt_block_extlist_read_avail(WT_SESSION_IMPL *session, * Extent blocks are allocated from the available list: if reading the * avail list, the extent blocks might be included, remove them. */ - WT_ERR_NOTFOUND_OK( - __wt_block_off_remove_overlap(session, el, el->offset, el->size)); + WT_ERR_NOTFOUND_OK(__wt_block_off_remove_overlap( + session, block, el, el->offset, el->size)); err: #ifdef HAVE_DIAGNOSTIC @@ -1137,7 +1167,8 @@ __wt_block_extlist_read(WT_SESSION_IMPL *session, WT_DECL_ITEM(tmp); WT_DECL_RET; wt_off_t off, size; - int (*func)(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t); + int (*func)( + WT_SESSION_IMPL *, WT_BLOCK *, WT_EXTLIST *, wt_off_t, wt_off_t); const uint8_t *p; /* If there isn't a list, we're done. */ @@ -1187,14 +1218,16 @@ __wt_block_extlist_read(WT_SESSION_IMPL *session, if (off < block->allocsize || off % block->allocsize != 0 || size % block->allocsize != 0 || - off + size > ckpt_size) -corrupted: WT_PANIC_RET(session, WT_ERROR, + off + size > ckpt_size) { +corrupted: __wt_scr_free(session, &tmp); + WT_BLOCK_RET(session, block, WT_ERROR, "file contains a corrupted %s extent list, range %" PRIdMAX "-%" PRIdMAX " past end-of-file", el->name, (intmax_t)off, (intmax_t)(off + size)); + } - WT_ERR(func(session, el, off, size)); + WT_ERR(func(session, block, el, off, size)); } if (WT_VERBOSE_ISSET(session, WT_VERB_BLOCK)) @@ -1290,7 +1323,7 @@ __wt_block_extlist_write(WT_SESSION_IMPL *session, * blocks never appear on any allocation list. */ WT_TRET(__wt_block_off_remove_overlap( - session, &block->live.alloc, el->offset, el->size)); + session, block, &block->live.alloc, el->offset, el->size)); WT_ERR(__wt_verbose(session, WT_VERB_BLOCK, "%s written %" PRIdMAX "/%" PRIu32, @@ -1331,7 +1364,7 @@ __wt_block_extlist_truncate( */ orig = fh->size; size = ext->off; - WT_RET(__block_off_remove(session, el, size, NULL)); + WT_RET(__block_off_remove(session, block, el, size, NULL)); fh->size = size; /* diff --git a/src/third_party/wiredtiger/src/block/block_slvg.c b/src/third_party/wiredtiger/src/block/block_slvg.c index 517fb92491e..c78a6c39942 100644 --- a/src/third_party/wiredtiger/src/block/block_slvg.c +++ b/src/third_party/wiredtiger/src/block/block_slvg.c @@ -53,7 +53,7 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block) * any blocks we don't want as we process the file. */ WT_RET(__wt_block_insert_ext( - session, &block->live.alloc, allocsize, len - allocsize)); + session, block, &block->live.alloc, allocsize, len - allocsize)); return (0); } diff --git a/src/third_party/wiredtiger/src/block/block_vrfy.c b/src/third_party/wiredtiger/src/block/block_vrfy.c index 2a279a174dc..dc9662bd5e0 100644 --- a/src/third_party/wiredtiger/src/block/block_vrfy.c +++ b/src/third_party/wiredtiger/src/block/block_vrfy.c @@ -87,6 +87,12 @@ __wt_block_verify_start(WT_SESSION_IMPL *session, WT_RET(__bit_alloc(session, block->frags, &block->fragfile)); /* + * Set this before reading any extent lists: don't panic if we see + * corruption. + */ + block->verify = 1; + + /* * We maintain an allocation list that is rolled forward through the * set of checkpoints. */ @@ -102,8 +108,6 @@ __wt_block_verify_start(WT_SESSION_IMPL *session, /* Configuration: strict behavior on any error. */ WT_RET(__wt_config_gets(session, cfg, "strict", &cval)); block->verify_strict = cval.val ? 1 : 0; - - block->verify = 1; return (0); } @@ -228,7 +232,7 @@ __wt_verify_ckpt_load( WT_RET(__wt_block_extlist_read( session, block, el, ci->file_size)); WT_RET(__wt_block_extlist_merge( - session, el, &block->verify_alloc)); + session, block, el, &block->verify_alloc)); __wt_block_extlist_free(session, el); } el = &ci->discard; @@ -236,7 +240,7 @@ __wt_verify_ckpt_load( WT_RET(__wt_block_extlist_read( session, block, el, ci->file_size)); WT_EXT_FOREACH(ext, el->off) - WT_RET(__wt_block_off_remove_overlap(session, + WT_RET(__wt_block_off_remove_overlap(session, block, &block->verify_alloc, ext->off, ext->size)); __wt_block_extlist_free(session, el); } @@ -265,7 +269,7 @@ __wt_verify_ckpt_load( * checkpoints. */ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) - WT_RET(__wt_block_off_remove_overlap(session, + WT_RET(__wt_block_off_remove_overlap(session, block, &block->verify_alloc, ci->root_offset, ci->root_size)); /* diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c index ec7d3109c0c..a8bbf8a0266 100644 --- a/src/third_party/wiredtiger/src/btree/bt_io.c +++ b/src/third_party/wiredtiger/src/btree/bt_io.c @@ -73,7 +73,13 @@ __wt_bt_read(WT_SESSION_IMPL *session, ip = etmp; dsk = ip->data; - } + } else if (btree->kencryptor != NULL && + !F_ISSET(btree, WT_BTREE_VERIFY) && + !F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) + WT_ERR_MSG(session, WT_ERROR, + "encryption configured, and existing file is not " + "encrypted"); + if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || btree->compressor->decompress == NULL) diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index f5b7c4661b6..f257a955801 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -9,6 +9,66 @@ #include "wt_internal.h" /* + * __page_refp -- + * Return the page's index and slot for a reference. + */ +static inline void +__page_refp(WT_SESSION_IMPL *session, + WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp) +{ + WT_PAGE_INDEX *pindex; + uint32_t i; + + /* + * Copy the parent page's index value: the page can split at any time, + * but the index's value is always valid, even if it's not up-to-date. + */ +retry: WT_INTL_INDEX_GET(session, ref->home, pindex); + + /* + * Use the page's reference hint: it should be correct unless the page + * split before our slot. If the page splits after our slot, the hint + * will point earlier in the array than our actual slot, so the first + * loop is from the hint to the end of the list, and the second loop + * is from the start of the list to the end of the list. (The second + * loop overlaps the first, but that only happen in cases where we've + * deepened the tree and aren't going to find our slot at all, that's + * not worth optimizing.) + * + * It's not an error for the reference hint to be wrong, it just means + * the first retrieval (which sets the hint for subsequent retrievals), + * is slower. + */ + i = ref->pindex_hint; + if (i < pindex->entries && pindex->index[i]->page == ref->page) { + *pindexp = pindex; + *slotp = i; + return; + } + while (++i < pindex->entries) + if (pindex->index[i]->page == ref->page) { + *pindexp = pindex; + *slotp = ref->pindex_hint = i; + return; + } + for (i = 0; i < pindex->entries; ++i) + if (pindex->index[i]->page == ref->page) { + *pindexp = pindex; + *slotp = ref->pindex_hint = i; + return; + } + + /* + * If we don't find our reference, the page split into a new level and + * our home pointer references the wrong page. After internal pages + * deepen, their reference structure home value are updated; yield and + * wait for that to happen. + */ + __wt_yield(); + goto retry; +} + +/* * __wt_tree_walk -- * Move to the next/previous page in the tree. */ @@ -99,7 +159,7 @@ ascend: /* } /* Figure out the current slot in the WT_REF array. */ - __wt_page_refp(session, ref, &pindex, &slot); + __page_refp(session, ref, &pindex, &slot); for (;;) { /* @@ -134,19 +194,13 @@ ascend: /* * parent of the current child page, our parent * reference can't have split or been evicted. */ - __wt_page_refp(session, ref, &pindex, &slot); + __page_refp(session, ref, &pindex, &slot); if ((ret = __wt_page_swap( session, couple, ref, flags)) != 0) { WT_TRET(__wt_page_release( session, couple, flags)); WT_ERR(ret); } - - /* - * Set the reference hint (used when we continue - * the walk). - */ - ref->pindex_hint = slot; } *refp = ref; @@ -162,13 +216,15 @@ ascend: /* ++*walkcntp; for (;;) { - ref = pindex->index[slot]; - /* - * Set the reference hint (used when we continue the - * walk). + * Move to the next slot, and set the reference hint if + * it's wrong (used when we continue the walk). We don't + * update those hints when splitting, so it's common for + * them to be incorrect in some workloads. */ - ref->pindex_hint = slot; + ref = pindex->index[slot]; + if (ref->pindex_hint != slot) + ref->pindex_hint = slot; if (LF_ISSET(WT_READ_CACHE)) { /* @@ -270,7 +326,7 @@ ascend: /* couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; - __wt_page_refp(session, ref, &pindex, &slot); + __page_refp(session, ref, &pindex, &slot); if (couple == couple_orig) break; } diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index d56b44bbd95..f0a10cdf528 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -47,13 +47,13 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) */ int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, - WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove) + WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd_arg, int is_remove) { WT_DECL_RET; WT_INSERT *ins; WT_INSERT_HEAD *ins_head, **ins_headp; WT_PAGE *page; - WT_UPDATE *old_upd, **upd_entry; + WT_UPDATE *old_upd, *upd, **upd_entry; size_t ins_size, upd_size; uint32_t ins_slot; u_int i, skipdepth; @@ -61,6 +61,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, ins = NULL; page = cbt->ref->page; + upd = upd_arg; logged = 0; /* This code expects a remove to have a NULL value. */ @@ -90,7 +91,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, } else upd_entry = &cbt->ins->upd; - if (upd == NULL) { + if (upd_arg == NULL) { /* Make sure the update can proceed. */ WT_ERR(__wt_txn_update_check( session, old_upd = *upd_entry)); @@ -165,7 +166,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, cbt->ins_head = ins_head; cbt->ins = ins; - if (upd == NULL) { + if (upd_arg == NULL) { WT_ERR( __wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, upd)); @@ -218,7 +219,8 @@ err: /* __wt_txn_unmodify(session); __wt_free(session, ins); cbt->ins = NULL; - __wt_free(session, upd); + if (upd_arg == NULL) + __wt_free(session, upd); } return (ret); diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index be7ce2e9344..de4bf7268ed 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -392,100 +392,154 @@ typedef struct { (entry1).lsn.offset < (entry2).lsn.offset)) /* - * __log_wrlsn_server -- - * The log wrlsn server thread. + * __wt_log_wrlsn -- + * Process written log slots and attempt to coalesce them if the LSNs + * are contiguous. Returns 1 if slots were freed, 0 if no slots were + * freed in the progress arg. Must be called with the log slot lock held. */ -static WT_THREAD_RET -__log_wrlsn_server(void *arg) +int +__wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) { WT_CONNECTION_IMPL *conn; - WT_DECL_RET; WT_LOG *log; WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL]; - WT_LOGSLOT *slot; - WT_SESSION_IMPL *session; + WT_LOGSLOT *coalescing, *slot; size_t written_i; uint32_t i, save_i; - int yield; - session = arg; conn = S2C(session); log = conn->log; - yield = 0; - while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { - /* - * No need to use the log_slot_lock because the slot pool - * is statically allocated and any slot in the - * WT_LOG_SLOT_WRITTEN state is exclusively ours for now. - */ - i = 0; - written_i = 0; + coalescing = NULL; + written_i = 0; + i = 0; + if (free_i != NULL) + *free_i = WT_SLOT_POOL; + + /* + * Walk the array once saving any slots that are in the + * WT_LOG_SLOT_WRITTEN state. + */ + while (i < WT_SLOT_POOL) { + save_i = i; + slot = &log->slot_pool[i++]; + if (free_i != NULL && *free_i == WT_SLOT_POOL && + slot->slot_state == WT_LOG_SLOT_FREE) + *free_i = save_i; + if (slot->slot_state != WT_LOG_SLOT_WRITTEN) + continue; + written[written_i].slot_index = save_i; + written[written_i++].lsn = slot->slot_release_lsn; + } + /* + * If we found any written slots process them. We sort them + * based on the release LSN, and then look for them in order. + */ + if (written_i > 0) { /* - * Walk the array once saving any slots that are in the - * WT_LOG_SLOT_WRITTEN state. + * If wanted, reset the yield variable to indicate that we + * have found written slots. */ - while (i < WT_SLOT_POOL) { - save_i = i; - slot = &log->slot_pool[i++]; - if (slot->slot_state != WT_LOG_SLOT_WRITTEN) - continue; - written[written_i].slot_index = save_i; - written[written_i++].lsn = slot->slot_release_lsn; - } + if (yield != NULL) + *yield = 0; + WT_INSERTION_SORT(written, written_i, + WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT); + /* - * If we found any written slots process them. We sort them - * based on the release LSN, and then look for them in order. + * We know the written array is sorted by LSN. Go + * through them either advancing write_lsn or coalesce + * contiguous ranges of written slots. */ - if (written_i > 0) { - yield = 0; - WT_INSERTION_SORT(written, written_i, - WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT); - - /* - * We know the written array is sorted by LSN. Go - * through them either advancing write_lsn or stop - * as soon as one is not in order. - */ - for (i = 0; i < written_i; i++) { - if (WT_LOG_CMP(&log->write_lsn, - &written[i].lsn) != 0) - break; + for (i = 0; i < written_i; i++) { + slot = &log->slot_pool[written[i].slot_index]; + if (coalescing != NULL) { + if (WT_LOG_CMP(&coalescing->slot_end_lsn, + &written[i].lsn) != 0) { + coalescing = slot; + continue; + } + /* + * If we get here we have a slot to coalesce + * and free. + */ + coalescing->slot_end_lsn = slot->slot_end_lsn; + WT_STAT_FAST_CONN_INCR( + session, log_slot_coalesced); + /* + * Copy the flag for later closing. + */ + if (F_ISSET(slot, WT_SLOT_CLOSEFH)) + F_SET(coalescing, WT_SLOT_CLOSEFH); + } else { + /* + * If this written slot is not the next LSN, + * try to start coalescing with later slots. + */ + if (WT_LOG_CMP( + &log->write_lsn, &written[i].lsn) != 0) { + coalescing = slot; + continue; + } /* * If we get here we have a slot to process. * Advance the LSN and process the slot. */ - slot = &log->slot_pool[written[i].slot_index]; WT_ASSERT(session, WT_LOG_CMP(&written[i].lsn, &slot->slot_release_lsn) == 0); log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; - WT_ERR(__wt_cond_signal(session, - log->log_write_cond)); + WT_RET(__wt_cond_signal( + session, log->log_write_cond)); WT_STAT_FAST_CONN_INCR(session, log_write_lsn); - /* * Signal the close thread if needed. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) - WT_ERR(__wt_cond_signal(session, - conn->log_file_cond)); - WT_ERR(__wt_log_slot_free(session, slot)); + WT_RET(__wt_cond_signal( + session, conn->log_file_cond)); } + WT_RET(__wt_log_slot_free(session, slot)); + if (free_i != NULL && *free_i == WT_SLOT_POOL && + slot->slot_state == WT_LOG_SLOT_FREE) + *free_i = save_i; } - /* - * If we saw a later write, we always want to yield because - * we know something is in progress. - */ - if (yield++ < 1000) + } + return (0); +} + +/* + * __log_wrlsn_server -- + * The log wrlsn server thread. + */ +static WT_THREAD_RET +__log_wrlsn_server(void *arg) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + WT_SESSION_IMPL *session; + int locked, yield; + + session = arg; + conn = S2C(session); + log = conn->log; + locked = yield = 0; + while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { + __wt_spin_lock(session, &log->log_slot_lock); + locked = 1; + WT_ERR(__wt_log_wrlsn(session, NULL, &yield)); + locked = 0; + __wt_spin_unlock(session, &log->log_slot_lock); + if (++yield < 1000) __wt_yield(); else - /* Wait until the next event. */ WT_ERR(__wt_cond_wait(session, conn->log_wrlsn_cond, 100000)); } - - if (0) + if (0) { err: __wt_err(session, ret, "log wrlsn server error"); + } + if (locked) + __wt_spin_unlock(session, &log->log_slot_lock); return (WT_THREAD_RET_VALUE); } diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index caa6c469b30..d13ec1972fb 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -274,60 +274,6 @@ __wt_page_evict_soon(WT_PAGE *page) } /* - * __wt_page_refp -- - * Return the page's index and slot for a reference. - */ -static inline void -__wt_page_refp(WT_SESSION_IMPL *session, - WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp) -{ - WT_PAGE_INDEX *pindex; - uint32_t i; - - /* - * Copy the parent page's index value: the page can split at any time, - * but the index's value is always valid, even if it's not up-to-date. - */ -retry: WT_INTL_INDEX_GET(session, ref->home, pindex); - - /* - * Use the page's reference hint: it should be correct unless the page - * split before our slot. If the page splits after our slot, the hint - * will point earlier in the array than our actual slot, so the first - * loop is from the hint to the end of the list, and the second loop - * is from the start of the list to the end of the list. (The second - * loop overlaps the first, but that only happen in cases where we've - * deepened the tree and aren't going to find our slot at all, that's - * not worth optimizing.) - * - * It's not an error for the reference hint to be wrong, it just means - * the first retrieval (which sets the hint for subsequent retrievals), - * is slower. - */ - for (i = ref->pindex_hint; i < pindex->entries; ++i) - if (pindex->index[i]->page == ref->page) { - *pindexp = pindex; - *slotp = ref->pindex_hint = i; - return; - } - for (i = 0; i < pindex->entries; ++i) - if (pindex->index[i]->page == ref->page) { - *pindexp = pindex; - *slotp = ref->pindex_hint = i; - return; - } - - /* - * If we don't find our reference, the page split into a new level and - * our home pointer references the wrong page. After internal pages - * deepen, their reference structure home value are updated; yield and - * wait for that to happen. - */ - __wt_yield(); - goto retry; -} - -/* * __wt_page_modify_init -- * A page is about to be modified, allocate the modification structure. */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 87099ac839f..0826fa7b10b 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -26,14 +26,14 @@ extern int __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block); extern int __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp); extern int __wt_block_compact_page_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int *skipp); extern int __wt_block_misplaced(WT_SESSION_IMPL *session, WT_BLOCK *block, const char *tag, wt_off_t offset, uint32_t size, int live); -extern int __wt_block_off_remove_overlap( WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size); +extern int __wt_block_off_remove_overlap(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t off, wt_off_t size); extern int __wt_block_alloc( WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size); extern int __wt_block_free(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size); extern int __wt_block_off_free( WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size); extern int __wt_block_extlist_check( WT_SESSION_IMPL *session, WT_EXTLIST *al, WT_EXTLIST *bl); extern int __wt_block_extlist_overlap( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci); -extern int __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b); -extern int __wt_block_insert_ext( WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size); +extern int __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *a, WT_EXTLIST *b); +extern int __wt_block_insert_ext(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t off, wt_off_t size); extern int __wt_block_extlist_read_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size); extern int __wt_block_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size); extern int __wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, WT_EXTLIST *additional); @@ -171,7 +171,7 @@ extern int __wt_row_ikey_alloc(WT_SESSION_IMPL *session, uint32_t cell_offset, c extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref); extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref); extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page); -extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove); +extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd_arg, int is_remove); extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep); extern int __wt_update_alloc( WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep); extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); @@ -237,6 +237,7 @@ extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session); extern int __wt_connection_init(WT_CONNECTION_IMPL *conn); extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn); extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]); +extern int __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield); extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_logmgr_open(WT_SESSION_IMPL *session); extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session); diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h index 889fd908388..1c61768d372 100644 --- a/src/third_party/wiredtiger/src/include/gcc.h +++ b/src/third_party/wiredtiger/src/include/gcc.h @@ -169,6 +169,18 @@ #define WT_READ_BARRIER() WT_FULL_BARRIER() #define WT_WRITE_BARRIER() WT_FULL_BARRIER() +#elif defined(__aarch64__) +#define WT_PAUSE() __asm__ volatile("yield" ::: "memory") +#define WT_FULL_BARRIER() do { \ + __asm__ volatile ("dsb sy" ::: "memory"); \ +} while (0) +#define WT_READ_BARRIER() do { \ + __asm__ volatile ("dsb ld" ::: "memory"); \ +} while (0) +#define WT_WRITE_BARRIER() do { \ + __asm__ volatile ("dsb st" ::: "memory"); \ +} while (0) + #else #error "No write barrier implementation for this hardware" #endif diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h index 051f9fb262e..fbb0a3e3842 100644 --- a/src/third_party/wiredtiger/src/include/log.h +++ b/src/third_party/wiredtiger/src/include/log.h @@ -158,10 +158,9 @@ typedef struct { */ #define WT_SLOT_ACTIVE 1 #define WT_SLOT_POOL 128 - uint32_t pool_index; /* Global pool index */ WT_LOGSLOT *slot_array[WT_SLOT_ACTIVE]; /* Active slots */ WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */ - uint32_t slot_buf_size; /* Buffer size for slots */ + size_t slot_buf_size; /* Buffer size for slots */ #define WT_LOG_FORCE_CONSOLIDATE 0x01 /* Disable direct writes */ uint32_t flags; diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h index fd7fd16dea7..7a5028d6a28 100644 --- a/src/third_party/wiredtiger/src/include/mutex.h +++ b/src/third_party/wiredtiger/src/include/mutex.h @@ -32,7 +32,9 @@ typedef union { /* Read/write lock */ WiredTiger read/write locks require modification for big-endian systems. #else uint64_t u; - uint32_t us; + struct { + uint32_t us; + } i; struct { uint16_t writers; uint16_t readers; diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index f05d87c058b..6dc9282a613 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -221,10 +221,10 @@ struct __wt_connection_stats { WT_STATS log_scan_rereads; WT_STATS log_scans; WT_STATS log_slot_closes; + WT_STATS log_slot_coalesced; WT_STATS log_slot_consolidated; WT_STATS log_slot_joins; WT_STATS log_slot_races; - WT_STATS log_slot_switch_fails; WT_STATS log_slot_toobig; WT_STATS log_slot_toosmall; WT_STATS log_slot_transitions; diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 096fea3eeb3..e8f3b9958ce 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -3720,14 +3720,14 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_LOG_SCANS 1087 /*! log: consolidated slot closures */ #define WT_STAT_CONN_LOG_SLOT_CLOSES 1088 +/*! log: written slots coalesced */ +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1089 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1089 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1090 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1090 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1091 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1091 -/*! log: slots selected for switching that were unavailable */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1092 +#define WT_STAT_CONN_LOG_SLOT_RACES 1092 /*! log: record size exceeded maximum */ #define WT_STAT_CONN_LOG_SLOT_TOOBIG 1093 /*! log: failed to find a slot large enough for record */ diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 77ae0383cbe..4242571fe53 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -1217,6 +1217,7 @@ __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create, int *created) */ while (log->log_close_fh != NULL) { WT_STAT_FAST_CONN_INCR(session, log_close_yields); + WT_RET(__wt_log_wrlsn(session, NULL, NULL)); __wt_yield(); } log->log_close_fh = log->log_fh; diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c index 8723d492e13..0b580af4526 100644 --- a/src/third_party/wiredtiger/src/log/log_slot.c +++ b/src/third_party/wiredtiger/src/log/log_slot.c @@ -53,15 +53,14 @@ __wt_log_slot_init(WT_SESSION_IMPL *session) /* * Allocate memory for buffers now that the arrays are setup. Split * this out to make error handling simpler. - */ - /* + * * Cap the slot buffer to the log file size. */ - log->slot_buf_size = (uint32_t)WT_MIN( - conn->log_file_max, WT_LOG_SLOT_BUF_SIZE); + log->slot_buf_size = + WT_MIN((size_t)conn->log_file_max, WT_LOG_SLOT_BUF_SIZE); for (i = 0; i < WT_SLOT_POOL; i++) { WT_ERR(__wt_buf_init(session, - &log->slot_pool[i].slot_buf, (size_t)log->slot_buf_size)); + &log->slot_pool[i].slot_buf, log->slot_buf_size)); F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); } WT_STAT_FAST_CONN_INCRV(session, @@ -189,6 +188,36 @@ join_slot: } /* + * __log_slot_find_free -- + * Find and return a free log slot. + */ +static int +__log_slot_find_free(WT_SESSION_IMPL *session, WT_LOGSLOT **slot) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + uint32_t pool_i; + + conn = S2C(session); + log = conn->log; + WT_ASSERT(session, slot != NULL); + /* + * Encourage processing and moving the write LSN forward. + * That process has to walk the slots anyway, so do that + * work and let it give us the index of a free slot along + * the way. + */ + WT_RET(__wt_log_wrlsn(session, &pool_i, NULL)); + while (pool_i == WT_SLOT_POOL) { + __wt_yield(); + WT_RET(__wt_log_wrlsn(session, &pool_i, NULL)); + } + *slot = &log->slot_pool[pool_i]; + WT_ASSERT(session, (*slot)->slot_state == WT_LOG_SLOT_FREE); + return (0); +} + +/* * __wt_log_slot_close -- * Close a slot and do not allow any other threads to join this slot. * Remove this from the active slot array and move a new slot from @@ -202,40 +231,13 @@ __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) WT_LOG *log; WT_LOGSLOT *newslot; int64_t old_state; - int32_t yields; - uint32_t pool_i, switch_fails; conn = S2C(session); log = conn->log; - switch_fails = 0; -retry: /* * Find an unused slot in the pool. */ - pool_i = log->pool_index; - newslot = &log->slot_pool[pool_i]; - if (++log->pool_index >= WT_SLOT_POOL) - log->pool_index = 0; - if (newslot->slot_state != WT_LOG_SLOT_FREE) { - WT_STAT_FAST_CONN_INCR(session, log_slot_switch_fails); - /* - * If it takes a number of attempts to find an available slot - * it's likely all slots are waiting to be released. This - * churn is used to change how long we pause before closing - * the slot - which leads to more consolidation and less churn. - */ - if (++switch_fails % WT_SLOT_POOL == 0 && slot->slot_churn < 5) - ++slot->slot_churn; - __wt_yield(); - goto retry; - } else if (slot->slot_churn > 0) { - --slot->slot_churn; - WT_ASSERT(session, slot->slot_churn >= 0); - } - - /* Pause to allow other threads a chance to consolidate. */ - for (yields = slot->slot_churn; yields >= 0; yields--) - __wt_yield(); + WT_RET(__log_slot_find_free(session, &newslot)); /* * Swap out the slot we're going to use and put a free one in the @@ -244,7 +246,7 @@ retry: WT_STAT_FAST_CONN_INCR(session, log_slot_closes); newslot->slot_state = WT_LOG_SLOT_READY; newslot->slot_index = slot->slot_index; - log->slot_array[newslot->slot_index] = &log->slot_pool[pool_i]; + log->slot_array[newslot->slot_index] = newslot; old_state = WT_ATOMIC_STORE8(slot->slot_state, WT_LOG_SLOT_PENDING); slot->slot_group_size = (uint64_t)(old_state - WT_LOG_SLOT_READY); /* diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c index c887af58540..2ac1bfa71a1 100644 --- a/src/third_party/wiredtiger/src/meta/meta_track.c +++ b/src/third_party/wiredtiger/src/meta/meta_track.c @@ -125,31 +125,21 @@ __wt_meta_track_on(WT_SESSION_IMPL *session) * Apply the changes in a metadata tracking record. */ static int -__meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) +__meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; int tret; - /* - * Unlock handles and complete checkpoints regardless of whether we are - * unrolling. - */ - if (!unroll && trk->op != WT_ST_CHECKPOINT && - trk->op != WT_ST_DROP_COMMIT && trk->op != WT_ST_LOCK) - goto free; - switch (trk->op) { case WT_ST_EMPTY: /* Unused slot */ break; case WT_ST_CHECKPOINT: /* Checkpoint, see above */ - if (!unroll) { - btree = trk->dhandle->handle; - bm = btree->bm; - WT_WITH_DHANDLE(session, trk->dhandle, - WT_TRET(bm->checkpoint_resolve(bm, session))); - } + btree = trk->dhandle->handle; + bm = btree->bm; + WT_WITH_DHANDLE(session, trk->dhandle, + WT_TRET(bm->checkpoint_resolve(bm, session))); break; case WT_ST_DROP_COMMIT: if ((tret = __wt_remove_if_exists(session, trk->a)) != 0) { @@ -159,8 +149,40 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) WT_TRET(tret); } break; + case WT_ST_LOCK: + WT_WITH_DHANDLE(session, trk->dhandle, + WT_TRET(__wt_session_release_btree(session))); + break; + case WT_ST_FILEOP: + case WT_ST_REMOVE: + case WT_ST_SET: + break; + WT_ILLEGAL_VALUE(session); + } + + __meta_track_clear(session, trk); + return (ret); +} + +/* + * __meta_track_unroll -- + * Undo the changes in a metadata tracking record. + */ +static int +__meta_track_unroll(WT_SESSION_IMPL *session, WT_META_TRACK *trk) +{ + WT_DECL_RET; + int tret; + + switch (trk->op) { + case WT_ST_EMPTY: /* Unused slot */ + break; + case WT_ST_CHECKPOINT: /* Checkpoint, see above */ + break; + case WT_ST_DROP_COMMIT: + break; case WT_ST_LOCK: /* Handle lock, see above */ - if (unroll && trk->created) + if (trk->created) F_SET(trk->dhandle, WT_DHANDLE_DISCARD); WT_WITH_DHANDLE(session, trk->dhandle, WT_TRET(__wt_session_release_btree(session))); @@ -214,8 +236,7 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) WT_ILLEGAL_VALUE(session); } -free: __meta_track_clear(session, trk); - + __meta_track_clear(session, trk); return (ret); } @@ -279,33 +300,38 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, int need_sync, int unroll) if (trk == trk_orig) return (0); - while (--trk >= trk_orig) - WT_TRET(__meta_track_apply(session, trk, unroll)); + if (unroll) { + while (--trk >= trk_orig) + WT_TRET(__meta_track_unroll(session, trk)); + /* Unroll operations don't need to flush the metadata. */ + return (ret); + } /* - * Unroll operations don't need to flush the metadata. - * - * Also, if we don't have the metadata handle (e.g, we're in the - * process of creating the metadata), we can't sync it. + * If we don't have the metadata handle (e.g, we're in the process of + * creating the metadata), we can't sync it. */ - if (unroll || ret != 0 || !need_sync || session->meta_dhandle == NULL) - return (ret); + if (!need_sync || session->meta_dhandle == NULL) + goto done; /* If we're logging, make sure the metadata update was flushed. */ if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED)) { - if (!FLD_ISSET(S2C(session)->txn_logsync, - WT_LOG_DSYNC | WT_LOG_FSYNC)) - WT_WITH_DHANDLE(session, session->meta_dhandle, - ret = __wt_txn_checkpoint_log(session, - 0, WT_TXN_LOG_CKPT_SYNC, NULL)); + WT_WITH_DHANDLE(session, session->meta_dhandle, + ret = __wt_txn_checkpoint_log(session, + 0, WT_TXN_LOG_CKPT_SYNC, NULL)); + WT_RET(ret); } else { WT_WITH_DHANDLE(session, session->meta_dhandle, ret = __wt_checkpoint(session, NULL)); WT_RET(ret); WT_WITH_DHANDLE(session, session->meta_dhandle, ret = __wt_checkpoint_sync(session, NULL)); + WT_RET(ret); } +done: /* Apply any tracked operations post-commit. */ + for (; trk_orig < trk; trk_orig++) + WT_TRET(__meta_track_apply(session, trk_orig)); return (ret); } @@ -342,7 +368,7 @@ __wt_meta_track_sub_off(WT_SESSION_IMPL *session) session->meta_track_next = session->meta_track_sub = NULL; while (--trk >= trk_orig) - WT_TRET(__meta_track_apply(session, trk, 0)); + WT_TRET(__meta_track_apply(session, trk)); session->meta_track_next = trk_orig; return (ret); diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c index 19183ed9030..cdd4f8a24e1 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c @@ -216,7 +216,7 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) ++copy.s.writers; ++copy.s.readers; - l->us = copy.us; + l->i.us = copy.i.us; return (0); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 703bebb1597..53a73b44feb 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -5108,7 +5108,7 @@ err: __wt_scr_free(session, &tkey); */ mod->rec_max_txn = r->max_txn; if (!F_ISSET(r, WT_EVICTING) && - !WT_TXNID_LT(btree->rec_max_txn, r->max_txn)) + WT_TXNID_LT(btree->rec_max_txn, r->max_txn)) btree->rec_max_txn = r->max_txn; /* diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 0310fdc207c..b0e7d660587 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -484,12 +484,11 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) stats->log_prealloc_used.desc = "log: pre-allocated log files used"; stats->log_slot_toobig.desc = "log: record size exceeded maximum"; stats->log_scan_records.desc = "log: records processed by log scan"; - stats->log_slot_switch_fails.desc = - "log: slots selected for switching that were unavailable"; stats->log_compress_mem.desc = "log: total in-memory size of compressed records"; stats->log_buffer_size.desc = "log: total log buffer size"; stats->log_compress_len.desc = "log: total size of compressed records"; + stats->log_slot_coalesced.desc = "log: written slots coalesced"; stats->log_close_yields.desc = "log: yields waiting for previous log file close"; stats->lsm_work_queue_app.desc = @@ -647,9 +646,9 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->log_prealloc_used.v = 0; stats->log_slot_toobig.v = 0; stats->log_scan_records.v = 0; - stats->log_slot_switch_fails.v = 0; stats->log_compress_mem.v = 0; stats->log_compress_len.v = 0; + stats->log_slot_coalesced.v = 0; stats->log_close_yields.v = 0; stats->lsm_rows_merged.v = 0; stats->lsm_checkpoint_throttle.v = 0; diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index f317a3dc697..49fcd69ffed 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -352,7 +352,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN_STATE *txn_state; void *saved_meta_next; u_int i; - int full, fullckpt_logging, idle, tracking; + int full, idle, logging, tracking; const char *txn_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL }; @@ -361,7 +361,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); saved_isolation = session->isolation; - full = fullckpt_logging = idle = tracking = 0; + full = idle = logging= tracking = 0; /* Ensure the metadata table is open before taking any locks. */ WT_RET(__wt_metadata_open(session)); @@ -373,8 +373,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__checkpoint_apply_all(session, cfg, NULL, &full)); /* Configure logging only if doing a full checkpoint. */ - fullckpt_logging = - full && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED); + logging = FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED); /* * Get a list of handles we want to flush; this may pull closed objects @@ -424,7 +423,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) tracking = 1; /* Tell logging that we are about to start a database checkpoint. */ - if (fullckpt_logging) + if (full && logging) WT_ERR(__wt_txn_checkpoint_log( session, full, WT_TXN_LOG_CKPT_PREPARE, NULL)); @@ -494,7 +493,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) txn_state->id = txn_state->snap_min = WT_TXN_NONE; /* Tell logging that we have started a database checkpoint. */ - if (fullckpt_logging) + if (full && logging) WT_ERR(__wt_txn_checkpoint_log( session, full, WT_TXN_LOG_CKPT_START, NULL)); @@ -532,26 +531,29 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_txn_commit(session, NULL)); /* - * If any tree was dirty, we will have updated the metadata with the - * new checkpoint information. If the metadata is clean, all other - * trees must have been clean. - * - * Disable metadata tracking during the metadata checkpoint. - * - * We don't lock old checkpoints in the metadata file: there is no way - * to open one. We are holding other handle locks, it is not safe to - * lock conn->spinlock. + * Ensure that the metadata changes are durable before the checkpoint + * is resolved. Do this by either checkpointing the metadata or syncing + * the log file. + * Recovery relies on the checkpoint LSN in the metadata only being + * updated by full checkpoints so only checkpoint the metadata for + * full or non-logged checkpoints. */ - session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; - saved_meta_next = session->meta_track_next; - session->meta_track_next = NULL; - WT_WITH_DHANDLE(session, - session->meta_dhandle, ret = __wt_checkpoint(session, cfg)); - session->meta_track_next = saved_meta_next; - WT_ERR(ret); - - WT_ERR(__checkpoint_verbose_track(session, - "metadata sync completed", &verb_timer)); + if (full || !logging) { + session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; + /* Disable metadata tracking during the metadata checkpoint. */ + saved_meta_next = session->meta_track_next; + session->meta_track_next = NULL; + WT_WITH_DHANDLE(session, + session->meta_dhandle, ret = __wt_checkpoint(session, cfg)); + session->meta_track_next = saved_meta_next; + WT_ERR(ret); + + WT_ERR(__checkpoint_verbose_track(session, + "metadata sync completed", &verb_timer)); + } else + WT_WITH_DHANDLE(session, session->meta_dhandle, + ret = __wt_txn_checkpoint_log(session, + 0, WT_TXN_LOG_CKPT_SYNC, NULL)); if (full) { WT_ERR(__wt_epoch(session, &stop)); @@ -590,7 +592,7 @@ err: /* * Tell logging that we have finished a database checkpoint. Do not * write a log record if the database was idle. */ - if (fullckpt_logging) { + if (full && logging) { if (ret == 0 && F_ISSET((WT_BTREE *)session->meta_dhandle->handle, WT_BTREE_SKIP_CKPT)) @@ -1174,19 +1176,21 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, int final) } /* - * If closing a modified file, checkpoint the file and optionally flush - * the writes (the checkpoint call will discard the blocks, there's no - * additional step needed). - * * We should already have the schema lock unless we're finishing a bulk * load -- the only other paths to closing files (sweep and LSM) have * already checked for read-only trees. */ - if (!final) - WT_ASSERT(session, - bulk || F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); + WT_ASSERT(session, + final || bulk || F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); + + /* + * Turn on metadata tracking if: + * - The session is not already doing metadata tracking. + * - The file was bulk loaded. + * - The close is not during connection close. + */ + need_tracking = !WT_META_TRACKING(session) && !bulk && !final; - need_tracking = !bulk && !final && !WT_META_TRACKING(session); if (need_tracking) WT_RET(__wt_meta_track_on(session)); |