summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2017-06-24 03:01:13 +1000
committerMichael Cahill <michael.cahill@mongodb.com>2017-06-24 03:01:13 +1000
commitcf713d9b8cf0436f08facc8171ffb407d380ea85 (patch)
tree86df947be6762f82e30f5fe411024039d8b339dd
parenta02c5e1cf53e83de05d3c98ed38673d91a350ce0 (diff)
downloadmongo-cf713d9b8cf0436f08facc8171ffb407d380ea85.tar.gz
Import wiredtiger: 2b048c9ad2dfde7ad9c2a1329bc082dfc882aec7 from branch mongodb-3.6
ref: 47e8c3d1d2..2b048c9ad2 for: 3.5.10 SERVER-29439 WiredTiger turtle file "MoveFileExW: Access is denied." error. WT-3251 Remove interim buffer used to split pages during reconciliation WT-3367 Switch OSX builders to 10.12 WT-3370 Heap use after free in txn recover code WT-3371 Make Windows/MSVC build warnings fatal. WT-3379 Avoid a performance regression on secondaries
-rw-r--r--src/third_party/wiredtiger/SConstruct10
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c28
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_table.c4
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_turtle.c6
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c484
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c10
-rw-r--r--src/third_party/wiredtiger/test/mciproject.yml17
8 files changed, 339 insertions, 222 deletions
diff --git a/src/third_party/wiredtiger/SConstruct b/src/third_party/wiredtiger/SConstruct
index 22f869e02e7..2661807594d 100644
--- a/src/third_party/wiredtiger/SConstruct
+++ b/src/third_party/wiredtiger/SConstruct
@@ -67,15 +67,11 @@ var.Add('CPPPATH', 'C Preprocessor include path', [
])
var.Add('CFLAGS', 'C Compiler Flags', [
- "/Z7", # Generate debugging symbols
"/wd4090", # Ignore warning about mismatched const qualifiers
"/wd4996", # Ignore deprecated functions
"/W3", # Warning level 3
- #"/we4244", # Possible loss of data
- "/we4013", # Error on undefined functions
- #"/we4047", # Indirection differences in types
- #"/we4024", # Differences in parameter types
- #"/we4100", # Unreferenced local parameter
+ "/WX", # Warnings are fatal
+ "/Z7", # Generate debugging symbols
"/TC", # Compile as C code
#"/Od", # Disable optimization
"/Ob1", # inline expansion
@@ -338,6 +334,8 @@ if GetOption("lang-python"):
"-nodefaultctor",
"-nodefaultdtor",
])
+ # Ignore warnings in swig-generated code.
+ pythonEnv['CFLAGS'].remove("/WX")
swiglib = pythonEnv.SharedLibrary('_wiredtiger',
[ 'lang\python\wiredtiger.i'],
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index ec6d864a530..fdde764ea7b 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "47e8c3d1d22018eaaa09f91dfd78addb49e0b49b",
+ "commit": "2b048c9ad2dfde7ad9c2a1329bc082dfc882aec7",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-3.6"
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index 01818f106fc..d50326afb1e 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -420,7 +420,7 @@ __evict_review(
WT_DECL_RET;
WT_PAGE *page;
uint32_t flags;
- bool lookaside_retry, modified;
+ bool lookaside_retry, *lookaside_retryp, modified;
conn = S2C(session);
flags = WT_EVICTING;
@@ -541,6 +541,9 @@ __evict_review(
* the page and keep it in memory.
*/
cache = conn->cache;
+ lookaside_retry = false;
+ lookaside_retryp = NULL;
+
if (closing)
LF_SET(WT_VISIBILITY_ERR);
else if (!WT_PAGE_IS_INTERNAL(page)) {
@@ -552,23 +555,26 @@ __evict_review(
if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB))
LF_SET(WT_EVICT_SCRUB);
+
+ /*
+ * Check if reconciliation suggests trying the
+ * lookaside table.
+ */
+ lookaside_retryp = &lookaside_retry;
}
}
/* Reconcile the page. */
- ret = __wt_reconcile(session, ref, NULL, flags, &lookaside_retry);
+ ret = __wt_reconcile(session, ref, NULL, flags, lookaside_retryp);
/*
- * If reconciliation fails, eviction is stuck and reconciliation reports
- * it might succeed if we use the lookaside table (the page didn't have
- * uncommitted updates, it was not-yet-globally visible updates causing
- * the problem), configure reconciliation to write those updates to the
- * lookaside table, allowing the eviction of pages we'd otherwise have
- * to retain in cache to support older readers.
+ * If reconciliation fails, eviction is stuck and reconciliation
+ * reports it might succeed if we use the lookaside table, then
+ * configure reconciliation to write those updates to the lookaside
+ * table, allowing the eviction of pages we'd otherwise have to retain
+ * in cache to support older readers.
*/
- if (ret == EBUSY &&
- !F_ISSET(conn, WT_CONN_IN_MEMORY) &&
- __wt_cache_stuck(session) && lookaside_retry) {
+ if (ret == EBUSY && lookaside_retry && __wt_cache_stuck(session)) {
LF_CLR(WT_EVICT_SCRUB | WT_EVICT_UPDATE_RESTORE);
LF_SET(WT_EVICT_LOOKASIDE);
ret = __wt_reconcile(session, ref, NULL, flags, NULL);
diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c
index 65835a16c8b..326ad12bd33 100644
--- a/src/third_party/wiredtiger/src/meta/meta_table.c
+++ b/src/third_party/wiredtiger/src/meta/meta_table.c
@@ -267,7 +267,9 @@ __wt_metadata_search(WT_SESSION_IMPL *session, const char *key, char **valuep)
* that Coverity complains a lot, add an error check to get some
* peace and quiet.
*/
- if ((ret = __wt_turtle_read(session, key, valuep)) != 0)
+ WT_WITH_TURTLE_LOCK(session,
+ ret = __wt_turtle_read(session, key, valuep));
+ if (ret != 0)
__wt_free(session, *valuep);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c
index 7a99df6b83b..362a3aa2bbe 100644
--- a/src/third_party/wiredtiger/src/meta/meta_turtle.c
+++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c
@@ -246,6 +246,9 @@ __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep)
*valuep = NULL;
+ /* Require single-threading. */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TURTLE));
+
/*
* Open the turtle file; there's one case where we won't find the turtle
* file, yet still succeed. We create the metadata file before creating
@@ -302,6 +305,9 @@ __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value)
fs = NULL;
+ /* Require single-threading. */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TURTLE));
+
/*
* Create the turtle setup file: we currently re-write it from scratch
* every time.
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index f7df73c4ecb..1c266496ec8 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -25,12 +25,25 @@ typedef struct {
WT_PAGE *page;
uint32_t flags; /* Caller's configuration */
- WT_ITEM disk_image; /* Temporary disk-image buffer */
/*
- * Temporary buffer used to write out a disk image when managing two
- * chunks worth of data in memory
- */
- WT_ITEM *interim_buf;
+ * Reconciliation can end up requiring two temporary disk image buffers
+ * if a page split is involved. These two disk images are pointed to by
+ * current and the previous image pointers. During initialization the
+ * first image is allocated and pointed to by the current image pointer.
+ * If and when a split is involved the second image gets allocated and
+ * is pointed to by the current image pointer. The previous image
+ * pointer is made to refer the first image at this point. Two images
+ * are kept in memory to redistribute data among them in case the last
+ * split chunk ends up being smaller than the minimum required. As
+ * reconciliation generates more split chunks, the image referred to by
+ * the previous image pointer is written to the disk, the current and
+ * the previous image pointers are swapped, making space for another
+ * split chunk to be reconciled in the buffer that was just written out
+ * to the disk.
+ */
+ WT_ITEM disk_image[2]; /* Temporary disk-image buffers */
+ WT_ITEM *cur_img_ptr;
+ WT_ITEM *prev_img_ptr;
/*
* Track start/stop write generation to decide if all changes to the
@@ -146,17 +159,6 @@ typedef struct {
* that references all of our split pages.
*/
struct __rec_boundary {
- /*
- * Offset is the byte offset in the initial split buffer of the
- * first byte of the split chunk, recorded before we decide to
- * split the page; the difference between chunk[1]'s offset and
- * chunk[0]'s offset is chunk[0]'s length.
- *
- * Once we split a page, we stop filling in offset values, we're
- * writing the split chunks as we find them.
- */
- size_t offset; /* Split's first byte */
-
WT_ADDR addr; /* Split's written location */
uint32_t size; /* Split's size */
uint32_t checksum; /* Split's checksum */
@@ -338,7 +340,8 @@ static int __rec_split_write(WT_SESSION_IMPL *,
WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, bool);
static int __rec_update_las(
WT_SESSION_IMPL *, WT_RECONCILE *, uint32_t, WT_BOUNDARY *);
-static int __rec_write_check_complete(WT_SESSION_IMPL *, WT_RECONCILE *);
+static int __rec_write_check_complete(
+ WT_SESSION_IMPL *, WT_RECONCILE *, bool *);
static int __rec_write_init(WT_SESSION_IMPL *,
WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *);
static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *);
@@ -438,7 +441,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
/* Checks for a successful reconciliation. */
if (ret == 0)
- ret = __rec_write_check_complete(session, r);
+ ret = __rec_write_check_complete(session, r, lookaside_retryp);
/* Wrap up the page reconciliation. */
if (ret == 0 && (ret = __rec_write_wrapup(session, r, page)) == 0)
@@ -449,14 +452,6 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
/* Release the reconciliation lock. */
WT_PAGE_UNLOCK(session, page);
- /*
- * If our caller can configure lookaside table reconciliation, flag if
- * that's worth trying. The lookaside table doesn't help if we skipped
- * updates, it can only help with older readers preventing eviction.
- */
- if (lookaside_retryp != NULL && r->update_mem_uncommitted == 0)
- *lookaside_retryp = true;
-
/* Update statistics. */
WT_STAT_CONN_INCR(session, rec_pages);
WT_STAT_DATA_INCR(session, rec_pages);
@@ -561,7 +556,8 @@ __rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* Check that reconciliation should complete.
*/
static int
-__rec_write_check_complete(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+__rec_write_check_complete(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, bool *lookaside_retryp)
{
/*
* Tests in this function are lookaside tests and tests to decide if
@@ -581,17 +577,62 @@ __rec_write_check_complete(WT_SESSION_IMPL *session, WT_RECONCILE *r)
return (EBUSY);
/*
- * If when doing update/restore based eviction, we didn't split and
- * didn't apply any updates, then give up.
+ * Eviction can configure lookaside table reconciliation, consider if
+ * it's worth giving up this reconciliation attempt and falling back to
+ * using the lookaside table. We continue with evict/restore if
+ * switching to the lookaside doesn't make sense for any reason: we
+ * won't retry an evict/restore reconciliation until/unless the
+ * transactional system moves forward, so at worst it's a single wasted
+ * effort.
*
- * This may lead to saving the page to the lookaside table: that
- * decision is made by eviction.
+ * First, check if the lookaside table is a possible alternative.
*/
- if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && r->bnd_next == 1 &&
- r->update_mem_all != 0 && r->update_mem_all == r->update_mem_saved)
- return (EBUSY);
+ if (lookaside_retryp == NULL)
+ return (0);
- return (0);
+ /*
+ * We only suggest lookaside if currently in an evict/restore attempt
+ * and some updates were saved. Our caller sets the evict/restore flag
+ * based on various conditions (like if this is a leaf page), which is
+ * why we're testing that flag instead of a set of other conditions.
+ * If no updates were saved, eviction will succeed without needing to
+ * restore anything.
+ */
+ if (!F_ISSET(r, WT_EVICT_UPDATE_RESTORE) || r->bnd->supd == NULL)
+ return (0);
+
+ /*
+ * Check if this reconciliation attempt is making progress. If there's
+ * any sign of progress, don't fall back to the lookaside table.
+ *
+ * Check if the current reconciliation split, in which case we'll
+ * likely get to write at least one of the blocks. If that page is
+ * empty, that's also progress.
+ */
+ if (r->bnd_next != 1)
+ return (0);
+
+ /*
+ * Check if the current reconciliation applied some updates, in which
+ * case evict/restore should gain us some space.
+ */
+ if (r->update_mem_saved != r->update_mem_all)
+ return (0);
+
+ /*
+ * Check if lookaside eviction is possible. If any of the updates we
+ * saw were uncommitted, the lookaside table cannot be used: it only
+ * helps with older readers preventing eviction.
+ */
+ if (r->update_mem_uncommitted != 0)
+ return (0);
+
+ /*
+ * The current evict/restore approach shows no signs of being useful,
+ * lookaside is possible, suggest the lookaside table.
+ */
+ *lookaside_retryp = true;
+ return (EBUSY);
}
/*
@@ -832,7 +873,8 @@ __rec_write_init(WT_SESSION_IMPL *session,
r->last = &r->_last;
/* Disk buffers need to be aligned for writing. */
- F_SET(&r->disk_image, WT_ITEM_ALIGNED);
+ F_SET(&r->disk_image[0], WT_ITEM_ALIGNED);
+ F_SET(&r->disk_image[1], WT_ITEM_ALIGNED);
}
/* Reconciliation is not re-entrant, make sure that doesn't happen. */
@@ -977,8 +1019,8 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
return;
*(WT_RECONCILE **)reconcilep = NULL;
- __wt_buf_free(session, &r->disk_image);
- __wt_scr_free(session, &r->interim_buf);
+ __wt_buf_free(session, &r->disk_image[0]);
+ __wt_buf_free(session, &r->disk_image[1]);
__wt_free(session, r->raw_entries);
__wt_free(session, r->raw_offsets);
@@ -1766,7 +1808,7 @@ __rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size)
*/
WT_ASSERT(session, r->space_avail >= size);
WT_ASSERT(session, WT_BLOCK_FITS(
- r->first_free, size, r->disk_image.mem, r->disk_image.memsize));
+ r->first_free, size, r->cur_img_ptr->mem, r->cur_img_ptr->memsize));
r->entries += v;
r->space_avail -= size;
@@ -1853,7 +1895,7 @@ __rec_dict_replace(
* copy cell instead.
*/
if (dp->offset == 0)
- dp->offset = WT_PTRDIFF32(r->first_free, r->disk_image.mem);
+ dp->offset = WT_PTRDIFF32(r->first_free, r->cur_img_ptr->mem);
else {
/*
* The offset is the byte offset from this cell to the previous,
@@ -1861,7 +1903,7 @@ __rec_dict_replace(
* page.
*/
offset = (uint64_t)WT_PTRDIFF(r->first_free,
- (uint8_t *)r->disk_image.mem + dp->offset);
+ (uint8_t *)r->cur_img_ptr->mem + dp->offset);
val->len = val->cell_len =
__wt_cell_pack_copy(&val->cell, rle, offset);
val->buf.data = NULL;
@@ -1997,7 +2039,6 @@ __rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r)
static void
__rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
{
- bnd->offset = 0;
bnd->max_bnd_recno = WT_RECNO_OOB;
bnd->max_bnd_entries = 0;
@@ -2210,15 +2251,14 @@ __rec_split_init(WT_SESSION_IMPL *session,
* Ensure the disk image buffer is large enough for the max object, as
* corrected by the underlying block manager.
*
- * The buffer that we build disk image in, needs to hold two chunks
- * worth of data. Since we want to support split_size more than the page
- * size (to allow for adjustments based on the compression), this buffer
- * should be greater of twice of split_size and page_size.
+ * Since we want to support split_size more than the page size (to allow
+ * for adjustments based on the compression), this buffer should be
+ * greater of split_size and page_size.
*/
corrected_page_size = r->page_size;
- disk_img_buf_size = 2 * WT_MAX(corrected_page_size, r->split_size);
WT_RET(bm->write_size(bm, session, &corrected_page_size));
- WT_RET(__wt_buf_init(session, &r->disk_image, disk_img_buf_size));
+ disk_img_buf_size = WT_MAX(corrected_page_size, r->split_size);
+ WT_RET(__wt_buf_init(session, &r->disk_image[0], disk_img_buf_size));
/*
* Clear the disk page header to ensure all of it is initialized, even
@@ -2228,15 +2268,17 @@ __rec_split_init(WT_SESSION_IMPL *session,
* fixed-length column-store sets bits in bytes, where the bytes are
* assumed to initially be 0.
*/
- memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ?
+ memset(r->disk_image[0].mem, 0, page->type == WT_PAGE_COL_FIX ?
disk_img_buf_size : WT_PAGE_HEADER_SIZE);
/*
* Set the page type (the type doesn't change, and setting it later
* would require additional code in a few different places).
*/
- dsk = r->disk_image.mem;
+ dsk = r->disk_image[0].mem;
dsk->type = page->type;
+ r->cur_img_ptr = &r->disk_image[0];
+ r->prev_img_ptr = NULL;
r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
@@ -2245,7 +2287,6 @@ __rec_split_init(WT_SESSION_IMPL *session,
WT_RET(__rec_split_bnd_grow(session, r));
__rec_split_bnd_init(session, &r->bnd[0]);
r->bnd[0].max_bnd_recno = recno;
- r->bnd[0].offset = WT_PAGE_HEADER_BYTE_SIZE(btree);
/* Initialize the entry counter. */
r->entries = 0;
@@ -2451,21 +2492,18 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len)
{
WT_BM *bm;
WT_BTREE *btree;
- size_t corrected_page_size, inuse, len;
+ size_t corrected_page_size, inuse;
btree = S2BT(session);
bm = btree->bm;
- len = WT_PTRDIFF(r->first_free, r->disk_image.mem);
- inuse = (len - r->bnd[r->bnd_next].offset) +
- WT_PAGE_HEADER_BYTE_SIZE(btree);
+ inuse = WT_PTRDIFF(r->first_free, r->cur_img_ptr->mem);
corrected_page_size = inuse + add_len;
WT_RET(bm->write_size(bm, session, &corrected_page_size));
- /* Need to account for buffer carrying two chunks worth of data */
- WT_RET(__wt_buf_grow(session, &r->disk_image, 2 * corrected_page_size));
+ WT_RET(__wt_buf_grow(session, r->cur_img_ptr, corrected_page_size));
- r->first_free = (uint8_t *)r->disk_image.mem + len;
+ r->first_free = (uint8_t *)r->cur_img_ptr->mem + inuse;
WT_ASSERT(session, corrected_page_size >= inuse);
r->space_avail = corrected_page_size - inuse;
WT_ASSERT(session, r->space_avail >= add_len);
@@ -2474,89 +2512,55 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len)
}
/*
- * __rec_split_write_prev_and_shift_cur --
- * Write the previous split chunk to the disk as a page. Shift the contents
- * of the current chunk to the start of the buffer, making space for a new
- * chunk to be written.
- * If the caller asks for a chunk resizing, the boundary between the two
- * chunks is readjusted to the minimum split size boundary details stored
- * in the previous chunk, letting the current chunk grow at the cost of the
- * previous chunk.
+ * __rec_split_write_prev_and_swap_buf --
+ * If there is a previous split chunk held in the memory, write it to the
+ * disk as a page. If there isn't one, this is the first time we are
+ * splitting and need to initialize a second buffer. Also, swap the
+ * previous and the current buffer pointers.
*/
static int
-__rec_split_write_prev_and_shift_cur(
- WT_SESSION_IMPL *session, WT_RECONCILE *r, bool resize_chunks)
+__rec_split_write_prev_and_swap_buf(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
- WT_BM *bm;
- WT_BOUNDARY *bnd_cur, *bnd_prev;
- WT_BTREE *btree;
- WT_PAGE_HEADER *dsk, *dsk_tmp;
- size_t cur_len, len;
- uint8_t *dsk_start;
-
- WT_ASSERT(session, r->bnd_next != 0);
-
- btree = S2BT(session);
- bm = btree->bm;
- bnd_cur = &r->bnd[r->bnd_next];
- bnd_prev = bnd_cur - 1;
- dsk = r->disk_image.mem;
- cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset;
-
- /*
- * Resize chunks if the current is smaller than the minimum, and there
- * are details on the minimum split size boundary available in the
- * previous boundary details.
- *
- * There is a possibility that we do not have a minimum boundary set, in
- * such a case we skip chunk resizing. Such a condition is possible for
- * instance when we are building the image in the buffer and the first
- * K/V pair is large enough that it surpasses both the minimum split
- * size and the split size the application has set. In such a case we
- * split the chunk without saving any minimum boundary.
- */
- if (resize_chunks &&
- cur_len < r->min_split_size && bnd_prev->min_bnd_offset != 0) {
- bnd_cur->offset = bnd_prev->min_bnd_offset;
- bnd_cur->max_bnd_entries +=
- bnd_prev->max_bnd_entries - bnd_prev->min_bnd_entries;
- bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries;
- bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno;
-
- WT_RET(__wt_buf_set(session, &bnd_cur->max_bnd_key,
- bnd_prev->min_bnd_key.data, bnd_prev->min_bnd_key.size));
-
- /* Update current chunk's length */
- cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset;
+ WT_BOUNDARY *bnd_prev;
+ WT_ITEM *tmp_img_ptr;
+ WT_PAGE_HEADER *dsk;
+ size_t disk_img_size;
+
+ WT_ASSERT(session, (r->prev_img_ptr == NULL && r->bnd_next == 0) ||
+ (r->prev_img_ptr != NULL && r->bnd_next != 0));
+
+ /* Write previous chunk, if there is one */
+ if (r->prev_img_ptr != NULL) {
+ bnd_prev = &r->bnd[r->bnd_next - 1];
+ dsk = r->prev_img_ptr->mem;
+ dsk->recno = bnd_prev->max_bnd_recno;
+ dsk->u.entries = bnd_prev->max_bnd_entries;
+ dsk->mem_size = (uint32_t)bnd_prev->size;
+ r->prev_img_ptr->size = dsk->mem_size;
+ WT_RET(__rec_split_write(session,
+ r, bnd_prev, r->prev_img_ptr, false));
+ } else {
+ /*
+ * If we do not have a previous buffer, we should initialize the
+ * second buffer before proceeding. We will create the second
+ * buffer of the same size as the current buffer.
+ */
+ disk_img_size = r->cur_img_ptr->memsize;
+ WT_RET(__wt_buf_init(session,
+ &r->disk_image[1], disk_img_size));
+ r->prev_img_ptr = &r->disk_image[1];
+ dsk = r->prev_img_ptr->mem;
+ memset(dsk, 0,
+ r->page->type == WT_PAGE_COL_FIX ?
+ disk_img_size : WT_PAGE_HEADER_SIZE);
+ dsk->type = r->page->type;
}
- /*
- * Create an interim buffer if not already done to prepare the previous
- * chunk's disk image.
- */
- len = bnd_cur->offset;
- WT_RET(bm->write_size(bm, session, &len));
- if (r->interim_buf == NULL)
- WT_RET(__wt_scr_alloc(session, len, &r->interim_buf));
- else
- WT_RET(__wt_buf_init(session, r->interim_buf, len));
-
- dsk_tmp = r->interim_buf->mem;
- memcpy(dsk_tmp, dsk, bnd_cur->offset);
- dsk_tmp->recno = bnd_prev->max_bnd_recno;
- dsk_tmp->u.entries = bnd_prev->max_bnd_entries;
- dsk_tmp->mem_size = WT_STORE_SIZE(bnd_cur->offset);
- r->interim_buf->size = dsk_tmp->mem_size;
- WT_RET(__rec_split_write(session, r, bnd_prev, r->interim_buf, false));
-
- /* Shift the current chunk to the start of the buffer */
- dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
- (void)memmove(dsk_start, (uint8_t *)dsk + bnd_cur->offset, cur_len);
-
- /* Fix boundary offset */
- bnd_cur->offset = WT_PAGE_HEADER_BYTE_SIZE(btree);
- /* Fix where free points */
- r->first_free = dsk_start + cur_len;
+ /* swap previous and current buffers */
+ tmp_img_ptr = r->prev_img_ptr;
+ r->prev_img_ptr = r->cur_img_ptr;
+ r->cur_img_ptr = tmp_img_ptr;
+
return (0);
}
@@ -2574,7 +2578,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
size_t inuse;
btree = S2BT(session);
- dsk = r->disk_image.mem;
+ dsk = r->cur_img_ptr->mem;
/* Fixed length col store can call with next_len 0 */
WT_ASSERT(session, next_len == 0 || r->space_avail < next_len);
@@ -2588,9 +2592,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
"%s page too large, attempted split during salvage",
__wt_page_type_string(r->page->type));
- last = &r->bnd[r->bnd_next];
- inuse = (WT_PTRDIFF(r->first_free, dsk) - last->offset) +
- WT_PAGE_HEADER_BYTE_SIZE(btree);
+ inuse = WT_PTRDIFF(r->first_free, dsk);
/*
* We can get here if the first key/value pair won't fit.
@@ -2603,8 +2605,10 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
/* All page boundaries reset the dictionary. */
__rec_dictionary_reset(r);
- /* Set the number of entries for the just finished chunk. */
+ /* Set the number of entries and size for the just finished chunk. */
+ last = &r->bnd[r->bnd_next];
last->max_bnd_entries = r->entries;
+ last->size = (uint32_t)inuse;
/*
* In case of bulk load, write out chunks as we get them. Otherwise we
@@ -2616,19 +2620,22 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
dsk->recno = last->max_bnd_recno;
dsk->u.entries = last->max_bnd_entries;
dsk->mem_size = (uint32_t)inuse;
- r->disk_image.size = dsk->mem_size;
- WT_RET(__rec_split_write(
- session, r, last, &r->disk_image, false));
- /* Fix where free points */
- r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
- } else if (r->bnd_next != 0)
- WT_RET(__rec_split_write_prev_and_shift_cur(session, r, false));
+ r->cur_img_ptr->size = dsk->mem_size;
+ WT_RET(__rec_split_write(session,
+ r, last, r->cur_img_ptr, false));
+ } else {
+ WT_RET(__rec_split_write_prev_and_swap_buf(session, r));
+ /* current image we are writing to has changed */
+ dsk = r->cur_img_ptr->mem;
+ }
+
+ /* Fix where free points */
+ r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
/* Prepare the next boundary */
WT_RET(__rec_split_bnd_grow(session, r));
r->bnd_next++;
next = &r->bnd[r->bnd_next];
- next->offset = WT_PTRDIFF(r->first_free, dsk);
/* Set the key for the next chunk. */
next->max_bnd_recno = r->recno;
if (dsk->type == WT_PAGE_ROW_INT || dsk->type == WT_PAGE_ROW_LEAF)
@@ -2687,9 +2694,8 @@ __rec_split_crossing_bnd(
!WT_CROSSING_SPLIT_BND(r, next_len)) {
btree = S2BT(session);
bnd = &r->bnd[r->bnd_next];
- dsk = r->disk_image.mem;
- min_bnd_offset = (WT_PTRDIFF(r->first_free, dsk) -
- bnd->offset) + WT_PAGE_HEADER_BYTE_SIZE(btree);
+ dsk = r->cur_img_ptr->mem;
+ min_bnd_offset = WT_PTRDIFF(r->first_free, dsk);
if (min_bnd_offset == WT_PAGE_HEADER_BYTE_SIZE(btree))
/*
* This is possible if the first record doesn't fit in
@@ -2750,7 +2756,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
unpack = &_unpack;
compressor = btree->compressor;
dst = &r->raw_destination;
- dsk = r->disk_image.mem;
+ dsk = r->cur_img_ptr->mem;
WT_RET(__rec_split_bnd_grow(session, r));
last = &r->bnd[r->bnd_next];
@@ -3066,7 +3072,7 @@ no_slots:
r->first_free = dsk_start + len;
r->space_avail += r->raw_offsets[result_slots];
WT_ASSERT(session, r->first_free + r->space_avail <=
- (uint8_t *)r->disk_image.mem + r->disk_image.memsize);
+ (uint8_t *)r->cur_img_ptr->mem + r->cur_img_ptr->memsize);
/*
* Set the key for the next block (before writing the block, a
@@ -3105,13 +3111,13 @@ no_slots:
dsk->recno = last->max_bnd_recno;
dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk);
dsk->u.entries = r->entries;
- r->disk_image.size = dsk->mem_size;
+ r->cur_img_ptr->size = dsk->mem_size;
r->entries = 0;
r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
- write_ref = &r->disk_image;
+ write_ref = r->cur_img_ptr;
last->already_compressed = false;
} else {
/*
@@ -3139,7 +3145,7 @@ no_slots:
last_block && __rec_is_checkpoint(session, r, last)) {
if (write_ref == dst)
WT_RET(__wt_buf_set(
- session, &r->disk_image, dst->mem, dst->size));
+ session, r->cur_img_ptr, dst->mem, dst->size));
} else
WT_RET(
__rec_split_write(session, r, last, write_ref, last_block));
@@ -3173,15 +3179,120 @@ __rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
}
/*
+ * __rec_split_finish_process_prev --
+ * If the two split chunks together fit in a single page, merge them into
+ * one. If they do not fit in a single page but the last is smaller than
+ * the minimum desired, move some data from the penultimate chunk to the
+ * last chunk and write out the previous/penultimate. Finally, update the
+ * pointer to the current image buffer. After this function exits, we will
+ * have one (last) buffer in memory, pointed to by the current image
+ * pointer.
+ */
+static int
+__rec_split_finish_process_prev(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, bool *chunks_merged)
+{
+ WT_BOUNDARY *bnd_cur, *bnd_prev;
+ WT_BTREE *btree;
+ WT_PAGE_HEADER *dsk;
+ size_t len_to_move;
+ uint32_t combined_size;
+ uint8_t *cur_dsk_start;
+
+ WT_ASSERT(session, r->prev_img_ptr != NULL);
+
+ btree = S2BT(session);
+ bnd_cur = &r->bnd[r->bnd_next];
+ bnd_prev = bnd_cur - 1;
+ *chunks_merged = false;
+ /*
+ * The sizes referred to in the boundary structure include the header,
+ * so when calculating the combined size, make sure not to include the
+ * header twice.
+ */
+ combined_size = bnd_prev->size +
+ (bnd_cur->size - WT_PAGE_HEADER_BYTE_SIZE(btree));
+
+ if (combined_size <= r->page_size) {
+ /*
+ * We have two boundaries, but the data in the buffers can fit a
+ * single page. Merge the boundaries and create a single chunk.
+ */
+ dsk = r->cur_img_ptr->mem;
+ memcpy((uint8_t *)r->prev_img_ptr->mem + bnd_prev->size,
+ WT_PAGE_HEADER_BYTE(btree, dsk),
+ bnd_cur->size - WT_PAGE_HEADER_BYTE_SIZE(btree));
+ bnd_prev->size = combined_size;
+ bnd_prev->max_bnd_entries += bnd_cur->max_bnd_entries;
+ r->bnd_next--;
+ *chunks_merged = true;
+ } else {
+ if (bnd_cur->size < r->min_split_size &&
+ bnd_prev->min_bnd_offset != 0 ) {
+ /*
+ * The last chunk, pointed to by the current image
+ * pointer, has less than the minimum data. Let's move
+ * any data more than the minimum from the previous
+ * image into the current.
+ */
+ len_to_move = bnd_prev->size - bnd_prev->min_bnd_offset;
+ /* Grow current buffer if it is not large enough */
+ if (r->space_avail < len_to_move)
+ WT_RET(__rec_split_grow(session,
+ r, len_to_move));
+ cur_dsk_start = WT_PAGE_HEADER_BYTE(btree,
+ r->cur_img_ptr->mem);
+
+ /*
+ * Shift the contents of the current buffer to make
+ * space for the data that will be prepended into the
+ * current buffer
+ */
+ memmove(cur_dsk_start + len_to_move,
+ cur_dsk_start, bnd_cur->size -
+ WT_PAGE_HEADER_BYTE_SIZE(btree));
+ /*
+ * copy any data more than the minimum, from the
+ * previous buffer to the start of the current.
+ */
+ memcpy(cur_dsk_start, (uint8_t *)r->prev_img_ptr->mem +
+ bnd_prev->min_bnd_offset, len_to_move);
+
+ /* Update boundary information */
+ bnd_cur->size += (uint32_t)len_to_move;
+ bnd_prev->size -= (uint32_t)len_to_move;
+ bnd_cur->max_bnd_entries += bnd_prev->max_bnd_entries -
+ bnd_prev->min_bnd_entries;
+ bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries;
+ bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno;
+ WT_RET(__wt_buf_set(session,
+ &bnd_cur->max_bnd_key, bnd_prev->min_bnd_key.data,
+ bnd_prev->min_bnd_key.size));
+ }
+
+ /* Write out the previous image */
+ WT_RET(__rec_split_write_prev_and_swap_buf(session, r));
+ }
+
+ /*
+ * At this point, there is only one disk image in the memory, pointed to
+ * by the previous image pointer. Update the current image pointer to
+ * this image.
+ */
+ r->cur_img_ptr = r->prev_img_ptr;
+ return (0);
+}
+
+/*
* __rec_split_finish_std --
* Finish processing a page, standard version.
*/
static int
__rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
- WT_BOUNDARY *bnd_cur, *bnd_prev;
+ WT_BOUNDARY *bnd_cur;
WT_PAGE_HEADER *dsk;
- bool grow_bnd;
+ bool chunks_merged;
/*
* We may arrive here with no entries to write if the page was entirely
@@ -3208,50 +3319,22 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
return (EBUSY);
}
- dsk = r->disk_image.mem;
-
- /* Set the number of entries for the just finished chunk. */
+ /* Set the number of entries and size for the just finished chunk. */
bnd_cur = &r->bnd[r->bnd_next];
bnd_cur->max_bnd_entries = r->entries;
+ bnd_cur->size = WT_PTRDIFF32(r->first_free, r->cur_img_ptr->mem);
- grow_bnd = true;
- /*
- * We can reach here even with raw_compression when the last split chunk
- * is too small to be sent for raw compression.
- */
- if (!r->is_bulk_load && !r->raw_compression) {
- if (WT_PTRDIFF(r->first_free, dsk) > r->page_size &&
- r->bnd_next != 0) {
- /*
- * We hold two boundaries worth of data in the buffer,
- * and this data doesn't fit in a single page. If the
- * last chunk is too small, readjust the boundary to a
- * pre-computed minimum.
- * Write out the penultimate chunk to the disk as a page
- */
- WT_RET(__rec_split_write_prev_and_shift_cur(
- session, r, true));
- } else
- if (r->bnd_next != 0) {
- /*
- * We have two boundaries, but the data in the
- * buffer can fit a single page. Merge the
- * boundaries to create a single chunk.
- */
- bnd_prev = bnd_cur - 1;
- bnd_prev->max_bnd_entries +=
- bnd_cur->max_bnd_entries;
- r->bnd_next--;
- grow_bnd = false;
- }
- }
+ chunks_merged = false;
+ if (r->prev_img_ptr != NULL)
+ WT_RET(__rec_split_finish_process_prev(session,
+ r, &chunks_merged));
/*
* We already have space for an extra boundary if we merged two
* boundaries above, in that case we do not need to grow the boundary
* structure.
*/
- if (grow_bnd)
+ if (!chunks_merged)
WT_RET(__rec_split_bnd_grow(session, r));
bnd_cur = &r->bnd[r->bnd_next];
r->bnd_next++;
@@ -3260,14 +3343,15 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* Current boundary now has all the remaining data/last page now.
* Let's write it to the disk
*/
+ dsk = r->cur_img_ptr->mem;
dsk->recno = bnd_cur->max_bnd_recno;
dsk->u.entries = bnd_cur->max_bnd_entries;
- dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk);
- r->disk_image.size = dsk->mem_size;
+ dsk->mem_size = bnd_cur->size;
+ r->cur_img_ptr->size = dsk->mem_size;
/* If this is a checkpoint, we're done, otherwise write the page. */
return (__rec_is_checkpoint(session, r, bnd_cur) ?
- 0 : __rec_split_write(session, r, bnd_cur, &r->disk_image, true));
+ 0 : __rec_split_write(session, r, bnd_cur, r->cur_img_ptr, true));
}
/*
@@ -3289,7 +3373,7 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
if (r->raw_compression && r->entries != 0) {
while (r->entries != 0) {
data_size =
- WT_PTRDIFF(r->first_free, r->disk_image.mem);
+ WT_PTRDIFF(r->first_free, r->cur_img_ptr->mem);
if (data_size <= btree->allocsize)
break;
WT_RET(__rec_split_raw_worker(session, r, 0, true));
@@ -5882,7 +5966,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* write the buffer so we know what to do here.
*/
if (bnd->addr.addr == NULL)
- WT_RET(__wt_bt_write(session, &r->disk_image,
+ WT_RET(__wt_bt_write(session, r->cur_img_ptr,
NULL, NULL, true, F_ISSET(r, WT_CHECKPOINTING),
bnd->already_compressed));
else {
@@ -6546,7 +6630,7 @@ __rec_dictionary_lookup(
for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash);
dp != NULL && dp->hash == hash; dp = dp->next[0]) {
WT_RET(__wt_cell_pack_data_match(
- (WT_CELL *)((uint8_t *)r->disk_image.mem + dp->offset),
+ (WT_CELL *)((uint8_t *)r->cur_img_ptr->mem + dp->offset),
&val->cell, val->buf.data, &match));
if (match) {
WT_STAT_DATA_INCR(session, rec_dictionary);
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 590e17b6a2a..58f4f0750d7 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -458,6 +458,11 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
* larger than any checkpoint LSN we have from the earlier time.
*/
WT_ERR(__recovery_file_scan(&r));
+ /*
+ * The array can be re-allocated in recovery_file_scan. Reset
+ * our pointer after scanning all the files.
+ */
+ metafile = &r.files[WT_METAFILE_ID];
conn->next_file_id = r.max_fileid;
if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
@@ -509,6 +514,11 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
/* Scan the metadata to find the live files and their IDs. */
WT_ERR(__recovery_file_scan(&r));
+ /*
+ * Clear this out. We no longer need it and it could have been
+ * re-allocated when scanning the files.
+ */
+ metafile = NULL;
/*
* We no longer need the metadata cursor: close it to avoid pinning any
diff --git a/src/third_party/wiredtiger/test/mciproject.yml b/src/third_party/wiredtiger/test/mciproject.yml
index 6456475aa00..72022fe46ec 100644
--- a/src/third_party/wiredtiger/test/mciproject.yml
+++ b/src/third_party/wiredtiger/test/mciproject.yml
@@ -95,6 +95,16 @@ tasks:
script: |
set -o errexit
set -o verbose
+
+ # On 10.12, change the binary location with install_name_tool since DYLD_LIBRARY_PATH
+ # appears not to work for dynamic modules loaded by python. For wt, the libtool generated
+ # script has the wrong path for running on test machines.
+ if [ "$(uname -s)" == "Darwin" ]; then
+ WT_VERSION=$(m4 build_posix/aclocal/version.m4)
+ install_name_tool -change /usr/local/lib/libwiredtiger-$WT_VERSION.dylib $(pwd)/.libs/libwiredtiger-$WT_VERSION.dylib lang/python/_wiredtiger.so
+ install_name_tool -change /usr/local/lib/libwiredtiger-$WT_VERSION.dylib $(pwd)/.libs/libwiredtiger-$WT_VERSION.dylib .libs/wt
+ fi
+
${test_env_vars|} python ./test/suite/run.py -v 2 ${smp_command|} 2>&1
- name: compile-windows-alt
@@ -182,10 +192,10 @@ buildvariants:
#- name: format - Enable when we have a solution for hangs and crashses
- name: fops
-- name: osx-1010
- display_name: OS X 10.10
+- name: macos-1012
+ display_name: OS X 10.12
run_on:
- - osx-1010
+ - macos-1012
expansions:
smp_command: -j $(sysctl -n hw.logicalcpu)
configure_env_vars: PATH=/opt/local/bin:$PATH
@@ -195,3 +205,4 @@ buildvariants:
- name: compile
- name: unit-test
- name: fops
+