diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2017-06-24 03:01:13 +1000 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2017-06-24 03:01:13 +1000 |
commit | cf713d9b8cf0436f08facc8171ffb407d380ea85 (patch) | |
tree | 86df947be6762f82e30f5fe411024039d8b339dd | |
parent | a02c5e1cf53e83de05d3c98ed38673d91a350ce0 (diff) | |
download | mongo-cf713d9b8cf0436f08facc8171ffb407d380ea85.tar.gz |
Import wiredtiger: 2b048c9ad2dfde7ad9c2a1329bc082dfc882aec7 from branch mongodb-3.6
ref: 47e8c3d1d2..2b048c9ad2
for: 3.5.10
SERVER-29439 WiredTiger turtle file "MoveFileExW: Access is denied." error.
WT-3251 Remove interim buffer used to split pages during reconciliation
WT-3367 Switch OSX builders to 10.12
WT-3370 Heap use after free in txn recover code
WT-3371 Make Windows/MSVC build warnings fatal.
WT-3379 Avoid a performance regression on secondaries
-rw-r--r-- | src/third_party/wiredtiger/SConstruct | 10 | ||||
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/evict/evict_page.c | 28 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/meta/meta_table.c | 4 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/meta/meta_turtle.c | 6 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_write.c | 484 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_recover.c | 10 | ||||
-rw-r--r-- | src/third_party/wiredtiger/test/mciproject.yml | 17 |
8 files changed, 339 insertions, 222 deletions
diff --git a/src/third_party/wiredtiger/SConstruct b/src/third_party/wiredtiger/SConstruct index 22f869e02e7..2661807594d 100644 --- a/src/third_party/wiredtiger/SConstruct +++ b/src/third_party/wiredtiger/SConstruct @@ -67,15 +67,11 @@ var.Add('CPPPATH', 'C Preprocessor include path', [ ]) var.Add('CFLAGS', 'C Compiler Flags', [ - "/Z7", # Generate debugging symbols "/wd4090", # Ignore warning about mismatched const qualifiers "/wd4996", # Ignore deprecated functions "/W3", # Warning level 3 - #"/we4244", # Possible loss of data - "/we4013", # Error on undefined functions - #"/we4047", # Indirection differences in types - #"/we4024", # Differences in parameter types - #"/we4100", # Unreferenced local parameter + "/WX", # Warnings are fatal + "/Z7", # Generate debugging symbols "/TC", # Compile as C code #"/Od", # Disable optimization "/Ob1", # inline expansion @@ -338,6 +334,8 @@ if GetOption("lang-python"): "-nodefaultctor", "-nodefaultdtor", ]) + # Ignore warnings in swig-generated code. + pythonEnv['CFLAGS'].remove("/WX") swiglib = pythonEnv.SharedLibrary('_wiredtiger', [ 'lang\python\wiredtiger.i'], diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index ec6d864a530..fdde764ea7b 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "47e8c3d1d22018eaaa09f91dfd78addb49e0b49b", + "commit": "2b048c9ad2dfde7ad9c2a1329bc082dfc882aec7", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.6" diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 01818f106fc..d50326afb1e 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -420,7 +420,7 @@ __evict_review( WT_DECL_RET; WT_PAGE *page; uint32_t flags; - bool lookaside_retry, modified; + bool lookaside_retry, *lookaside_retryp, modified; conn = S2C(session); flags = WT_EVICTING; @@ -541,6 +541,9 @@ __evict_review( * the page and keep it in memory. */ cache = conn->cache; + lookaside_retry = false; + lookaside_retryp = NULL; + if (closing) LF_SET(WT_VISIBILITY_ERR); else if (!WT_PAGE_IS_INTERNAL(page)) { @@ -552,23 +555,26 @@ __evict_review( if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB)) LF_SET(WT_EVICT_SCRUB); + + /* + * Check if reconciliation suggests trying the + * lookaside table. + */ + lookaside_retryp = &lookaside_retry; } } /* Reconcile the page. */ - ret = __wt_reconcile(session, ref, NULL, flags, &lookaside_retry); + ret = __wt_reconcile(session, ref, NULL, flags, lookaside_retryp); /* - * If reconciliation fails, eviction is stuck and reconciliation reports - * it might succeed if we use the lookaside table (the page didn't have - * uncommitted updates, it was not-yet-globally visible updates causing - * the problem), configure reconciliation to write those updates to the - * lookaside table, allowing the eviction of pages we'd otherwise have - * to retain in cache to support older readers. + * If reconciliation fails, eviction is stuck and reconciliation + * reports it might succeed if we use the lookaside table, then + * configure reconciliation to write those updates to the lookaside + * table, allowing the eviction of pages we'd otherwise have to retain + * in cache to support older readers. */ - if (ret == EBUSY && - !F_ISSET(conn, WT_CONN_IN_MEMORY) && - __wt_cache_stuck(session) && lookaside_retry) { + if (ret == EBUSY && lookaside_retry && __wt_cache_stuck(session)) { LF_CLR(WT_EVICT_SCRUB | WT_EVICT_UPDATE_RESTORE); LF_SET(WT_EVICT_LOOKASIDE); ret = __wt_reconcile(session, ref, NULL, flags, NULL); diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c index 65835a16c8b..326ad12bd33 100644 --- a/src/third_party/wiredtiger/src/meta/meta_table.c +++ b/src/third_party/wiredtiger/src/meta/meta_table.c @@ -267,7 +267,9 @@ __wt_metadata_search(WT_SESSION_IMPL *session, const char *key, char **valuep) * that Coverity complains a lot, add an error check to get some * peace and quiet. */ - if ((ret = __wt_turtle_read(session, key, valuep)) != 0) + WT_WITH_TURTLE_LOCK(session, + ret = __wt_turtle_read(session, key, valuep)); + if (ret != 0) __wt_free(session, *valuep); return (ret); } diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c index 7a99df6b83b..362a3aa2bbe 100644 --- a/src/third_party/wiredtiger/src/meta/meta_turtle.c +++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c @@ -246,6 +246,9 @@ __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep) *valuep = NULL; + /* Require single-threading. */ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TURTLE)); + /* * Open the turtle file; there's one case where we won't find the turtle * file, yet still succeed. We create the metadata file before creating @@ -302,6 +305,9 @@ __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value) fs = NULL; + /* Require single-threading. */ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TURTLE)); + /* * Create the turtle setup file: we currently re-write it from scratch * every time. diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index f7df73c4ecb..1c266496ec8 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -25,12 +25,25 @@ typedef struct { WT_PAGE *page; uint32_t flags; /* Caller's configuration */ - WT_ITEM disk_image; /* Temporary disk-image buffer */ /* - * Temporary buffer used to write out a disk image when managing two - * chunks worth of data in memory - */ - WT_ITEM *interim_buf; + * Reconciliation can end up requiring two temporary disk image buffers + * if a page split is involved. These two disk images are pointed to by + * current and the previous image pointers. During initialization the + * first image is allocated and pointed to by the current image pointer. + * If and when a split is involved the second image gets allocated and + * is pointed to by the current image pointer. The previous image + * pointer is made to refer the first image at this point. Two images + * are kept in memory to redistribute data among them in case the last + * split chunk ends up being smaller than the minimum required. As + * reconciliation generates more split chunks, the image referred to by + * the previous image pointer is written to the disk, the current and + * the previous image pointers are swapped, making space for another + * split chunk to be reconciled in the buffer that was just written out + * to the disk. + */ + WT_ITEM disk_image[2]; /* Temporary disk-image buffers */ + WT_ITEM *cur_img_ptr; + WT_ITEM *prev_img_ptr; /* * Track start/stop write generation to decide if all changes to the @@ -146,17 +159,6 @@ typedef struct { * that references all of our split pages. */ struct __rec_boundary { - /* - * Offset is the byte offset in the initial split buffer of the - * first byte of the split chunk, recorded before we decide to - * split the page; the difference between chunk[1]'s offset and - * chunk[0]'s offset is chunk[0]'s length. - * - * Once we split a page, we stop filling in offset values, we're - * writing the split chunks as we find them. - */ - size_t offset; /* Split's first byte */ - WT_ADDR addr; /* Split's written location */ uint32_t size; /* Split's size */ uint32_t checksum; /* Split's checksum */ @@ -338,7 +340,8 @@ static int __rec_split_write(WT_SESSION_IMPL *, WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, bool); static int __rec_update_las( WT_SESSION_IMPL *, WT_RECONCILE *, uint32_t, WT_BOUNDARY *); -static int __rec_write_check_complete(WT_SESSION_IMPL *, WT_RECONCILE *); +static int __rec_write_check_complete( + WT_SESSION_IMPL *, WT_RECONCILE *, bool *); static int __rec_write_init(WT_SESSION_IMPL *, WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *); static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *); @@ -438,7 +441,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, /* Checks for a successful reconciliation. */ if (ret == 0) - ret = __rec_write_check_complete(session, r); + ret = __rec_write_check_complete(session, r, lookaside_retryp); /* Wrap up the page reconciliation. */ if (ret == 0 && (ret = __rec_write_wrapup(session, r, page)) == 0) @@ -449,14 +452,6 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, /* Release the reconciliation lock. */ WT_PAGE_UNLOCK(session, page); - /* - * If our caller can configure lookaside table reconciliation, flag if - * that's worth trying. The lookaside table doesn't help if we skipped - * updates, it can only help with older readers preventing eviction. - */ - if (lookaside_retryp != NULL && r->update_mem_uncommitted == 0) - *lookaside_retryp = true; - /* Update statistics. */ WT_STAT_CONN_INCR(session, rec_pages); WT_STAT_DATA_INCR(session, rec_pages); @@ -561,7 +556,8 @@ __rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r) * Check that reconciliation should complete. */ static int -__rec_write_check_complete(WT_SESSION_IMPL *session, WT_RECONCILE *r) +__rec_write_check_complete( + WT_SESSION_IMPL *session, WT_RECONCILE *r, bool *lookaside_retryp) { /* * Tests in this function are lookaside tests and tests to decide if @@ -581,17 +577,62 @@ __rec_write_check_complete(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (EBUSY); /* - * If when doing update/restore based eviction, we didn't split and - * didn't apply any updates, then give up. + * Eviction can configure lookaside table reconciliation, consider if + * it's worth giving up this reconciliation attempt and falling back to + * using the lookaside table. We continue with evict/restore if + * switching to the lookaside doesn't make sense for any reason: we + * won't retry an evict/restore reconciliation until/unless the + * transactional system moves forward, so at worst it's a single wasted + * effort. * - * This may lead to saving the page to the lookaside table: that - * decision is made by eviction. + * First, check if the lookaside table is a possible alternative. */ - if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && r->bnd_next == 1 && - r->update_mem_all != 0 && r->update_mem_all == r->update_mem_saved) - return (EBUSY); + if (lookaside_retryp == NULL) + return (0); - return (0); + /* + * We only suggest lookaside if currently in an evict/restore attempt + * and some updates were saved. Our caller sets the evict/restore flag + * based on various conditions (like if this is a leaf page), which is + * why we're testing that flag instead of a set of other conditions. + * If no updates were saved, eviction will succeed without needing to + * restore anything. + */ + if (!F_ISSET(r, WT_EVICT_UPDATE_RESTORE) || r->bnd->supd == NULL) + return (0); + + /* + * Check if this reconciliation attempt is making progress. If there's + * any sign of progress, don't fall back to the lookaside table. + * + * Check if the current reconciliation split, in which case we'll + * likely get to write at least one of the blocks. If that page is + * empty, that's also progress. + */ + if (r->bnd_next != 1) + return (0); + + /* + * Check if the current reconciliation applied some updates, in which + * case evict/restore should gain us some space. + */ + if (r->update_mem_saved != r->update_mem_all) + return (0); + + /* + * Check if lookaside eviction is possible. If any of the updates we + * saw were uncommitted, the lookaside table cannot be used: it only + * helps with older readers preventing eviction. + */ + if (r->update_mem_uncommitted != 0) + return (0); + + /* + * The current evict/restore approach shows no signs of being useful, + * lookaside is possible, suggest the lookaside table. + */ + *lookaside_retryp = true; + return (EBUSY); } /* @@ -832,7 +873,8 @@ __rec_write_init(WT_SESSION_IMPL *session, r->last = &r->_last; /* Disk buffers need to be aligned for writing. */ - F_SET(&r->disk_image, WT_ITEM_ALIGNED); + F_SET(&r->disk_image[0], WT_ITEM_ALIGNED); + F_SET(&r->disk_image[1], WT_ITEM_ALIGNED); } /* Reconciliation is not re-entrant, make sure that doesn't happen. */ @@ -977,8 +1019,8 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) return; *(WT_RECONCILE **)reconcilep = NULL; - __wt_buf_free(session, &r->disk_image); - __wt_scr_free(session, &r->interim_buf); + __wt_buf_free(session, &r->disk_image[0]); + __wt_buf_free(session, &r->disk_image[1]); __wt_free(session, r->raw_entries); __wt_free(session, r->raw_offsets); @@ -1766,7 +1808,7 @@ __rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size) */ WT_ASSERT(session, r->space_avail >= size); WT_ASSERT(session, WT_BLOCK_FITS( - r->first_free, size, r->disk_image.mem, r->disk_image.memsize)); + r->first_free, size, r->cur_img_ptr->mem, r->cur_img_ptr->memsize)); r->entries += v; r->space_avail -= size; @@ -1853,7 +1895,7 @@ __rec_dict_replace( * copy cell instead. */ if (dp->offset == 0) - dp->offset = WT_PTRDIFF32(r->first_free, r->disk_image.mem); + dp->offset = WT_PTRDIFF32(r->first_free, r->cur_img_ptr->mem); else { /* * The offset is the byte offset from this cell to the previous, @@ -1861,7 +1903,7 @@ __rec_dict_replace( * page. */ offset = (uint64_t)WT_PTRDIFF(r->first_free, - (uint8_t *)r->disk_image.mem + dp->offset); + (uint8_t *)r->cur_img_ptr->mem + dp->offset); val->len = val->cell_len = __wt_cell_pack_copy(&val->cell, rle, offset); val->buf.data = NULL; @@ -1997,7 +2039,6 @@ __rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) static void __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) { - bnd->offset = 0; bnd->max_bnd_recno = WT_RECNO_OOB; bnd->max_bnd_entries = 0; @@ -2210,15 +2251,14 @@ __rec_split_init(WT_SESSION_IMPL *session, * Ensure the disk image buffer is large enough for the max object, as * corrected by the underlying block manager. * - * The buffer that we build disk image in, needs to hold two chunks - * worth of data. Since we want to support split_size more than the page - * size (to allow for adjustments based on the compression), this buffer - * should be greater of twice of split_size and page_size. + * Since we want to support split_size more than the page size (to allow + * for adjustments based on the compression), this buffer should be + * greater of split_size and page_size. */ corrected_page_size = r->page_size; - disk_img_buf_size = 2 * WT_MAX(corrected_page_size, r->split_size); WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_init(session, &r->disk_image, disk_img_buf_size)); + disk_img_buf_size = WT_MAX(corrected_page_size, r->split_size); + WT_RET(__wt_buf_init(session, &r->disk_image[0], disk_img_buf_size)); /* * Clear the disk page header to ensure all of it is initialized, even @@ -2228,15 +2268,17 @@ __rec_split_init(WT_SESSION_IMPL *session, * fixed-length column-store sets bits in bytes, where the bytes are * assumed to initially be 0. */ - memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? + memset(r->disk_image[0].mem, 0, page->type == WT_PAGE_COL_FIX ? disk_img_buf_size : WT_PAGE_HEADER_SIZE); /* * Set the page type (the type doesn't change, and setting it later * would require additional code in a few different places). */ - dsk = r->disk_image.mem; + dsk = r->disk_image[0].mem; dsk->type = page->type; + r->cur_img_ptr = &r->disk_image[0]; + r->prev_img_ptr = NULL; r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); @@ -2245,7 +2287,6 @@ __rec_split_init(WT_SESSION_IMPL *session, WT_RET(__rec_split_bnd_grow(session, r)); __rec_split_bnd_init(session, &r->bnd[0]); r->bnd[0].max_bnd_recno = recno; - r->bnd[0].offset = WT_PAGE_HEADER_BYTE_SIZE(btree); /* Initialize the entry counter. */ r->entries = 0; @@ -2451,21 +2492,18 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) { WT_BM *bm; WT_BTREE *btree; - size_t corrected_page_size, inuse, len; + size_t corrected_page_size, inuse; btree = S2BT(session); bm = btree->bm; - len = WT_PTRDIFF(r->first_free, r->disk_image.mem); - inuse = (len - r->bnd[r->bnd_next].offset) + - WT_PAGE_HEADER_BYTE_SIZE(btree); + inuse = WT_PTRDIFF(r->first_free, r->cur_img_ptr->mem); corrected_page_size = inuse + add_len; WT_RET(bm->write_size(bm, session, &corrected_page_size)); - /* Need to account for buffer carrying two chunks worth of data */ - WT_RET(__wt_buf_grow(session, &r->disk_image, 2 * corrected_page_size)); + WT_RET(__wt_buf_grow(session, r->cur_img_ptr, corrected_page_size)); - r->first_free = (uint8_t *)r->disk_image.mem + len; + r->first_free = (uint8_t *)r->cur_img_ptr->mem + inuse; WT_ASSERT(session, corrected_page_size >= inuse); r->space_avail = corrected_page_size - inuse; WT_ASSERT(session, r->space_avail >= add_len); @@ -2474,89 +2512,55 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) } /* - * __rec_split_write_prev_and_shift_cur -- - * Write the previous split chunk to the disk as a page. Shift the contents - * of the current chunk to the start of the buffer, making space for a new - * chunk to be written. - * If the caller asks for a chunk resizing, the boundary between the two - * chunks is readjusted to the minimum split size boundary details stored - * in the previous chunk, letting the current chunk grow at the cost of the - * previous chunk. + * __rec_split_write_prev_and_swap_buf -- + * If there is a previous split chunk held in the memory, write it to the + * disk as a page. If there isn't one, this is the first time we are + * splitting and need to initialize a second buffer. Also, swap the + * previous and the current buffer pointers. */ static int -__rec_split_write_prev_and_shift_cur( - WT_SESSION_IMPL *session, WT_RECONCILE *r, bool resize_chunks) +__rec_split_write_prev_and_swap_buf(WT_SESSION_IMPL *session, WT_RECONCILE *r) { - WT_BM *bm; - WT_BOUNDARY *bnd_cur, *bnd_prev; - WT_BTREE *btree; - WT_PAGE_HEADER *dsk, *dsk_tmp; - size_t cur_len, len; - uint8_t *dsk_start; - - WT_ASSERT(session, r->bnd_next != 0); - - btree = S2BT(session); - bm = btree->bm; - bnd_cur = &r->bnd[r->bnd_next]; - bnd_prev = bnd_cur - 1; - dsk = r->disk_image.mem; - cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; - - /* - * Resize chunks if the current is smaller than the minimum, and there - * are details on the minimum split size boundary available in the - * previous boundary details. - * - * There is a possibility that we do not have a minimum boundary set, in - * such a case we skip chunk resizing. Such a condition is possible for - * instance when we are building the image in the buffer and the first - * K/V pair is large enough that it surpasses both the minimum split - * size and the split size the application has set. In such a case we - * split the chunk without saving any minimum boundary. - */ - if (resize_chunks && - cur_len < r->min_split_size && bnd_prev->min_bnd_offset != 0) { - bnd_cur->offset = bnd_prev->min_bnd_offset; - bnd_cur->max_bnd_entries += - bnd_prev->max_bnd_entries - bnd_prev->min_bnd_entries; - bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries; - bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno; - - WT_RET(__wt_buf_set(session, &bnd_cur->max_bnd_key, - bnd_prev->min_bnd_key.data, bnd_prev->min_bnd_key.size)); - - /* Update current chunk's length */ - cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; + WT_BOUNDARY *bnd_prev; + WT_ITEM *tmp_img_ptr; + WT_PAGE_HEADER *dsk; + size_t disk_img_size; + + WT_ASSERT(session, (r->prev_img_ptr == NULL && r->bnd_next == 0) || + (r->prev_img_ptr != NULL && r->bnd_next != 0)); + + /* Write previous chunk, if there is one */ + if (r->prev_img_ptr != NULL) { + bnd_prev = &r->bnd[r->bnd_next - 1]; + dsk = r->prev_img_ptr->mem; + dsk->recno = bnd_prev->max_bnd_recno; + dsk->u.entries = bnd_prev->max_bnd_entries; + dsk->mem_size = (uint32_t)bnd_prev->size; + r->prev_img_ptr->size = dsk->mem_size; + WT_RET(__rec_split_write(session, + r, bnd_prev, r->prev_img_ptr, false)); + } else { + /* + * If we do not have a previous buffer, we should initialize the + * second buffer before proceeding. We will create the second + * buffer of the same size as the current buffer. + */ + disk_img_size = r->cur_img_ptr->memsize; + WT_RET(__wt_buf_init(session, + &r->disk_image[1], disk_img_size)); + r->prev_img_ptr = &r->disk_image[1]; + dsk = r->prev_img_ptr->mem; + memset(dsk, 0, + r->page->type == WT_PAGE_COL_FIX ? + disk_img_size : WT_PAGE_HEADER_SIZE); + dsk->type = r->page->type; } - /* - * Create an interim buffer if not already done to prepare the previous - * chunk's disk image. - */ - len = bnd_cur->offset; - WT_RET(bm->write_size(bm, session, &len)); - if (r->interim_buf == NULL) - WT_RET(__wt_scr_alloc(session, len, &r->interim_buf)); - else - WT_RET(__wt_buf_init(session, r->interim_buf, len)); - - dsk_tmp = r->interim_buf->mem; - memcpy(dsk_tmp, dsk, bnd_cur->offset); - dsk_tmp->recno = bnd_prev->max_bnd_recno; - dsk_tmp->u.entries = bnd_prev->max_bnd_entries; - dsk_tmp->mem_size = WT_STORE_SIZE(bnd_cur->offset); - r->interim_buf->size = dsk_tmp->mem_size; - WT_RET(__rec_split_write(session, r, bnd_prev, r->interim_buf, false)); - - /* Shift the current chunk to the start of the buffer */ - dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - (void)memmove(dsk_start, (uint8_t *)dsk + bnd_cur->offset, cur_len); - - /* Fix boundary offset */ - bnd_cur->offset = WT_PAGE_HEADER_BYTE_SIZE(btree); - /* Fix where free points */ - r->first_free = dsk_start + cur_len; + /* swap previous and current buffers */ + tmp_img_ptr = r->prev_img_ptr; + r->prev_img_ptr = r->cur_img_ptr; + r->cur_img_ptr = tmp_img_ptr; + return (0); } @@ -2574,7 +2578,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) size_t inuse; btree = S2BT(session); - dsk = r->disk_image.mem; + dsk = r->cur_img_ptr->mem; /* Fixed length col store can call with next_len 0 */ WT_ASSERT(session, next_len == 0 || r->space_avail < next_len); @@ -2588,9 +2592,7 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) "%s page too large, attempted split during salvage", __wt_page_type_string(r->page->type)); - last = &r->bnd[r->bnd_next]; - inuse = (WT_PTRDIFF(r->first_free, dsk) - last->offset) + - WT_PAGE_HEADER_BYTE_SIZE(btree); + inuse = WT_PTRDIFF(r->first_free, dsk); /* * We can get here if the first key/value pair won't fit. @@ -2603,8 +2605,10 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) /* All page boundaries reset the dictionary. */ __rec_dictionary_reset(r); - /* Set the number of entries for the just finished chunk. */ + /* Set the number of entries and size for the just finished chunk. */ + last = &r->bnd[r->bnd_next]; last->max_bnd_entries = r->entries; + last->size = (uint32_t)inuse; /* * In case of bulk load, write out chunks as we get them. Otherwise we @@ -2616,19 +2620,22 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) dsk->recno = last->max_bnd_recno; dsk->u.entries = last->max_bnd_entries; dsk->mem_size = (uint32_t)inuse; - r->disk_image.size = dsk->mem_size; - WT_RET(__rec_split_write( - session, r, last, &r->disk_image, false)); - /* Fix where free points */ - r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); - } else if (r->bnd_next != 0) - WT_RET(__rec_split_write_prev_and_shift_cur(session, r, false)); + r->cur_img_ptr->size = dsk->mem_size; + WT_RET(__rec_split_write(session, + r, last, r->cur_img_ptr, false)); + } else { + WT_RET(__rec_split_write_prev_and_swap_buf(session, r)); + /* current image we are writing to has changed */ + dsk = r->cur_img_ptr->mem; + } + + /* Fix where free points */ + r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); /* Prepare the next boundary */ WT_RET(__rec_split_bnd_grow(session, r)); r->bnd_next++; next = &r->bnd[r->bnd_next]; - next->offset = WT_PTRDIFF(r->first_free, dsk); /* Set the key for the next chunk. */ next->max_bnd_recno = r->recno; if (dsk->type == WT_PAGE_ROW_INT || dsk->type == WT_PAGE_ROW_LEAF) @@ -2687,9 +2694,8 @@ __rec_split_crossing_bnd( !WT_CROSSING_SPLIT_BND(r, next_len)) { btree = S2BT(session); bnd = &r->bnd[r->bnd_next]; - dsk = r->disk_image.mem; - min_bnd_offset = (WT_PTRDIFF(r->first_free, dsk) - - bnd->offset) + WT_PAGE_HEADER_BYTE_SIZE(btree); + dsk = r->cur_img_ptr->mem; + min_bnd_offset = WT_PTRDIFF(r->first_free, dsk); if (min_bnd_offset == WT_PAGE_HEADER_BYTE_SIZE(btree)) /* * This is possible if the first record doesn't fit in @@ -2750,7 +2756,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, unpack = &_unpack; compressor = btree->compressor; dst = &r->raw_destination; - dsk = r->disk_image.mem; + dsk = r->cur_img_ptr->mem; WT_RET(__rec_split_bnd_grow(session, r)); last = &r->bnd[r->bnd_next]; @@ -3066,7 +3072,7 @@ no_slots: r->first_free = dsk_start + len; r->space_avail += r->raw_offsets[result_slots]; WT_ASSERT(session, r->first_free + r->space_avail <= - (uint8_t *)r->disk_image.mem + r->disk_image.memsize); + (uint8_t *)r->cur_img_ptr->mem + r->cur_img_ptr->memsize); /* * Set the key for the next block (before writing the block, a @@ -3105,13 +3111,13 @@ no_slots: dsk->recno = last->max_bnd_recno; dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); dsk->u.entries = r->entries; - r->disk_image.size = dsk->mem_size; + r->cur_img_ptr->size = dsk->mem_size; r->entries = 0; r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - write_ref = &r->disk_image; + write_ref = r->cur_img_ptr; last->already_compressed = false; } else { /* @@ -3139,7 +3145,7 @@ no_slots: last_block && __rec_is_checkpoint(session, r, last)) { if (write_ref == dst) WT_RET(__wt_buf_set( - session, &r->disk_image, dst->mem, dst->size)); + session, r->cur_img_ptr, dst->mem, dst->size)); } else WT_RET( __rec_split_write(session, r, last, write_ref, last_block)); @@ -3173,15 +3179,120 @@ __rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) } /* + * __rec_split_finish_process_prev -- + * If the two split chunks together fit in a single page, merge them into + * one. If they do not fit in a single page but the last is smaller than + * the minimum desired, move some data from the penultimate chunk to the + * last chunk and write out the previous/penultimate. Finally, update the + * pointer to the current image buffer. After this function exits, we will + * have one (last) buffer in memory, pointed to by the current image + * pointer. + */ +static int +__rec_split_finish_process_prev( + WT_SESSION_IMPL *session, WT_RECONCILE *r, bool *chunks_merged) +{ + WT_BOUNDARY *bnd_cur, *bnd_prev; + WT_BTREE *btree; + WT_PAGE_HEADER *dsk; + size_t len_to_move; + uint32_t combined_size; + uint8_t *cur_dsk_start; + + WT_ASSERT(session, r->prev_img_ptr != NULL); + + btree = S2BT(session); + bnd_cur = &r->bnd[r->bnd_next]; + bnd_prev = bnd_cur - 1; + *chunks_merged = false; + /* + * The sizes referred to in the boundary structure include the header, + * so when calculating the combined size, make sure not to include the + * header twice. + */ + combined_size = bnd_prev->size + + (bnd_cur->size - WT_PAGE_HEADER_BYTE_SIZE(btree)); + + if (combined_size <= r->page_size) { + /* + * We have two boundaries, but the data in the buffers can fit a + * single page. Merge the boundaries and create a single chunk. + */ + dsk = r->cur_img_ptr->mem; + memcpy((uint8_t *)r->prev_img_ptr->mem + bnd_prev->size, + WT_PAGE_HEADER_BYTE(btree, dsk), + bnd_cur->size - WT_PAGE_HEADER_BYTE_SIZE(btree)); + bnd_prev->size = combined_size; + bnd_prev->max_bnd_entries += bnd_cur->max_bnd_entries; + r->bnd_next--; + *chunks_merged = true; + } else { + if (bnd_cur->size < r->min_split_size && + bnd_prev->min_bnd_offset != 0 ) { + /* + * The last chunk, pointed to by the current image + * pointer, has less than the minimum data. Let's move + * any data more than the minimum from the previous + * image into the current. + */ + len_to_move = bnd_prev->size - bnd_prev->min_bnd_offset; + /* Grow current buffer if it is not large enough */ + if (r->space_avail < len_to_move) + WT_RET(__rec_split_grow(session, + r, len_to_move)); + cur_dsk_start = WT_PAGE_HEADER_BYTE(btree, + r->cur_img_ptr->mem); + + /* + * Shift the contents of the current buffer to make + * space for the data that will be prepended into the + * current buffer + */ + memmove(cur_dsk_start + len_to_move, + cur_dsk_start, bnd_cur->size - + WT_PAGE_HEADER_BYTE_SIZE(btree)); + /* + * copy any data more than the minimum, from the + * previous buffer to the start of the current. + */ + memcpy(cur_dsk_start, (uint8_t *)r->prev_img_ptr->mem + + bnd_prev->min_bnd_offset, len_to_move); + + /* Update boundary information */ + bnd_cur->size += (uint32_t)len_to_move; + bnd_prev->size -= (uint32_t)len_to_move; + bnd_cur->max_bnd_entries += bnd_prev->max_bnd_entries - + bnd_prev->min_bnd_entries; + bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries; + bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno; + WT_RET(__wt_buf_set(session, + &bnd_cur->max_bnd_key, bnd_prev->min_bnd_key.data, + bnd_prev->min_bnd_key.size)); + } + + /* Write out the previous image */ + WT_RET(__rec_split_write_prev_and_swap_buf(session, r)); + } + + /* + * At this point, there is only one disk image in the memory, pointed to + * by the previous image pointer. Update the current image pointer to + * this image. + */ + r->cur_img_ptr = r->prev_img_ptr; + return (0); +} + +/* * __rec_split_finish_std -- * Finish processing a page, standard version. */ static int __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) { - WT_BOUNDARY *bnd_cur, *bnd_prev; + WT_BOUNDARY *bnd_cur; WT_PAGE_HEADER *dsk; - bool grow_bnd; + bool chunks_merged; /* * We may arrive here with no entries to write if the page was entirely @@ -3208,50 +3319,22 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (EBUSY); } - dsk = r->disk_image.mem; - - /* Set the number of entries for the just finished chunk. */ + /* Set the number of entries and size for the just finished chunk. */ bnd_cur = &r->bnd[r->bnd_next]; bnd_cur->max_bnd_entries = r->entries; + bnd_cur->size = WT_PTRDIFF32(r->first_free, r->cur_img_ptr->mem); - grow_bnd = true; - /* - * We can reach here even with raw_compression when the last split chunk - * is too small to be sent for raw compression. - */ - if (!r->is_bulk_load && !r->raw_compression) { - if (WT_PTRDIFF(r->first_free, dsk) > r->page_size && - r->bnd_next != 0) { - /* - * We hold two boundaries worth of data in the buffer, - * and this data doesn't fit in a single page. If the - * last chunk is too small, readjust the boundary to a - * pre-computed minimum. - * Write out the penultimate chunk to the disk as a page - */ - WT_RET(__rec_split_write_prev_and_shift_cur( - session, r, true)); - } else - if (r->bnd_next != 0) { - /* - * We have two boundaries, but the data in the - * buffer can fit a single page. Merge the - * boundaries to create a single chunk. - */ - bnd_prev = bnd_cur - 1; - bnd_prev->max_bnd_entries += - bnd_cur->max_bnd_entries; - r->bnd_next--; - grow_bnd = false; - } - } + chunks_merged = false; + if (r->prev_img_ptr != NULL) + WT_RET(__rec_split_finish_process_prev(session, + r, &chunks_merged)); /* * We already have space for an extra boundary if we merged two * boundaries above, in that case we do not need to grow the boundary * structure. */ - if (grow_bnd) + if (!chunks_merged) WT_RET(__rec_split_bnd_grow(session, r)); bnd_cur = &r->bnd[r->bnd_next]; r->bnd_next++; @@ -3260,14 +3343,15 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) * Current boundary now has all the remaining data/last page now. * Let's write it to the disk */ + dsk = r->cur_img_ptr->mem; dsk->recno = bnd_cur->max_bnd_recno; dsk->u.entries = bnd_cur->max_bnd_entries; - dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); - r->disk_image.size = dsk->mem_size; + dsk->mem_size = bnd_cur->size; + r->cur_img_ptr->size = dsk->mem_size; /* If this is a checkpoint, we're done, otherwise write the page. */ return (__rec_is_checkpoint(session, r, bnd_cur) ? - 0 : __rec_split_write(session, r, bnd_cur, &r->disk_image, true)); + 0 : __rec_split_write(session, r, bnd_cur, r->cur_img_ptr, true)); } /* @@ -3289,7 +3373,7 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) if (r->raw_compression && r->entries != 0) { while (r->entries != 0) { data_size = - WT_PTRDIFF(r->first_free, r->disk_image.mem); + WT_PTRDIFF(r->first_free, r->cur_img_ptr->mem); if (data_size <= btree->allocsize) break; WT_RET(__rec_split_raw_worker(session, r, 0, true)); @@ -5882,7 +5966,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * write the buffer so we know what to do here. */ if (bnd->addr.addr == NULL) - WT_RET(__wt_bt_write(session, &r->disk_image, + WT_RET(__wt_bt_write(session, r->cur_img_ptr, NULL, NULL, true, F_ISSET(r, WT_CHECKPOINTING), bnd->already_compressed)); else { @@ -6546,7 +6630,7 @@ __rec_dictionary_lookup( for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash); dp != NULL && dp->hash == hash; dp = dp->next[0]) { WT_RET(__wt_cell_pack_data_match( - (WT_CELL *)((uint8_t *)r->disk_image.mem + dp->offset), + (WT_CELL *)((uint8_t *)r->cur_img_ptr->mem + dp->offset), &val->cell, val->buf.data, &match)); if (match) { WT_STAT_DATA_INCR(session, rec_dictionary); diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index 590e17b6a2a..58f4f0750d7 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -458,6 +458,11 @@ __wt_txn_recover(WT_SESSION_IMPL *session) * larger than any checkpoint LSN we have from the earlier time. */ WT_ERR(__recovery_file_scan(&r)); + /* + * The array can be re-allocated in recovery_file_scan. Reset + * our pointer after scanning all the files. + */ + metafile = &r.files[WT_METAFILE_ID]; conn->next_file_id = r.max_fileid; if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && @@ -509,6 +514,11 @@ __wt_txn_recover(WT_SESSION_IMPL *session) /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); + /* + * Clear this out. We no longer need it and it could have been + * re-allocated when scanning the files. + */ + metafile = NULL; /* * We no longer need the metadata cursor: close it to avoid pinning any diff --git a/src/third_party/wiredtiger/test/mciproject.yml b/src/third_party/wiredtiger/test/mciproject.yml index 6456475aa00..72022fe46ec 100644 --- a/src/third_party/wiredtiger/test/mciproject.yml +++ b/src/third_party/wiredtiger/test/mciproject.yml @@ -95,6 +95,16 @@ tasks: script: | set -o errexit set -o verbose + + # On 10.12, change the binary location with install_name_tool since DYLD_LIBRARY_PATH + # appears not to work for dynamic modules loaded by python. For wt, the libtool generated + # script has the wrong path for running on test machines. + if [ "$(uname -s)" == "Darwin" ]; then + WT_VERSION=$(m4 build_posix/aclocal/version.m4) + install_name_tool -change /usr/local/lib/libwiredtiger-$WT_VERSION.dylib $(pwd)/.libs/libwiredtiger-$WT_VERSION.dylib lang/python/_wiredtiger.so + install_name_tool -change /usr/local/lib/libwiredtiger-$WT_VERSION.dylib $(pwd)/.libs/libwiredtiger-$WT_VERSION.dylib .libs/wt + fi + ${test_env_vars|} python ./test/suite/run.py -v 2 ${smp_command|} 2>&1 - name: compile-windows-alt @@ -182,10 +192,10 @@ buildvariants: #- name: format - Enable when we have a solution for hangs and crashses - name: fops -- name: osx-1010 - display_name: OS X 10.10 +- name: macos-1012 + display_name: OS X 10.12 run_on: - - osx-1010 + - macos-1012 expansions: smp_command: -j $(sysctl -n hw.logicalcpu) configure_env_vars: PATH=/opt/local/bin:$PATH @@ -195,3 +205,4 @@ buildvariants: - name: compile - name: unit-test - name: fops + |