diff options
author | Luke Chen <luke.chen@mongodb.com> | 2017-10-19 15:51:08 +1100 |
---|---|---|
committer | Luke Chen <luke.chen@mongodb.com> | 2017-10-19 15:54:58 +1100 |
commit | 2f9f471cd5d5217023f9645fff83ff79167a8fbf (patch) | |
tree | 0df4e8ea44eecb0dbe23a3b69c27d718a99fe00e | |
parent | 59f7859bc5db014940c36bf60f339992c3b1e672 (diff) | |
download | mongo-2f9f471cd5d5217023f9645fff83ff79167a8fbf.tar.gz |
Import wiredtiger: 4ecdb8f1a327067a178258ad025806eeefc1267b from branch mongodb-3.6
ref: 4b5ade6072..4ecdb8f1a3
for: 3.6.0-rc1
WT-3553 Change test utility error handling to abort rather than exit
WT-3611 Backup comment doesn't match the code.
WT-3619 Make compaction more aware of checkpoints and eviction
WT-3635 Coverity 1381606 & Friday builds & lint.
WT-3650 test_timestamp07(table-cg.nolog.1000keys)
WT-3651 Reduce runtime of Python lookaside test
WT-3655 Don't dirty pages to induce lookaside eviction
WT-3657 Timestamp and lookaside related automated test failures
WT-3660 WiredTiger documentation refers to WT_CURSOR::first.
WT-3662 Write lookaside after reconciliation has succeeded
WT-3663 lookaside records ignored unless a backing disk block written
WT-3665 change format to configure the WiredTiger checkpoint thread
36 files changed, 775 insertions, 529 deletions
diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py index df897bcb91e..7ddbff62a63 100644 --- a/src/third_party/wiredtiger/dist/flags.py +++ b/src/third_party/wiredtiger/dist/flags.py @@ -38,7 +38,6 @@ flags = { 'rec_write' : [ 'REC_CHECKPOINT', 'REC_EVICT', - 'REC_INMEM_SPLIT', 'REC_IN_MEMORY', 'REC_LOOKASIDE', 'REC_SCRUB', diff --git a/src/third_party/wiredtiger/dist/function.py b/src/third_party/wiredtiger/dist/function.py index f7118e91874..0e36a539cc4 100644 --- a/src/third_party/wiredtiger/dist/function.py +++ b/src/third_party/wiredtiger/dist/function.py @@ -20,11 +20,12 @@ def missing_comment(): (f, s[:m.start(2)].count('\n'), m.group(2)) # Sort helper function, discard * operators so a pointer doesn't necessarily -# sort before non-pointers, ignore const/volatile keywords. +# sort before non-pointers, ignore const/static/volatile keywords. def function_args_alpha(text): s = text.strip() s = re.sub("[*]","", s) s = re.sub("^const ","", s) + s = re.sub("^static ","", s) s = re.sub("^volatile ","", s) return s @@ -68,9 +69,10 @@ types = [ # Return the sort order of a variable declaration, or no-match. # This order isn't defensible: it's roughly how WiredTiger looked when we # settled on a style, and it's roughly what the KNF/BSD styles look like. -def function_args(line): +def function_args(name, line): line = line.strip() line = re.sub("^const ", "", line) + line = re.sub("^static ", "", line) line = re.sub("^volatile ", "", line) # Let WT_UNUSED terminate the parse. It often appears at the beginning @@ -86,8 +88,7 @@ def function_args(line): # Check for illegal types. for m in illegal_types: if re.search('^' + m + "\s*[\w(*]", line): - print >>sys.stderr, \ - m + ": illegal declaration: " + line.strip() + print >>sys.stderr, name + ": illegal type: " + line.strip() sys.exit(1) # Check for matching types. @@ -117,17 +118,36 @@ def function_declaration(): if not tracking: tfile.write(line) if re.search('^{$', line): - r = [[] for i in range(len(types))] + list = [[] for i in range(len(types))] + static_list = [[] for i in range(len(types))] tracking = True; continue - found,n = function_args(line) + found,n = function_args(name, line) if found: - r[n].append(line) - else : + # List statics first. + if re.search("^\sstatic", line): + static_list[n].append(line) + continue + + # Disallow assignments in the declaration. Ignore braces + # to allow automatic array initialization using constant + # initializers (and we've already skipped statics, which + # are also typically initialized in the declaration). + if re.search("\s=\s[-\w]", line): + print >>sys.stderr, \ + name + ": assignment in string: " + line.strip() + sys.exit(1); + + list[n].append(line) + else: # Sort the resulting lines (we don't yet sort declarations - # within a single line). - for arg in filter(None, r): + # within a single line). It's two passes, first to catch + # the statics, then to catch everything else. + for arg in filter(None, static_list): + for p in sorted(arg, key=function_args_alpha): + tfile.write(p) + for arg in filter(None, list): for p in sorted(arg, key=function_args_alpha): tfile.write(p) tfile.write(line) diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index f5e704d7625..3c022218fb0 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "4b5ade6072d548fdebe3b376f94e0d672eea5359", + "commit": "4ecdb8f1a327067a178258ad025806eeefc1267b", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.6" diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c index fe860034f88..75bb46aaf89 100644 --- a/src/third_party/wiredtiger/src/btree/bt_compact.c +++ b/src/third_party/wiredtiger/src/btree/bt_compact.c @@ -31,17 +31,6 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) mod = page->modify; /* - * Ignore the root: it may not have a replacement address, and besides, - * if anything else gets written, so will it. - */ - if (__wt_ref_is_root(ref)) - return (0); - - /* Ignore currently dirty pages, they will be written regardless. */ - if (__wt_page_is_modified(page)) - return (0); - - /* * If the page is clean, test the original addresses. * If the page is a replacement, test the replacement addresses. * Ignore empty pages, they get merged into the parent. @@ -86,6 +75,45 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) } /* + * __compact_rewrite_lock -- + * Lock out checkpoints and return if a page needs to be re-written. + */ +static int +__compact_rewrite_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) +{ + WT_BTREE *btree; + WT_DECL_RET; + + *skipp = true; /* Default skip. */ + + btree = S2BT(session); + + /* + * Reviewing in-memory pages requires looking at page reconciliation + * results, because we care about where the page is stored now, not + * where the page was stored when we first read it into the cache. + * We need to ensure we don't race with page reconciliation as it's + * writing the page modify information. + * + * There are two ways we call reconciliation: checkpoints and eviction. + * Get the tree's flush lock which blocks threads writing pages for + * checkpoints. If checkpoint is holding the lock, quit working this + * file, we'll visit it again in our next pass. + * + * Serializing with eviction is not quite as simple, and it gets done + * in the underlying function that checks modification information. + */ + WT_RET(__wt_spin_trylock(session, &btree->flush_lock)); + + ret = __compact_rewrite(session, ref, skipp); + + /* Unblock threads writing leaf pages. */ + __wt_spin_unlock(session, &btree->flush_lock); + + return (ret); +} + +/* * __wt_compact -- * Compact a file. */ @@ -93,14 +121,12 @@ int __wt_compact(WT_SESSION_IMPL *session) { WT_BM *bm; - WT_BTREE *btree; WT_DECL_RET; WT_REF *ref; u_int i; bool skip; - btree = S2BT(session); - bm = btree->bm; + bm = S2BT(session)->bm; ref = NULL; WT_STAT_DATA_INCR(session, session_compact); @@ -114,28 +140,29 @@ __wt_compact(WT_SESSION_IMPL *session) if (skip) return (0); - /* - * Reviewing in-memory pages requires looking at page reconciliation - * results, because we care about where the page is stored now, not - * where the page was stored when we first read it into the cache. - * We need to ensure we don't race with page reconciliation as it's - * writing the page modify information. - * - * There are two ways we call reconciliation: checkpoints and eviction. - * Get the tree's flush lock which blocks threads writing pages for - * checkpoints. - */ - __wt_spin_lock(session, &btree->flush_lock); - /* Walk the tree reviewing pages to see if they should be re-written. */ for (i = 0;;) { - /* Periodically check if we've run out of time. */ + /* + * Periodically check if we've timed out or eviction is stuck. + * Quit if eviction is stuck, we're making the problem worse. + */ if (++i > 100) { WT_ERR(__wt_session_compact_check_timeout(session)); + + if (__wt_cache_stuck(session)) + WT_ERR(EBUSY); + i = 0; } /* + * Compact pulls pages into cache during the walk without + * checking whether the cache is full. Check now to throttle + * compact to match eviction speed. + */ + WT_ERR(__wt_cache_eviction_check(session, false, NULL)); + + /* * Pages read for compaction aren't "useful"; don't update the * read generation of pages already in memory, and if a page is * read, set its generation to a low value so it is evicted @@ -147,25 +174,34 @@ __wt_compact(WT_SESSION_IMPL *session) if (ref == NULL) break; - WT_ERR(__compact_rewrite(session, ref, &skip)); - if (skip) + /* + * Cheap checks that don't require locking. + * + * Ignore the root: it may not have a replacement address, and + * besides, if anything else gets written, so will it. + * + * Ignore dirty pages, checkpoint writes them regardless. + */ + if (__wt_ref_is_root(ref)) + continue; + if (__wt_page_is_modified(ref->page)) continue; - session->compact_state = WT_COMPACT_SUCCESS; + WT_ERR(__compact_rewrite_lock(session, ref, &skip)); + if (skip) + continue; /* Rewrite the page: mark the page and tree dirty. */ WT_ERR(__wt_page_modify_init(session, ref->page)); __wt_page_modify_set(session, ref->page); + session->compact_state = WT_COMPACT_SUCCESS; WT_STAT_DATA_INCR(session, btree_compact_rewrite); } err: if (ref != NULL) WT_TRET(__wt_page_release(session, ref, 0)); - /* Unblock threads writing leaf pages. */ - __wt_spin_unlock(session, &btree->flush_lock); - return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index f0388bd1f07..caa960d78ae 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -478,8 +478,8 @@ __debug_dsk_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk) static char * __debug_tree_shape_info(WT_PAGE *page) { - uint64_t v; static char buf[128]; + uint64_t v; const char *unit; v = page->memory_footprint; diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 9d4e860f8fd..0246c1eca66 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -300,12 +300,11 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref) { struct timespec start, stop; WT_BTREE *btree; - WT_CURSOR *las_cursor; WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; size_t addr_size; - uint32_t new_state, previous_state, session_flags; + uint32_t new_state, previous_state; const uint8_t *addr; bool timer; @@ -411,12 +410,9 @@ skip_read: * entries. Note that we are discarding updates so the page * must be marked available even if these operations fail. */ - __wt_las_cursor(session, &las_cursor, &session_flags); WT_TRET(__wt_las_remove_block( - session, las_cursor, btree->id, ref->page_las->las_pageid)); + session, NULL, btree->id, ref->page_las->las_pageid)); __wt_free(session, ref->page_las); - WT_TRET(__wt_las_cursor_close( - session, &las_cursor, session_flags)); } done: WT_PUBLISH(ref->state, WT_REF_MEM); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 71007e76dfd..1ccb27c2296 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -1495,8 +1495,8 @@ __split_multi_inmem( * tombstone away: we may need it to correctly resolve * modifications. */ - if (supd->onpage_upd->type == WT_UPDATE_DELETED && - prev_upd != NULL) + if (prev_upd != NULL && + prev_upd->type == WT_UPDATE_DELETED) prev_upd = prev_upd->next; if (prev_upd != NULL) { __wt_update_obsolete_free( @@ -1620,8 +1620,11 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, break; } - /* There should be an address or a disk image (or both). */ - WT_ASSERT(session, + /* + * There can be an address or a disk image or both, but if there is + * neither, there must be a backing lookaside page. + */ + WT_ASSERT(session, multi->las_pageid != 0 || multi->addr.addr != NULL || multi->disk_image != NULL); /* If closing the file, there better be an address. */ @@ -1652,16 +1655,23 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, addr->type = multi->addr.type; WT_RET(__wt_memdup(session, multi->addr.addr, addr->size, &addr->addr)); - if (multi->las_pageid != 0) { - WT_RET(__wt_calloc_one(session, &ref->page_las)); - ref->page_las->las_pageid = multi->las_pageid; + + ref->state = WT_REF_DISK; + } + + /* + * Copy any associated lookaside reference, potentially resetting + * WT_REF.state. Regardless of a backing address, WT_REF_LOOKASIDE + * overrides WT_REF_DISK. + */ + if (multi->las_pageid != 0) { + WT_RET(__wt_calloc_one(session, &ref->page_las)); + ref->page_las->las_pageid = multi->las_pageid; #ifdef HAVE_TIMESTAMPS - __wt_timestamp_set(&ref->page_las->min_timestamp, - &multi->las_min_timestamp); + __wt_timestamp_set( + &ref->page_las->min_timestamp, &multi->las_min_timestamp); #endif - ref->state = WT_REF_LOOKASIDE; - } else - ref->state = WT_REF_DISK; + ref->state = WT_REF_LOOKASIDE; } /* diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index ccf16674a68..00dafb680da 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -259,6 +259,184 @@ __wt_las_cursor_close( } /* + * __las_insert_block_verbose -- + * Display a verbose message once per checkpoint with details about the + * cache state when performing a lookaside table write. + */ +static void +__las_insert_block_verbose( + WT_SESSION_IMPL *session, uint32_t btree_id, uint64_t las_pageid) +{ +#ifdef HAVE_VERBOSE + WT_CONNECTION_IMPL *conn; + uint64_t ckpt_gen_current, ckpt_gen_last; + uint32_t pct_dirty, pct_full; + + if (!WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE)) + return; + + conn = S2C(session); + ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT); + ckpt_gen_last = conn->las_verb_gen_write; + + /* + * This message is throttled to one per checkpoint. To do this we + * track the generation of the last checkpoint for which the message + * was printed and check against the current checkpoint generation. + */ + if (ckpt_gen_current > ckpt_gen_last) { + /* + * Attempt to atomically replace the last checkpoint generation + * for which this message was printed. If the atomic swap fails + * we have raced and the winning thread will print the message. + */ + if (__wt_atomic_casv64(&conn->las_verb_gen_write, + ckpt_gen_last, ckpt_gen_current)) { + (void)__wt_eviction_clean_needed(session, &pct_full); + (void)__wt_eviction_dirty_needed(session, &pct_dirty); + + __wt_verbose(session, WT_VERB_LOOKASIDE, + "Page reconciliation triggered lookaside write" + "file ID %" PRIu32 ", page ID %" PRIu64 ". " + "Entries now in lookaside file: %" PRId64 ", " + "cache dirty: %" PRIu32 "%% , " + "cache use: %" PRIu32 "%%", + btree_id, las_pageid, + WT_STAT_READ(conn->stats, cache_lookaside_entries), + pct_dirty, pct_full); + } + } +#else + WT_UNUSED(session); + WT_UNUSED(btree_id); + WT_UNUSED(las_pageid); +#endif +} + +/* + * __wt_las_insert_block -- + * Copy one set of saved updates into the database's lookaside buffer. + */ +int +__wt_las_insert_block(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_CURSOR *cursor, WT_MULTI *multi, WT_ITEM *key) +{ + WT_ITEM las_timestamp, las_value; + WT_SAVE_UPD *list; + WT_UPDATE *upd; + uint64_t insert_cnt, las_counter, las_pageid; + uint32_t btree_id, i, slot; + uint8_t *p; + + WT_CLEAR(las_timestamp); + WT_CLEAR(las_value); + insert_cnt = 0; + + btree_id = S2BT(session)->id; + las_pageid = multi->las_pageid = + __wt_atomic_add64(&S2BT(session)->las_pageid, 1); + + /* + * Make sure there are no leftover entries (e.g., from a handle + * reopen). + */ + WT_RET(__wt_las_remove_block(session, cursor, btree_id, las_pageid)); + + /* Enter each update in the boundary's list into the lookaside store. */ + for (las_counter = 0, i = 0, + list = multi->supd; i < multi->supd_entries; ++i, ++list) { + /* Lookaside table key component: source key. */ + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + p = key->mem; + WT_RET( + __wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins))); + key->size = WT_PTRDIFF(p, key->data); + break; + case WT_PAGE_ROW_LEAF: + if (list->ins == NULL) + WT_RET(__wt_row_leaf_key( + session, page, list->ripcip, key, false)); + else { + key->data = WT_INSERT_KEY(list->ins); + key->size = WT_INSERT_KEY_SIZE(list->ins); + } + break; + WT_ILLEGAL_VALUE(session); + } + + /* + * Lookaside table value component: update reference. Updates + * come from the row-store insert list (an inserted item), or + * update array (an update to an original on-page item), or from + * a column-store insert list (column-store format has no update + * array, the insert list contains both inserted items and + * updates to original on-page items). When rolling forward a + * modify update from an original on-page item, we need an + * on-page slot so we can find the original on-page item. When + * rolling forward from an inserted item, no on-page slot is + * possible. + */ + slot = UINT32_MAX; /* Impossible slot */ + if (list->ripcip != NULL) + slot = page->type == WT_PAGE_ROW_LEAF ? + WT_ROW_SLOT(page, list->ripcip) : + WT_COL_SLOT(page, list->ripcip); + upd = list->ins == NULL ? + page->modify->mod_row_update[slot] : list->ins->upd; + + /* + * Walk the list of updates, storing each key/value pair into + * the lookaside table. Skip aborted items (there's no point + * to restoring them), and assert we never see a reserved item. + */ + do { + if (upd->txnid == WT_TXN_ABORTED) + continue; + + switch (upd->type) { + case WT_UPDATE_DELETED: + las_value.size = 0; + break; + case WT_UPDATE_MODIFIED: + case WT_UPDATE_STANDARD: + las_value.data = upd->data; + las_value.size = upd->size; + break; + case WT_UPDATE_RESERVED: + WT_ASSERT(session, + upd->type != WT_UPDATE_RESERVED); + continue; + } + + cursor->set_key(cursor, + btree_id, las_pageid, ++las_counter, key); + +#ifdef HAVE_TIMESTAMPS + las_timestamp.data = &upd->timestamp; + las_timestamp.size = WT_TIMESTAMP_SIZE; +#endif + cursor->set_value(cursor, + upd->txnid, &las_timestamp, upd->type, &las_value); + + WT_RET(cursor->insert(cursor)); + ++insert_cnt; + } while ((upd = upd->next) != NULL); + } + + __wt_free(session, multi->supd); + multi->supd_entries = 0; + + if (insert_cnt > 0) { + WT_STAT_CONN_INCRV( + session, cache_lookaside_entries, insert_cnt); + __las_insert_block_verbose(session, btree_id, las_pageid); + } + return (0); +} + +/* * __wt_las_remove_block -- * Remove all records matching a key prefix from the lookaside store. */ @@ -269,10 +447,18 @@ __wt_las_remove_block(WT_SESSION_IMPL *session, WT_DECL_RET; WT_ITEM las_key; uint64_t las_counter, las_pageid, remove_cnt; - uint32_t las_id; + uint32_t las_id, session_flags; int exact; + bool local_cursor; remove_cnt = 0; + session_flags = 0; /* [-Wconditional-uninitialized] */ + + local_cursor = false; + if (cursor == NULL) { + __wt_las_cursor(session, &cursor, &session_flags); + local_cursor = true; + } /* * Search for the block's unique prefix and step through all matching @@ -301,6 +487,9 @@ __wt_las_remove_block(WT_SESSION_IMPL *session, } WT_ERR_NOTFOUND_OK(ret); -err: WT_STAT_CONN_DECRV(session, cache_lookaside_entries, remove_cnt); +err: if (local_cursor) + WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + + WT_STAT_CONN_DECRV(session, cache_lookaside_entries, remove_cnt); return (ret); } diff --git a/src/third_party/wiredtiger/src/config/config.c b/src/third_party/wiredtiger/src/config/config.c index 9669d5bb39f..dd46aa55ad1 100644 --- a/src/third_party/wiredtiger/src/config/config.c +++ b/src/third_party/wiredtiger/src/config/config.c @@ -340,12 +340,15 @@ static const int8_t goesc[256] = { static int __config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value) { - WT_CONFIG_ITEM *out = key; - int utf8_remain = 0; + WT_CONFIG_ITEM *out; + int utf8_remain; static const WT_CONFIG_ITEM true_value = { "", 0, 1, WT_CONFIG_ITEM_BOOL }; + out = key; + utf8_remain = 0; + key->len = 0; /* Keys with no value default to true. */ *value = true_value; diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c index 9781688217f..ecb0e02929f 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_backup.c +++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c @@ -253,16 +253,11 @@ __backup_start( WT_ERR(__wt_fopen(session, WT_BACKUP_TMP, WT_FS_OPEN_CREATE, WT_STREAM_WRITE, &cb->bfs)); /* - * If a list of targets was specified, work our way through them. - * Else, generate a list of all database objects. - * - * Include log files if doing a full backup, and copy them before - * copying data files to avoid rolling the metadata forward across - * a checkpoint that completes during the backup. + * If targets were specified, add them to the list. Otherwise it is a + * full backup, add all database objects and log files to the list. */ target_list = false; WT_ERR(__backup_uri(session, cfg, &target_list, &log_only)); - if (!target_list) { WT_ERR(__backup_log_append(session, cb, true)); WT_ERR(__backup_all(session)); diff --git a/src/third_party/wiredtiger/src/docs/basic-api.dox b/src/third_party/wiredtiger/src/docs/basic-api.dox index 2b810e6676b..dceb82b06ba 100644 --- a/src/third_party/wiredtiger/src/docs/basic-api.dox +++ b/src/third_party/wiredtiger/src/docs/basic-api.dox @@ -106,7 +106,7 @@ by a previous run of the example). No data extraction or conversion is required in the application. Because the cursor was positioned in the table after the WT_CURSOR::insert -call, we had to re-position it using the WT_CURSOR::first call; if we +call, we had to re-position it using the WT_CURSOR::reset call; if we weren't using the cursor for the call to WT_CURSOR::insert above, this loop would simplify to: diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c index f2a09a0a769..bd70de8bddb 100644 --- a/src/third_party/wiredtiger/src/evict/evict_file.c +++ b/src/third_party/wiredtiger/src/evict/evict_file.c @@ -16,12 +16,11 @@ int __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { WT_BTREE *btree; - WT_CURSOR *las_cursor; WT_DATA_HANDLE *dhandle; WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; - uint32_t session_flags, walk_flags; + uint32_t walk_flags; dhandle = session->dhandle; btree = dhandle->handle; @@ -58,12 +57,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) !F_ISSET(btree, WT_BTREE_LOOKASIDE)) { WT_ASSERT(session, !WT_IS_METADATA(dhandle)); - __wt_las_cursor(session, &las_cursor, &session_flags); - WT_TRET(__wt_las_remove_block( - session, las_cursor, btree->id, 0)); - WT_TRET(__wt_las_cursor_close( - session, &las_cursor, session_flags)); - WT_RET(ret); + WT_RET(__wt_las_remove_block(session, NULL, btree->id, 0)); } else FLD_SET(walk_flags, WT_READ_LOOKASIDE); diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 2bc359df4ae..02208e0f84a 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -113,9 +113,11 @@ __evict_entry_priority(WT_SESSION_IMPL *session, WT_REF *ref) static int WT_CDECL __evict_lru_cmp(const void *a_arg, const void *b_arg) { - const WT_EVICT_ENTRY *a = a_arg, *b = b_arg; + const WT_EVICT_ENTRY *a, *b; uint64_t a_score, b_score; + a = a_arg; + b = b_arg; a_score = (a->ref == NULL ? UINT64_MAX : a->score); b_score = (b->ref == NULL ? UINT64_MAX : b->score); @@ -1884,24 +1886,6 @@ __evict_walk_file(WT_SESSION_IMPL *session, F_ISSET(btree, WT_BTREE_LOOKASIDE)) goto fast; - /* - * If application threads are blocked waiting for eviction (so - * we are going to consider lookaside), and the only thing - * preventing a clean page from being evicted is that it - * contains historical data, mark it dirty so we can do - * lookaside eviction. - */ - if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD | - WT_CACHE_EVICT_DIRTY_HARD) && - !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) && - !modified && page->modify != NULL && - !__wt_txn_visible_all(session, page->modify->rec_max_txn, - WT_TIMESTAMP_NULL(&page->modify->rec_max_timestamp))) { - __wt_page_only_modify_set(session, page); - modified = true; - goto fast; - } - /* Skip clean pages if appropriate. */ if (!modified && !F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) continue; diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 4b7c71c19ee..edf80ec4460 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -10,7 +10,7 @@ static int __evict_page_clean_update(WT_SESSION_IMPL *, WT_REF *, bool); static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, bool); -static int __evict_review(WT_SESSION_IMPL *, WT_REF *, bool, uint32_t *); +static int __evict_review(WT_SESSION_IMPL *, WT_REF *, bool, bool *); /* * __evict_exclusive_clear -- @@ -122,8 +122,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; - uint32_t flags; - bool clean_page, tree_dead; + bool clean_page, inmem_split, tree_dead; conn = S2C(session); @@ -143,13 +142,13 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * to make this check for clean pages, too: while unlikely eviction * would choose an internal page with children, it's not disallowed. */ - WT_ERR(__evict_review(session, ref, closing, &flags)); + WT_ERR(__evict_review(session, ref, closing, &inmem_split)); /* * If there was an in-memory split, the tree has been left in the state * we want: there is nothing more to do. */ - if (LF_ISSET(WT_REC_INMEM_SPLIT)) + if (inmem_split) goto done; /* Count evictions of internal pages during normal operation. */ @@ -428,7 +427,7 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) */ static int __evict_review( - WT_SESSION_IMPL *session, WT_REF *ref, bool closing, uint32_t *flagsp) + WT_SESSION_IMPL *session, WT_REF *ref, bool closing, bool *inmem_splitp) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; @@ -437,11 +436,12 @@ __evict_review( uint32_t flags; bool lookaside_retry, *lookaside_retryp, modified; + *inmem_splitp = false; + conn = S2C(session); flags = WT_REC_EVICT; if (!WT_SESSION_IS_CHECKPOINT(session)) LF_SET(WT_REC_VISIBLE_ALL); - *flagsp = flags; /* * Get exclusive access to the page if our caller doesn't have the tree @@ -508,9 +508,8 @@ __evict_review( WT_RET(__wt_txn_update_oldest( session, WT_TXN_OLDEST_STRICT)); - if (!__wt_page_can_evict(session, ref, flagsp)) + if (!__wt_page_can_evict(session, ref, inmem_splitp)) return (EBUSY); - flags = *flagsp; /* * Check for an append-only workload needing an in-memory @@ -519,7 +518,7 @@ __evict_review( * the page stays in memory and the tree is left in the desired * state: avoid the usual cleanup. */ - if (LF_ISSET(WT_REC_INMEM_SPLIT)) + if (*inmem_splitp) return (__wt_split_insert(session, ref)); } @@ -609,7 +608,6 @@ __evict_review( ret = __wt_reconcile(session, ref, NULL, flags, NULL); } - *flagsp = flags; WT_RET(ret); /* diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 35c7d5d5a1a..8803f3b907d 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -462,7 +462,7 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page, bool rewrite) * real progress. */ if (rewrite) - (void)__wt_atomic_subv64(&cache->pages_inmem, 1); + (void)__wt_atomic_sub64(&cache->pages_inmem, 1); else (void)__wt_atomic_addv64(&cache->pages_evict, 1); } @@ -1287,13 +1287,15 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) * Check whether a page can be evicted. */ static inline bool -__wt_page_can_evict( - WT_SESSION_IMPL *session, WT_REF *ref, uint32_t *evict_flagsp) +__wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) { WT_PAGE *page; WT_PAGE_MODIFY *mod; bool modified; + if (inmem_splitp != NULL) + *inmem_splitp = false; + page = ref->page; mod = page->modify; @@ -1318,8 +1320,8 @@ __wt_page_can_evict( * won't be written or discarded from the cache. */ if (__wt_leaf_page_can_split(session, page)) { - if (evict_flagsp != NULL) - FLD_SET(*evict_flagsp, WT_REC_INMEM_SPLIT); + if (inmem_splitp != NULL) + *inmem_splitp = true; return (true); } diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 0b9e82ee1ef..3f890a50d2b 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -372,8 +372,8 @@ struct __wt_connection_impl { * checkpoint. To accomplish this we track the checkpoint generation * for the most recent read and write verbose messages. */ - volatile uint64_t las_verb_gen_read; - volatile uint64_t las_verb_gen_write; + uint64_t las_verb_gen_read; + uint64_t las_verb_gen_write; /* Set of btree IDs not being rolled back */ uint8_t *stable_rollback_bitstring; diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 23897a05dfb..71bda687659 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -206,6 +206,7 @@ extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE extern int __wt_las_cursor_open(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags); extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_las_insert_block(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CURSOR *cursor, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern uint32_t __wt_checksum_sw(const void *chunk, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_checksum_init(void); @@ -434,7 +435,7 @@ extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTR extern void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot); extern int64_t __wt_log_slot_release(WT_MYSLOT *myslot, int64_t size); extern void __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); -extern int __wt_log_system_record( WT_SESSION_IMPL *session, WT_FH *log_fh, WT_LSN *lsn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_log_system_record(WT_SESSION_IMPL *session, WT_FH *log_fh, WT_LSN *lsn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_log_recover_system(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_LSN *lsnp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_verbose_dump_log(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h index 65b4ce34752..ec0ef0a5311 100644 --- a/src/third_party/wiredtiger/src/include/flags.h +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -45,13 +45,12 @@ #define WT_READ_WONT_NEED 0x00000800 #define WT_REC_CHECKPOINT 0x00000001 #define WT_REC_EVICT 0x00000002 -#define WT_REC_INMEM_SPLIT 0x00000004 -#define WT_REC_IN_MEMORY 0x00000008 -#define WT_REC_LOOKASIDE 0x00000010 -#define WT_REC_SCRUB 0x00000020 -#define WT_REC_UPDATE_RESTORE 0x00000040 -#define WT_REC_VISIBILITY_ERR 0x00000080 -#define WT_REC_VISIBLE_ALL 0x00000100 +#define WT_REC_IN_MEMORY 0x00000004 +#define WT_REC_LOOKASIDE 0x00000008 +#define WT_REC_SCRUB 0x00000010 +#define WT_REC_UPDATE_RESTORE 0x00000020 +#define WT_REC_VISIBILITY_ERR 0x00000040 +#define WT_REC_VISIBLE_ALL 0x00000080 #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_INTERNAL 0x00000002 #define WT_SESSION_LOCKED_CHECKPOINT 0x00000004 diff --git a/src/third_party/wiredtiger/src/include/packing.i b/src/third_party/wiredtiger/src/include/packing.i index 01023b1ba88..e1cf158c660 100644 --- a/src/third_party/wiredtiger/src/include/packing.i +++ b/src/third_party/wiredtiger/src/include/packing.i @@ -719,8 +719,10 @@ __wt_struct_unpackv(WT_SESSION_IMPL *session, static inline void __wt_struct_size_adjust(WT_SESSION_IMPL *session, size_t *sizep) { - size_t curr_size = *sizep; - size_t field_size, prev_field_size = 1; + size_t curr_size, field_size, prev_field_size; + + curr_size = *sizep; + prev_field_size = 1; while ((field_size = __wt_vsize_uint(curr_size)) != prev_field_size) { curr_size += field_size - prev_field_size; diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index fb22d7b013f..89fe64c6f18 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -2577,11 +2577,13 @@ __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap) WT_DECL_ITEM(logrec); WT_DECL_RET; size_t header_size, len; - uint32_t rectype = WT_LOGREC_MESSAGE; - const char *rec_fmt = WT_UNCHECKED_STRING(I); + uint32_t rectype; + const char *rec_fmt; va_list ap_copy; conn = S2C(session); + rectype = WT_LOGREC_MESSAGE; + rec_fmt = WT_UNCHECKED_STRING(I); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); diff --git a/src/third_party/wiredtiger/src/log/log_sys.c b/src/third_party/wiredtiger/src/log/log_sys.c index 32297fd6280..ad65aaf9a8a 100644 --- a/src/third_party/wiredtiger/src/log/log_sys.c +++ b/src/third_party/wiredtiger/src/log/log_sys.c @@ -13,8 +13,7 @@ * Write a system log record for the previous LSN. */ int -__wt_log_system_record( - WT_SESSION_IMPL *session, WT_FH *log_fh, WT_LSN *lsn) +__wt_log_system_record(WT_SESSION_IMPL *session, WT_FH *log_fh, WT_LSN *lsn) { WT_DECL_ITEM(logrec_buf); WT_DECL_RET; @@ -23,10 +22,13 @@ __wt_log_system_record( WT_LOG_RECORD *logrec; WT_MYSLOT myslot; size_t recsize; - uint32_t rectype = WT_LOGREC_SYSTEM; - const char *fmt = WT_UNCHECKED_STRING(I); + uint32_t rectype; + const char *fmt; log = S2C(session)->log; + rectype = WT_LOGREC_SYSTEM; + fmt = WT_UNCHECKED_STRING(I); + WT_RET(__wt_logrec_alloc(session, log->allocsize, &logrec_buf)); memset((uint8_t *)logrec_buf->mem, 0, log->allocsize); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index a62489cb661..4056722a13c 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -199,6 +199,7 @@ typedef struct { WT_SAVE_UPD *supd; /* Saved updates */ uint32_t supd_next; size_t supd_allocated; + size_t supd_memsize; /* Size of saved update structures */ /* List of pages we've written so far. */ WT_MULTI *multi; @@ -313,6 +314,8 @@ static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *, static int __rec_destroy_session(WT_SESSION_IMPL *); static int __rec_init(WT_SESSION_IMPL *, WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *); +static int __rec_las_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *); +static int __rec_las_wrapup_err(WT_SESSION_IMPL *, WT_RECONCILE *); static uint32_t __rec_min_split_page_size(WT_BTREE *, uint32_t); static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t); static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); @@ -326,8 +329,6 @@ static int __rec_split_row_promote( WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t); static int __rec_split_write( WT_SESSION_IMPL *, WT_RECONCILE *, WT_CHUNK *, WT_ITEM *, bool); -static int __rec_update_las( - WT_SESSION_IMPL *, WT_RECONCILE *, uint32_t, WT_MULTI *); static int __rec_write_check_complete( WT_SESSION_IMPL *, WT_RECONCILE *, int, bool *); static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *); @@ -340,8 +341,6 @@ static int __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int); static int __rec_dictionary_lookup( WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *, WT_DICTIONARY **); static void __rec_dictionary_reset(WT_RECONCILE *); -static void __rec_verbose_lookaside_write( - WT_SESSION_IMPL *, uint32_t, uint64_t); /* * __wt_reconcile -- @@ -629,7 +628,7 @@ __rec_write_check_complete( * Check if lookaside eviction is possible. If any of the updates we * saw were uncommitted, the lookaside table cannot be used. */ - if (r->update_used || r->update_uncommitted) + if (r->update_uncommitted || r->update_used) return (0); *lookaside_retryp = true; @@ -808,8 +807,8 @@ err: __wt_page_out(session, &next); * Configure raw compression. */ static inline bool -__rec_raw_compression_config( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) +__rec_raw_compression_config(WT_SESSION_IMPL *session, + uint32_t flags, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) { WT_BTREE *btree; @@ -825,6 +824,14 @@ __rec_raw_compression_config( return (false); /* + * XXX + * Turn off if lookaside is configured: lookaside potentially writes + * blocks without entries and raw compression isn't ready for that. + */ + if (LF_ISSET(WT_REC_LOOKASIDE)) + return (false); + + /* * Raw compression cannot support dictionary compression. (Technically, * we could still use the raw callback on column-store variable length * internal pages with dictionary compression configured, because @@ -956,14 +963,14 @@ __rec_init(WT_SESSION_IMPL *session, #endif /* Track if updates were used and/or uncommitted. */ - r->update_used = r->update_uncommitted = false; + r->update_uncommitted = r->update_used = false; /* Track if the page can be marked clean. */ r->leave_dirty = false; /* Raw compression. */ r->raw_compression = - __rec_raw_compression_config(session, page, salvage); + __rec_raw_compression_config(session, flags, page, salvage); r->raw_destination.flags = WT_ITEM_ALIGNED; /* Track overflow items. */ @@ -975,6 +982,7 @@ __rec_init(WT_SESSION_IMPL *session, /* The list of saved updates is reused. */ r->supd_next = 0; + r->supd_memsize = 0; /* The list of pages we've written. */ r->multi = NULL; @@ -1125,8 +1133,8 @@ __rec_destroy_session(WT_SESSION_IMPL *session) * Save a WT_UPDATE list for later restoration. */ static int -__rec_update_save(WT_SESSION_IMPL *session, - WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, WT_UPDATE *onpage_upd) +__rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, + WT_INSERT *ins, void *ripcip, WT_UPDATE *onpage_upd, size_t upd_memsize) { WT_RET(__wt_realloc_def( session, &r->supd_allocated, r->supd_next + 1, &r->supd)); @@ -1134,6 +1142,7 @@ __rec_update_save(WT_SESSION_IMPL *session, r->supd[r->supd_next].ripcip = ripcip; r->supd[r->supd_next].onpage_upd = onpage_upd; ++r->supd_next; + r->supd_memsize += upd_memsize; return (0); } @@ -1143,22 +1152,24 @@ __rec_update_save(WT_SESSION_IMPL *session, */ static int __rec_append_orig_value(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_UPDATE *first_upd, WT_CELL_UNPACK *unpack) + WT_PAGE *page, WT_UPDATE *upd, WT_CELL_UNPACK *unpack) { WT_DECL_ITEM(tmp); WT_DECL_RET; - WT_UPDATE *append, *upd; + WT_UPDATE *append; size_t size; - /* - * If at least one self-contained update is globally visible, we're - * done. - */ - for (upd = first_upd; upd != NULL; upd = upd->next) + /* Done if at least one self-contained update is globally visible. */ + for (;; upd = upd->next) { if (WT_UPDATE_DATA_VALUE(upd) && __wt_txn_upd_visible_all(session, upd)) return (0); + /* Leave reference at the last item in the chain. */ + if (upd->next == NULL) + break; + } + /* * We need the original on-page value for some reader: get a copy and * append it to the end of the update list with a transaction ID that @@ -1187,8 +1198,6 @@ __rec_append_orig_value(WT_SESSION_IMPL *session, * * Append the new entry to the update list. */ - for (upd = first_upd; upd->next != NULL; upd = upd->next) - ; WT_PUBLISH(upd->next, append); __wt_cache_page_inmem_incr(session, page, size); @@ -1206,15 +1215,22 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) { WT_PAGE *page; - WT_UPDATE *first_ts_upd, *first_txn_upd, *first_upd, *upd; + WT_UPDATE *first_txn_upd, *first_upd, *upd; wt_timestamp_t *timestampp; + size_t upd_memsize; uint64_t max_txn, txnid; bool all_visible, uncommitted; +#ifdef HAVE_TIMESTAMPS + WT_UPDATE *first_ts_upd; + first_ts_upd = NULL; +#endif + *updp = NULL; page = r->page; - first_ts_upd = first_txn_upd = NULL; + first_txn_upd = NULL; + upd_memsize = 0; max_txn = WT_TXN_NONE; uncommitted = false; @@ -1253,6 +1269,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (WT_TXNID_LE(r->last_running, txnid)) uncommitted = r->update_uncommitted = true; + upd_memsize += WT_UPDATE_MEMSIZE(upd); + /* * Find the first update we can use. * @@ -1340,7 +1358,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, #ifdef HAVE_TIMESTAMPS timestampp = first_ts_upd == NULL ? NULL : &first_ts_upd->timestamp; #else - WT_UNUSED(first_ts_upd); timestampp = NULL; #endif all_visible = *updp == first_txn_upd && @@ -1388,12 +1405,14 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * The order of the updates on the list matters, we can't move only the * unresolved updates, move the entire update list. */ - WT_RET(__rec_update_save(session, r, ins, ripcip, *updp)); + WT_RET(__rec_update_save(session, r, ins, ripcip, *updp, upd_memsize)); #ifdef HAVE_TIMESTAMPS /* Track the oldest saved timestamp for lookaside. */ - if (F_ISSET(r, WT_REC_LOOKASIDE)) - for (upd = first_upd; upd->next != NULL; upd = upd->next) + if (first_ts_upd == NULL) + __wt_timestamp_set_zero(&r->min_saved_timestamp); + else if (F_ISSET(r, WT_REC_LOOKASIDE)) + for (upd = first_upd; upd != NULL; upd = upd->next) if (upd->txnid != WT_TXN_ABORTED && upd->txnid != WT_TXN_NONE && __wt_timestamp_cmp( @@ -1410,9 +1429,8 @@ check_original_value: * image is rewritten), or any reconciliation of a backing overflow * record that will be physically removed once it's no longer needed. */ - if (*updp != NULL && - (F_ISSET(r, WT_REC_LOOKASIDE) || - (*updp != NULL && vpack != NULL && + if (*updp != NULL && (F_ISSET(r, WT_REC_LOOKASIDE) || + (vpack != NULL && vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM))) WT_RET( __rec_append_orig_value(session, page, first_upd, vpack)); @@ -1994,25 +2012,32 @@ __rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (page_size * 2); } -#define WT_REC_MAX_SAVED_UPDATES 100 - /* * __rec_need_split -- * Check whether adding some bytes to the page requires a split. - * - * This takes into account the disk image growing across a boundary, and - * also triggers a split for row store leaf pages when a threshold number - * of saved updates is reached. This allows pages to split for update / - * restore and lookaside eviction when there is no visible data that - * causes the disk image to grow. */ static bool __rec_need_split(WT_RECONCILE *r, size_t len) { - if (r->page->type == WT_PAGE_ROW_LEAF && - r->supd_next >= WT_REC_MAX_SAVED_UPDATES) - return (true); - + /* + * In the case of a row-store leaf page, trigger a split if a threshold + * number of saved updates is reached. This allows pages to split for + * update/restore and lookaside eviction when there is no visible data + * causing the disk image to grow. + * + * In the case of small pages or large keys, we might try to split when + * a page has no updates or entries, which isn't possible. To consider + * update/restore or lookaside information, require either page entries + * or updates that will be attached to the image. The limit is one of + * either, but it doesn't make sense to create pages or images with few + * entries or updates, even where page sizes are small (especially as + * updates that will eventually become overflow items can throw off our + * calculations). Bound the combination at something reasonable. + */ + if (r->page->type == WT_PAGE_ROW_LEAF && r->entries + r->supd_next > 10) + len += r->supd_memsize; + + /* Check for the disk image crossing a boundary. */ return (r->raw_compression ? len > r->space_avail : WT_CHECK_CROSSING_BND(r, len)); } @@ -2619,11 +2644,11 @@ __rec_split_crossing_bnd( } /* - * __rec_split_raw_worker -- - * Handle the raw compression page reconciliation bookkeeping. + * __rec_split_raw -- + * Raw compression. */ static int -__rec_split_raw_worker(WT_SESSION_IMPL *session, +__rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len, bool no_more_rows) { WT_BM *bm; @@ -3003,16 +3028,6 @@ split_grow: /* } /* - * __rec_split_raw -- - * Raw compression split routine. - */ -static inline int -__rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) -{ - return (__rec_split_raw_worker(session, r, next_len, false)); -} - -/* * __rec_split_finish_process_prev -- * If the two split chunks together fit in a single page, merge them into * one. If they do not fit in a single page but the last is smaller than @@ -3129,7 +3144,7 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) WT_PTRDIFF(r->first_free, r->cur_ptr->image.mem); if (data_size <= btree->allocsize) break; - WT_RET(__rec_split_raw_worker(session, r, 0, true)); + WT_RET(__rec_split_raw(session, r, 0, true)); } if (r->entries == 0) return (0); @@ -3193,6 +3208,7 @@ __rec_split_write_supd(WT_SESSION_IMPL *session, WT_DECL_RET; WT_PAGE *page; WT_SAVE_UPD *supd; + WT_UPDATE *upd; uint32_t i, j; int cmp; @@ -3208,6 +3224,7 @@ __rec_split_write_supd(WT_SESSION_IMPL *session, if (last_block) { WT_RET(__rec_supd_move(session, multi, r->supd, r->supd_next)); r->supd_next = 0; + r->supd_memsize = 0; return (0); } @@ -3252,8 +3269,19 @@ __rec_split_write_supd(WT_SESSION_IMPL *session, * saved updates in sorted order, new saved updates must be * appended to the list). */ - for (j = 0; i < r->supd_next; ++j, ++i) + r->supd_memsize = 0; + for (j = 0; i < r->supd_next; ++j, ++i) { + /* Account for the remaining update memory. */ + if (r->supd[i].ins == NULL) + upd = page->modify->mod_row_update[ + page->type == WT_PAGE_ROW_LEAF ? + WT_ROW_SLOT(page, r->supd[i].ripcip) : + WT_COL_SLOT(page, r->supd[i].ripcip)]; + else + upd = r->supd[i].ins->upd; + r->supd_memsize += __wt_update_list_memsize(upd); r->supd[j] = r->supd[i]; + } r->supd_next = j; } @@ -3296,10 +3324,8 @@ __rec_split_write_header(WT_SESSION_IMPL *session, * and we found updates that weren't globally visible when reconciling * this page. */ - if (F_ISSET(r, WT_REC_LOOKASIDE) && multi->supd != NULL) { + if (F_ISSET(r, WT_REC_LOOKASIDE) && multi->supd != NULL) F_SET(dsk, WT_PAGE_LAS_UPDATE); - r->cache_write_lookaside = true; - } dsk->unused[0] = dsk->unused[1] = 0; @@ -3492,28 +3518,40 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, goto copy_image; /* - * If there are saved updates, we are either doing update/restore - * eviction or lookaside eviction. Update/restore never writes the - * disk image. - * - * Lookaside does write disk images, but also needs to cope with the - * case where no updates could be written, which means there are no - * entries in the page image to write. + * If there are saved updates, either doing update/restore eviction or + * lookaside eviction. */ - if (multi->supd != NULL && - (F_ISSET(r, WT_REC_UPDATE_RESTORE) || chunk->entries == 0)) { + if (multi->supd != NULL) { /* + * XXX * If no entries were used, the page is empty and we can only - * restore updates against an empty row store leaf page. - * (Column store modify will attempt to allocate a zero-length - * array). + * restore eviction/restore or lookaside updates against + * empty row-store leaf pages, column-store modify attempts to + * allocate a zero-length array. */ - if (r->page->type != WT_PAGE_ROW_LEAF && - chunk->entries == 0 && multi->supd != NULL) + if (r->page->type != WT_PAGE_ROW_LEAF && chunk->entries == 0) return (EBUSY); - r->cache_write_restore = true; - goto update_las; + if (F_ISSET(r, WT_REC_LOOKASIDE)) { + r->cache_write_lookaside = true; + + /* + * Lookaside eviction writes disk images, but if no + * entries were used, there's no disk image to write. + * There's no more work to do in this case, lookaside + * eviction doesn't copy disk images. + */ + if (chunk->entries == 0) + return (0); + } else { + r->cache_write_restore = true; + + /* + * Update/restore never writes a disk image, but always + * copies a disk image. + */ + goto copy_image; + } } /* @@ -3525,6 +3563,7 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, last_block)) goto copy_image; + /* Write the disk image and get an address. */ WT_RET(__wt_bt_write(session, compressed_image == NULL ? &chunk->image : compressed_image, addr, &addr_size, false, F_ISSET(r, WT_REC_CHECKPOINT), @@ -3535,15 +3574,6 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_RET(__wt_memdup(session, addr, addr_size, &multi->addr.addr)); multi->addr.size = (uint8_t)addr_size; -update_las: - /* - * If using the lookaside table eviction path and we found updates that - * weren't globally visible when reconciling this page, copy them into - * the database's lookaside store. - */ - if (F_ISSET(r, WT_REC_LOOKASIDE) && multi->supd != NULL) - WT_RET(__rec_update_las(session, r, btree->id, multi)); - copy_image: #ifdef HAVE_DIAGNOSTIC /* @@ -3569,150 +3599,6 @@ copy_image: } /* - * __rec_update_las -- - * Copy a set of updates into the database's lookaside buffer. - */ -static int -__rec_update_las(WT_SESSION_IMPL *session, - WT_RECONCILE *r, uint32_t btree_id, WT_MULTI *multi) -{ - WT_CURSOR *cursor; - WT_DECL_ITEM(key); - WT_DECL_RET; - WT_ITEM las_timestamp, las_value; - WT_PAGE *page; - WT_SAVE_UPD *list; - WT_UPDATE *upd; - uint64_t insert_cnt, las_counter, las_pageid; - uint32_t i, session_flags, slot; - uint8_t *p; - - cursor = NULL; - WT_CLEAR(las_timestamp); - WT_CLEAR(las_value); - page = r->page; - insert_cnt = las_pageid = 0; - - __wt_las_cursor(session, &cursor, &session_flags); - - /* Ensure enough room for a column-store key without checking. */ - WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key)); - - /* - * Each key in the lookaside table is associated with a unique - * identifier, allocated sequentially per tree. - */ - las_pageid = multi->las_pageid = - __wt_atomic_add64(&S2BT(session)->las_pageid, 1); - - /* The zero page ID is reserved, check we don't see it. */ - WT_ASSERT(session, las_pageid != 0); - - /* - * Make sure there are no left over entries (e.g., from a handle - * reopen). - */ - WT_ERR(__wt_las_remove_block(session, cursor, btree_id, las_pageid)); - - /* Enter each update in the boundary's list into the lookaside store. */ - for (las_counter = 0, i = 0, - list = multi->supd; i < multi->supd_entries; ++i, ++list) { - /* Lookaside table key component: source key. */ - switch (page->type) { - case WT_PAGE_COL_FIX: - case WT_PAGE_COL_VAR: - p = key->mem; - WT_ERR( - __wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins))); - key->size = WT_PTRDIFF(p, key->data); - break; - case WT_PAGE_ROW_LEAF: - if (list->ins == NULL) - WT_ERR(__wt_row_leaf_key( - session, page, list->ripcip, key, false)); - else { - key->data = WT_INSERT_KEY(list->ins); - key->size = WT_INSERT_KEY_SIZE(list->ins); - } - break; - WT_ILLEGAL_VALUE_ERR(session); - } - - /* - * Lookaside table value component: update reference. Updates - * come from the row-store insert list (an inserted item), or - * update array (an update to an original on-page item), or from - * a column-store insert list (column-store format has no update - * array, the insert list contains both inserted items and - * updates to original on-page items). When rolling forward a - * modify update from an original on-page item, we need an - * on-page slot so we can find the original on-page item. When - * rolling forward from an inserted item, no on-page slot is - * possible. - */ - slot = UINT32_MAX; /* Impossible slot */ - if (list->ripcip != NULL) - slot = page->type == WT_PAGE_ROW_LEAF ? - WT_ROW_SLOT(page, list->ripcip) : - WT_COL_SLOT(page, list->ripcip); - upd = list->ins == NULL ? - page->modify->mod_row_update[slot] : list->ins->upd; - - /* - * Walk the list of updates, storing each key/value pair into - * the lookaside table. Skip aborted items (there's no point - * to restoring them), and assert we never see a reserved item. - */ - do { - if (upd->txnid == WT_TXN_ABORTED) - continue; - - switch (upd->type) { - case WT_UPDATE_DELETED: - las_value.size = 0; - break; - case WT_UPDATE_MODIFIED: - case WT_UPDATE_STANDARD: - las_value.data = upd->data; - las_value.size = upd->size; - break; - case WT_UPDATE_RESERVED: - WT_ASSERT(session, - upd->type != WT_UPDATE_RESERVED); - continue; - } - - cursor->set_key(cursor, - btree_id, las_pageid, ++las_counter, key); - -#ifdef HAVE_TIMESTAMPS - las_timestamp.data = &upd->timestamp; - las_timestamp.size = WT_TIMESTAMP_SIZE; -#endif - cursor->set_value(cursor, - upd->txnid, &las_timestamp, upd->type, &las_value); - - WT_ERR(cursor->insert(cursor)); - ++insert_cnt; - } while ((upd = upd->next) != NULL); - } - - __wt_free(session, multi->supd); - multi->supd_entries = 0; - -err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); - - if (insert_cnt > 0) { - WT_STAT_CONN_INCRV( - session, cache_lookaside_entries, insert_cnt); - __rec_verbose_lookaside_write(session, btree_id, las_pageid); - } - - __wt_scr_free(session, &key); - return (ret); -} - -/* * __wt_bulk_init -- * Bulk insert initialization. */ @@ -3825,7 +3711,7 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) if (r->raw_compression) { if (key->len + val->len > r->space_avail) WT_RET(__rec_split_raw( - session, r, key->len + val->len)); + session, r, key->len + val->len, false)); } else if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) { /* @@ -3992,7 +3878,7 @@ __wt_bulk_insert_var( /* Boundary: split or write the page. */ if (r->raw_compression) { if (val->len > r->space_avail) - WT_RET(__rec_split_raw(session, r, val->len)); + WT_RET(__rec_split_raw(session, r, val->len, false)); } else if (WT_CROSSING_SPLIT_BND(r, val->len)) WT_RET(__rec_split_crossing_bnd(session, r, val->len)); @@ -4133,7 +4019,8 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) /* Boundary: split or write the page. */ if (__rec_need_split(r, val->len)) { if (r->raw_compression) - WT_ERR(__rec_split_raw(session, r, val->len)); + WT_ERR(__rec_split_raw( + session, r, val->len, false)); else WT_ERR(__rec_split_crossing_bnd( session, r, val->len)); @@ -4181,7 +4068,8 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* Boundary: split or write the page. */ if (__rec_need_split(r, val->len)) { if (r->raw_compression) - WT_RET(__rec_split_raw(session, r, val->len)); + WT_RET(__rec_split_raw( + session, r, val->len, false)); else WT_RET(__rec_split_crossing_bnd( session, r, val->len)); @@ -4454,7 +4342,7 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, /* Boundary: split or write the page. */ if (__rec_need_split(r, val->len)) { if (r->raw_compression) - WT_RET(__rec_split_raw(session, r, val->len)); + WT_RET(__rec_split_raw(session, r, val->len, false)); else WT_RET(__rec_split_crossing_bnd(session, r, val->len)); } @@ -5156,7 +5044,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) if (__rec_need_split(r, key->len + val->len)) { if (r->raw_compression) WT_ERR(__rec_split_raw( - session, r, key->len + val->len)); + session, r, key->len + val->len, false)); else { /* * In one path above, we copied address blocks @@ -5226,7 +5114,7 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) if (__rec_need_split(r, key->len + val->len)) { if (r->raw_compression) WT_RET(__rec_split_raw( - session, r, key->len + val->len)); + session, r, key->len + val->len, false)); else WT_RET(__rec_split_crossing_bnd( session, r, key->len + val->len)); @@ -5573,7 +5461,7 @@ build: if (__rec_need_split(r, key->len + val->len)) { if (r->raw_compression) WT_ERR(__rec_split_raw( - session, r, key->len + val->len)); + session, r, key->len + val->len, false)); else { /* * If we copied address blocks from the page @@ -5656,59 +5544,34 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) { WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); - if (upd == NULL) { - /* - * Look for an update. If nothing is visible and not in - * evict/restore, there's no work to do. - */ - if (!F_ISSET(r, WT_REC_UPDATE_RESTORE)) - continue; + /* If no updates are visible there's no work to do. */ + if (upd == NULL) + continue; + switch (upd->type) { + case WT_UPDATE_DELETED: + continue; + case WT_UPDATE_MODIFIED: /* - * When doing evict/restore, move the insert key to the - * page, with an empty value (this allows us to split - * the page if there's a huge, pinned insert list). The - * on-page key must never be read, make sure there is a - * globally visible update in the chain. - * - * __rec_txn_read also returns a NULL update when all of - * the updates were aborted, without saving the update - * list to the evict/restore array, so we can't append - * a delete update. Ugly, but the alternative is another - * parameter to __rec_txn_read. + * Impossible slot, there's no backing on-page + * item. */ - if (r->supd_next == 0 || - r->supd[r->supd_next - 1].ins != ins) - continue; - - WT_RET(__rec_append_orig_value( - session, r->page, ins->upd, NULL)); - val->len = 0; - } else - switch (upd->type) { - case WT_UPDATE_DELETED: - continue; - case WT_UPDATE_MODIFIED: - /* - * Impossible slot, there's no backing on-page - * item. - */ - cbt->slot = UINT32_MAX; - WT_RET(__wt_value_return(session, cbt, upd)); - WT_RET(__rec_cell_build_val(session, r, - cbt->iface.value.data, - cbt->iface.value.size, (uint64_t)0)); - break; - case WT_UPDATE_STANDARD: - if (upd->size == 0) - val->len = 0; - else - WT_RET(__rec_cell_build_val(session, - r, upd->data, upd->size, - (uint64_t)0)); - break; - WT_ILLEGAL_VALUE(session); - } + cbt->slot = UINT32_MAX; + WT_RET(__wt_value_return(session, cbt, upd)); + WT_RET(__rec_cell_build_val(session, r, + cbt->iface.value.data, + cbt->iface.value.size, (uint64_t)0)); + break; + case WT_UPDATE_STANDARD: + if (upd->size == 0) + val->len = 0; + else + WT_RET(__rec_cell_build_val(session, + r, upd->data, upd->size, + (uint64_t)0)); + break; + WT_ILLEGAL_VALUE(session); + } /* Build key cell. */ WT_RET(__rec_cell_build_leaf_key(session, r, WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); @@ -5717,7 +5580,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) if (__rec_need_split(r, key->len + val->len)) { if (r->raw_compression) WT_RET(__rec_split_raw( - session, r, key->len + val->len)); + session, r, key->len + val->len, false)); else { /* * Turn off prefix compression until a full key @@ -5923,6 +5786,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) mod->rec_result = 0; /* + * If using the lookaside table eviction path and we found updates that + * weren't globally visible when reconciling this page, copy them into + * the database's lookaside store. + */ + if (F_ISSET(r, WT_REC_LOOKASIDE)) + WT_RET(__rec_las_wrapup(session, r)); + + /* * Wrap up overflow tracking. If we are about to create a checkpoint, * the system must be entirely consistent at that point (the underlying * block manager is presumably going to do some action to resolve the @@ -5932,8 +5803,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_RET(__wt_ovfl_track_wrapup(session, page)); __wt_verbose(session, WT_VERB_RECONCILE, - "%p reconciled into %" PRIu32 " pages", - (void *)ref, r->multi_next); + "%p reconciled into %" PRIu32 " pages", (void *)ref, r->multi_next); switch (r->multi_next) { case 0: /* Page delete */ @@ -5988,7 +5858,6 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) __wt_timestamp_set(&mod->mod_replace_las_min_timestamp, &r->min_saved_timestamp); #endif - r->multi->las_pageid = 0; } else WT_RET(__wt_bt_write(session, r->wrapup_checkpoint, NULL, NULL, true, F_ISSET(r, WT_REC_CHECKPOINT), @@ -6066,7 +5935,77 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) multi->addr.addr, multi->addr.size)); } + /* + * If using the lookaside table eviction path and we found updates that + * weren't globally visible when reconciling this page, we might have + * already copied them into the database's lookaside store. Remove them. + */ + if (F_ISSET(r, WT_REC_LOOKASIDE)) + WT_TRET(__rec_las_wrapup_err(session, r)); + WT_TRET(__wt_ovfl_track_wrapup_err(session, page)); + + return (ret); +} + +/* + * __rec_las_wrapup -- + * Copy all of the saved updates into the database's lookaside buffer. + */ +static int +__rec_las_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_CURSOR *cursor; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_MULTI *multi; + uint32_t i, session_flags; + + /* Check if there's work to do. */ + for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i) + if (multi->supd != NULL) + break; + if (i == r->multi_next) + return (0); + + /* Ensure enough room for a column-store key without checking. */ + WT_RET(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key)); + + __wt_las_cursor(session, &cursor, &session_flags); + + for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i) + if (multi->supd != NULL) + WT_ERR(__wt_las_insert_block( + session, r->page, cursor, multi, key)); + +err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + + __wt_scr_free(session, &key); + return (ret); +} + +/* + * __rec_las_wrapup_err -- + * Discard any saved updates from the database's lookaside buffer. + */ +static int +__rec_las_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_DECL_RET; + WT_MULTI *multi; + uint32_t btree_id, i; + + btree_id = S2BT(session)->id; + + /* + * Note the additional check for a non-zero lookaside page ID, that + * flags if lookaside table entries for this page have been written. + */ + for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i) + if (multi->supd != NULL && multi->las_pageid != 0) + WT_TRET(__wt_las_remove_block( + session, NULL, btree_id, multi->las_pageid)); + return (ret); } @@ -6552,57 +6491,3 @@ __rec_dictionary_lookup( *dpp = next; return (0); } - -/* - * __rec_verbose_lookaside_write -- - * Create a verbose message to display once per checkpoint with details - * about the cache state when performing a lookaside table write. - */ -static void -__rec_verbose_lookaside_write( - WT_SESSION_IMPL *session, uint32_t las_id, uint64_t las_pageid) -{ -#ifdef HAVE_VERBOSE - WT_CONNECTION_IMPL *conn; - uint64_t ckpt_gen_current, ckpt_gen_last; - uint32_t pct_dirty, pct_full; - - if (!WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE)) return; - - conn = S2C(session); - ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT); - ckpt_gen_last = conn->las_verb_gen_write; - - /* - * This message is throttled to one per checkpoint. To do this we - * track the generation of the last checkpoint for which the message - * was printed and check against the current checkpoint generation. - */ - if (ckpt_gen_current > ckpt_gen_last) { - /* - * Attempt to atomically replace the last checkpoint generation - * for which this message was printed. If the atomic swap fails - * we have raced and the winning thread will print the message. - */ - if (__wt_atomic_casv64(&conn->las_verb_gen_write, - ckpt_gen_last, ckpt_gen_current)) { - (void)__wt_eviction_clean_needed(session, &pct_full); - (void)__wt_eviction_dirty_needed(session, &pct_dirty); - - __wt_verbose(session, WT_VERB_LOOKASIDE, - "Page reconciliation triggered lookaside write" - "file ID %" PRIu32 ", page ID %" PRIu64 ". " - "Entries now in lookaside file: %" PRId64 ", " - "cache dirty: %" PRIu32 "%% , " - "cache use: %" PRIu32 "%%", - las_id, las_pageid, - WT_STAT_READ(conn->stats, cache_lookaside_entries), - pct_dirty, pct_full); - } - } -#else - WT_UNUSED(session); - WT_UNUSED(las_id); - WT_UNUSED(las_pageid); -#endif -} diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c index a284b8a2229..6ccf3161229 100644 --- a/src/third_party/wiredtiger/src/session/session_compact.c +++ b/src/third_party/wiredtiger/src/session/session_compact.c @@ -250,7 +250,7 @@ __compact_worker(WT_SESSION_IMPL *session) { WT_DECL_RET; u_int i, loop; - bool didwork; + bool another_pass; /* * Reset the handles' compaction skip flag (we don't bother setting @@ -274,7 +274,8 @@ __compact_worker(WT_SESSION_IMPL *session) */ for (loop = 0; loop < 100; ++loop) { /* Step through the list of files being compacted. */ - for (didwork = false, i = 0; i < session->op_handle_next; ++i) { + for (another_pass = false, + i = 0; i < session->op_handle_next; ++i) { /* Skip objects where there's no more work. */ if (session->op_handle[i]->compact_skip) continue; @@ -282,15 +283,43 @@ __compact_worker(WT_SESSION_IMPL *session) session->compact_state = WT_COMPACT_RUNNING; WT_WITH_DHANDLE(session, session->op_handle[i], ret = __wt_compact(session)); - WT_ERR(ret); - /* If we did no work, skip this file in the future. */ - if (session->compact_state == WT_COMPACT_SUCCESS) - didwork = true; - else - session->op_handle[i]->compact_skip = true; + /* + * If successful and we did work, schedule another pass. + * If successful and we did no work, skip this file in + * the future. + */ + if (ret == 0) { + if (session-> + compact_state == WT_COMPACT_SUCCESS) + another_pass = true; + else + session-> + op_handle[i]->compact_skip = true; + continue; + } + + /* + * If compaction failed because checkpoint was running, + * continue with the next handle. We might continue to + * race with checkpoint on each handle, but that's OK, + * we'll step through all the handles, and then we'll + * block until a checkpoint completes. + * + * Just quit if eviction is the problem. + */ + if (ret == EBUSY) { + if (__wt_cache_stuck(session)) { + WT_ERR_MSG(session, EBUSY, + "compaction halted by eviction " + "pressure"); + } + ret = 0; + another_pass = true; + } + WT_ERR(ret); } - if (!didwork) + if (!another_pass) break; /* @@ -320,10 +349,25 @@ __wt_session_compact( WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; + bool no_eviction_set; + + no_eviction_set = false; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, compact, config, cfg); + /* + * Don't highjack the compaction thread for eviction; it's holding locks + * blocking checkpoints and once an application is tapped for eviction, + * it can spend a long time doing nothing else. (And, if we're tapping + * application threads for eviction, compaction should quit, it's not + * making anything better.) + */ + if (!F_ISSET(session, WT_SESSION_NO_EVICTION)) { + no_eviction_set = true; + F_SET(session, WT_SESSION_NO_EVICTION); + } + /* In-memory ignores compaction operations. */ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) goto err; @@ -393,6 +437,9 @@ err: session->compact = NULL; */ WT_TRET(__wt_session_release_resources(session)); + if (no_eviction_set) + F_CLR(session, WT_SESSION_NO_EVICTION); + if (ret != 0) WT_STAT_CONN_INCR(session, session_table_compact_fail); else diff --git a/src/third_party/wiredtiger/src/support/pow.c b/src/third_party/wiredtiger/src/support/pow.c index cd770a514b2..2fb193afca8 100644 --- a/src/third_party/wiredtiger/src/support/pow.c +++ b/src/third_party/wiredtiger/src/support/pow.c @@ -89,8 +89,9 @@ __wt_nlpo2(uint32_t v) uint32_t __wt_log2_int(uint32_t n) { - uint32_t l = 0; + uint32_t l; + l = 0; while (n >>= 1) l++; return (l); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 3215a372d36..cfdb7d26498 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -15,14 +15,16 @@ static uint32_t __snapsort_partition(uint64_t *array, uint32_t f, uint32_t l, uint64_t pivot) { - uint32_t i = f - 1, j = l + 1; + uint32_t i, j; + i = f - 1; + j = l + 1; for (;;) { while (pivot < array[--j]) ; while (array[++i] < pivot) ; - if (i<j) { + if (i < j) { uint64_t tmp = array[i]; array[i] = array[j]; array[j] = tmp; diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 2137d5b16ef..afb3cba1db6 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -519,7 +519,7 @@ __checkpoint_stats( conn = S2C(session); /* - * Get time diff in microseconds. + * Get time diff in milliseconds. */ msec = WT_TIMEDIFF_MS(*stop, *start); @@ -549,11 +549,11 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session, __wt_epoch(session, &stop); /* - * Get time diff in microseconds. + * Get time diff in milliseconds. */ msec = WT_TIMEDIFF_MS(stop, *start); __wt_verbose(session, - WT_VERB_CHECKPOINT, "time: %" PRIu64 " us, gen: %" PRIu64 + WT_VERB_CHECKPOINT, "time: %" PRIu64 " ms, gen: %" PRIu64 ": Full database checkpoint %s", msec, __wt_gen(session, WT_GEN_CHECKPOINT), msg); diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index c627335283d..bd1b2239e2d 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -187,10 +187,13 @@ __txn_logrec_init(WT_SESSION_IMPL *session) WT_DECL_RET; WT_TXN *txn; size_t header_size; - uint32_t rectype = WT_LOGREC_COMMIT; - const char *fmt = WT_UNCHECKED_STRING(Iq); + uint32_t rectype; + const char *fmt; txn = &session->txn; + rectype = WT_LOGREC_COMMIT; + fmt = WT_UNCHECKED_STRING(Iq); + if (txn->logrec != NULL) return (0); @@ -295,13 +298,14 @@ __txn_log_file_sync(WT_SESSION_IMPL *session, uint32_t flags, WT_LSN *lsnp) WT_DECL_ITEM(logrec); WT_DECL_RET; size_t header_size; - uint32_t rectype = WT_LOGREC_FILE_SYNC; - int start; - const char *fmt = WT_UNCHECKED_STRING(III); + uint32_t rectype, start; + const char *fmt; bool need_sync; btree = S2BT(session); - start = LF_ISSET(WT_TXN_LOG_CKPT_START); + rectype = WT_LOGREC_FILE_SYNC; + start = LF_ISSET(WT_TXN_LOG_CKPT_START) ? 1 : 0; + fmt = WT_UNCHECKED_STRING(III); need_sync = LF_ISSET(WT_TXN_LOG_CKPT_SYNC); WT_RET(__wt_struct_size( @@ -331,7 +335,9 @@ __wt_txn_checkpoint_logread(WT_SESSION_IMPL *session, WT_ITEM ckpt_snapshot_unused; uint32_t ckpt_file, ckpt_offset; u_int ckpt_nsnapshot_unused; - const char *fmt = WT_UNCHECKED_STRING(IIIu); + const char *fmt; + + fmt = WT_UNCHECKED_STRING(IIIu); if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, &ckpt_file, &ckpt_offset, diff --git a/src/third_party/wiredtiger/src/utilities/util_backup.c b/src/third_party/wiredtiger/src/utilities/util_backup.c index 7d809c2a624..bf4d7b67671 100644 --- a/src/third_party/wiredtiger/src/utilities/util_backup.c +++ b/src/third_party/wiredtiger/src/utilities/util_backup.c @@ -18,9 +18,9 @@ static int usage(void); static int append_target(WT_SESSION *session, const char *target, char **bufp) { - static bool first = true; static size_t len = 0, remain = 0; static char *buf = NULL; + static bool first = true; /* 20 bytes of slop */ if (buf == NULL || remain < strlen(target) + 20) { diff --git a/src/third_party/wiredtiger/test/format/compact.c b/src/third_party/wiredtiger/test/format/compact.c index 8a558d2b35b..c1a73bea64b 100644 --- a/src/third_party/wiredtiger/test/format/compact.c +++ b/src/third_party/wiredtiger/test/format/compact.c @@ -64,11 +64,11 @@ compact(void *arg) break; /* - * Compact can return EBUSY if concurrent with alter. + * Compact can return EBUSY if concurrent with alter or if there + * is eviction pressure, or we collide with checkpoints. */ - while ((ret = session->compact(session, g.uri, NULL)) == EBUSY) - __wt_yield(); - if (ret != 0 && ret != WT_ROLLBACK) + ret = session->compact(session, g.uri, NULL); + if (ret != 0 && ret != EBUSY && ret != WT_ROLLBACK) testutil_die(ret, "session.compact"); } diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c index df5dc3e5378..049a655cb79 100644 --- a/src/third_party/wiredtiger/test/format/config.c +++ b/src/third_party/wiredtiger/test/format/config.c @@ -29,6 +29,7 @@ #include "format.h" #include "config.h" +static void config_checkpoint(void); static void config_checksum(void); static void config_compression(const char *); static void config_encryption(void); @@ -39,6 +40,7 @@ static void config_in_memory_reset(void); static int config_is_perm(const char *); static void config_isolation(void); static void config_lrt(void); +static void config_map_checkpoint(const char *, u_int *); static void config_map_checksum(const char *, u_int *); static void config_map_compression(const char *, u_int *); static void config_map_encryption(const char *, u_int *); @@ -159,6 +161,7 @@ config_setup(void) if (!g.replay && g.run_cnt % 20 == 19 && !config_is_perm("threads")) g.c_threads = 1; + config_checkpoint(); config_checksum(); config_compression("compression"); config_compression("logging_compression"); @@ -234,6 +237,28 @@ config_setup(void) } /* + * config_checkpoint -- + * Checkpoint configuration. + */ +static void +config_checkpoint(void) +{ + /* Choose a checkpoint mode if nothing was specified. */ + if (!config_is_perm("checkpoints")) + switch (mmrand(NULL, 1, 20)) { + case 1: case 2: case 3: case 4: /* 20% */ + config_single("checkpoints=wiredtiger", 0); + break; + case 5: /* 5 % */ + config_single("checkpoints=off", 0); + break; + default: /* 75% */ + config_single("checkpoints=on", 0); + break; + } +} + +/* * config_checksum -- * Checksum configuration. */ @@ -823,7 +848,10 @@ config_single(const char *s, int perm) *cp->vstr = NULL; } - if (strncmp(s, "checksum", strlen("checksum")) == 0) { + if (strncmp(s, "checkpoints", strlen("checkpoints")) == 0) { + config_map_checkpoint(ep, &g.c_checkpoint_flag); + *cp->vstr = dstrdup(ep); + } else if (strncmp(s, "checksum", strlen("checksum")) == 0) { config_map_checksum(ep, &g.c_checksum_flag); *cp->vstr = dstrdup(ep); } else if (strncmp( @@ -834,12 +862,12 @@ config_single(const char *s, int perm) s, "encryption", strlen("encryption")) == 0) { config_map_encryption(ep, &g.c_encryption_flag); *cp->vstr = dstrdup(ep); - } else if (strncmp(s, "isolation", strlen("isolation")) == 0) { - config_map_isolation(ep, &g.c_isolation_flag); - *cp->vstr = dstrdup(ep); } else if (strncmp(s, "file_type", strlen("file_type")) == 0) { config_map_file_type(ep, &g.type); *cp->vstr = dstrdup(config_file_type(g.type)); + } else if (strncmp(s, "isolation", strlen("isolation")) == 0) { + config_map_isolation(ep, &g.c_isolation_flag); + *cp->vstr = dstrdup(ep); } else if (strncmp(s, "logging_compression", strlen("logging_compression")) == 0) { config_map_compression(ep, @@ -905,6 +933,24 @@ config_map_file_type(const char *s, u_int *vp) } /* + * config_map_checkpoint -- + * Map a checkpoint configuration to a flag. + */ +static void +config_map_checkpoint(const char *s, u_int *vp) +{ + /* Checkpoint configuration used to be 1/0, let it continue to work. */ + if (strcmp(s, "on") == 0 || strcmp(s, "1") == 0) + *vp = CHECKPOINT_ON; + else if (strcmp(s, "off") == 0 || strcmp(s, "0") == 0) + *vp = CHECKPOINT_OFF; + else if (strcmp(s, "wiredtiger") == 0) + *vp = CHECKPOINT_WIREDTIGER; + else + testutil_die(EINVAL, "illegal checkpoint configuration: %s", s); +} + +/* * config_map_checksum -- * Map a checksum configuration to a flag. */ diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h index 7ac71d7877b..6fb4071074d 100644 --- a/src/third_party/wiredtiger/test/format/config.h +++ b/src/third_party/wiredtiger/test/format/config.h @@ -102,8 +102,16 @@ static CONFIG c[] = { 0x0, 1, 100, 100 * 1024, &g.c_cache, NULL }, { "checkpoints", - "if periodic checkpoints are done", /* 95% */ - C_BOOL, 95, 0, 0, &g.c_checkpoints, NULL }, + "type of checkpoints (on | off | wiredtiger)", + C_IGNORE|C_STRING, 0, 0, 0, NULL, &g.c_checkpoint}, + + { "checkpoint_log_size", + "MB of log to wait if wiredtiger checkpoints configured", + 0x0, 20, 200, 1024, &g.c_checkpoint_log_size, NULL}, + + { "checkpoint_wait", + "seconds to wait if wiredtiger checkpoints configured", + 0x0, 5, 100, 3600, &g.c_checkpoint_wait, NULL}, { "checksum", "type of checksums (on | off | uncompressed)", @@ -222,6 +230,10 @@ static CONFIG c[] = { "type of logging compression " COMPRESSION_LIST, C_IGNORE|C_STRING, 0, 0, 0, NULL, &g.c_logging_compression }, + { "logging_file_max", + "maximum log file size in KB", + 0x0, 100, 512000, 2097152, &g.c_logging_file_max, NULL }, + { "logging_prealloc", "if log file pre-allocation configured", /* 50% */ C_BOOL, 50, 0, 0, &g.c_logging_prealloc, NULL }, diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index f35e71f58aa..96e1a0fe335 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -119,7 +119,6 @@ typedef struct { bool workers_finished; /* Operations completed */ pthread_rwlock_t backup_lock; /* Backup running */ - pthread_rwlock_t checkpoint_lock; /* Checkpoint running */ WT_RAND_STATE rnd; /* Global RNG state */ @@ -151,7 +150,9 @@ typedef struct { uint32_t c_bloom_hash_count; uint32_t c_bloom_oldest; uint32_t c_cache; - uint32_t c_checkpoints; + char *c_checkpoint; + uint32_t c_checkpoint_log_size; + uint32_t c_checkpoint_wait; char *c_checksum; uint32_t c_chunk_size; uint32_t c_compact; @@ -182,6 +183,7 @@ typedef struct { uint32_t c_logging; uint32_t c_logging_archive; char *c_logging_compression; + uint32_t c_logging_file_max; uint32_t c_logging_prealloc; uint32_t c_long_running_txn; uint32_t c_lsm_worker_threads; @@ -216,6 +218,11 @@ typedef struct { #define VAR 3 u_int type; /* File type's flag value */ +#define CHECKPOINT_OFF 1 +#define CHECKPOINT_ON 2 +#define CHECKPOINT_WIREDTIGER 3 + u_int c_checkpoint_flag; /* Checkpoint flag value */ + #define CHECKSUM_OFF 1 #define CHECKSUM_ON 2 #define CHECKSUM_UNCOMPRESSED 3 diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index 4fed18d12b4..607dd43a8f3 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -175,7 +175,7 @@ wts_ops(int lastrun) if (g.c_backups) testutil_check( __wt_thread_create(NULL, &backup_tid, backup, NULL)); - if (g.c_checkpoints) + if (g.c_checkpoint_flag == CHECKPOINT_ON) testutil_check(__wt_thread_create( NULL, &checkpoint_tid, checkpoint, NULL)); if (g.c_compact) @@ -252,7 +252,7 @@ wts_ops(int lastrun) testutil_check(__wt_thread_join(NULL, alter_tid)); if (g.c_backups) testutil_check(__wt_thread_join(NULL, backup_tid)); - if (g.c_checkpoints) + if (g.c_checkpoint_flag == CHECKPOINT_ON) testutil_check(__wt_thread_join(NULL, checkpoint_tid)); if (g.c_compact) testutil_check(__wt_thread_join(NULL, compact_tid)); @@ -988,8 +988,8 @@ read_row(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno) { static int sn = 0; WT_SESSION *session; - int exact, ret; uint8_t bitfield; + int exact, ret; session = cursor->session; diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c index 30493a41912..ddcd14cfd55 100644 --- a/src/third_party/wiredtiger/test/format/wts.c +++ b/src/third_party/wiredtiger/test/format/wts.c @@ -185,6 +185,12 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp) if (DATASOURCE("lsm") || g.c_cache < 20) CONFIG_APPEND(p, ",eviction_dirty_trigger=95"); + /* Checkpoints. */ + if (g.c_checkpoint_flag == CHECKPOINT_WIREDTIGER) + CONFIG_APPEND(p, + ",checkpoint=(wait=%" PRIu32 ",log_size=%" PRIu32 ")", + g.c_checkpoint_wait, MEGABYTE(g.c_checkpoint_log_size)); + /* Eviction worker configuration. */ if (g.c_evict_max != 0) CONFIG_APPEND(p, @@ -193,12 +199,14 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp) /* Logging configuration. */ if (g.c_logging) CONFIG_APPEND(p, - ",log=(enabled=true,archive=%d,prealloc=%d" - ",compressor=\"%s\")", + ",log=(enabled=true,archive=%d," + "prealloc=%d,file_max=%" PRIu32 ",compressor=\"%s\")", g.c_logging_archive ? 1 : 0, g.c_logging_prealloc ? 1 : 0, + KILOBYTE(g.c_logging_file_max), compressor(g.c_logging_compression_flag)); + /* Encryption. */ if (g.c_encryption) CONFIG_APPEND(p, ",encryption=(name=%s)", encryptor(g.c_encryption_flag)); diff --git a/src/third_party/wiredtiger/test/suite/test_las.py b/src/third_party/wiredtiger/test/suite/test_las.py index 52a0b2d7300..07938c6d80b 100644 --- a/src/third_party/wiredtiger/test/suite/test_las.py +++ b/src/third_party/wiredtiger/test/suite/test_las.py @@ -38,13 +38,13 @@ def timestamp_str(t): class test_las(wttest.WiredTigerTestCase): # Force a small cache. def conn_config(self): - return 'cache_size=1GB' + return 'cache_size=50MB' def large_updates(self, session, uri, value, ds, nrows, timestamp=False): # Insert a large number of records, we'll hang if the lookaside table # isn't doing its thing. cursor = session.open_cursor(uri) - for i in range(1, 1000000): + for i in range(1, 10000): if timestamp == True: session.begin_transaction() cursor.set_key(ds.key(nrows + i)) @@ -73,7 +73,6 @@ class test_las(wttest.WiredTigerTestCase): session.close() conn.close() - @wttest.longtest('lookaside table smoke test') def test_las(self): # Create a small table. uri = "table:test_las" @@ -84,7 +83,7 @@ class test_las(wttest.WiredTigerTestCase): # Initially load huge data cursor = self.session.open_cursor(uri) - for i in range(1, 1000000): + for i in range(1, 10000): cursor.set_key(ds.key(nrows + i)) cursor.set_value(bigvalue) self.assertEquals(cursor.insert(), 0) diff --git a/src/third_party/wiredtiger/test/utility/misc.c b/src/third_party/wiredtiger/test/utility/misc.c index 0d751cd0df8..9d8fa28d3d7 100644 --- a/src/third_party/wiredtiger/test/utility/misc.c +++ b/src/third_party/wiredtiger/test/utility/misc.c @@ -31,8 +31,8 @@ void (*custom_die)(void) = NULL; const char *progname = "program name not set"; /* - * die -- - * Report an error and quit. + * testutil_die -- + * Report an error and abort. */ void testutil_die(int e, const char *fmt, ...) @@ -53,8 +53,9 @@ testutil_die(int e, const char *fmt, ...) if (e != 0) fprintf(stderr, ": %s", wiredtiger_strerror(e)); fprintf(stderr, "\n"); + fprintf(stderr, "process aborting\n"); - exit(EXIT_FAILURE); + abort(); } /* |