diff options
68 files changed, 5987 insertions, 4266 deletions
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 6908a52f5e0..3e5f2806de0 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -430,6 +430,18 @@ connection_runtime_config = [ for space to be available in cache before giving up. Default will wait forever''', min=0), + Config('cache_overflow', '', r''' + cache overflow configuration options''', + type='category', subconfig=[ + Config('file_max', '0', r''' + The maximum number of bytes that WiredTiger is allowed to use for + its cache overflow mechanism. If the cache overflow file exceeds + this size, a panic will be triggered. The default value means that + the cache overflow file is unbounded and may use as much space as + the filesystem will accommodate. The minimum non-zero setting is + 100MB.''', # !!! Must match WT_LAS_FILE_MIN + min='0') + ]), Config('cache_overhead', '8', r''' assume the heap allocator overhead is the specified percentage, and adjust the cache usage by that amount (for example, if there is 10GB @@ -456,6 +468,31 @@ connection_runtime_config = [ above 0 configures periodic checkpoints''', min='0', max='100000'), ]), + Config('debug_mode', '', r''' + control the settings of various extended debugging features''', + type='category', subconfig=[ + Config('checkpoint_retention', '0', r''' + adjust log archiving to retain the log records of this number + of checkpoints. Zero or one means perform normal archiving.''', + min='0', max='1024'), + Config('eviction', 'false', r''' + if true, modify internal algorithms to change skew to force + lookaside eviction to happen more aggressively. This includes but + is not limited to not skewing newest, not favoring leaf pages, + and modifying the eviction score mechanism.''', + type='boolean'), + Config('rollback_error', '0', r''' + return a WT_ROLLBACK error from a transaction operation about + every Nth operation to simulate a collision''', + min='0', max='10M'), + Config('table_logging', 'false', r''' + if true, write transaction related information to the log for all + operations, even operations for tables with logging turned off. + This setting introduces a log format change that may break older + versions of WiredTiger. These operations are informational and + skipped in recovery.''', + type='boolean'), + ]), Config('error_prefix', '', r''' prefix string for error messages'''), Config('eviction', '', r''' diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index 73fa6819e94..036b1a8b1a9 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -163,7 +163,12 @@ src/os_win/os_yield.c WINDOWS_HOST src/packing/pack_api.c src/packing/pack_impl.c src/packing/pack_stream.c +src/reconcile/rec_child.c +src/reconcile/rec_col.c +src/reconcile/rec_dictionary.c +src/reconcile/rec_row.c src/reconcile/rec_track.c +src/reconcile/rec_visibility.c src/reconcile/rec_write.c src/schema/schema_alter.c src/schema/schema_create.c diff --git a/src/third_party/wiredtiger/dist/log.py b/src/third_party/wiredtiger/dist/log.py index 4669b6bcc73..b3e6a71b63a 100644 --- a/src/third_party/wiredtiger/dist/log.py +++ b/src/third_party/wiredtiger/dist/log.py @@ -18,6 +18,9 @@ field_types = { 'WT_ERR(__logrec_make_hex_str(session, &escaped, &arg));']), 'recno' : ('uint64_t', 'r', '%" PRIu64 "', 'arg', [ '' ]), 'uint32' : ('uint32_t', 'I', '%" PRIu32 "', 'arg', [ '' ]), + # The fileid may have the high bit set. Print in both decimal and hex. + 'uint32_id' : ('uint32_t', 'I', + '%" PRIu32 " 0x%" PRIx32 "', 'arg, arg', [ '' ]), 'uint64' : ('uint64_t', 'Q', '%" PRIu64 "', 'arg', [ '' ]), } diff --git a/src/third_party/wiredtiger/dist/log_data.py b/src/third_party/wiredtiger/dist/log_data.py index 9e1538ccf04..18f368eaad0 100644 --- a/src/third_party/wiredtiger/dist/log_data.py +++ b/src/third_party/wiredtiger/dist/log_data.py @@ -36,7 +36,7 @@ rectypes = [ # the allocated LSN to reduce the amount of work recovery has to do, and # they are useful for debugging recovery. LogRecordType('file_sync', 'file sync', [ - ('uint32', 'fileid'), ('int', 'start')]), + ('uint32_id', 'fileid'), ('int', 'start')]), # Debugging message in the log LogRecordType('message', 'message', [('string', 'message')]), @@ -62,25 +62,39 @@ class LogOperationType: optypes = [ # commit operations LogOperationType('col_modify', 'column modify', - [('uint32', 'fileid'), ('recno', 'recno'), ('item', 'value')]), + [('uint32_id', 'fileid'), ('recno', 'recno'), ('item', 'value')]), LogOperationType('col_put', 'column put', - [('uint32', 'fileid'), ('recno', 'recno'), ('item', 'value')]), + [('uint32_id', 'fileid'), ('recno', 'recno'), ('item', 'value')]), LogOperationType('col_remove', 'column remove', - [('uint32', 'fileid'), ('recno', 'recno')]), + [('uint32_id', 'fileid'), ('recno', 'recno')]), LogOperationType('col_truncate', 'column truncate', - [('uint32', 'fileid'), ('recno', 'start'), ('recno', 'stop')]), + [('uint32_id', 'fileid'), ('recno', 'start'), ('recno', 'stop')]), LogOperationType('row_modify', 'row modify', - [('uint32', 'fileid'), ('item', 'key'), ('item', 'value')]), + [('uint32_id', 'fileid'), ('item', 'key'), ('item', 'value')]), LogOperationType('row_put', 'row put', - [('uint32', 'fileid'), ('item', 'key'), ('item', 'value')]), + [('uint32_id', 'fileid'), ('item', 'key'), ('item', 'value')]), LogOperationType('row_remove', 'row remove', - [('uint32', 'fileid'), ('item', 'key')]), + [('uint32_id', 'fileid'), ('item', 'key')]), LogOperationType('row_truncate', 'row truncate', - [('uint32', 'fileid'), ('item', 'start'), ('item', 'stop'), + [('uint32_id', 'fileid'), ('item', 'start'), ('item', 'stop'), ('uint32', 'mode')]), # system operations LogOperationType('checkpoint_start', 'checkpoint start', []), LogOperationType('prev_lsn', 'previous LSN', [('WT_LSN', 'prev_lsn')]), + +# diagnostic operations +# Operations used only for diagnostic purposes should be have their type +# values in the diagnostic range in src/include/wiredtiger.in so that they +# are always ignored by recovery. + # + # We need to know the base size/type of a 'struct timespec'. Cast its + # parts to uint64_t and split it into seconds and nanoseconds. + # + LogOperationType('txn_timestamp', 'txn_timestamp', + [('uint64', 'time_sec'), ('uint64', 'time_nsec'), + ('uint64', 'commit_ts'), ('uint64', 'durable_ts'), + ('uint64', 'first_ts'), ('uint64', 'prepare_ts'), + ('uint64', 'read_ts')]), ] diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index f199900e860..4ed32778cbb 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -20,6 +20,7 @@ WT_BLOCK_HEADER_SIZE WT_CACHE_LINE_ALIGNMENT WT_CACHE_LINE_PAD_BEGIN WT_CACHE_LINE_PAD_END +WT_CELL_UNUSED_BIT4 WT_CLOCKDIFF_NS WT_CONN_CHECK_PANIC WT_DEADLOCK diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index c251c99f2fe..3f336d0443b 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -103,6 +103,7 @@ DbEnv Decrement Decrypt DeleteFileW +Dh EACCES EAGAIN EB diff --git a/src/third_party/wiredtiger/dist/s_void b/src/third_party/wiredtiger/dist/s_void index 6c2b8b34040..2bb8b7abf0a 100755 --- a/src/third_party/wiredtiger/dist/s_void +++ b/src/third_party/wiredtiger/dist/s_void @@ -119,6 +119,7 @@ func_ok() -e '/int snappy_pre_size$/d' \ -e '/int snappy_terminate$/d' \ -e '/int subtest_error_handler$/d' \ + -e '/int test_las_workload$/d' \ -e '/int uri2name$/d' \ -e '/int usage$/d' \ -e '/int util_err$/d' \ diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 8b26fa2e9af..34d957a75ec 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -274,6 +274,8 @@ connection_stats = [ CacheStat('cache_lookaside_cursor_wait_internal', 'cache overflow cursor internal thread wait time (usecs)'), CacheStat('cache_lookaside_entries', 'cache overflow table entries', 'no_clear,no_scale'), CacheStat('cache_lookaside_insert', 'cache overflow table insert calls'), + CacheStat('cache_lookaside_ondisk', 'cache overflow table on-disk size', 'no_clear,no_scale,size'), + CacheStat('cache_lookaside_ondisk_max', 'cache overflow table max on-disk size', 'no_clear,no_scale,size'), CacheStat('cache_lookaside_remove', 'cache overflow table remove calls'), CacheStat('cache_lookaside_score', 'cache overflow score', 'no_clear,no_scale'), CacheStat('cache_overhead', 'percentage overhead', 'no_clear,no_scale'), diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 43dc53c86e3..a6cf0bc879e 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "4051e4941c894655cdb7d3dec97a7e32e7defbe6", + "commit": "4a3194b043b8cffb5339c12e1554d0bd42ed1b1f", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-4.0" diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 6047be0be14..55b41ad4b21 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -1423,12 +1423,13 @@ __cursor_chain_exceeded(WT_CURSOR_BTREE *cbt) upd != NULL && upd->type == WT_UPDATE_MODIFY; ++i, upd = upd->next) { upd_size += WT_UPDATE_MEMSIZE(upd); - if (upd_size >= WT_MODIFY_MEM_FACTOR * cursor->value.size) + if (i >= WT_MAX_MODIFY_UPDATE && + upd_size * WT_MODIFY_MEM_FRACTION >= cursor->value.size) return (true); } - if (upd != NULL && upd->type == WT_UPDATE_STANDARD && - __wt_txn_upd_visible_all(session, upd) && - i >= WT_MAX_MODIFY_UPDATE) + if (i >= WT_MAX_MODIFY_UPDATE && upd != NULL && + upd->type == WT_UPDATE_STANDARD && + __wt_txn_upd_visible_all(session, upd)) return (true); return (false); } diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index b0fd6a58edf..63ee4a3bc7c 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -81,7 +81,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); - ret = __wt_evict(session, ref, false, previous_state); + ret = __wt_evict(session, ref, previous_state, 0); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); ret = 0; diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 8dd918e8011..87f47f20aeb 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -776,7 +776,7 @@ read: /* if (force_attempts < 10 && __evict_force_check(session, ref)) { ++force_attempts; - ret = __wt_page_release_evict(session, ref); + ret = __wt_page_release_evict(session, ref, 0); /* If forced eviction fails, stall. */ if (ret == EBUSY) { WT_NOT_READ(ret, 0); diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c index bc85dcee4f5..4b42221865e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ret.c +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -201,6 +201,13 @@ __wt_value_return_upd(WT_SESSION_IMPL *session, memcpy(listp, list, sizeof(list)); } listp[i++] = upd; + + /* + * Once a modify is found, all previously committed + * modifications should be applied regardless of + * visibility. + */ + ignore_visibility = true; } } diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index f8f2552dc0a..5b0f2a5569a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -328,7 +328,8 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) */ if (ss->root_ref.page != NULL) { btree->ckpt = ckptbase; - ret = __wt_evict(session, &ss->root_ref, true, WT_REF_MEM); + ret = __wt_evict(session, &ss->root_ref, WT_REF_MEM, + WT_EVICT_CALL_CLOSING); ss->root_ref.page = NULL; btree->ckpt = NULL; } @@ -1300,7 +1301,8 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) ret = __wt_page_release(session, ref, 0); if (ret == 0) - ret = __wt_evict(session, ref, true, WT_REF_MEM); + ret = __wt_evict(session, ref, WT_REF_MEM, + WT_EVICT_CALL_CLOSING); if (0) { err: WT_TRET(__wt_page_release(session, ref, 0)); @@ -2019,7 +2021,8 @@ __slvg_row_build_leaf( */ ret = __wt_page_release(session, ref, 0); if (ret == 0) - ret = __wt_evict(session, ref, true, WT_REF_MEM); + ret = __wt_evict(session, ref, WT_REF_MEM, + WT_EVICT_CALL_CLOSING); if (0) { err: WT_TRET(__wt_page_release(session, ref, 0)); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 113b95e6ff9..9321cc88282 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -1406,6 +1406,25 @@ err: if (parent != NULL) return (0); } +#ifdef HAVE_DIAGNOSTIC +/* + * __check_upd_list -- + * Sanity check an update list. + * In particular, make sure there no birthmarks. + */ +static void +__check_upd_list(WT_SESSION_IMPL *session, WT_UPDATE *upd) +{ + int birthmark_count; + + for (birthmark_count = 0; upd != NULL; upd = upd->next) + if (upd->type == WT_UPDATE_BIRTHMARK) + ++birthmark_count; + + WT_ASSERT(session, birthmark_count <= 1); +} +#endif + /* * __split_multi_inmem -- * Instantiate a page from a disk image. @@ -1501,6 +1520,10 @@ __split_multi_inmem( key->size = WT_INSERT_KEY_SIZE(supd->ins); } +#ifdef HAVE_DIAGNOSTIC + __check_upd_list(session, upd); +#endif + /* Search the page. */ WT_ERR(__wt_row_search( session, key, ref, &cbt, true, true)); @@ -1802,9 +1825,11 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) { key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); - } else + } else { + WT_ASSERT(session, page->entries > 0); WT_ERR(__wt_row_leaf_key( session, page, &page->pg_row[0], key, true)); + } WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child)); parent_incr += sizeof(WT_IKEY) + key->size; __wt_scr_free(session, &key); diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index c7d17abd202..7113f4d9724 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -143,7 +143,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * checkpoint, the on-disk version is correct. If the truncate is * visible, we skip over the child page when writing its parent. We * check whether a truncate is visible in the checkpoint as part of - * reconciling internal pages (specifically in __rec_child_modify). + * reconciling internal pages (specifically in __wt_rec_child_modify). */ LF_SET(WT_READ_DELETED_SKIP); @@ -326,7 +326,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) page->read_gen == WT_READGEN_WONT_NEED && !tried_eviction) { WT_ERR_BUSY_OK( - __wt_page_release_evict(session, walk)); + __wt_page_release_evict(session, walk, 0)); walk = prev; prev = NULL; tried_eviction = true; diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index 77614e9c9e4..0e9f4f04f46 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -57,6 +57,46 @@ __las_entry_count(WT_CACHE *cache) } /* + * __wt_las_config -- + * Configure the lookaside table. + */ +int +__wt_las_config(WT_SESSION_IMPL *session, const char **cfg) +{ + WT_CONFIG_ITEM cval; + WT_CURSOR_BTREE *las_cursor; + WT_SESSION_IMPL *las_session; + + WT_RET(__wt_config_gets( + session, cfg, "cache_overflow.file_max", &cval)); + + if (cval.val != 0 && cval.val < WT_LAS_FILE_MIN) + WT_RET_MSG(session, EINVAL, + "max cache overflow size %" PRId64 " below minimum %d", + cval.val, WT_LAS_FILE_MIN); + + /* This is expected for in-memory configurations. */ + las_session = S2C(session)->cache->las_session[0]; + WT_ASSERT(session, + las_session != NULL || F_ISSET(S2C(session), WT_CONN_IN_MEMORY)); + + if (las_session == NULL) + return (0); + + /* + * We need to set file_max on the btree associated with one of the + * lookaside sessions. + */ + las_cursor = (WT_CURSOR_BTREE *)las_session->las_cursor; + las_cursor->btree->file_max = (uint64_t)cval.val; + + WT_STAT_CONN_SET( + session, cache_lookaside_ondisk_max, las_cursor->btree->file_max); + + return (0); +} + +/* * __wt_las_empty -- * Return when there are entries in the lookaside table. */ @@ -126,7 +166,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) * Initialize the database's lookaside store. */ int -__wt_las_create(WT_SESSION_IMPL *session) +__wt_las_create(WT_SESSION_IMPL *session, const char **cfg) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; @@ -166,6 +206,8 @@ __wt_las_create(WT_SESSION_IMPL *session) WT_RET(__wt_las_cursor_open(cache->las_session[i])); } + WT_RET(__wt_las_config(session, cfg)); + /* The statistics server is already running, make sure we don't race. */ WT_WRITE_BARRIER(); F_SET(conn, WT_CONN_LOOKASIDE_OPEN); @@ -609,8 +651,10 @@ __wt_las_insert_block(WT_CURSOR *cursor, WT_SAVE_UPD *list; WT_SESSION_IMPL *session; WT_TXN_ISOLATION saved_isolation; - WT_UPDATE *upd; - uint64_t insert_cnt, las_counter, las_pageid, prepared_insert_cnt; + WT_UPDATE *first_upd, *upd; + wt_off_t las_size; + uint64_t insert_cnt, las_counter, las_pageid, max_las_size; + uint64_t prepared_insert_cnt; uint32_t btree_id, i, slot; uint8_t *p; bool local_txn; @@ -688,7 +732,7 @@ __wt_las_insert_block(WT_CURSOR *cursor, slot = page->type == WT_PAGE_ROW_LEAF ? WT_ROW_SLOT(page, list->ripcip) : WT_COL_SLOT(page, list->ripcip); - upd = list->ins == NULL ? + first_upd = upd = list->ins == NULL ? page->modify->mod_row_update[slot] : list->ins->upd; /* @@ -707,6 +751,9 @@ __wt_las_insert_block(WT_CURSOR *cursor, las_value.size = upd->size; break; case WT_UPDATE_BIRTHMARK: + WT_ASSERT(session, upd != first_upd || + multi->page_las.skew_newest); + /* FALLTHROUGH */ case WT_UPDATE_TOMBSTONE: las_value.size = 0; break; @@ -727,6 +774,8 @@ __wt_las_insert_block(WT_CURSOR *cursor, (upd->type == WT_UPDATE_STANDARD || upd->type == WT_UPDATE_MODIFY)) { las_value.size = 0; + WT_ASSERT(session, upd != first_upd || + multi->page_las.skew_newest); cursor->set_value(cursor, upd->txnid, upd->timestamp, upd->prepare_state, WT_UPDATE_BIRTHMARK, &las_value); @@ -748,6 +797,14 @@ __wt_las_insert_block(WT_CURSOR *cursor, } while ((upd = upd->next) != NULL); } + WT_ERR(__wt_block_manager_named_size(session, WT_LAS_FILE, &las_size)); + WT_STAT_CONN_SET(session, cache_lookaside_ondisk, las_size); + max_las_size = ((WT_CURSOR_BTREE *)cursor)->btree->file_max; + if (max_las_size != 0 && (uint64_t)las_size > max_las_size) + WT_PANIC_MSG(session, WT_PANIC, + "WiredTigerLAS: file size of %" PRIu64 " exceeds maximum " + "size %" PRIu64, (uint64_t)las_size, max_las_size); + err: /* Resolve the transaction. */ if (local_txn) { if (ret == 0) @@ -773,6 +830,7 @@ err: /* Resolve the transaction. */ __las_insert_block_verbose(session, btree, multi); } + WT_UNUSED(first_upd); return (ret); } diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index 521f3d4bdc8..9e78e669cbb 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -61,6 +61,12 @@ static const WT_CONFIG_CHECK }; static const WT_CONFIG_CHECK + confchk_wiredtiger_open_cache_overflow_subconfigs[] = { + { "file_max", "int", NULL, "min=0", NULL, 0 }, + { NULL, NULL, NULL, NULL, NULL, 0 } +}; + +static const WT_CONFIG_CHECK confchk_wiredtiger_open_checkpoint_subconfigs[] = { { "log_size", "int", NULL, "min=0,max=2GB", NULL, 0 }, { "wait", "int", NULL, "min=0,max=100000", NULL, 0 }, @@ -74,6 +80,17 @@ static const WT_CONFIG_CHECK }; static const WT_CONFIG_CHECK + confchk_wiredtiger_open_debug_mode_subconfigs[] = { + { "checkpoint_retention", "int", + NULL, "min=0,max=1024", + NULL, 0 }, + { "eviction", "boolean", NULL, NULL, NULL, 0 }, + { "rollback_error", "int", NULL, "min=0,max=10M", NULL, 0 }, + { "table_logging", "boolean", NULL, NULL, NULL, 0 }, + { NULL, NULL, NULL, NULL, NULL, 0 } +}; + +static const WT_CONFIG_CHECK confchk_wiredtiger_open_eviction_subconfigs[] = { { "threads_max", "int", NULL, "min=1,max=20", NULL, 0 }, { "threads_min", "int", NULL, "min=1,max=20", NULL, 0 }, @@ -148,6 +165,9 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { NULL, NULL, confchk_wiredtiger_open_async_subconfigs, 3 }, { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 }, + { "cache_overflow", "category", + NULL, NULL, + confchk_wiredtiger_open_cache_overflow_subconfigs, 1 }, { "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 }, { "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 }, { "checkpoint", "category", @@ -156,6 +176,9 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { { "compatibility", "category", NULL, NULL, confchk_WT_CONNECTION_reconfigure_compatibility_subconfigs, 1 }, + { "debug_mode", "category", + NULL, NULL, + confchk_wiredtiger_open_debug_mode_subconfigs, 4 }, { "error_prefix", "string", NULL, NULL, NULL, 0 }, { "eviction", "category", NULL, NULL, @@ -839,6 +862,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "builtin_extension_config", "string", NULL, NULL, NULL, 0 }, { "cache_cursors", "boolean", NULL, NULL, NULL, 0 }, { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 }, + { "cache_overflow", "category", + NULL, NULL, + confchk_wiredtiger_open_cache_overflow_subconfigs, 1 }, { "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 }, { "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 }, { "checkpoint", "category", @@ -850,6 +876,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { confchk_wiredtiger_open_compatibility_subconfigs, 3 }, { "config_base", "boolean", NULL, NULL, NULL, 0 }, { "create", "boolean", NULL, NULL, NULL, 0 }, + { "debug_mode", "category", + NULL, NULL, + confchk_wiredtiger_open_debug_mode_subconfigs, 4 }, { "direct_io", "list", NULL, "choices=[\"checkpoint\",\"data\",\"log\"]", NULL, 0 }, @@ -948,6 +977,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "builtin_extension_config", "string", NULL, NULL, NULL, 0 }, { "cache_cursors", "boolean", NULL, NULL, NULL, 0 }, { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 }, + { "cache_overflow", "category", + NULL, NULL, + confchk_wiredtiger_open_cache_overflow_subconfigs, 1 }, { "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 }, { "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 }, { "checkpoint", "category", @@ -959,6 +991,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { confchk_wiredtiger_open_compatibility_subconfigs, 3 }, { "config_base", "boolean", NULL, NULL, NULL, 0 }, { "create", "boolean", NULL, NULL, NULL, 0 }, + { "debug_mode", "category", + NULL, NULL, + confchk_wiredtiger_open_debug_mode_subconfigs, 4 }, { "direct_io", "list", NULL, "choices=[\"checkpoint\",\"data\",\"log\"]", NULL, 0 }, @@ -1058,6 +1093,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { { "builtin_extension_config", "string", NULL, NULL, NULL, 0 }, { "cache_cursors", "boolean", NULL, NULL, NULL, 0 }, { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 }, + { "cache_overflow", "category", + NULL, NULL, + confchk_wiredtiger_open_cache_overflow_subconfigs, 1 }, { "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 }, { "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 }, { "checkpoint", "category", @@ -1067,6 +1105,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { { "compatibility", "category", NULL, NULL, confchk_wiredtiger_open_compatibility_subconfigs, 3 }, + { "debug_mode", "category", + NULL, NULL, + confchk_wiredtiger_open_debug_mode_subconfigs, 4 }, { "direct_io", "list", NULL, "choices=[\"checkpoint\",\"data\",\"log\"]", NULL, 0 }, @@ -1162,6 +1203,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { { "builtin_extension_config", "string", NULL, NULL, NULL, 0 }, { "cache_cursors", "boolean", NULL, NULL, NULL, 0 }, { "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 }, + { "cache_overflow", "category", + NULL, NULL, + confchk_wiredtiger_open_cache_overflow_subconfigs, 1 }, { "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 }, { "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 }, { "checkpoint", "category", @@ -1171,6 +1215,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { { "compatibility", "category", NULL, NULL, confchk_wiredtiger_open_compatibility_subconfigs, 3 }, + { "debug_mode", "category", + NULL, NULL, + confchk_wiredtiger_open_debug_mode_subconfigs, 4 }, { "direct_io", "list", NULL, "choices=[\"checkpoint\",\"data\",\"log\"]", NULL, 0 }, @@ -1307,8 +1354,10 @@ static const WT_CONFIG_ENTRY config_entries[] = { }, { "WT_CONNECTION.reconfigure", "async=(enabled=false,ops_max=1024,threads=2),cache_max_wait_ms=0" - ",cache_overhead=8,cache_size=100MB,checkpoint=(log_size=0," - "wait=0),compatibility=(release=),error_prefix=," + ",cache_overflow=(file_max=0),cache_overhead=8,cache_size=100MB," + "checkpoint=(log_size=0,wait=0),compatibility=(release=)," + "debug_mode=(checkpoint_retention=0,eviction=false," + "rollback_error=0,table_logging=false),error_prefix=," "eviction=(threads_max=8,threads_min=1)," "eviction_checkpoint_target=1,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" @@ -1321,7 +1370,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "statistics=none,statistics_log=(json=false,on_close=false," "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "timing_stress_for_test=,verbose=", - confchk_WT_CONNECTION_reconfigure, 24 + confchk_WT_CONNECTION_reconfigure, 26 }, { "WT_CONNECTION.rollback_to_stable", "", @@ -1556,19 +1605,22 @@ static const WT_CONFIG_ENTRY config_entries[] = { { "wiredtiger_open", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" ",builtin_extension_config=,cache_cursors=true," - "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB," - "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," - "compatibility=(release=,require_max=,require_min=)," - "config_base=true,create=false,direct_io=,encryption=(keyid=," - "name=,secretkey=),error_prefix=,eviction=(threads_max=8," - "threads_min=1),eviction_checkpoint_target=1," - "eviction_dirty_target=5,eviction_dirty_trigger=20," - "eviction_target=80,eviction_trigger=95,exclusive=false," - "extensions=,file_extend=,file_manager=(close_handle_minimum=250," - "close_idle_time=30,close_scan_interval=10),hazard_max=1000," - "in_memory=false,io_capacity=(total=0),log=(archive=true," - "compressor=,enabled=false,file_max=100MB,os_cache_dirty_pct=0," - "path=\".\",prealloc=true,recover=on,zero_fill=false)," + "cache_max_wait_ms=0,cache_overflow=(file_max=0),cache_overhead=8" + ",cache_size=100MB,checkpoint=(log_size=0,wait=0)," + "checkpoint_sync=true,compatibility=(release=,require_max=," + "require_min=),config_base=true,create=false," + "debug_mode=(checkpoint_retention=0,eviction=false," + "rollback_error=0,table_logging=false),direct_io=," + "encryption=(keyid=,name=,secretkey=),error_prefix=," + "eviction=(threads_max=8,threads_min=1)," + "eviction_checkpoint_target=1,eviction_dirty_target=5," + "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" + ",exclusive=false,extensions=,file_extend=," + "file_manager=(close_handle_minimum=250,close_idle_time=30," + "close_scan_interval=10),hazard_max=1000,in_memory=false," + "io_capacity=(total=0),log=(archive=true,compressor=," + "enabled=false,file_max=100MB,os_cache_dirty_pct=0,path=\".\"," + "prealloc=true,recover=on,zero_fill=false)," "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," "mmap=true,multiprocess=false,operation_tracking=(enabled=false," "path=\".\"),readonly=false,salvage=false,session_max=100," @@ -1579,24 +1631,27 @@ static const WT_CONFIG_ENTRY config_entries[] = { "timing_stress_for_test=,transaction_sync=(enabled=false," "method=fsync),use_environment=true,use_environment_priv=false," "verbose=,write_through=", - confchk_wiredtiger_open, 48 + confchk_wiredtiger_open, 50 }, { "wiredtiger_open_all", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" ",builtin_extension_config=,cache_cursors=true," - "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB," - "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," - "compatibility=(release=,require_max=,require_min=)," - "config_base=true,create=false,direct_io=,encryption=(keyid=," - "name=,secretkey=),error_prefix=,eviction=(threads_max=8," - "threads_min=1),eviction_checkpoint_target=1," - "eviction_dirty_target=5,eviction_dirty_trigger=20," - "eviction_target=80,eviction_trigger=95,exclusive=false," - "extensions=,file_extend=,file_manager=(close_handle_minimum=250," - "close_idle_time=30,close_scan_interval=10),hazard_max=1000," - "in_memory=false,io_capacity=(total=0),log=(archive=true," - "compressor=,enabled=false,file_max=100MB,os_cache_dirty_pct=0," - "path=\".\",prealloc=true,recover=on,zero_fill=false)," + "cache_max_wait_ms=0,cache_overflow=(file_max=0),cache_overhead=8" + ",cache_size=100MB,checkpoint=(log_size=0,wait=0)," + "checkpoint_sync=true,compatibility=(release=,require_max=," + "require_min=),config_base=true,create=false," + "debug_mode=(checkpoint_retention=0,eviction=false," + "rollback_error=0,table_logging=false),direct_io=," + "encryption=(keyid=,name=,secretkey=),error_prefix=," + "eviction=(threads_max=8,threads_min=1)," + "eviction_checkpoint_target=1,eviction_dirty_target=5," + "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" + ",exclusive=false,extensions=,file_extend=," + "file_manager=(close_handle_minimum=250,close_idle_time=30," + "close_scan_interval=10),hazard_max=1000,in_memory=false," + "io_capacity=(total=0),log=(archive=true,compressor=," + "enabled=false,file_max=100MB,os_cache_dirty_pct=0,path=\".\"," + "prealloc=true,recover=on,zero_fill=false)," "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," "mmap=true,multiprocess=false,operation_tracking=(enabled=false," "path=\".\"),readonly=false,salvage=false,session_max=100," @@ -1607,14 +1662,16 @@ static const WT_CONFIG_ENTRY config_entries[] = { "timing_stress_for_test=,transaction_sync=(enabled=false," "method=fsync),use_environment=true,use_environment_priv=false," "verbose=,version=(major=0,minor=0),write_through=", - confchk_wiredtiger_open_all, 49 + confchk_wiredtiger_open_all, 51 }, { "wiredtiger_open_basecfg", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" ",builtin_extension_config=,cache_cursors=true," - "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB," - "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," - "compatibility=(release=,require_max=,require_min=),direct_io=," + "cache_max_wait_ms=0,cache_overflow=(file_max=0),cache_overhead=8" + ",cache_size=100MB,checkpoint=(log_size=0,wait=0)," + "checkpoint_sync=true,compatibility=(release=,require_max=," + "require_min=),debug_mode=(checkpoint_retention=0,eviction=false," + "rollback_error=0,table_logging=false),direct_io=," "encryption=(keyid=,name=,secretkey=),error_prefix=," "eviction=(threads_max=8,threads_min=1)," "eviction_checkpoint_target=1,eviction_dirty_target=5," @@ -1633,14 +1690,16 @@ static const WT_CONFIG_ENTRY config_entries[] = { "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "timing_stress_for_test=,transaction_sync=(enabled=false," "method=fsync),verbose=,version=(major=0,minor=0),write_through=", - confchk_wiredtiger_open_basecfg, 43 + confchk_wiredtiger_open_basecfg, 45 }, { "wiredtiger_open_usercfg", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" ",builtin_extension_config=,cache_cursors=true," - "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB," - "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," - "compatibility=(release=,require_max=,require_min=),direct_io=," + "cache_max_wait_ms=0,cache_overflow=(file_max=0),cache_overhead=8" + ",cache_size=100MB,checkpoint=(log_size=0,wait=0)," + "checkpoint_sync=true,compatibility=(release=,require_max=," + "require_min=),debug_mode=(checkpoint_retention=0,eviction=false," + "rollback_error=0,table_logging=false),direct_io=," "encryption=(keyid=,name=,secretkey=),error_prefix=," "eviction=(threads_max=8,threads_min=1)," "eviction_checkpoint_target=1,eviction_dirty_target=5," @@ -1659,7 +1718,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "timing_stress_for_test=,transaction_sync=(enabled=false," "method=fsync),verbose=,write_through=", - confchk_wiredtiger_open_usercfg, 42 + confchk_wiredtiger_open_usercfg, 44 }, { NULL, NULL, NULL, 0 } }; diff --git a/src/third_party/wiredtiger/src/conn/api_calc_modify.c b/src/third_party/wiredtiger/src/conn/api_calc_modify.c index 4a435a85ef1..a8091498ee6 100644 --- a/src/third_party/wiredtiger/src/conn/api_calc_modify.c +++ b/src/third_party/wiredtiger/src/conn/api_calc_modify.c @@ -69,16 +69,31 @@ static void __cm_extend(WT_CM_STATE *cms, const uint8_t *m1, const uint8_t *m2, WT_CM_MATCH *match) { + ptrdiff_t n; const uint8_t *p1, *p2; - /* Step past the end and before the beginning of the matching block. */ + p1 = m1; + p2 = m2; + + /* + * Keep skipping half of the remaining bytes while they compare equal. + * This is significantly faster than our byte-at-a-time loop below. + */ for (p1 = m1, p2 = m2; - p1 < cms->e1 && p2 < cms->e2 && *p1 == *p2; - p1++, p2++) + (n = WT_MIN(cms->e1 - p1, cms->e2 - p2) / 2) > 8 && + memcmp(p1, p2, (size_t)n) == 0; + p1 += n, p2 += n) + ; + + /* Step past the end and before the beginning of the matching block. */ + for (n = WT_MIN(cms->e1 - p1, cms->e2 - p2); + n > 0 && *p1 == *p2; + n--, p1++, p2++) ; - for (; m1 >= cms->used1 && m2 >= cms->used2 && *m1 == *m2; - m1--, m2--) + for (n = WT_MIN(m1 - cms->used1, m2 - cms->used2); + n > 0 && *m1 == *m2; + n--, m1--, m2--) ; match->m1 = m1 + 1; diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index ef0072c45ac..54199fd38ad 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1816,6 +1816,57 @@ err: /* return (ret); } +/* + * __wt_debug_mode_config -- + * Set debugging configuration. + */ +int +__wt_debug_mode_config(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CACHE *cache; + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_TXN_GLOBAL *txn_global; + + conn = S2C(session); + cache = conn->cache; + txn_global = &conn->txn_global; + + WT_RET(__wt_config_gets(session, + cfg, "debug_mode.checkpoint_retention", &cval)); + conn->debug_ckpt_cnt = (uint32_t)cval.val; + if (cval.val == 0) { + if (conn->debug_ckpt != NULL) + __wt_free(session, conn->debug_ckpt); + conn->debug_ckpt = NULL; + } else if (conn->debug_ckpt != NULL) + WT_RET(__wt_realloc(session, NULL, + conn->debug_ckpt_cnt, &conn->debug_ckpt)); + else + WT_RET(__wt_calloc_def(session, + conn->debug_ckpt_cnt, &conn->debug_ckpt)); + + WT_RET(__wt_config_gets(session, + cfg, "debug_mode.eviction", &cval)); + if (cval.val) + F_SET(cache, WT_CACHE_EVICT_DEBUG_MODE); + else + F_CLR(cache, WT_CACHE_EVICT_DEBUG_MODE); + + WT_RET(__wt_config_gets(session, + cfg, "debug_mode.rollback_error", &cval)); + txn_global->debug_rollback = (uint64_t)cval.val; + + WT_RET(__wt_config_gets(session, + cfg, "debug_mode.table_logging", &cval)); + if (cval.val) + FLD_SET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE); + else + FLD_CLR(conn->log_flags, WT_CONN_LOG_DEBUG_MODE); + + return (0); +} + /* Simple structure for name and flag configuration searches. */ typedef struct { const char *name; @@ -2707,6 +2758,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, session = conn->default_session; /* + * This function expects the cache to be created so parse this after + * the rest of the connection is set up. + */ + WT_ERR(__wt_debug_mode_config(session, cfg)); + + /* * Load the extensions after initialization completes; extensions expect * everything else to be in place, and the extensions call back into the * library. diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c index a3818b3c914..faee6216ed7 100644 --- a/src/third_party/wiredtiger/src/conn/conn_handle.c +++ b/src/third_party/wiredtiger/src/conn/conn_handle.c @@ -132,8 +132,9 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) /* Free allocated memory. */ __wt_free(session, conn->cfg); - __wt_free(session, conn->home); + __wt_free(session, conn->debug_ckpt); __wt_free(session, conn->error_prefix); + __wt_free(session, conn->home); __wt_free(session, conn->sessions); __wt_stat_connection_discard(session, conn); diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index 8bc111346c5..cd93e459e0a 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -372,9 +372,19 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) */ if (backup_file != 0) min_lognum = WT_MIN(log->ckpt_lsn.l.file, backup_file); - else - min_lognum = WT_MIN( - log->ckpt_lsn.l.file, log->sync_lsn.l.file); + else { + /* + * Figure out the minimum log file to archive. Use the + * LSN in the debugging array if necessary. + */ + if (conn->debug_ckpt_cnt == 0) + min_lognum = WT_MIN( + log->ckpt_lsn.l.file, log->sync_lsn.l.file); + else + min_lognum = WT_MIN( + conn->debug_ckpt[conn->debug_ckpt_cnt - 1].l.file, + log->sync_lsn.l.file); + } __wt_verbose(session, WT_VERB_LOG, "log_archive: archive to log number %" PRIu32, min_lognum); diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index 7a2b52f40f9..fba1132ecb7 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -238,7 +238,7 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_meta_track_init(session)); /* Create the lookaside table. */ - WT_RET(__wt_las_create(session)); + WT_RET(__wt_las_create(session, cfg)); /* * Start eviction threads. diff --git a/src/third_party/wiredtiger/src/conn/conn_reconfig.c b/src/third_party/wiredtiger/src/conn/conn_reconfig.c index 1cd589c32c9..fa0726a1306 100644 --- a/src/third_party/wiredtiger/src/conn/conn_reconfig.c +++ b/src/third_party/wiredtiger/src/conn/conn_reconfig.c @@ -488,12 +488,14 @@ __wt_conn_reconfig(WT_SESSION_IMPL *session, const char **cfg) WT_ERR(__wt_cache_config(session, true, cfg)); WT_ERR(__wt_capacity_server_create(session, cfg)); WT_ERR(__wt_checkpoint_server_create(session, cfg)); + WT_ERR(__wt_debug_mode_config(session, cfg)); + WT_ERR(__wt_las_config(session, cfg)); WT_ERR(__wt_logmgr_reconfig(session, cfg)); WT_ERR(__wt_lsm_manager_reconfig(session, cfg)); WT_ERR(__wt_statlog_create(session, cfg)); WT_ERR(__wt_sweep_config(session, cfg)); - WT_ERR(__wt_verbose_config(session, cfg)); WT_ERR(__wt_timing_stress_config(session, cfg)); + WT_ERR(__wt_verbose_config(session, cfg)); /* Third, merge everything together, creating a new connection state. */ WT_ERR(__wt_config_merge(session, cfg, NULL, &p)); diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c index b9747d1b681..0e806f20608 100644 --- a/src/third_party/wiredtiger/src/evict/evict_file.c +++ b/src/third_party/wiredtiger/src/evict/evict_file.c @@ -95,7 +95,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * Ensure the ref state is restored to the previous * value if eviction fails. */ - WT_ERR(__wt_evict(session, ref, true, ref->state)); + WT_ERR(__wt_evict(session, ref, ref->state, + WT_EVICT_CALL_CLOSING)); break; case WT_SYNC_DISCARD: /* diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 3001f3d23da..f40ed758a19 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -107,6 +107,25 @@ __evict_entry_priority(WT_SESSION_IMPL *session, WT_REF *ref) } /* + * __evict_lru_cmp_debug -- + * Qsort function: sort the eviction array. + * Version for eviction debug mode. + */ +static int WT_CDECL +__evict_lru_cmp_debug(const void *a_arg, const void *b_arg) +{ + const WT_EVICT_ENTRY *a, *b; + uint64_t a_score, b_score; + + a = a_arg; + b = b_arg; + a_score = (a->ref == NULL ? UINT64_MAX : 0); + b_score = (b->ref == NULL ? UINT64_MAX : 0); + + return ((a_score < b_score) ? -1 : (a_score == b_score) ? 0 : 1); +} + +/* * __evict_lru_cmp -- * Qsort function: sort the eviction array. */ @@ -1257,8 +1276,17 @@ __evict_lru_walk(WT_SESSION_IMPL *session) queue->evict_current = NULL; entries = queue->evict_entries; - __wt_qsort(queue->evict_queue, - entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp); + /* + * Style note: __wt_qsort is a macro that can leave a dangling + * else. Full curly braces are needed here for the compiler. + */ + if (F_ISSET(cache, WT_CACHE_EVICT_DEBUG_MODE)) { + __wt_qsort(queue->evict_queue, + entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp_debug); + } else { + __wt_qsort(queue->evict_queue, + entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp); + } /* Trim empty entries from the end. */ while (entries > 0 && queue->evict_queue[entries - 1].ref == NULL) @@ -1975,12 +2003,14 @@ __evict_walk_tree(WT_SESSION_IMPL *session, * cache (indicated by seeing an internal page that is the * parent of the last page we saw). * - * Also skip internal page unless we get aggressive or the tree - * is idle (indicated by the tree being skipped for walks). + * Also skip internal page unless we get aggressive, the tree + * is idle (indicated by the tree being skipped for walks), + * or we are in eviction debug mode. * The goal here is that if trees become completely idle, we * eventually push them out of cache completely. */ - if (WT_PAGE_IS_INTERNAL(page)) { + if (!F_ISSET(cache, WT_CACHE_EVICT_DEBUG_MODE) && + WT_PAGE_IS_INTERNAL(page)) { if (page == last_parent) continue; if (btree->evict_walk_period == 0 && @@ -2320,7 +2350,7 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) __wt_cache_read_gen_bump(session, ref->page); WT_WITH_BTREE(session, btree, - ret = __wt_evict(session, ref, false, previous_state)); + ret = __wt_evict(session, ref, previous_state, 0)); (void)__wt_atomic_subv32(&btree->evict_busy, 1); diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index e75f0ef1bed..2510815401f 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -8,9 +8,9 @@ #include "wt_internal.h" -static int __evict_page_clean_update(WT_SESSION_IMPL *, WT_REF *, bool); -static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, bool); -static int __evict_review(WT_SESSION_IMPL *, WT_REF *, bool, bool *); +static int __evict_page_clean_update(WT_SESSION_IMPL *, WT_REF *, uint32_t); +static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, uint32_t); +static int __evict_review(WT_SESSION_IMPL *, WT_REF *, uint32_t, bool *); /* * __evict_exclusive_clear -- @@ -51,19 +51,20 @@ __evict_exclusive(WT_SESSION_IMPL *session, WT_REF *ref) * Release a reference to a page, and attempt to immediately evict it. */ int -__wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) +__wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; uint64_t time_start, time_stop; - uint32_t previous_state; + uint32_t evict_flags, previous_state; bool locked, too_big; btree = S2BT(session); locked = false; page = ref->page; time_start = __wt_clock(session); + evict_flags = LF_ISSET(WT_READ_NO_SPLIT) ? WT_EVICT_CALL_NO_SPLIT : 0; /* * This function always releases the hazard pointer - ensure that's @@ -89,7 +90,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) * Track how long the call to evict took. If eviction is successful then * we have one of two pairs of stats to increment. */ - ret = __wt_evict(session, ref, false, previous_state); + ret = __wt_evict(session, ref, previous_state, evict_flags); time_stop = __wt_clock(session); if (ret == 0) { if (too_big) { @@ -124,20 +125,25 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) */ int __wt_evict(WT_SESSION_IMPL *session, - WT_REF *ref, bool closing, uint32_t previous_state) + WT_REF *ref, uint32_t previous_state, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; - bool clean_page, inmem_split, local_gen, tree_dead; + bool clean_page, closing, inmem_split, local_gen, tree_dead; conn = S2C(session); page = ref->page; + closing = LF_ISSET(WT_EVICT_CALL_CLOSING); local_gen = false; __wt_verbose(session, WT_VERB_EVICT, "page %p (%s)", (void *)page, __wt_page_type_string(page->type)); + tree_dead = F_ISSET(session->dhandle, WT_DHANDLE_DEAD); + if (tree_dead) + LF_SET(WT_EVICT_CALL_NO_SPLIT); + /* * Enter the eviction generation. If we re-enter eviction, leave the * previous eviction generation (which must be as low as the current @@ -171,7 +177,7 @@ __wt_evict(WT_SESSION_IMPL *session, * Make this check for clean pages, too: while unlikely eviction would * choose an internal page with children, it's not disallowed. */ - WT_ERR(__evict_review(session, ref, closing, &inmem_split)); + WT_ERR(__evict_review(session, ref, flags, &inmem_split)); /* * If there was an in-memory split, the tree has been left in the state @@ -208,7 +214,6 @@ __wt_evict(WT_SESSION_IMPL *session, } /* Update the reference and discard the page. */ - tree_dead = F_ISSET(session->dhandle, WT_DHANDLE_DEAD); if (__wt_ref_is_root(ref)) __wt_ref_out(session, ref); else if ((clean_page && !F_ISSET(conn, WT_CONN_IN_MEMORY)) || tree_dead) @@ -216,10 +221,9 @@ __wt_evict(WT_SESSION_IMPL *session, * Pages that belong to dead trees never write back to disk * and can't support page splits. */ - WT_ERR(__evict_page_clean_update( - session, ref, tree_dead || closing)); + WT_ERR(__evict_page_clean_update(session, ref, flags)); else - WT_ERR(__evict_page_dirty_update(session, ref, closing)); + WT_ERR(__evict_page_dirty_update(session, ref, flags)); if (clean_page) { WT_STAT_CONN_INCR(session, cache_eviction_clean); @@ -250,7 +254,7 @@ done: /* Leave any local eviction generation. */ * split. */ static int -__evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) +__evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) { WT_DECL_RET; WT_PAGE *parent; @@ -264,7 +268,7 @@ __evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * Avoid doing reverse splits when closing the file, it is wasted work * and some structures may have already been freed. */ - if (!closing) { + if (!LF_ISSET(WT_EVICT_CALL_NO_SPLIT | WT_EVICT_CALL_CLOSING)) { parent = ref->home; WT_INTL_INDEX_GET(session, parent, pindex); ndeleted = __wt_atomic_addv32(&pindex->deleted_entries, 1); @@ -302,9 +306,12 @@ __evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * Update a clean page's reference on eviction. */ static int -__evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) +__evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) { WT_DECL_RET; + bool closing; + + closing = LF_ISSET(WT_EVICT_CALL_CLOSING); /* * Before discarding a page, assert that all updates are globally @@ -334,7 +341,7 @@ __evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_REF_SET_STATE(ref, WT_REF_LOOKASIDE); } else if (ref->addr == NULL) { WT_WITH_PAGE_INDEX(session, - ret = __evict_delete_ref(session, ref, closing)); + ret = __evict_delete_ref(session, ref, flags)); WT_RET_BUSY_OK(ret); } else WT_REF_SET_STATE(ref, WT_REF_DISK); @@ -347,14 +354,17 @@ __evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * Update a dirty page's reference on eviction. */ static int -__evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) +__evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, + uint32_t evict_flags) { WT_ADDR *addr; WT_DECL_RET; WT_MULTI multi; WT_PAGE_MODIFY *mod; + bool closing; mod = ref->page->modify; + closing = FLD_ISSET(evict_flags, WT_EVICT_CALL_CLOSING); WT_ASSERT(session, ref->addr == NULL); @@ -370,7 +380,7 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) */ __wt_ref_out(session, ref); WT_WITH_PAGE_INDEX(session, - ret = __evict_delete_ref(session, ref, closing)); + ret = __evict_delete_ref(session, ref, evict_flags)); WT_RET_BUSY_OK(ret); break; case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ @@ -511,20 +521,22 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) */ static int __evict_review( - WT_SESSION_IMPL *session, WT_REF *ref, bool closing, bool *inmem_splitp) + WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, + bool *inmem_splitp) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; uint32_t flags; - bool lookaside_retry, *lookaside_retryp, modified; + bool closing, lookaside_retry, *lookaside_retryp, modified; *inmem_splitp = false; conn = S2C(session); page = ref->page; flags = WT_REC_EVICT; + closing = FLD_ISSET(evict_flags, WT_EVICT_CALL_CLOSING); if (!WT_SESSION_BTREE_SYNC(session)) LF_SET(WT_REC_VISIBLE_ALL); @@ -644,7 +656,13 @@ __evict_review( else if (!WT_IS_METADATA(session->dhandle)) { LF_SET(WT_REC_UPDATE_RESTORE); - if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB)) + /* + * Scrub if we're supposed to or toss it in sometimes + * if we are in debugging mode. + */ + if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB) || + (F_ISSET(cache, WT_CACHE_EVICT_DEBUG_MODE) && + __wt_random(&session->rnd) % 3 == 0)) LF_SET(WT_REC_SCRUB); /* @@ -653,8 +671,16 @@ __evict_review( * suggests trying the lookaside table. */ if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE) && - !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE)) + !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE)) { + if (F_ISSET(cache, + WT_CACHE_EVICT_DEBUG_MODE) && + __wt_random(&session->rnd) % 10 == 0) { + LF_CLR(WT_REC_SCRUB | + WT_REC_UPDATE_RESTORE); + LF_SET(WT_REC_LOOKASIDE); + } lookaside_retryp = &lookaside_retry; + } } } diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 46f507ebedf..9859b3b607a 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -1114,9 +1114,9 @@ struct __wt_update { /* * WT_MODIFY_MEM_FACTOR -- - * Limit update chains to a factor of the base document size. + * Limit update chains to a fraction of the base document size. */ -#define WT_MODIFY_MEM_FACTOR 1 +#define WT_MODIFY_MEM_FRACTION 10 /* * WT_INSERT -- diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index f7ff274cfb8..17722a806e5 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -194,6 +194,12 @@ struct __wt_btree { uint64_t bytes_dirty_total; /* Bytes ever dirtied in cache. */ /* + * The maximum bytes allowed to be used for the table on disk. This is + * currently only used for the lookaside table. + */ + uint64_t file_max; + + /* * We flush pages from the tree (in order to make checkpoint faster), * without a high-level lock. To avoid multiple threads flushing at * the same time, lock the tree. diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 5e0f0521ded..e728790b02c 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -918,6 +918,7 @@ __wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell) */ v = WT_CELL_ENCODE_OFFSET(WT_PAGE_DISK_OFFSET(page, cell)) | WT_CELL_FLAG; + WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries); WT_ROW_KEY_SET(rip, v); } @@ -937,6 +938,7 @@ __wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack) v = WT_K_ENCODE_KEY_LEN(unpack->size) | WT_K_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) | WT_K_FLAG; + WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries); WT_ROW_KEY_SET(rip, v); } @@ -975,6 +977,7 @@ __wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack) WT_KV_ENCODE_VALUE_LEN(unpack->size) | WT_KV_ENCODE_KEY_OFFSET(key_offset) | WT_KV_ENCODE_VALUE_OFFSET(value_offset) | WT_KV_FLAG; + WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries); WT_ROW_KEY_SET(rip, v); } @@ -1516,7 +1519,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) WT_IGNORE_RET( __wt_page_evict_urgent(session, ref)); } else { - WT_RET_BUSY_OK(__wt_page_release_evict(session, ref)); + WT_RET_BUSY_OK(__wt_page_release_evict(session, ref, + flags)); return (0); } } diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index 7966d9802b3..c4c0ee5d5d4 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -54,6 +54,7 @@ typedef enum __wt_cache_op { WT_SYNC_WRITE_LEAVES } WT_CACHE_OP; +#define WT_LAS_FILE_MIN (100 * WT_MEGABYTE) #define WT_LAS_NUM_SESSIONS 5 #define WT_LAS_SWEEP_ENTRIES (20 * WT_THOUSAND) #define WT_LAS_SWEEP_SEC 2 @@ -171,7 +172,7 @@ struct __wt_cache { * Score of how aggressive eviction should be about selecting eviction * candidates. If eviction is struggling to make progress, this score * rises (up to a maximum of 100), at which point the cache is "stuck" - * and transaction will be rolled back. + * and transactions will be rolled back. */ uint32_t evict_aggressive_score; @@ -251,11 +252,12 @@ struct __wt_cache { /* AUTOMATIC FLAG VALUE GENERATION START */ #define WT_CACHE_EVICT_CLEAN 0x01u /* Evict clean pages */ #define WT_CACHE_EVICT_CLEAN_HARD 0x02u /* Clean % blocking app threads */ -#define WT_CACHE_EVICT_DIRTY 0x04u /* Evict dirty pages */ -#define WT_CACHE_EVICT_DIRTY_HARD 0x08u /* Dirty % blocking app threads */ -#define WT_CACHE_EVICT_LOOKASIDE 0x10u /* Try lookaside eviction */ -#define WT_CACHE_EVICT_SCRUB 0x20u /* Scrub dirty pages */ -#define WT_CACHE_EVICT_URGENT 0x40u /* Pages are in the urgent queue */ +#define WT_CACHE_EVICT_DEBUG_MODE 0x04u /* Aggressive debugging mode */ +#define WT_CACHE_EVICT_DIRTY 0x08u /* Evict dirty pages */ +#define WT_CACHE_EVICT_DIRTY_HARD 0x10u /* Dirty % blocking app threads */ +#define WT_CACHE_EVICT_LOOKASIDE 0x20u /* Try lookaside eviction */ +#define WT_CACHE_EVICT_SCRUB 0x40u /* Scrub dirty pages */ +#define WT_CACHE_EVICT_URGENT 0x80u /* Pages are in the urgent queue */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ #define WT_CACHE_EVICT_ALL (WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_DIRTY) uint32_t flags; @@ -290,3 +292,9 @@ struct __wt_cache_pool { /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint8_t flags; }; + +/* Flags used with __wt_evict */ +/* AUTOMATIC FLAG VALUE GENERATION START */ +#define WT_EVICT_CALL_CLOSING 0x1u /* Closing connection or tree */ +#define WT_EVICT_CALL_NO_SPLIT 0x2u /* Splits not allowed */ +/* AUTOMATIC FLAG VALUE GENERATION STOP */ diff --git a/src/third_party/wiredtiger/src/include/cell.h b/src/third_party/wiredtiger/src/include/cell.h new file mode 100644 index 00000000000..5e079a613ad --- /dev/null +++ b/src/third_party/wiredtiger/src/include/cell.h @@ -0,0 +1,162 @@ +/*- + * Copyright (c) 2014-2019 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * WT_CELL -- + * Variable-length cell type. + * + * Pages containing variable-length keys or values data (the WT_PAGE_ROW_INT, + * WT_PAGE_ROW_LEAF, WT_PAGE_COL_INT and WT_PAGE_COL_VAR page types), have + * cells after the page header. + * + * There are 4 basic cell types: keys and data (each of which has an overflow + * form), deleted cells and off-page references. The cell is usually followed + * by additional data, varying by type: a key or data cell is followed by a set + * of bytes, an address cookie follows overflow or off-page cells. + * + * Deleted cells are place-holders for column-store files, where entries cannot + * be removed in order to preserve the record count. + * + * Here's the cell use by page type: + * + * WT_PAGE_ROW_INT (row-store internal page): + * Keys and offpage-reference pairs (a WT_CELL_KEY or WT_CELL_KEY_OVFL + * cell followed by a WT_CELL_ADDR_XXX cell). + * + * WT_PAGE_ROW_LEAF (row-store leaf page): + * Keys with optional data cells (a WT_CELL_KEY or WT_CELL_KEY_OVFL cell, + * normally followed by a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell). + * + * WT_PAGE_ROW_LEAF pages optionally prefix-compress keys, using a single + * byte count immediately following the cell. + * + * WT_PAGE_COL_INT (Column-store internal page): + * Off-page references (a WT_CELL_ADDR_XXX cell). + * + * WT_PAGE_COL_VAR (Column-store leaf page storing variable-length cells): + * Data cells (a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell), or deleted + * cells (a WT_CELL_DEL cell). + * + * Each cell starts with a descriptor byte: + * + * Bits 1 and 2 are reserved for "short" key and value cells (that is, a cell + * carrying data less than 64B, where we can store the data length in the cell + * descriptor byte): + * 0x00 Not a short key/data cell + * 0x01 Short key cell + * 0x10 Short key cell, with a following prefix-compression byte + * 0x11 Short value cell + * In these cases, the other 6 bits of the descriptor byte are the data length. + * + * Bit 3 marks an 8B packed, uint64_t value following the cell description byte. + * (A run-length counter or a record number for variable-length column store.) + * + * Bit 4 is unused. + * + * Bits 5-8 are cell "types". + */ +#define WT_CELL_KEY_SHORT 0x01 /* Short key */ +#define WT_CELL_KEY_SHORT_PFX 0x02 /* Short key with prefix byte */ +#define WT_CELL_VALUE_SHORT 0x03 /* Short data */ +#define WT_CELL_SHORT_TYPE(v) ((v) & 0x03U) + +#define WT_CELL_SHORT_MAX 63 /* Maximum short key/value */ +#define WT_CELL_SHORT_SHIFT 2 /* Shift for short key/value */ + +#define WT_CELL_64V 0x04 /* Associated value */ + +/* + * We could use bit 4 as a single bit (similar to bit 3), or as a type bit in a + * backward compatible way by adding bit 4 to the type mask and adding new types + * that incorporate it. + */ +#define WT_CELL_UNUSED_BIT4 0x08 /* Unused */ + +/* + * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf + * block location, and WT_CELL_ADDR_LEAF_NO is a leaf block location where the + * page has no overflow items. (The goal is to speed up truncation as we don't + * have to read pages without overflow items in order to delete them. Note, + * WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without + * overflow items, the only guarantee is that if set, the page has no overflow + * items.) + * + * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting + * value dictionaries: if the two values are the same, we only store them once + * and have the second and subsequent use reference the original. + */ +#define WT_CELL_ADDR_DEL (0) /* Address: deleted */ +#define WT_CELL_ADDR_INT (1 << 4) /* Address: internal */ +#define WT_CELL_ADDR_LEAF (2 << 4) /* Address: leaf */ +#define WT_CELL_ADDR_LEAF_NO (3 << 4) /* Address: leaf no overflow */ +#define WT_CELL_DEL (4 << 4) /* Deleted value */ +#define WT_CELL_KEY (5 << 4) /* Key */ +#define WT_CELL_KEY_OVFL (6 << 4) /* Overflow key */ +#define WT_CELL_KEY_OVFL_RM (12 << 4) /* Overflow key (removed) */ +#define WT_CELL_KEY_PFX (7 << 4) /* Key with prefix byte */ +#define WT_CELL_VALUE (8 << 4) /* Value */ +#define WT_CELL_VALUE_COPY (9 << 4) /* Value copy */ +#define WT_CELL_VALUE_OVFL (10 << 4) /* Overflow value */ +#define WT_CELL_VALUE_OVFL_RM (11 << 4) /* Overflow value (removed) */ + +#define WT_CELL_TYPE_MASK (0x0fU << 4) /* Maximum 16 cell types */ +#define WT_CELL_TYPE(v) ((v) & WT_CELL_TYPE_MASK) + +/* + * When we aren't able to create a short key or value (and, in the case of a + * value, there's no associated RLE), the key or value is at least 64B, else + * we'd have been able to store it as a short cell. Decrement/Increment the + * size before storing it, in the hopes that relatively small key/value sizes + * will pack into a single byte instead of two bytes. + */ +#define WT_CELL_SIZE_ADJUST 64 + +/* + * WT_CELL -- + * Variable-length, on-page cell header. + */ +struct __wt_cell { + /* + * Maximum of 16 bytes: + * 1: cell descriptor byte + * 1: prefix compression count + * 9: associated 64-bit value (uint64_t encoding, max 9 bytes) + * 5: data length (uint32_t encoding, max 5 bytes) + * + * This calculation is pessimistic: the prefix compression count and + * 64V value overlap, the 64V value and data length are optional. + */ + uint8_t __chunk[1 + 1 + WT_INTPACK64_MAXSIZE + WT_INTPACK32_MAXSIZE]; +}; + +/* + * WT_CELL_UNPACK -- + * Unpacked cell. + */ +struct __wt_cell_unpack { + WT_CELL *cell; /* Cell's disk image address */ + + uint64_t v; /* RLE count or recno */ + + /* + * !!! + * The size and __len fields are reasonably type size_t; don't change + * the type, performance drops significantly if they're type size_t. + */ + const void *data; /* Data */ + uint32_t size; /* Data size */ + + uint32_t __len; /* Cell + data length (usually) */ + + uint8_t prefix; /* Cell prefix length */ + + uint8_t raw; /* Raw cell type (include "shorts") */ + uint8_t type; /* Cell type */ + + uint8_t ovfl; /* boolean: cell is an overflow */ +}; diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i index f518acfcbb0..c807737c494 100644 --- a/src/third_party/wiredtiger/src/include/cell.i +++ b/src/third_party/wiredtiger/src/include/cell.i @@ -7,161 +7,6 @@ */ /* - * WT_CELL -- - * Variable-length cell type. - * - * Pages containing variable-length keys or values data (the WT_PAGE_ROW_INT, - * WT_PAGE_ROW_LEAF, WT_PAGE_COL_INT and WT_PAGE_COL_VAR page types), have - * cells after the page header. - * - * There are 4 basic cell types: keys and data (each of which has an overflow - * form), deleted cells and off-page references. The cell is usually followed - * by additional data, varying by type: a key or data cell is followed by a set - * of bytes, an address cookie follows overflow or off-page cells. - * - * Deleted cells are place-holders for column-store files, where entries cannot - * be removed in order to preserve the record count. - * - * Here's the cell use by page type: - * - * WT_PAGE_ROW_INT (row-store internal page): - * Keys and offpage-reference pairs (a WT_CELL_KEY or WT_CELL_KEY_OVFL - * cell followed by a WT_CELL_ADDR_XXX cell). - * - * WT_PAGE_ROW_LEAF (row-store leaf page): - * Keys with optional data cells (a WT_CELL_KEY or WT_CELL_KEY_OVFL cell, - * normally followed by a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell). - * - * WT_PAGE_ROW_LEAF pages optionally prefix-compress keys, using a single - * byte count immediately following the cell. - * - * WT_PAGE_COL_INT (Column-store internal page): - * Off-page references (a WT_CELL_ADDR_XXX cell). - * - * WT_PAGE_COL_VAR (Column-store leaf page storing variable-length cells): - * Data cells (a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell), or deleted - * cells (a WT_CELL_DEL cell). - * - * Each cell starts with a descriptor byte: - * - * Bits 1 and 2 are reserved for "short" key and value cells (that is, a cell - * carrying data less than 64B, where we can store the data length in the cell - * descriptor byte): - * 0x00 Not a short key/data cell - * 0x01 Short key cell - * 0x10 Short key cell, with a following prefix-compression byte - * 0x11 Short value cell - * In these cases, the other 6 bits of the descriptor byte are the data length. - * - * Bit 3 marks an 8B packed, uint64_t value following the cell description byte. - * (A run-length counter or a record number for variable-length column store.) - * - * Bit 4 is unused. - * - * Bits 5-8 are cell "types". - */ -#define WT_CELL_KEY_SHORT 0x01 /* Short key */ -#define WT_CELL_KEY_SHORT_PFX 0x02 /* Short key with prefix byte */ -#define WT_CELL_VALUE_SHORT 0x03 /* Short data */ -#define WT_CELL_SHORT_TYPE(v) ((v) & 0x03U) - -#define WT_CELL_SHORT_MAX 63 /* Maximum short key/value */ -#define WT_CELL_SHORT_SHIFT 2 /* Shift for short key/value */ - -#define WT_CELL_64V 0x04 /* Associated value */ - -/* - * We could use bit 4 as a single bit (similar to bit 3), or as a type bit in a - * backward compatible way by adding bit 4 to the type mask and adding new types - * that incorporate it. - */ -#define WT_CELL_UNUSED_BIT4 0x08 /* Unused */ - -/* - * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf - * block location, and WT_CELL_ADDR_LEAF_NO is a leaf block location where the - * page has no overflow items. (The goal is to speed up truncation as we don't - * have to read pages without overflow items in order to delete them. Note, - * WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without - * overflow items, the only guarantee is that if set, the page has no overflow - * items.) - * - * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting - * value dictionaries: if the two values are the same, we only store them once - * and have the second and subsequent use reference the original. - */ -#define WT_CELL_ADDR_DEL (0) /* Address: deleted */ -#define WT_CELL_ADDR_INT (1 << 4) /* Address: internal */ -#define WT_CELL_ADDR_LEAF (2 << 4) /* Address: leaf */ -#define WT_CELL_ADDR_LEAF_NO (3 << 4) /* Address: leaf no overflow */ -#define WT_CELL_DEL (4 << 4) /* Deleted value */ -#define WT_CELL_KEY (5 << 4) /* Key */ -#define WT_CELL_KEY_OVFL (6 << 4) /* Overflow key */ -#define WT_CELL_KEY_OVFL_RM (12 << 4) /* Overflow key (removed) */ -#define WT_CELL_KEY_PFX (7 << 4) /* Key with prefix byte */ -#define WT_CELL_VALUE (8 << 4) /* Value */ -#define WT_CELL_VALUE_COPY (9 << 4) /* Value copy */ -#define WT_CELL_VALUE_OVFL (10 << 4) /* Overflow value */ -#define WT_CELL_VALUE_OVFL_RM (11 << 4) /* Overflow value (removed) */ - -#define WT_CELL_TYPE_MASK (0x0fU << 4) /* Maximum 16 cell types */ -#define WT_CELL_TYPE(v) ((v) & WT_CELL_TYPE_MASK) - -/* - * When we aren't able to create a short key or value (and, in the case of a - * value, there's no associated RLE), the key or value is at least 64B, else - * we'd have been able to store it as a short cell. Decrement/Increment the - * size before storing it, in the hopes that relatively small key/value sizes - * will pack into a single byte instead of two bytes. - */ -#define WT_CELL_SIZE_ADJUST 64 - -/* - * WT_CELL -- - * Variable-length, on-page cell header. - */ -struct __wt_cell { - /* - * Maximum of 16 bytes: - * 1: cell descriptor byte - * 1: prefix compression count - * 9: associated 64-bit value (uint64_t encoding, max 9 bytes) - * 5: data length (uint32_t encoding, max 5 bytes) - * - * This calculation is pessimistic: the prefix compression count and - * 64V value overlap, the 64V value and data length are optional. - */ - uint8_t __chunk[1 + 1 + WT_INTPACK64_MAXSIZE + WT_INTPACK32_MAXSIZE]; -}; - -/* - * WT_CELL_UNPACK -- - * Unpacked cell. - */ -struct __wt_cell_unpack { - WT_CELL *cell; /* Cell's disk image address */ - - uint64_t v; /* RLE count or recno */ - - /* - * !!! - * The size and __len fields are reasonably type size_t; don't change - * the type, performance drops significantly if they're type size_t. - */ - const void *data; /* Data */ - uint32_t size; /* Data size */ - - uint32_t __len; /* Cell + data length (usually) */ - - uint8_t prefix; /* Cell prefix length */ - - uint8_t raw; /* Raw cell type (include "shorts") */ - uint8_t type; /* Cell type */ - - uint8_t ovfl; /* boolean: cell is an overflow */ -}; - -/* * WT_CELL_FOREACH -- * Walk the cells on a page. */ diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index a23434ea9e2..73ac6c85522 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -193,6 +193,9 @@ struct __wt_connection_impl { WT_SPINLOCK optrack_map_spinlock; /* Translation file spinlock. */ uintmax_t optrack_pid; /* Cache the process ID. */ + WT_LSN *debug_ckpt; /* Debug mode checkpoint LSNs. */ + uint32_t debug_ckpt_cnt;/* Checkpoint retention number */ + void **foc; /* Free-on-close array */ size_t foc_cnt; /* Array entries */ size_t foc_size; /* Array size */ @@ -321,15 +324,16 @@ struct __wt_connection_impl { /* AUTOMATIC FLAG VALUE GENERATION START */ #define WT_CONN_LOG_ARCHIVE 0x001u /* Archive is enabled */ -#define WT_CONN_LOG_DOWNGRADED 0x002u /* Running older version */ -#define WT_CONN_LOG_ENABLED 0x004u /* Logging is enabled */ -#define WT_CONN_LOG_EXISTED 0x008u /* Log files found */ -#define WT_CONN_LOG_FORCE_DOWNGRADE 0x010u /* Force downgrade */ -#define WT_CONN_LOG_RECOVER_DIRTY 0x020u /* Recovering unclean */ -#define WT_CONN_LOG_RECOVER_DONE 0x040u /* Recovery completed */ -#define WT_CONN_LOG_RECOVER_ERR 0x080u /* Error if recovery required */ -#define WT_CONN_LOG_RECOVER_FAILED 0x100u /* Recovery failed */ -#define WT_CONN_LOG_ZERO_FILL 0x200u /* Manually zero files */ +#define WT_CONN_LOG_DEBUG_MODE 0x002u /* Debug-mode logging enabled */ +#define WT_CONN_LOG_DOWNGRADED 0x004u /* Running older version */ +#define WT_CONN_LOG_ENABLED 0x008u /* Logging is enabled */ +#define WT_CONN_LOG_EXISTED 0x010u /* Log files found */ +#define WT_CONN_LOG_FORCE_DOWNGRADE 0x020u /* Force downgrade */ +#define WT_CONN_LOG_RECOVER_DIRTY 0x040u /* Recovering unclean */ +#define WT_CONN_LOG_RECOVER_DONE 0x080u /* Recovery completed */ +#define WT_CONN_LOG_RECOVER_ERR 0x100u /* Error if recovery required */ +#define WT_CONN_LOG_RECOVER_FAILED 0x200u /* Recovery failed */ +#define WT_CONN_LOG_ZERO_FILL 0x400u /* Manually zero files */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint32_t log_flags; /* Global logging configuration */ WT_CONDVAR *log_cond; /* Log server wait mutex */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index aa313fa2caf..1ecfaf6eef6 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -202,9 +202,10 @@ extern int __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, WT_ extern WT_UPDATE *__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert, bool restore) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_las_config(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_las_empty(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_las_stats_update(WT_SESSION_IMPL *session); -extern int __wt_las_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_las_create(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_cursor_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_las_cursor(WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags); @@ -254,6 +255,7 @@ extern int __wt_encryptor_config(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cval, extern int __wt_conn_remove_encryptor(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_extractor_config(WT_SESSION_IMPL *session, const char *uri, const char *config, WT_EXTRACTOR **extractorp, int *ownp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_conn_remove_extractor(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_debug_mode_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_verbose_dump_sessions(WT_SESSION_IMPL *session, bool show_cursors) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_timing_stress_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -386,11 +388,11 @@ extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v); extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session); extern int __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing, uint32_t previous_state) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t previous_state, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session); extern int __wt_log_printf(WT_SESSION_IMPL *session, const char *format, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern void __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); +extern void __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckpt_lsn); extern int __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn); extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -446,6 +448,9 @@ extern int __wt_logop_checkpoint_start_print(WT_SESSION_IMPL *session, const uin extern int __wt_logop_prev_lsn_pack(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *prev_lsn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_logop_prev_lsn_unpack(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_LSN *prev_lsnp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_logop_prev_lsn_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_logop_txn_timestamp_pack(WT_SESSION_IMPL *session, WT_ITEM *logrec, uint64_t time_sec, uint64_t time_nsec, uint64_t commit_ts, uint64_t durable_ts, uint64_t first_ts, uint64_t prepare_ts, uint64_t read_ts) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_logop_txn_timestamp_unpack(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint64_t *time_secp, uint64_t *time_nsecp, uint64_t *commit_tsp, uint64_t *durable_tsp, uint64_t *first_tsp, uint64_t *prepare_tsp, uint64_t *read_tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_logop_txn_timestamp_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_op_printlog(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int __wt_log_slot_switch(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced, bool *did_work) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -597,6 +602,21 @@ extern int __wt_ext_unpack_item(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, WT extern int __wt_ext_unpack_int(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, int64_t *ip) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_ext_unpack_str(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, const char **sp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_ext_unpack_uint(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, uint64_t *up) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rec_child_modify(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, bool *hazardp, WT_CHILD_STATE *statep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rec_col_fix_slvg(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rec_col_var(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u_int slots) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r); +extern void __wt_rec_dictionary_reset(WT_RECONCILE *r); +extern int __wt_rec_dictionary_lookup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *val, WT_REC_DICTIONARY **dpp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rec_row_leaf(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_ovfl_track_init(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page); @@ -605,14 +625,16 @@ extern int __wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page, const ui extern void __wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, bool *upd_savedp, WT_UPDATE **updp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags, bool *lookaside_retryp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern uint32_t __wt_split_page_size(int split_pct, uint32_t maxpagesize, uint32_t allocsize); +extern int __wt_rec_split_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page, uint64_t recno, uint64_t max) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rec_split_crossing_bnd(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rec_cell_build_ovfl(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv, uint8_t type, uint64_t rle) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_schema_alter(WT_SESSION_IMPL *session, const char *uri, const char *newcfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -840,6 +862,7 @@ extern void __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op); extern int __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_checkpoint_logread(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_LSN *ckpt_lsn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_txn_ts_log(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_checkpoint_log(WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_truncate_log(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_txn_truncate_end(WT_SESSION_IMPL *session); @@ -861,7 +884,7 @@ extern int __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *c extern int __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t ts, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_parse_prepare_timestamp(WT_SESSION_IMPL *session, const char *cfg[], wt_timestamp_t *timestamp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session, const char *cfg[], bool *set_tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session); extern void __wt_txn_clear_commit_timestamp(WT_SESSION_IMPL *session); extern void __wt_txn_set_read_timestamp(WT_SESSION_IMPL *session); diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h index 463f92a34a3..e7e49b8b0ce 100644 --- a/src/third_party/wiredtiger/src/include/log.h +++ b/src/third_party/wiredtiger/src/include/log.h @@ -21,6 +21,9 @@ #define WT_LOG_SYNC_ENABLED 0x10u /* AUTOMATIC FLAG VALUE GENERATION STOP */ +#define WT_LOGOP_IGNORE 0x80000000 +#define WT_LOGOP_IS_IGNORED(val) (val & WT_LOGOP_IGNORE) + /* * WT_LSN -- * A log sequence number, representing a position in the transaction log. diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h index e221cad1481..e4b369f736d 100644 --- a/src/third_party/wiredtiger/src/include/meta.h +++ b/src/third_party/wiredtiger/src/include/meta.h @@ -27,6 +27,7 @@ #define WT_METAFILE_SLVG "WiredTiger.wt.orig" /* Metadata copy */ #define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata table URI */ +#define WT_LAS_FILE "WiredTigerLAS.wt" /* Lookaside table */ #define WT_LAS_URI "file:WiredTigerLAS.wt" /* Lookaside table URI*/ #define WT_SYSTEM_PREFIX "system:" /* System URI prefix */ diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h new file mode 100644 index 00000000000..fdb47f3d3d9 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/reconcile.h @@ -0,0 +1,294 @@ +/*- + * Copyright (c) 2014-2019 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * Reconciliation is the process of taking an in-memory page, walking each entry + * in the page, building a backing disk image in a temporary buffer representing + * that information, and writing that buffer to disk. What could be simpler? + * + * WT_RECONCILE -- + * Information tracking a single page reconciliation. + */ +typedef struct { + WT_REF *ref; /* Page being reconciled */ + WT_PAGE *page; + uint32_t flags; /* Caller's configuration */ + + /* + * Track start/stop write generation to decide if all changes to the + * page are written. + */ + uint32_t orig_write_gen; + + /* + * Track start/stop checkpoint generations to decide if lookaside table + * records are correct. + */ + uint64_t orig_btree_checkpoint_gen; + uint64_t orig_txn_checkpoint_gen; + + /* + * Track the oldest running transaction and whether to skew lookaside + * to the newest update. + */ + bool las_skew_newest; + uint64_t last_running; + + /* Track the page's min/maximum transactions. */ + uint64_t max_txn; + wt_timestamp_t max_timestamp; + + /* Lookaside boundary tracking. */ + uint64_t unstable_txn; + wt_timestamp_t unstable_timestamp; + + u_int updates_seen; /* Count of updates seen. */ + u_int updates_unstable; /* Count of updates not visible_all. */ + + bool update_uncommitted; /* An update was uncommitted */ + bool update_used; /* An update could be used */ + + /* + * When we can't mark the page clean (for example, checkpoint found some + * uncommitted updates), there's a leave-dirty flag. + */ + bool leave_dirty; + + /* + * Track if reconciliation has seen any overflow items. If a leaf page + * with no overflow items is written, the parent page's address cell is + * set to the leaf-no-overflow type. This means we can delete the leaf + * page without reading it because we don't have to discard any overflow + * items it might reference. + * + * The test test is per-page reconciliation, that is, once we see an + * overflow item on the page, all subsequent leaf pages written for the + * page will not be leaf-no-overflow type, regardless of whether or not + * they contain overflow items. In other words, leaf-no-overflow is not + * guaranteed to be set on every page that doesn't contain an overflow + * item, only that if it is set, the page contains no overflow items. + * XXX + * This was originally done because raw compression couldn't do better, + * now that raw compression has been removed, we should do better. + */ + bool ovfl_items; + + /* + * Track if reconciliation of a row-store leaf page has seen empty (zero + * length) values. We don't write out anything for empty values, so if + * there are empty values on a page, we have to make two passes over the + * page when it's read to figure out how many keys it has, expensive in + * the common case of no empty values and (entries / 2) keys. Likewise, + * a page with only empty values is another common data set, and keys on + * that page will be equal to the number of entries. In both cases, set + * a flag in the page's on-disk header. + * + * The test is per-page reconciliation as described above for the + * overflow-item test. + */ + bool all_empty_value, any_empty_value; + + /* + * Reconciliation gets tricky if we have to split a page, which happens + * when the disk image we create exceeds the page type's maximum disk + * image size. + * + * First, the target size of the page we're building. + */ + uint32_t page_size; /* Page size */ + + /* + * Second, the split size: if we're doing the page layout, split to a + * smaller-than-maximum page size when a split is required so we don't + * repeatedly split a packed page. + */ + uint32_t split_size; /* Split page size */ + uint32_t min_split_size; /* Minimum split page size */ + + /* + * We maintain two split chunks in the memory during reconciliation to + * be written out as pages. As we get to the end of the data, if the + * last one turns out to be smaller than the minimum split size, we go + * back into the penultimate chunk and split at this minimum split size + * boundary. This moves some data from the penultimate chunk to the last + * chunk, hence increasing the size of the last page written without + * decreasing the penultimate page size beyond the minimum split size. + * For this reason, we maintain an expected split percentage boundary + * and a minimum split percentage boundary. + * + * Chunks are referenced by current and previous pointers. In case of a + * split, previous references the first chunk and current switches to + * the second chunk. If reconciliation generates more split chunks, the + * the previous chunk is written to the disk and current and previous + * swap. + */ + struct __wt_rec_chunk { + /* + * The recno and entries fields are the starting record number + * of the split chunk (for column-store splits), and the number + * of entries in the split chunk. + * + * The key for a row-store page; no column-store key is needed + * because the page's recno, stored in the recno field, is the + * column-store key. + */ + uint32_t entries; + uint64_t recno; + WT_ITEM key; + + uint32_t min_entries; + uint64_t min_recno; + WT_ITEM min_key; + + /* Minimum split-size boundary buffer offset. */ + size_t min_offset; + + WT_ITEM image; /* disk-image */ + } chunkA, chunkB, *cur_ptr, *prev_ptr; + + /* + * We track current information about the current record number, the + * number of entries copied into the disk image buffer, where we are + * in the buffer, and how much memory remains. Those values are + * packaged here rather than passing pointers to stack locations + * around the code. + */ + uint64_t recno; /* Current record number */ + uint32_t entries; /* Current number of entries */ + uint8_t *first_free; /* Current first free byte */ + size_t space_avail; /* Remaining space in this chunk */ + /* Remaining space in this chunk to put a minimum size boundary */ + size_t min_space_avail; + + /* + * Saved update list, supporting the WT_REC_UPDATE_RESTORE and + * WT_REC_LOOKASIDE configurations. While reviewing updates for each + * page, we save WT_UPDATE lists here, and then move them to per-block + * areas as the blocks are defined. + */ + WT_SAVE_UPD *supd; /* Saved updates */ + uint32_t supd_next; + size_t supd_allocated; + size_t supd_memsize; /* Size of saved update structures */ + + /* List of pages we've written so far. */ + WT_MULTI *multi; + uint32_t multi_next; + size_t multi_allocated; + + /* + * Root pages are written when wrapping up the reconciliation, remember + * the image we're going to write. + */ + WT_ITEM *wrapup_checkpoint; + bool wrapup_checkpoint_compressed; + + /* + * We don't need to keep the 0th key around on internal pages, the + * search code ignores them as nothing can sort less by definition. + * There's some trickiness here, see the code for comments on how + * these fields work. + */ + bool cell_zero; /* Row-store internal page 0th key */ + + /* + * We calculate checksums to find previously written identical blocks, + * but once a match fails during an eviction, there's no point trying + * again. + */ + bool evict_matching_checksum_failed; + + /* + * WT_REC_DICTIONARY -- + * We optionally build a dictionary of values for leaf pages. Where + * two value cells are identical, only write the value once, the second + * and subsequent copies point to the original cell. The dictionary is + * fixed size, but organized in a skip-list to make searches faster. + */ + struct __wt_rec_dictionary { + uint64_t hash; /* Hash value */ + uint32_t offset; /* Matching cell */ + + u_int depth; /* Skiplist */ + WT_REC_DICTIONARY *next[0]; + } **dictionary; /* Dictionary */ + u_int dictionary_next, dictionary_slots; /* Next, max entries */ + /* Skiplist head. */ + WT_REC_DICTIONARY *dictionary_head[WT_SKIP_MAXDEPTH]; + + /* + * WT_REC_KV-- + * An on-page key/value item we're building. + */ + struct __wt_rec_kv { + WT_ITEM buf; /* Data */ + WT_CELL cell; /* Cell and cell's length */ + size_t cell_len; + size_t len; /* Total length of cell + data */ + } k, v; /* Key/Value being built */ + + WT_ITEM *cur, _cur; /* Key/Value being built */ + WT_ITEM *last, _last; /* Last key/value built */ + + bool key_pfx_compress; /* If can prefix-compress next key */ + bool key_pfx_compress_conf; /* If prefix compression configured */ + bool key_sfx_compress; /* If can suffix-compress next key */ + bool key_sfx_compress_conf; /* If suffix compression configured */ + + bool is_bulk_load; /* If it's a bulk load */ + + WT_SALVAGE_COOKIE *salvage; /* If it's a salvage operation */ + + bool cache_write_lookaside; /* Used the lookaside table */ + bool cache_write_restore; /* Used update/restoration */ + + uint32_t tested_ref_state; /* Debugging information */ + + /* + * XXX + * In the case of a modified update, we may need a copy of the current + * value as a set of bytes. We call back into the btree code using a + * fake cursor to do that work. This a layering violation and fragile, + * we need a better solution. + */ + WT_CURSOR_BTREE update_modify_cbt; +} WT_RECONCILE; + +/* + * WT_CHILD_RELEASE, WT_CHILD_RELEASE_ERR -- + * Macros to clean up during internal-page reconciliation, releasing the + * hazard pointer we're holding on child pages. + */ +#define WT_CHILD_RELEASE(session, hazard, ref) do { \ + if (hazard) { \ + (hazard) = false; \ + WT_TRET( \ + __wt_page_release(session, ref, WT_READ_NO_EVICT)); \ + } \ +} while (0) +#define WT_CHILD_RELEASE_ERR(session, hazard, ref) do { \ + WT_CHILD_RELEASE(session, hazard, ref); \ + WT_ERR(ret); \ +} while (0) + +typedef enum { + WT_CHILD_IGNORE, /* Ignored child */ + WT_CHILD_MODIFIED, /* Modified child */ + WT_CHILD_ORIGINAL, /* Original child */ + WT_CHILD_PROXY /* Deleted child: proxy */ +} WT_CHILD_STATE; + +/* + * Macros from fixed-length entries to/from bytes. + */ +#define WT_FIX_BYTES_TO_ENTRIES(btree, bytes) \ + ((uint32_t)((((bytes) * 8) / (btree)->bitcnt))) +#define WT_FIX_ENTRIES_TO_BYTES(btree, entries) \ + ((uint32_t)WT_ALIGN((entries) * (btree)->bitcnt, 8)) diff --git a/src/third_party/wiredtiger/src/include/reconcile.i b/src/third_party/wiredtiger/src/include/reconcile.i new file mode 100644 index 00000000000..b56b8dc1404 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/reconcile.i @@ -0,0 +1,257 @@ +/*- + * Copyright (c) 2014-2019 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +#define WT_CROSSING_MIN_BND(r, next_len) \ + ((r)->cur_ptr->min_offset == 0 && \ + (next_len) > (r)->min_space_avail) +#define WT_CROSSING_SPLIT_BND(r, next_len) ((next_len) > (r)->space_avail) +#define WT_CHECK_CROSSING_BND(r, next_len) \ + (WT_CROSSING_MIN_BND(r, next_len) || WT_CROSSING_SPLIT_BND(r, next_len)) + +/* + * __wt_rec_vtype -- + * Return a value cell's address type. + */ +static inline u_int +__wt_rec_vtype(WT_ADDR *addr) +{ + if (addr->type == WT_ADDR_INT) + return (WT_CELL_ADDR_INT); + if (addr->type == WT_ADDR_LEAF) + return (WT_CELL_ADDR_LEAF); + return (WT_CELL_ADDR_LEAF_NO); +} + +/* + * __wt_rec_need_split -- + * Check whether adding some bytes to the page requires a split. + */ +static inline bool +__wt_rec_need_split(WT_RECONCILE *r, size_t len) +{ + /* + * In the case of a row-store leaf page, trigger a split if a threshold + * number of saved updates is reached. This allows pages to split for + * update/restore and lookaside eviction when there is no visible data + * causing the disk image to grow. + * + * In the case of small pages or large keys, we might try to split when + * a page has no updates or entries, which isn't possible. To consider + * update/restore or lookaside information, require either page entries + * or updates that will be attached to the image. The limit is one of + * either, but it doesn't make sense to create pages or images with few + * entries or updates, even where page sizes are small (especially as + * updates that will eventually become overflow items can throw off our + * calculations). Bound the combination at something reasonable. + */ + if (r->page->type == WT_PAGE_ROW_LEAF && r->entries + r->supd_next > 10) + len += r->supd_memsize; + + /* Check for the disk image crossing a boundary. */ + return (WT_CHECK_CROSSING_BND(r, len)); +} + +/* + * __wt_rec_incr -- + * Update the memory tracking structure for a set of new entries. + */ +static inline void +__wt_rec_incr( + WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size) +{ + /* + * The buffer code is fragile and prone to off-by-one errors -- check + * for overflow in diagnostic mode. + */ + WT_ASSERT(session, r->space_avail >= size); + WT_ASSERT(session, WT_BLOCK_FITS(r->first_free, size, + r->cur_ptr->image.mem, r->cur_ptr->image.memsize)); + + r->entries += v; + r->space_avail -= size; + r->first_free += size; + + /* + * If offset for the minimum split size boundary is not set, we have not + * yet reached the minimum boundary, reduce the space available for it. + */ + if (r->cur_ptr->min_offset == 0) { + if (r->min_space_avail >= size) + r->min_space_avail -= size; + else + r->min_space_avail = 0; + } +} + +/* + * __wt_rec_copy_incr -- + * Copy a key/value cell and buffer pair into the new image. + */ +static inline void +__wt_rec_copy_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv) +{ + size_t len; + uint8_t *p, *t; + + /* + * If there's only one chunk of data to copy (because the cell and data + * are being copied from the original disk page), the cell length won't + * be set, the WT_ITEM data/length will reference the data to be copied. + * + * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do + * the copy in-line. + */ + for (p = r->first_free, + t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len) + *p++ = *t++; + + /* The data can be quite large -- call memcpy. */ + if (kv->buf.size != 0) + memcpy(p, kv->buf.data, kv->buf.size); + + WT_ASSERT(session, kv->len == kv->cell_len + kv->buf.size); + __wt_rec_incr(session, r, 1, kv->len); +} + +/* + * __wt_rec_cell_build_addr -- + * Process an address reference and return a cell structure to be stored + * on the page. + */ +static inline void +__wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, + const void *addr, size_t size, u_int cell_type, uint64_t recno) +{ + WT_REC_KV *val; + + val = &r->v; + + WT_ASSERT(session, size != 0 || cell_type == WT_CELL_ADDR_DEL); + + /* + * We don't check the address size because we can't store an address on + * an overflow page: if the address won't fit, the overflow page's + * address won't fit either. This possibility must be handled by Btree + * configuration, we have to disallow internal page sizes that are too + * small with respect to the largest address cookie the underlying block + * manager might return. + */ + + /* + * We don't copy the data into the buffer, it's not necessary; just + * re-point the buffer's data/length fields. + */ + val->buf.data = addr; + val->buf.size = size; + val->cell_len = + __wt_cell_pack_addr(&val->cell, cell_type, recno, val->buf.size); + val->len = val->cell_len + val->buf.size; +} + +/* + * __wt_rec_cell_build_val -- + * Process a data item and return a WT_CELL structure and byte string to + * be stored on the page. + */ +static inline int +__wt_rec_cell_build_val(WT_SESSION_IMPL *session, + WT_RECONCILE *r, const void *data, size_t size, uint64_t rle) +{ + WT_BTREE *btree; + WT_REC_KV *val; + + btree = S2BT(session); + + val = &r->v; + + /* + * We don't copy the data into the buffer, it's not necessary; just + * re-point the buffer's data/length fields. + */ + val->buf.data = data; + val->buf.size = size; + + /* Handle zero-length cells quickly. */ + if (size != 0) { + /* Optionally compress the data using the Huffman engine. */ + if (btree->huffman_value != NULL) + WT_RET(__wt_huffman_encode( + session, btree->huffman_value, + val->buf.data, (uint32_t)val->buf.size, &val->buf)); + + /* Create an overflow object if the data won't fit. */ + if (val->buf.size > btree->maxleafvalue) { + WT_STAT_DATA_INCR(session, rec_overflow_value); + + return (__wt_rec_cell_build_ovfl( + session, r, val, WT_CELL_VALUE_OVFL, rle)); + } + } + val->cell_len = __wt_cell_pack_data(&val->cell, rle, val->buf.size); + val->len = val->cell_len + val->buf.size; + + return (0); +} + +/* + * __wt_rec_dict_replace -- + * Check for a dictionary match. + */ +static inline int +__wt_rec_dict_replace( + WT_SESSION_IMPL *session, WT_RECONCILE *r, uint64_t rle, WT_REC_KV *val) +{ + WT_REC_DICTIONARY *dp; + uint64_t offset; + + /* + * We optionally create a dictionary of values and only write a unique + * value once per page, using a special "copy" cell for all subsequent + * copies of the value. We have to do the cell build and resolution at + * this low level because we need physical cell offsets for the page. + * + * Sanity check: short-data cells can be smaller than dictionary-copy + * cells. If the data is already small, don't bother doing the work. + * This isn't just work avoidance: on-page cells can't grow as a result + * of writing a dictionary-copy cell, the reconciliation functions do a + * split-boundary test based on the size required by the value's cell; + * if we grow the cell after that test we'll potentially write off the + * end of the buffer's memory. + */ + if (val->buf.size <= WT_INTPACK32_MAXSIZE) + return (0); + WT_RET(__wt_rec_dictionary_lookup(session, r, val, &dp)); + if (dp == NULL) + return (0); + + /* + * If the dictionary offset isn't set, we're creating a new entry in the + * dictionary, set its location. + * + * If the dictionary offset is set, we have a matching value. Create a + * copy cell instead. + */ + if (dp->offset == 0) + dp->offset = WT_PTRDIFF32(r->first_free, r->cur_ptr->image.mem); + else { + /* + * The offset is the byte offset from this cell to the previous, + * matching cell, NOT the byte offset from the beginning of the + * page. + */ + offset = (uint64_t)WT_PTRDIFF(r->first_free, + (uint8_t *)r->cur_ptr->image.mem + dp->offset); + val->len = val->cell_len = + __wt_cell_pack_copy(&val->cell, rle, offset); + val->buf.data = NULL; + val->buf.size = 0; + } + return (0); +} diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index a1fc065d263..b0e66d69743 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -371,6 +371,8 @@ struct __wt_connection_stats { int64_t cache_lookaside_score; int64_t cache_lookaside_entries; int64_t cache_lookaside_insert; + int64_t cache_lookaside_ondisk_max; + int64_t cache_lookaside_ondisk; int64_t cache_lookaside_remove; int64_t cache_eviction_checkpoint; int64_t cache_eviction_get_ref; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index a1f6634922a..928c3c13ad5 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -153,6 +153,8 @@ struct __wt_txn_global { WT_TXN_STATE checkpoint_state; /* Checkpoint's txn state */ wt_timestamp_t checkpoint_timestamp; /* Checkpoint's timestamp */ + volatile uint64_t debug_ops; /* Debug mode op counter */ + uint64_t debug_rollback; /* Debug mode rollback */ volatile uint64_t metadata_pinned; /* Oldest ID for metadata */ /* Named snapshot state. */ diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 1cd615fa3bd..de10e8c44b9 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -425,6 +425,42 @@ __wt_txn_op_apply_prepare_state( } /* + * __wt_txn_op_delete_commit_apply_timestamps -- + * Apply the correct start and durable timestamps to any + * updates in the page del update list. + */ +static inline void +__wt_txn_op_delete_commit_apply_timestamps( + WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_TXN *txn; + WT_UPDATE **updp; + uint32_t previous_state; + + txn = &session->txn; + + /* + * Lock the ref to ensure we don't race with eviction freeing the page + * deleted update list or with a page instantiate. + */ + for (;; __wt_yield()) { + previous_state = ref->state; + WT_ASSERT(session, previous_state != WT_REF_READING); + if (previous_state != WT_REF_LOCKED && WT_REF_CAS_STATE( + session, ref, previous_state, WT_REF_LOCKED)) + break; + } + + for (updp = ref->page_del->update_list; + updp != NULL && *updp != NULL; ++updp) { + (*updp)->timestamp = txn->commit_timestamp; + } + + /* Unlock the page by setting it back to it's previous state */ + WT_REF_SET_STATE(ref, previous_state); +} + +/* * __wt_txn_op_set_timestamp -- * Decide whether to copy a commit timestamp into an update. If the op * structure doesn't have a populated update or ref field or in prepared @@ -471,6 +507,10 @@ __wt_txn_op_set_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op) &op->u.ref->page_del->timestamp : &op->u.op_upd->timestamp; if (*timestamp == 0) *timestamp = txn->commit_timestamp; + + if (op->type == WT_TXN_OP_REF_DELETE) + __wt_txn_op_delete_commit_apply_timestamps( + session, op->u.ref); } } @@ -1075,13 +1115,19 @@ static inline int __wt_txn_update_check(WT_SESSION_IMPL *session, WT_UPDATE *upd) { WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; bool ignore_prepare_set; txn = &session->txn; + txn_global = &S2C(session)->txn_global; if (txn->isolation != WT_ISO_SNAPSHOT) return (0); + if (txn_global->debug_rollback != 0 && + ++txn_global->debug_ops % txn_global->debug_rollback == 0) + return (__wt_txn_rollback_required(session, + "debug mode simulated conflict")); /* * Always include prepared transactions in this check: they are not * supposed to affect visibility for update operations. diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 17bfb813151..2fe91e312e4 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -2196,6 +2196,16 @@ struct __wt_connection { * application thread will wait for space to be available in cache * before giving up. Default will wait forever., an integer greater * than or equal to 0; default \c 0.} + * @config{cache_overflow = (, cache overflow configuration options., a + * set of related configuration options defined below.} + * @config{ file_max, The maximum number of bytes + * that WiredTiger is allowed to use for its cache overflow mechanism. + * If the cache overflow file exceeds this size\, a panic will be + * triggered. The default value means that the cache overflow file is + * unbounded and may use as much space as the filesystem will + * accommodate. The minimum non-zero setting is 100MB., an integer + * greater than or equal to 0; default \c 0.} + * @config{ ),,} * @config{cache_overhead, assume the heap allocator overhead is the * specified percentage\, and adjust the cache usage by that amount (for * example\, if there is 10GB of data in cache\, a percentage of 10 @@ -2228,6 +2238,28 @@ struct __wt_connection { * @config{ release, compatibility release * version string., a string; default empty.} * @config{ ),,} + * @config{debug_mode = (, control the settings of various extended + * debugging features., a set of related configuration options defined + * below.} + * @config{ checkpoint_retention, adjust + * log archiving to retain the log records of this number of + * checkpoints. Zero or one means perform normal archiving., an integer + * between 0 and 1024; default \c 0.} + * @config{ eviction, if true\, modify internal + * algorithms to change skew to force lookaside eviction to happen more + * aggressively. This includes but is not limited to not skewing + * newest\, not favoring leaf pages\, and modifying the eviction score + * mechanism., a boolean flag; default \c false.} + * @config{ rollback_error, return a WT_ROLLBACK + * error from a transaction operation about every Nth operation to + * simulate a collision., an integer between 0 and 10M; default \c 0.} + * @config{ table_logging, if true\, write + * transaction related information to the log for all operations\, even + * operations for tables with logging turned off. This setting + * introduces a log format change that may break older versions of + * WiredTiger. These operations are informational and skipped in + * recovery., a boolean flag; default \c false.} + * @config{ ),,} * @config{error_prefix, prefix string for error messages., a string; * default empty.} * @config{eviction = (, eviction configuration options., a set of @@ -2795,6 +2827,15 @@ struct __wt_connection { * thread will wait for space to be available in cache before giving up. * Default will wait forever., an integer greater than or equal to 0; default \c * 0.} + * @config{cache_overflow = (, cache overflow configuration options., a set of + * related configuration options defined below.} + * @config{ file_max, The maximum number of bytes that + * WiredTiger is allowed to use for its cache overflow mechanism. If the cache + * overflow file exceeds this size\, a panic will be triggered. The default + * value means that the cache overflow file is unbounded and may use as much + * space as the filesystem will accommodate. The minimum non-zero setting is + * 100MB., an integer greater than or equal to 0; default \c 0.} + * @config{ ),,} * @config{cache_overhead, assume the heap allocator overhead is the specified * percentage\, and adjust the cache usage by that amount (for example\, if * there is 10GB of data in cache\, a percentage of 10 means WiredTiger treats @@ -2843,6 +2884,27 @@ struct __wt_connection { * true.} * @config{create, create the database if it does not exist., a boolean flag; * default \c false.} + * @config{debug_mode = (, control the settings of various extended debugging + * features., a set of related configuration options defined below.} + * @config{ checkpoint_retention, adjust log archiving to + * retain the log records of this number of checkpoints. Zero or one means + * perform normal archiving., an integer between 0 and 1024; default \c 0.} + * @config{ eviction, if true\, modify internal + * algorithms to change skew to force lookaside eviction to happen more + * aggressively. This includes but is not limited to not skewing newest\, not + * favoring leaf pages\, and modifying the eviction score mechanism., a boolean + * flag; default \c false.} + * @config{ rollback_error, + * return a WT_ROLLBACK error from a transaction operation about every Nth + * operation to simulate a collision., an integer between 0 and 10M; default \c + * 0.} + * @config{ table_logging, if true\, write + * transaction related information to the log for all operations\, even + * operations for tables with logging turned off. This setting introduces a log + * format change that may break older versions of WiredTiger. These operations + * are informational and skipped in recovery., a boolean flag; default \c + * false.} + * @config{ ),,} * @config{direct_io, Use \c O_DIRECT on POSIX systems\, and \c * FILE_FLAG_NO_BUFFERING on Windows to access files. Options are given as a * list\, such as <code>"direct_io=[data]"</code>. Configuring \c direct_io @@ -4921,6 +4983,12 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_LOGOP_COL_MODIFY 9 /*! row-store modify */ #define WT_LOGOP_ROW_MODIFY 10 +/* + * NOTE: Diagnostic-only log operations should have values in + * the ignore range. + */ +/*! Diagnostic: transaction timestamps */ +#define WT_LOGOP_TXN_TIMESTAMP (WT_LOGOP_IGNORE | 11) /*! @} */ /******************************************* @@ -5034,737 +5102,741 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_LOOKASIDE_ENTRIES 1045 /*! cache: cache overflow table insert calls */ #define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1046 +/*! cache: cache overflow table max on-disk size */ +#define WT_STAT_CONN_CACHE_LOOKASIDE_ONDISK_MAX 1047 +/*! cache: cache overflow table on-disk size */ +#define WT_STAT_CONN_CACHE_LOOKASIDE_ONDISK 1048 /*! cache: cache overflow table remove calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1047 +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1049 /*! cache: checkpoint blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1048 +#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1050 /*! cache: eviction calls to get a page */ -#define WT_STAT_CONN_CACHE_EVICTION_GET_REF 1049 +#define WT_STAT_CONN_CACHE_EVICTION_GET_REF 1051 /*! cache: eviction calls to get a page found queue empty */ -#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY 1050 +#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY 1052 /*! cache: eviction calls to get a page found queue empty after locking */ -#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY2 1051 +#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY2 1053 /*! cache: eviction currently operating in aggressive mode */ -#define WT_STAT_CONN_CACHE_EVICTION_AGGRESSIVE_SET 1052 +#define WT_STAT_CONN_CACHE_EVICTION_AGGRESSIVE_SET 1054 /*! cache: eviction empty score */ -#define WT_STAT_CONN_CACHE_EVICTION_EMPTY_SCORE 1053 +#define WT_STAT_CONN_CACHE_EVICTION_EMPTY_SCORE 1055 /*! cache: eviction passes of a file */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK_PASSES 1054 +#define WT_STAT_CONN_CACHE_EVICTION_WALK_PASSES 1056 /*! cache: eviction server candidate queue empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1055 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1057 /*! cache: eviction server candidate queue not empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1056 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1058 /*! cache: eviction server evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1057 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1059 /*! * cache: eviction server slept, because we did not make progress with * eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_SLEPT 1058 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_SLEPT 1060 /*! cache: eviction server unable to reach eviction goal */ -#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1059 +#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1061 /*! cache: eviction state */ -#define WT_STAT_CONN_CACHE_EVICTION_STATE 1060 +#define WT_STAT_CONN_CACHE_EVICTION_STATE 1062 /*! cache: eviction walk target pages histogram - 0-9 */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT10 1061 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT10 1063 /*! cache: eviction walk target pages histogram - 10-31 */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT32 1062 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT32 1064 /*! cache: eviction walk target pages histogram - 128 and higher */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_GE128 1063 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_GE128 1065 /*! cache: eviction walk target pages histogram - 32-63 */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT64 1064 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT64 1066 /*! cache: eviction walk target pages histogram - 64-128 */ -#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT128 1065 +#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT128 1067 /*! cache: eviction walks abandoned */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1066 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1068 /*! cache: eviction walks gave up because they restarted their walk twice */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STOPPED 1067 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STOPPED 1069 /*! * cache: eviction walks gave up because they saw too many pages and * found no candidates */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_NO_TARGETS 1068 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_NO_TARGETS 1070 /*! * cache: eviction walks gave up because they saw too many pages and * found too few candidates */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_RATIO 1069 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_RATIO 1071 /*! cache: eviction walks reached end of tree */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ENDED 1070 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ENDED 1072 /*! cache: eviction walks started from root of tree */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK_FROM_ROOT 1071 +#define WT_STAT_CONN_CACHE_EVICTION_WALK_FROM_ROOT 1073 /*! cache: eviction walks started from saved location in tree */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK_SAVED_POS 1072 +#define WT_STAT_CONN_CACHE_EVICTION_WALK_SAVED_POS 1074 /*! cache: eviction worker thread active */ -#define WT_STAT_CONN_CACHE_EVICTION_ACTIVE_WORKERS 1073 +#define WT_STAT_CONN_CACHE_EVICTION_ACTIVE_WORKERS 1075 /*! cache: eviction worker thread created */ -#define WT_STAT_CONN_CACHE_EVICTION_WORKER_CREATED 1074 +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_CREATED 1076 /*! cache: eviction worker thread evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1075 +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1077 /*! cache: eviction worker thread removed */ -#define WT_STAT_CONN_CACHE_EVICTION_WORKER_REMOVED 1076 +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_REMOVED 1078 /*! cache: eviction worker thread stable number */ -#define WT_STAT_CONN_CACHE_EVICTION_STABLE_STATE_WORKERS 1077 +#define WT_STAT_CONN_CACHE_EVICTION_STABLE_STATE_WORKERS 1079 /*! * cache: failed eviction of pages that exceeded the in-memory maximum * count */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1078 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1080 /*! * cache: failed eviction of pages that exceeded the in-memory maximum * time (usecs) */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL_TIME 1079 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL_TIME 1081 /*! cache: files with active eviction walks */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1080 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1082 /*! cache: files with new eviction walks started */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1081 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1083 /*! cache: force re-tuning of eviction workers once in a while */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_RETUNE 1082 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_RETUNE 1084 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1083 +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1085 /*! cache: hazard pointer check calls */ -#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1084 +#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1086 /*! cache: hazard pointer check entries walked */ -#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1085 +#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1087 /*! cache: hazard pointer maximum array length */ -#define WT_STAT_CONN_CACHE_HAZARD_MAX 1086 +#define WT_STAT_CONN_CACHE_HAZARD_MAX 1088 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1087 +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1089 /*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1088 +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1090 /*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1089 +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1091 /*! cache: internal pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1090 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1092 /*! cache: leaf pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1091 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1093 /*! cache: maximum bytes configured */ -#define WT_STAT_CONN_CACHE_BYTES_MAX 1092 +#define WT_STAT_CONN_CACHE_BYTES_MAX 1094 /*! cache: maximum page size at eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1093 +#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1095 /*! cache: modified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1094 +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1096 /*! cache: modified pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1095 +#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1097 /*! cache: operations timed out waiting for space in cache */ -#define WT_STAT_CONN_CACHE_TIMED_OUT_OPS 1096 +#define WT_STAT_CONN_CACHE_TIMED_OUT_OPS 1098 /*! cache: overflow pages read into cache */ -#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1097 +#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1099 /*! cache: page split during eviction deepened the tree */ -#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1098 +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1100 /*! cache: page written requiring cache overflow records */ -#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1099 +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1101 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1100 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1102 /*! cache: pages evicted because they exceeded the in-memory maximum count */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1101 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1103 /*! * cache: pages evicted because they exceeded the in-memory maximum time * (usecs) */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_TIME 1102 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_TIME 1104 /*! cache: pages evicted because they had chains of deleted items count */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1103 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1105 /*! * cache: pages evicted because they had chains of deleted items time * (usecs) */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE_TIME 1104 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE_TIME 1106 /*! cache: pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP 1105 +#define WT_STAT_CONN_CACHE_EVICTION_APP 1107 /*! cache: pages queued for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1106 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1108 /*! cache: pages queued for urgent eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1107 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1109 /*! cache: pages queued for urgent eviction during walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1108 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1110 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1109 +#define WT_STAT_CONN_CACHE_READ 1111 /*! cache: pages read into cache after truncate */ -#define WT_STAT_CONN_CACHE_READ_DELETED 1110 +#define WT_STAT_CONN_CACHE_READ_DELETED 1112 /*! cache: pages read into cache after truncate in prepare state */ -#define WT_STAT_CONN_CACHE_READ_DELETED_PREPARED 1111 +#define WT_STAT_CONN_CACHE_READ_DELETED_PREPARED 1113 /*! cache: pages read into cache requiring cache overflow entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1112 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1114 /*! cache: pages read into cache requiring cache overflow for checkpoint */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_CHECKPOINT 1113 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_CHECKPOINT 1115 /*! cache: pages read into cache skipping older cache overflow entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_SKIPPED 1114 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_SKIPPED 1116 /*! * cache: pages read into cache with skipped cache overflow entries * needed later */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY 1115 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY 1117 /*! * cache: pages read into cache with skipped cache overflow entries * needed later by checkpoint */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY_CHECKPOINT 1116 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY_CHECKPOINT 1118 /*! cache: pages requested from the cache */ -#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1117 +#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1119 /*! cache: pages seen by eviction walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1118 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1120 /*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1119 +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1121 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1120 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1122 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1121 +#define WT_STAT_CONN_CACHE_WRITE 1123 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1122 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1124 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1123 +#define WT_STAT_CONN_CACHE_OVERHEAD 1125 /*! cache: tracked bytes belonging to internal pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1124 +#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1126 /*! cache: tracked bytes belonging to leaf pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_LEAF 1125 +#define WT_STAT_CONN_CACHE_BYTES_LEAF 1127 /*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1126 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1128 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1127 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1129 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1128 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1130 /*! capacity: background fsync file handles considered */ -#define WT_STAT_CONN_FSYNC_ALL_FH_TOTAL 1129 +#define WT_STAT_CONN_FSYNC_ALL_FH_TOTAL 1131 /*! capacity: background fsync file handles synced */ -#define WT_STAT_CONN_FSYNC_ALL_FH 1130 +#define WT_STAT_CONN_FSYNC_ALL_FH 1132 /*! capacity: background fsync time (msecs) */ -#define WT_STAT_CONN_FSYNC_ALL_TIME 1131 +#define WT_STAT_CONN_FSYNC_ALL_TIME 1133 /*! capacity: threshold to call fsync */ -#define WT_STAT_CONN_CAPACITY_THRESHOLD 1132 +#define WT_STAT_CONN_CAPACITY_THRESHOLD 1134 /*! capacity: throttled bytes read */ -#define WT_STAT_CONN_CAPACITY_BYTES_READ 1133 +#define WT_STAT_CONN_CAPACITY_BYTES_READ 1135 /*! capacity: throttled bytes written for checkpoint */ -#define WT_STAT_CONN_CAPACITY_BYTES_CKPT 1134 +#define WT_STAT_CONN_CAPACITY_BYTES_CKPT 1136 /*! capacity: throttled bytes written for eviction */ -#define WT_STAT_CONN_CAPACITY_BYTES_EVICT 1135 +#define WT_STAT_CONN_CAPACITY_BYTES_EVICT 1137 /*! capacity: throttled bytes written for log */ -#define WT_STAT_CONN_CAPACITY_BYTES_LOG 1136 +#define WT_STAT_CONN_CAPACITY_BYTES_LOG 1138 /*! capacity: throttled bytes written total */ -#define WT_STAT_CONN_CAPACITY_BYTES_WRITTEN 1137 +#define WT_STAT_CONN_CAPACITY_BYTES_WRITTEN 1139 /*! capacity: time waiting due to total capacity (usecs) */ -#define WT_STAT_CONN_CAPACITY_TIME_TOTAL 1138 +#define WT_STAT_CONN_CAPACITY_TIME_TOTAL 1140 /*! capacity: time waiting during checkpoint (usecs) */ -#define WT_STAT_CONN_CAPACITY_TIME_CKPT 1139 +#define WT_STAT_CONN_CAPACITY_TIME_CKPT 1141 /*! capacity: time waiting during eviction (usecs) */ -#define WT_STAT_CONN_CAPACITY_TIME_EVICT 1140 +#define WT_STAT_CONN_CAPACITY_TIME_EVICT 1142 /*! capacity: time waiting during logging (usecs) */ -#define WT_STAT_CONN_CAPACITY_TIME_LOG 1141 +#define WT_STAT_CONN_CAPACITY_TIME_LOG 1143 /*! capacity: time waiting during read (usecs) */ -#define WT_STAT_CONN_CAPACITY_TIME_READ 1142 +#define WT_STAT_CONN_CAPACITY_TIME_READ 1144 /*! connection: auto adjusting condition resets */ -#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1143 +#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1145 /*! connection: auto adjusting condition wait calls */ -#define WT_STAT_CONN_COND_AUTO_WAIT 1144 +#define WT_STAT_CONN_COND_AUTO_WAIT 1146 /*! connection: detected system time went backwards */ -#define WT_STAT_CONN_TIME_TRAVEL 1145 +#define WT_STAT_CONN_TIME_TRAVEL 1147 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1146 +#define WT_STAT_CONN_FILE_OPEN 1148 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1147 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1149 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1148 +#define WT_STAT_CONN_MEMORY_FREE 1150 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1149 +#define WT_STAT_CONN_MEMORY_GROW 1151 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1150 +#define WT_STAT_CONN_COND_WAIT 1152 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1151 +#define WT_STAT_CONN_RWLOCK_READ 1153 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1152 +#define WT_STAT_CONN_RWLOCK_WRITE 1154 /*! connection: total fsync I/Os */ -#define WT_STAT_CONN_FSYNC_IO 1153 +#define WT_STAT_CONN_FSYNC_IO 1155 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1154 +#define WT_STAT_CONN_READ_IO 1156 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1155 +#define WT_STAT_CONN_WRITE_IO 1157 /*! cursor: cached cursor count */ -#define WT_STAT_CONN_CURSOR_CACHED_COUNT 1156 +#define WT_STAT_CONN_CURSOR_CACHED_COUNT 1158 /*! cursor: cursor close calls that result in cache */ -#define WT_STAT_CONN_CURSOR_CACHE 1157 +#define WT_STAT_CONN_CURSOR_CACHE 1159 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1158 +#define WT_STAT_CONN_CURSOR_CREATE 1160 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1159 +#define WT_STAT_CONN_CURSOR_INSERT 1161 /*! cursor: cursor modify calls */ -#define WT_STAT_CONN_CURSOR_MODIFY 1160 +#define WT_STAT_CONN_CURSOR_MODIFY 1162 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1161 +#define WT_STAT_CONN_CURSOR_NEXT 1163 /*! cursor: cursor operation restarted */ -#define WT_STAT_CONN_CURSOR_RESTART 1162 +#define WT_STAT_CONN_CURSOR_RESTART 1164 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1163 +#define WT_STAT_CONN_CURSOR_PREV 1165 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1164 +#define WT_STAT_CONN_CURSOR_REMOVE 1166 /*! cursor: cursor reserve calls */ -#define WT_STAT_CONN_CURSOR_RESERVE 1165 +#define WT_STAT_CONN_CURSOR_RESERVE 1167 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1166 +#define WT_STAT_CONN_CURSOR_RESET 1168 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1167 +#define WT_STAT_CONN_CURSOR_SEARCH 1169 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1168 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1170 /*! cursor: cursor sweep buckets */ -#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1169 +#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1171 /*! cursor: cursor sweep cursors closed */ -#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1170 +#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1172 /*! cursor: cursor sweep cursors examined */ -#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1171 +#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1173 /*! cursor: cursor sweeps */ -#define WT_STAT_CONN_CURSOR_SWEEP 1172 +#define WT_STAT_CONN_CURSOR_SWEEP 1174 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1173 +#define WT_STAT_CONN_CURSOR_UPDATE 1175 /*! cursor: cursors reused from cache */ -#define WT_STAT_CONN_CURSOR_REOPEN 1174 +#define WT_STAT_CONN_CURSOR_REOPEN 1176 /*! cursor: open cursor count */ -#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1175 +#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1177 /*! cursor: truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1176 +#define WT_STAT_CONN_CURSOR_TRUNCATE 1178 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1177 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1179 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1178 +#define WT_STAT_CONN_DH_SWEEP_REF 1180 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1179 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1181 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1180 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1182 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1181 +#define WT_STAT_CONN_DH_SWEEP_TOD 1183 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1182 +#define WT_STAT_CONN_DH_SWEEPS 1184 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1183 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1185 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1184 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1186 /*! lock: checkpoint lock acquisitions */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1185 +#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1187 /*! lock: checkpoint lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1186 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1188 /*! lock: checkpoint lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1187 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1189 /*! * lock: commit timestamp queue lock application thread time waiting * (usecs) */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1188 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1190 /*! lock: commit timestamp queue lock internal thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1189 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1191 /*! lock: commit timestamp queue read lock acquisitions */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1190 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1192 /*! lock: commit timestamp queue write lock acquisitions */ -#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1191 +#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1193 /*! lock: dhandle lock application thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1192 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1194 /*! lock: dhandle lock internal thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1193 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1195 /*! lock: dhandle read lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1194 +#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1196 /*! lock: dhandle write lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1195 +#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1197 /*! lock: metadata lock acquisitions */ -#define WT_STAT_CONN_LOCK_METADATA_COUNT 1196 +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1198 /*! lock: metadata lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1197 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1199 /*! lock: metadata lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1198 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1200 /*! * lock: read timestamp queue lock application thread time waiting * (usecs) */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1199 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1201 /*! lock: read timestamp queue lock internal thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1200 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1202 /*! lock: read timestamp queue read lock acquisitions */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1201 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1203 /*! lock: read timestamp queue write lock acquisitions */ -#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1202 +#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1204 /*! lock: schema lock acquisitions */ -#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1203 +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1205 /*! lock: schema lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1204 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1206 /*! lock: schema lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1205 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1207 /*! * lock: table lock application thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1206 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1208 /*! * lock: table lock internal thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1207 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1209 /*! lock: table read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1208 +#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1210 /*! lock: table write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1209 +#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1211 /*! lock: txn global lock application thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1210 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1212 /*! lock: txn global lock internal thread time waiting (usecs) */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1211 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1213 /*! lock: txn global read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1212 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1214 /*! lock: txn global write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1213 +#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1215 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1214 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1216 /*! log: force archive time sleeping (usecs) */ -#define WT_STAT_CONN_LOG_FORCE_ARCHIVE_SLEEP 1215 +#define WT_STAT_CONN_LOG_FORCE_ARCHIVE_SLEEP 1217 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1216 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1218 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1217 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1219 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1218 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1220 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1219 +#define WT_STAT_CONN_LOG_FLUSH 1221 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1220 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1222 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1221 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1223 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1222 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1224 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1223 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1225 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1224 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1226 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1225 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1227 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1226 +#define WT_STAT_CONN_LOG_SCANS 1228 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1227 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1229 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1228 +#define WT_STAT_CONN_LOG_WRITE_LSN 1230 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1229 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1231 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1230 +#define WT_STAT_CONN_LOG_SYNC 1232 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1231 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1233 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1232 +#define WT_STAT_CONN_LOG_SYNC_DIR 1234 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1233 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1235 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1234 +#define WT_STAT_CONN_LOG_WRITES 1236 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1235 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1237 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1236 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1238 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1237 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1239 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1238 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1240 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1239 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1241 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1240 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1242 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1241 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1243 /*! log: slot close lost race */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1242 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1244 /*! log: slot close unbuffered waits */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1243 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1245 /*! log: slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1244 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1246 /*! log: slot join atomic update races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1245 +#define WT_STAT_CONN_LOG_SLOT_RACES 1247 /*! log: slot join calls atomic updates raced */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1246 +#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1248 /*! log: slot join calls did not yield */ -#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1247 +#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1249 /*! log: slot join calls found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1248 +#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1250 /*! log: slot join calls slept */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1249 +#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1251 /*! log: slot join calls yielded */ -#define WT_STAT_CONN_LOG_SLOT_YIELD 1250 +#define WT_STAT_CONN_LOG_SLOT_YIELD 1252 /*! log: slot join found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1251 +#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1253 /*! log: slot joins yield time (usecs) */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1252 +#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1254 /*! log: slot transitions unable to find free slot */ -#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1253 +#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1255 /*! log: slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1254 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1256 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1255 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1257 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1256 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1258 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1257 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1259 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1258 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1260 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1259 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1261 /*! perf: file system read latency histogram (bucket 1) - 10-49ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1260 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1262 /*! perf: file system read latency histogram (bucket 2) - 50-99ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1261 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1263 /*! perf: file system read latency histogram (bucket 3) - 100-249ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1262 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1264 /*! perf: file system read latency histogram (bucket 4) - 250-499ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1263 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1265 /*! perf: file system read latency histogram (bucket 5) - 500-999ms */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1264 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1266 /*! perf: file system read latency histogram (bucket 6) - 1000ms+ */ -#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1265 +#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1267 /*! perf: file system write latency histogram (bucket 1) - 10-49ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1266 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1268 /*! perf: file system write latency histogram (bucket 2) - 50-99ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1267 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1269 /*! perf: file system write latency histogram (bucket 3) - 100-249ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1268 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1270 /*! perf: file system write latency histogram (bucket 4) - 250-499ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1269 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1271 /*! perf: file system write latency histogram (bucket 5) - 500-999ms */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1270 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1272 /*! perf: file system write latency histogram (bucket 6) - 1000ms+ */ -#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1271 +#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1273 /*! perf: operation read latency histogram (bucket 1) - 100-249us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1272 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1274 /*! perf: operation read latency histogram (bucket 2) - 250-499us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1273 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1275 /*! perf: operation read latency histogram (bucket 3) - 500-999us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1274 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1276 /*! perf: operation read latency histogram (bucket 4) - 1000-9999us */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1275 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1277 /*! perf: operation read latency histogram (bucket 5) - 10000us+ */ -#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1276 +#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1278 /*! perf: operation write latency histogram (bucket 1) - 100-249us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1277 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1279 /*! perf: operation write latency histogram (bucket 2) - 250-499us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1278 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1280 /*! perf: operation write latency histogram (bucket 3) - 500-999us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1279 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1281 /*! perf: operation write latency histogram (bucket 4) - 1000-9999us */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1280 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1282 /*! perf: operation write latency histogram (bucket 5) - 10000us+ */ -#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1281 +#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1283 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1282 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1284 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1283 +#define WT_STAT_CONN_REC_PAGES 1285 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1284 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1286 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1285 +#define WT_STAT_CONN_REC_PAGE_DELETE 1287 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1286 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1288 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1287 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1289 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1288 +#define WT_STAT_CONN_SESSION_OPEN 1290 /*! session: session query timestamp calls */ -#define WT_STAT_CONN_SESSION_QUERY_TS 1289 +#define WT_STAT_CONN_SESSION_QUERY_TS 1291 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1290 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1292 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1291 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1293 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1292 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1294 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1293 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1295 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1294 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1296 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1295 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1297 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1296 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1298 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1297 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1299 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1298 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1300 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1299 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1301 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1300 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1302 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1301 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1303 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1302 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1304 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1303 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1305 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1304 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1306 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1305 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1307 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1306 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1308 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1307 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1309 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1308 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1310 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1309 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1311 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1310 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1312 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1311 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1313 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1312 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1314 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1313 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1315 /*! * thread-yield: connection close blocked waiting for transaction state * stabilization */ -#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1314 +#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1316 /*! thread-yield: connection close yielded for lsm manager shutdown */ -#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1315 +#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1317 /*! thread-yield: data handle lock yielded */ -#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1316 +#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1318 /*! * thread-yield: get reference for page index and slot time sleeping * (usecs) */ -#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1317 +#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1319 /*! thread-yield: log server sync yielded for log write */ -#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1318 +#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1320 /*! thread-yield: page access yielded due to prepare state change */ -#define WT_STAT_CONN_PREPARED_TRANSITION_BLOCKED_PAGE 1319 +#define WT_STAT_CONN_PREPARED_TRANSITION_BLOCKED_PAGE 1321 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1320 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1322 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1321 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1323 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1322 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1324 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1323 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1325 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1324 +#define WT_STAT_CONN_PAGE_SLEEP 1326 /*! * thread-yield: page delete rollback time sleeping for state change * (usecs) */ -#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1325 +#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1327 /*! thread-yield: page reconciliation yielded due to child modification */ -#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1326 +#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1328 /*! transaction: Number of prepared updates */ -#define WT_STAT_CONN_TXN_PREPARED_UPDATES_COUNT 1327 +#define WT_STAT_CONN_TXN_PREPARED_UPDATES_COUNT 1329 /*! transaction: Number of prepared updates added to cache overflow */ -#define WT_STAT_CONN_TXN_PREPARED_UPDATES_LOOKASIDE_INSERTS 1328 +#define WT_STAT_CONN_TXN_PREPARED_UPDATES_LOOKASIDE_INSERTS 1330 /*! transaction: Number of prepared updates resolved */ -#define WT_STAT_CONN_TXN_PREPARED_UPDATES_RESOLVED 1329 +#define WT_STAT_CONN_TXN_PREPARED_UPDATES_RESOLVED 1331 /*! transaction: commit timestamp queue entries walked */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_WALKED 1330 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_WALKED 1332 /*! transaction: commit timestamp queue insert to empty */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1331 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1333 /*! transaction: commit timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1332 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1334 /*! transaction: commit timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1333 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1335 /*! transaction: commit timestamp queue length */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1334 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1336 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1335 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1337 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1336 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1338 /*! transaction: prepared transactions */ -#define WT_STAT_CONN_TXN_PREPARE 1337 +#define WT_STAT_CONN_TXN_PREPARE 1339 /*! transaction: prepared transactions committed */ -#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1338 +#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1340 /*! transaction: prepared transactions currently active */ -#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1339 +#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1341 /*! transaction: prepared transactions rolled back */ -#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1340 +#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1342 /*! transaction: query timestamp calls */ -#define WT_STAT_CONN_TXN_QUERY_TS 1341 +#define WT_STAT_CONN_TXN_QUERY_TS 1343 /*! transaction: read timestamp queue entries walked */ -#define WT_STAT_CONN_TXN_READ_QUEUE_WALKED 1342 +#define WT_STAT_CONN_TXN_READ_QUEUE_WALKED 1344 /*! transaction: read timestamp queue insert to empty */ -#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1343 +#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1345 /*! transaction: read timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1344 +#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1346 /*! transaction: read timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1345 +#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1347 /*! transaction: read timestamp queue length */ -#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1346 +#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1348 /*! transaction: rollback to stable calls */ -#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE 1347 +#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE 1349 /*! transaction: rollback to stable updates aborted */ -#define WT_STAT_CONN_TXN_ROLLBACK_UPD_ABORTED 1348 +#define WT_STAT_CONN_TXN_ROLLBACK_UPD_ABORTED 1350 /*! transaction: rollback to stable updates removed from cache overflow */ -#define WT_STAT_CONN_TXN_ROLLBACK_LAS_REMOVED 1349 +#define WT_STAT_CONN_TXN_ROLLBACK_LAS_REMOVED 1351 /*! transaction: set timestamp calls */ -#define WT_STAT_CONN_TXN_SET_TS 1350 +#define WT_STAT_CONN_TXN_SET_TS 1352 /*! transaction: set timestamp commit calls */ -#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1351 +#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1353 /*! transaction: set timestamp commit updates */ -#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1352 +#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1354 /*! transaction: set timestamp oldest calls */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1353 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1355 /*! transaction: set timestamp oldest updates */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1354 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1356 /*! transaction: set timestamp stable calls */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE 1355 +#define WT_STAT_CONN_TXN_SET_TS_STABLE 1357 /*! transaction: set timestamp stable updates */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1356 +#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1358 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1357 +#define WT_STAT_CONN_TXN_BEGIN 1359 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1358 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1360 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1359 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1361 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1360 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1362 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1361 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1363 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1362 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1364 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1363 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1365 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1364 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1366 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1365 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1367 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1366 +#define WT_STAT_CONN_TXN_CHECKPOINT 1368 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1367 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1369 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1368 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1370 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1369 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1371 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1370 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1372 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1371 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1373 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1372 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1374 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1373 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1375 /*! transaction: transaction range of timestamps currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1374 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1376 /*! transaction: transaction range of timestamps pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1375 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1377 /*! * transaction: transaction range of timestamps pinned by the oldest * timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1376 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1378 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1377 +#define WT_STAT_CONN_TXN_SYNC 1379 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1378 +#define WT_STAT_CONN_TXN_COMMIT 1380 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1379 +#define WT_STAT_CONN_TXN_ROLLBACK 1381 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1380 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1382 /*! * @} diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index d93f6a3be7f..9e31180dbb1 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -271,6 +271,12 @@ struct __wt_page_modify; typedef struct __wt_page_modify WT_PAGE_MODIFY; struct __wt_process; typedef struct __wt_process WT_PROCESS; +struct __wt_rec_chunk; + typedef struct __wt_rec_chunk WT_REC_CHUNK; +struct __wt_rec_dictionary; + typedef struct __wt_rec_dictionary WT_REC_DICTIONARY; +struct __wt_rec_kv; + typedef struct __wt_rec_kv WT_REC_KV; struct __wt_ref; typedef struct __wt_ref WT_REF; struct __wt_ref_hist; @@ -362,6 +368,7 @@ typedef uint64_t wt_timestamp_t; #include "btree.h" #include "cache.h" #include "capacity.h" +#include "cell.h" #include "compact.h" #include "config.h" #include "cursor.h" @@ -372,6 +379,7 @@ typedef uint64_t wt_timestamp_t; #include "meta.h" #include "optrack.h" #include "os.h" +#include "reconcile.h" #include "schema.h" #include "thread_group.h" #include "txn.h" @@ -407,6 +415,7 @@ typedef uint64_t wt_timestamp_t; #include "os_fs.i" #include "os_fstream.i" #include "packing.i" +#include "reconcile.i" #include "serial.i" #if defined(__cplusplus) diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 9e27a996251..1963a3770fc 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -236,16 +236,26 @@ __log_fs_write(WT_SESSION_IMPL *session, * thread as needed. */ void -__wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn) +__wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckpt_lsn) { WT_CONNECTION_IMPL *conn; WT_LOG *log; + int i; conn = S2C(session); log = conn->log; - log->ckpt_lsn = *ckp_lsn; + log->ckpt_lsn = *ckpt_lsn; if (conn->log_cond != NULL) __wt_cond_signal(session, conn->log_cond); + /* + * If we are storing debugging LSNs to retain additional log files + * from archiving, then rotate the newest LSN into the array. + */ + if (conn->debug_ckpt_cnt != 0) { + for (i = (int)conn->debug_ckpt_cnt - 1; i > 0; --i) + conn->debug_ckpt[i] = conn->debug_ckpt[i - 1]; + conn->debug_ckpt[0] = *ckpt_lsn; + } } /* diff --git a/src/third_party/wiredtiger/src/log/log_auto.c b/src/third_party/wiredtiger/src/log/log_auto.c index f6d7afed0c2..d7f59fd920e 100644 --- a/src/third_party/wiredtiger/src/log/log_auto.c +++ b/src/third_party/wiredtiger/src/log/log_auto.c @@ -149,7 +149,7 @@ __wt_logop_col_modify_print(WT_SESSION_IMPL *session, WT_RET(__wt_fprintf(session, args->fs, " \"optype\": \"col_modify\",\n")); WT_ERR(__wt_fprintf(session, args->fs, - " \"fileid\": %" PRIu32 ",\n", fileid)); + " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid)); WT_ERR(__wt_fprintf(session, args->fs, " \"recno\": %" PRIu64 ",\n", recno)); WT_ERR(__logrec_make_json_str(session, &escaped, &value)); @@ -224,7 +224,7 @@ __wt_logop_col_put_print(WT_SESSION_IMPL *session, WT_RET(__wt_fprintf(session, args->fs, " \"optype\": \"col_put\",\n")); WT_ERR(__wt_fprintf(session, args->fs, - " \"fileid\": %" PRIu32 ",\n", fileid)); + " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid)); WT_ERR(__wt_fprintf(session, args->fs, " \"recno\": %" PRIu64 ",\n", recno)); WT_ERR(__logrec_make_json_str(session, &escaped, &value)); @@ -295,7 +295,7 @@ __wt_logop_col_remove_print(WT_SESSION_IMPL *session, WT_RET(__wt_fprintf(session, args->fs, " \"optype\": \"col_remove\",\n")); WT_RET(__wt_fprintf(session, args->fs, - " \"fileid\": %" PRIu32 ",\n", fileid)); + " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid)); WT_RET(__wt_fprintf(session, args->fs, " \"recno\": %" PRIu64 "", recno)); return (0); @@ -357,7 +357,7 @@ __wt_logop_col_truncate_print(WT_SESSION_IMPL *session, WT_RET(__wt_fprintf(session, args->fs, " \"optype\": \"col_truncate\",\n")); WT_RET(__wt_fprintf(session, args->fs, - " \"fileid\": %" PRIu32 ",\n", fileid)); + " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid)); WT_RET(__wt_fprintf(session, args->fs, " \"start\": %" PRIu64 ",\n", start)); WT_RET(__wt_fprintf(session, args->fs, @@ -424,7 +424,7 @@ __wt_logop_row_modify_print(WT_SESSION_IMPL *session, WT_RET(__wt_fprintf(session, args->fs, " \"optype\": \"row_modify\",\n")); WT_ERR(__wt_fprintf(session, args->fs, - " \"fileid\": %" PRIu32 ",\n", fileid)); + " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid)); WT_ERR(__logrec_make_json_str(session, &escaped, &key)); WT_ERR(__wt_fprintf(session, args->fs, " \"key\": \"%s\",\n", escaped)); @@ -505,7 +505,7 @@ __wt_logop_row_put_print(WT_SESSION_IMPL *session, WT_RET(__wt_fprintf(session, args->fs, " \"optype\": \"row_put\",\n")); WT_ERR(__wt_fprintf(session, args->fs, - " \"fileid\": %" PRIu32 ",\n", fileid)); + " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid)); WT_ERR(__logrec_make_json_str(session, &escaped, &key)); WT_ERR(__wt_fprintf(session, args->fs, " \"key\": \"%s\",\n", escaped)); @@ -585,7 +585,7 @@ __wt_logop_row_remove_print(WT_SESSION_IMPL *session, WT_RET(__wt_fprintf(session, args->fs, " \"optype\": \"row_remove\",\n")); WT_ERR(__wt_fprintf(session, args->fs, - " \"fileid\": %" PRIu32 ",\n", fileid)); + " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid)); WT_ERR(__logrec_make_json_str(session, &escaped, &key)); WT_ERR(__wt_fprintf(session, args->fs, " \"key\": \"%s\"", escaped)); @@ -659,7 +659,7 @@ __wt_logop_row_truncate_print(WT_SESSION_IMPL *session, WT_RET(__wt_fprintf(session, args->fs, " \"optype\": \"row_truncate\",\n")); WT_ERR(__wt_fprintf(session, args->fs, - " \"fileid\": %" PRIu32 ",\n", fileid)); + " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid)); WT_ERR(__logrec_make_json_str(session, &escaped, &start)); WT_ERR(__wt_fprintf(session, args->fs, " \"start\": \"%s\",\n", escaped)); @@ -798,6 +798,82 @@ __wt_logop_prev_lsn_print(WT_SESSION_IMPL *session, } int +__wt_logop_txn_timestamp_pack( + WT_SESSION_IMPL *session, WT_ITEM *logrec, + uint64_t time_sec, uint64_t time_nsec, uint64_t commit_ts, uint64_t durable_ts, uint64_t first_ts, uint64_t prepare_ts, uint64_t read_ts) +{ + const char *fmt = WT_UNCHECKED_STRING(IIQQQQQQQ); + size_t size; + uint32_t optype, recsize; + + optype = WT_LOGOP_TXN_TIMESTAMP; + WT_RET(__wt_struct_size(session, &size, fmt, + optype, 0, time_sec, time_nsec, commit_ts, durable_ts, first_ts, prepare_ts, read_ts)); + + __wt_struct_size_adjust(session, &size); + WT_RET(__wt_buf_extend(session, logrec, logrec->size + size)); + recsize = (uint32_t)size; + WT_RET(__wt_struct_pack(session, + (uint8_t *)logrec->data + logrec->size, size, fmt, + optype, recsize, time_sec, time_nsec, commit_ts, durable_ts, first_ts, prepare_ts, read_ts)); + + logrec->size += (uint32_t)size; + return (0); +} + +int +__wt_logop_txn_timestamp_unpack( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + uint64_t *time_secp, uint64_t *time_nsecp, uint64_t *commit_tsp, uint64_t *durable_tsp, uint64_t *first_tsp, uint64_t *prepare_tsp, uint64_t *read_tsp) +{ + WT_DECL_RET; + const char *fmt = WT_UNCHECKED_STRING(IIQQQQQQQ); + uint32_t optype, size; + + if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, + &optype, &size, time_secp, time_nsecp, commit_tsp, durable_tsp, first_tsp, prepare_tsp, read_tsp)) != 0) + WT_RET_MSG(session, ret, "logop_txn_timestamp: unpack failure"); + WT_ASSERT(session, optype == WT_LOGOP_TXN_TIMESTAMP); + + *pp += size; + return (0); +} + +int +__wt_logop_txn_timestamp_print(WT_SESSION_IMPL *session, + const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args) +{ + uint64_t time_sec; + uint64_t time_nsec; + uint64_t commit_ts; + uint64_t durable_ts; + uint64_t first_ts; + uint64_t prepare_ts; + uint64_t read_ts; + + WT_RET(__wt_logop_txn_timestamp_unpack( + session, pp, end, &time_sec, &time_nsec, &commit_ts, &durable_ts, &first_ts, &prepare_ts, &read_ts)); + + WT_RET(__wt_fprintf(session, args->fs, + " \"optype\": \"txn_timestamp\",\n")); + WT_RET(__wt_fprintf(session, args->fs, + " \"time_sec\": %" PRIu64 ",\n", time_sec)); + WT_RET(__wt_fprintf(session, args->fs, + " \"time_nsec\": %" PRIu64 ",\n", time_nsec)); + WT_RET(__wt_fprintf(session, args->fs, + " \"commit_ts\": %" PRIu64 ",\n", commit_ts)); + WT_RET(__wt_fprintf(session, args->fs, + " \"durable_ts\": %" PRIu64 ",\n", durable_ts)); + WT_RET(__wt_fprintf(session, args->fs, + " \"first_ts\": %" PRIu64 ",\n", first_ts)); + WT_RET(__wt_fprintf(session, args->fs, + " \"prepare_ts\": %" PRIu64 ",\n", prepare_ts)); + WT_RET(__wt_fprintf(session, args->fs, + " \"read_ts\": %" PRIu64 "", read_ts)); + return (0); +} + +int __wt_txn_op_printlog(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args) { @@ -848,6 +924,10 @@ __wt_txn_op_printlog(WT_SESSION_IMPL *session, WT_RET(__wt_logop_prev_lsn_print(session, pp, end, args)); break; + case WT_LOGOP_TXN_TIMESTAMP: + WT_RET(__wt_logop_txn_timestamp_print(session, pp, end, args)); + break; + WT_ILLEGAL_VALUE(session, optype); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_child.c b/src/third_party/wiredtiger/src/reconcile/rec_child.c new file mode 100644 index 00000000000..f1d261c8f42 --- /dev/null +++ b/src/third_party/wiredtiger/src/reconcile/rec_child.c @@ -0,0 +1,329 @@ +/*- + * Copyright (c) 2014-2019 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __rec_child_deleted -- + * Handle pages with leaf pages in the WT_REF_DELETED state. + */ +static int +__rec_child_deleted(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_REF *ref, WT_CHILD_STATE *statep) +{ + WT_PAGE_DELETED *page_del; + + page_del = ref->page_del; + + /* + * Internal pages with child leaf pages in the WT_REF_DELETED state are + * a special case during reconciliation. First, if the deletion was a + * result of a session truncate call, the deletion may not be visible to + * us. In that case, we proceed as with any change not visible during + * reconciliation by ignoring the change for the purposes of writing the + * internal page. + * + * In this case, there must be an associated page-deleted structure, and + * it holds the transaction ID we care about. + * + * In some cases, there had better not be any updates we can't see. + * + * A visible update to be in READY state (i.e. not in LOCKED or + * PREPARED state), for truly visible to others. + */ + if (F_ISSET(r, WT_REC_VISIBILITY_ERR) && page_del != NULL && + __wt_page_del_active(session, ref, false)) + WT_PANIC_RET(session, EINVAL, + "reconciliation illegally skipped an update"); + + /* + * Deal with any underlying disk blocks. + * + * First, check to see if there is an address associated with this leaf: + * if there isn't, we're done, the underlying page is already gone. If + * the page still exists, check for any transactions in the system that + * might want to see the page's state before it's deleted. + * + * If any such transactions exist, we cannot discard the underlying leaf + * page to the block manager because the transaction may eventually read + * it. However, this write might be part of a checkpoint, and should we + * recover to that checkpoint, we'll need to delete the leaf page, else + * we'd leak it. The solution is to write a proxy cell on the internal + * page ensuring the leaf page is eventually discarded. + * + * If no such transactions exist, we can discard the leaf page to the + * block manager and no cell needs to be written at all. We do this + * outside of the underlying tracking routines because this action is + * permanent and irrevocable. (Clearing the address means we've lost + * track of the disk address in a permanent way. This is safe because + * there's no path to reading the leaf page again: if there's ever a + * read into this part of the name space again, the cache read function + * instantiates an entirely new page.) + */ + if (ref->addr != NULL && !__wt_page_del_active(session, ref, true)) { + /* + * Minor memory cleanup: if a truncate call deleted this page + * and we were ever forced to instantiate the page in memory, + * we would have built a list of updates in the page reference + * in order to be able to commit/rollback the truncate. We just + * passed a visibility test, discard the update list. + */ + if (page_del != NULL) { + __wt_free(session, ref->page_del->update_list); + __wt_free(session, ref->page_del); + } + + WT_RET(__wt_ref_block_free(session, ref)); + } + + /* + * If the original page is gone, we can skip the slot on the internal + * page. + */ + if (ref->addr == NULL) { + *statep = WT_CHILD_IGNORE; + return (0); + } + + /* + * Internal pages with deletes that aren't stable cannot be evicted, we + * don't have sufficient information to restore the page's information + * if subsequently read (we wouldn't know which transactions should see + * the original page and which should see the deleted page). + */ + if (F_ISSET(r, WT_REC_EVICT)) + return (__wt_set_return(session, EBUSY)); + + /* + * If there are deleted child pages we can't discard immediately, keep + * the page dirty so they are eventually freed. + */ + r->leave_dirty = true; + + /* + * If the original page cannot be freed, we need to keep a slot on the + * page to reference it from the parent page. + * + * If the delete is not visible in this checkpoint, write the original + * address normally. Otherwise, we have to write a proxy record. + * If the delete state is not ready, then delete is not visible as it + * is in prepared state. + */ + if (!__wt_page_del_active(session, ref, false)) + *statep = WT_CHILD_PROXY; + + return (0); +} + +/* + * __wt_rec_child_modify -- + * Return if the internal page's child references any modifications. + */ +int +__wt_rec_child_modify(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_REF *ref, bool *hazardp, WT_CHILD_STATE *statep) +{ + WT_DECL_RET; + WT_PAGE_MODIFY *mod; + + /* We may acquire a hazard pointer our caller must release. */ + *hazardp = false; + + /* Default to using the original child address. */ + *statep = WT_CHILD_ORIGINAL; + + /* + * This function is called when walking an internal page to decide how + * to handle child pages referenced by the internal page. + * + * Internal pages are reconciled for two reasons: first, when evicting + * an internal page, second by the checkpoint code when writing internal + * pages. During eviction, all pages should be in the WT_REF_DISK or + * WT_REF_DELETED state. During checkpoint, eviction that might affect + * review of an internal page is prohibited, however, as the subtree is + * not reserved for our exclusive use, there are other page states that + * must be considered. + */ + for (;; __wt_yield()) { + switch (r->tested_ref_state = ref->state) { + case WT_REF_DISK: + /* On disk, not modified by definition. */ + goto done; + + case WT_REF_DELETED: + /* + * The child is in a deleted state. + * + * It's possible the state could change underneath us as + * the page is read in, and we can race between checking + * for a deleted state and looking at the transaction ID + * to see if the delete is visible to us. Lock down the + * structure. + */ + if (!WT_REF_CAS_STATE( + session, ref, WT_REF_DELETED, WT_REF_LOCKED)) + break; + ret = __rec_child_deleted(session, r, ref, statep); + WT_REF_SET_STATE(ref, WT_REF_DELETED); + goto done; + + case WT_REF_LOCKED: + /* + * Locked. + * + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. + */ + WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); + if (F_ISSET(r, WT_REC_EVICT)) + return (__wt_set_return(session, EBUSY)); + + /* + * If called during checkpoint, the child is being + * considered by the eviction server or the child is a + * truncated page being read. The eviction may have + * started before the checkpoint and so we must wait + * for the eviction to be resolved. I suspect we could + * handle reads of truncated pages, but we can't + * distinguish between the two and reads of truncated + * pages aren't expected to be common. + */ + break; + + case WT_REF_LIMBO: + WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); + /* FALLTHROUGH */ + case WT_REF_LOOKASIDE: + /* + * On disk or in cache with lookaside updates. + * + * We should never be here during eviction: active + * child pages in an evicted page's subtree fails the + * eviction attempt. + */ + if (F_ISSET(r, WT_REC_EVICT) && + __wt_page_las_active(session, ref)) { + WT_ASSERT(session, false); + return (__wt_set_return(session, EBUSY)); + } + + /* + * A page evicted with lookaside entries may not have + * an address, if no updates were visible to + * reconciliation. Any child pages in that state + * should be ignored. + */ + if (ref->addr == NULL) { + *statep = WT_CHILD_IGNORE; + WT_CHILD_RELEASE(session, *hazardp, ref); + } + goto done; + + case WT_REF_MEM: + /* + * In memory. + * + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. + */ + WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); + if (F_ISSET(r, WT_REC_EVICT)) + return (__wt_set_return(session, EBUSY)); + + /* + * If called during checkpoint, acquire a hazard pointer + * so the child isn't evicted, it's an in-memory case. + * + * This call cannot return split/restart, we have a lock + * on the parent which prevents a child page split. + * + * Set WT_READ_NO_WAIT because we're only interested in + * the WT_REF's final state. Pages in transition might + * change WT_REF state during our read, and then return + * WT_NOTFOUND to us. In that case, loop and look again. + */ + ret = __wt_page_in(session, ref, + WT_READ_CACHE | WT_READ_NO_EVICT | + WT_READ_NO_GEN | WT_READ_NO_WAIT); + if (ret == WT_NOTFOUND) { + ret = 0; + break; + } + WT_RET(ret); + *hazardp = true; + goto in_memory; + + case WT_REF_READING: + /* + * Being read, not modified by definition. + * + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. + */ + WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); + if (F_ISSET(r, WT_REC_EVICT)) + return (__wt_set_return(session, EBUSY)); + goto done; + + case WT_REF_SPLIT: + /* + * The page was split out from under us. + * + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. + * + * We should never be here during checkpoint, dirty page + * eviction is shutout during checkpoint, all splits in + * process will have completed before we walk any pages + * for checkpoint. + */ + WT_ASSERT(session, WT_REF_SPLIT != WT_REF_SPLIT); + return (__wt_set_return(session, EBUSY)); + + WT_ILLEGAL_VALUE(session, r->tested_ref_state); + } + WT_STAT_CONN_INCR(session, child_modify_blocked_page); + } + +in_memory: + /* + * In-memory states: the child is potentially modified if the page's + * modify structure has been instantiated. If the modify structure + * exists and the page has actually been modified, set that state. + * If that's not the case, we would normally use the original cell's + * disk address as our reference, however there are two special cases, + * both flagged by a missing block address. + * + * First, if forced to instantiate a deleted child page and it's never + * modified, we end up here with a page that has a modify structure, no + * modifications, and no disk address. Ignore those pages, they're not + * modified and there is no reason to write the cell. + * + * Second, insert splits are permitted during checkpoint. When doing the + * final checkpoint pass, we first walk the internal page's page-index + * and write out any dirty pages we find, then we write out the internal + * page in post-order traversal. If we found the split page in the first + * step, it will have an address; if we didn't find the split page in + * the first step, it won't have an address and we ignore it, it's not + * part of the checkpoint. + */ + mod = ref->page->modify; + if (mod != NULL && mod->rec_result != 0) + *statep = WT_CHILD_MODIFIED; + else if (ref->addr == NULL) { + *statep = WT_CHILD_IGNORE; + WT_CHILD_RELEASE(session, *hazardp, ref); + } + +done: WT_DIAGNOSTIC_YIELD; + return (ret); +} diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c new file mode 100644 index 00000000000..6a57a9c26d6 --- /dev/null +++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c @@ -0,0 +1,1077 @@ +/*- + * Copyright (c) 2014-2019 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __rec_col_fix_bulk_insert_split_check -- + * Check if a bulk-loaded fixed-length column store page needs to split. + */ +static inline int +__rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_RECONCILE *r; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session; + r = cbulk->reconcile; + btree = S2BT(session); + + if (cbulk->entry == cbulk->nrecs) { + if (cbulk->entry != 0) { + /* + * If everything didn't fit, update the counters and + * split. + * + * Boundary: split or write the page. + * + * No need to have a minimum split size boundary, all + * pages are filled 100% except the last, allowing it to + * grow in the future. + */ + __wt_rec_incr(session, r, cbulk->entry, + __bitstr_size( + (size_t)cbulk->entry * btree->bitcnt)); + WT_RET(__wt_rec_split(session, r, 0)); + } + cbulk->entry = 0; + cbulk->nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); + } + return (0); +} + +/* + * __wt_bulk_insert_fix -- + * Fixed-length column-store bulk insert. + */ +int +__wt_bulk_insert_fix( + WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_RECONCILE *r; + + r = cbulk->reconcile; + btree = S2BT(session); + cursor = &cbulk->cbt.iface; + + WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk)); + __bit_setv(r->first_free, cbulk->entry, + btree->bitcnt, deleted ? 0 : ((uint8_t *)cursor->value.data)[0]); + ++cbulk->entry; + ++r->recno; + + return (0); +} + +/* + * __wt_bulk_insert_fix_bitmap -- + * Fixed-length column-store bulk insert. + */ +int +__wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_RECONCILE *r; + uint32_t entries, offset, page_entries, page_size; + const uint8_t *data; + + r = cbulk->reconcile; + btree = S2BT(session); + cursor = &cbulk->cbt.iface; + + if (((r->recno - 1) * btree->bitcnt) & 0x7) + WT_RET_MSG(session, EINVAL, + "Bulk bitmap load not aligned on a byte boundary"); + for (data = cursor->value.data, + entries = (uint32_t)cursor->value.size; + entries > 0; + entries -= page_entries, data += page_size) { + WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk)); + + page_entries = WT_MIN(entries, cbulk->nrecs - cbulk->entry); + page_size = __bitstr_size(page_entries * btree->bitcnt); + offset = __bitstr_size(cbulk->entry * btree->bitcnt); + memcpy(r->first_free + offset, data, page_size); + cbulk->entry += page_entries; + r->recno += page_entries; + } + return (0); +} + +/* + * __wt_bulk_insert_var -- + * Variable-length column-store bulk insert. + */ +int +__wt_bulk_insert_var( + WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) +{ + WT_BTREE *btree; + WT_RECONCILE *r; + WT_REC_KV *val; + + r = cbulk->reconcile; + btree = S2BT(session); + + val = &r->v; + if (deleted) { + val->cell_len = __wt_cell_pack_del(&val->cell, cbulk->rle); + val->buf.data = NULL; + val->buf.size = 0; + val->len = val->cell_len; + } else + /* + * Store the bulk cursor's last buffer, not the current value, + * we're tracking duplicates, which means we want the previous + * value seen, not the current value. + */ + WT_RET(__wt_rec_cell_build_val(session, + r, cbulk->last.data, cbulk->last.size, cbulk->rle)); + + /* Boundary: split or write the page. */ + if (WT_CROSSING_SPLIT_BND(r, val->len)) + WT_RET(__wt_rec_split_crossing_bnd(session, r, val->len)); + + /* Copy the value onto the page. */ + if (btree->dictionary) + WT_RET(__wt_rec_dict_replace(session, r, cbulk->rle, val)); + __wt_rec_copy_incr(session, r, val); + + /* Update the starting record number in case we split. */ + r->recno += cbulk->rle; + + return (0); +} + +/* + * __rec_col_merge -- + * Merge in a split page. + */ +static int +__rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_ADDR *addr; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + WT_REC_KV *val; + uint32_t i; + + mod = page->modify; + + val = &r->v; + + /* For each entry in the split array... */ + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) { + /* Update the starting record number in case we split. */ + r->recno = multi->key.recno; + + /* Build the value cell. */ + addr = &multi->addr; + __wt_rec_cell_build_addr(session, r, + addr->addr, addr->size, __wt_rec_vtype(addr), r->recno); + + /* Boundary: split or write the page. */ + if (__wt_rec_need_split(r, val->len)) + WT_RET(__wt_rec_split_crossing_bnd( + session, r, val->len)); + + /* Copy the value onto the page. */ + __wt_rec_copy_incr(session, r, val); + } + return (0); +} + +/* + * __wt_rec_col_int -- + * Reconcile a column-store internal page. + */ +int +__wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) +{ + WT_ADDR *addr; + WT_BTREE *btree; + WT_CELL_UNPACK *vpack, _vpack; + WT_CHILD_STATE state; + WT_DECL_RET; + WT_PAGE *child, *page; + WT_REC_KV *val; + WT_REF *ref; + bool hazard; + + btree = S2BT(session); + page = pageref->page; + child = NULL; + hazard = false; + + val = &r->v; + vpack = &_vpack; + + WT_RET(__wt_rec_split_init(session, + r, page, pageref->ref_recno, btree->maxintlpage_precomp)); + + /* For each entry in the in-memory page... */ + WT_INTL_FOREACH_BEGIN(session, page, ref) { + /* Update the starting record number in case we split. */ + r->recno = ref->ref_recno; + + /* + * Modified child. + * The page may be emptied or internally created during a split. + * Deleted/split pages are merged into the parent and discarded. + */ + WT_ERR(__wt_rec_child_modify(session, r, ref, &hazard, &state)); + addr = NULL; + child = ref->page; + + switch (state) { + case WT_CHILD_IGNORE: + /* Ignored child. */ + WT_CHILD_RELEASE_ERR(session, hazard, ref); + continue; + + case WT_CHILD_MODIFIED: + /* + * Modified child. Empty pages are merged into the + * parent and discarded. + */ + switch (child->modify->rec_result) { + case WT_PM_REC_EMPTY: + /* + * Column-store pages are almost never empty, as + * discarding a page would remove a chunk of the + * name space. The exceptions are pages created + * when the tree is created, and never filled. + */ + WT_CHILD_RELEASE_ERR(session, hazard, ref); + continue; + case WT_PM_REC_MULTIBLOCK: + WT_ERR(__rec_col_merge(session, r, child)); + WT_CHILD_RELEASE_ERR(session, hazard, ref); + continue; + case WT_PM_REC_REPLACE: + addr = &child->modify->mod_replace; + break; + WT_ILLEGAL_VALUE_ERR( + session, child->modify->rec_result); + } + break; + case WT_CHILD_ORIGINAL: + /* Original child. */ + break; + case WT_CHILD_PROXY: + /* + * Deleted child where we write a proxy cell, not yet + * supported for column-store. + */ + WT_ERR(__wt_illegal_value(session, state)); + } + + /* + * Build the value cell. The child page address is in one of 3 + * places: if the page was replaced, the page's modify structure + * references it and we built the value cell just above in the + * switch statement. Else, the WT_REF->addr reference points to + * an on-page cell or an off-page WT_ADDR structure: if it's an + * on-page cell and we copy it from the page, else build a new + * cell. + */ + if (addr == NULL && __wt_off_page(page, ref->addr)) + addr = ref->addr; + if (addr == NULL) { + __wt_cell_unpack(ref->addr, vpack); + val->buf.data = ref->addr; + val->buf.size = __wt_cell_total_len(vpack); + val->cell_len = 0; + val->len = val->buf.size; + } else + __wt_rec_cell_build_addr(session, r, + addr->addr, addr->size, + __wt_rec_vtype(addr), ref->ref_recno); + WT_CHILD_RELEASE_ERR(session, hazard, ref); + + /* Boundary: split or write the page. */ + if (__wt_rec_need_split(r, val->len)) + WT_ERR(__wt_rec_split_crossing_bnd( + session, r, val->len)); + + /* Copy the value onto the page. */ + __wt_rec_copy_incr(session, r, val); + } WT_INTL_FOREACH_END; + + /* Write the remnant page. */ + return (__wt_rec_split_finish(session, r)); + +err: WT_CHILD_RELEASE(session, hazard, ref); + return (ret); +} + +/* + * __wt_rec_col_fix -- + * Reconcile a fixed-width, column-store leaf page. + */ +int +__wt_rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) +{ + WT_BTREE *btree; + WT_INSERT *ins; + WT_PAGE *page; + WT_UPDATE *upd; + uint64_t recno; + uint32_t entry, nrecs; + + btree = S2BT(session); + page = pageref->page; + + WT_RET(__wt_rec_split_init( + session, r, page, pageref->ref_recno, btree->maxleafpage)); + + /* Copy the original, disk-image bytes into place. */ + memcpy(r->first_free, page->pg_fix_bitf, + __bitstr_size((size_t)page->entries * btree->bitcnt)); + + /* Update any changes to the original on-page data items. */ + WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) { + WT_RET(__wt_rec_txn_read( + session, r, ins, NULL, NULL, NULL, &upd)); + if (upd != NULL) + __bit_setv(r->first_free, + WT_INSERT_RECNO(ins) - pageref->ref_recno, + btree->bitcnt, *upd->data); + } + + /* Calculate the number of entries per page remainder. */ + entry = page->entries; + nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail) - page->entries; + r->recno += entry; + + /* Walk any append list. */ + for (ins = + WT_SKIP_FIRST(WT_COL_APPEND(page));; ins = WT_SKIP_NEXT(ins)) { + if (ins == NULL) { + /* + * If the page split, instantiate any missing records in + * the page's name space. (Imagine record 98 is + * transactionally visible, 99 wasn't created or is not + * yet visible, 100 is visible. Then the page splits and + * record 100 moves to another page. When we reconcile + * the original page, we write record 98, then we don't + * see record 99 for whatever reason. If we've moved + * record 100, we don't know to write a deleted record + * 99 on the page.) + * + * The record number recorded during the split is the + * first key on the split page, that is, one larger than + * the last key on this page, we have to decrement it. + */ + if ((recno = + page->modify->mod_col_split_recno) == WT_RECNO_OOB) + break; + recno -= 1; + + /* + * The following loop assumes records to write, and the + * previous key might have been visible. + */ + if (r->recno > recno) + break; + upd = NULL; + } else { + WT_RET(__wt_rec_txn_read( + session, r, ins, NULL, NULL, NULL, &upd)); + recno = WT_INSERT_RECNO(ins); + } + for (;;) { + /* + * The application may have inserted records which left + * gaps in the name space. + */ + for (; + nrecs > 0 && r->recno < recno; + --nrecs, ++entry, ++r->recno) + __bit_setv( + r->first_free, entry, btree->bitcnt, 0); + + if (nrecs > 0) { + __bit_setv(r->first_free, entry, btree->bitcnt, + upd == NULL ? 0 : *upd->data); + --nrecs; + ++entry; + ++r->recno; + break; + } + + /* + * If everything didn't fit, update the counters and + * split. + * + * Boundary: split or write the page. + * + * No need to have a minimum split size boundary, all + * pages are filled 100% except the last, allowing it to + * grow in the future. + */ + __wt_rec_incr(session, r, entry, + __bitstr_size((size_t)entry * btree->bitcnt)); + WT_RET(__wt_rec_split(session, r, 0)); + + /* Calculate the number of entries per page. */ + entry = 0; + nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); + } + + /* + * Execute this loop once without an insert item to catch any + * missing records due to a split, then quit. + */ + if (ins == NULL) + break; + } + + /* Update the counters. */ + __wt_rec_incr( + session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt)); + + /* Write the remnant page. */ + return (__wt_rec_split_finish(session, r)); +} + +/* + * __wt_rec_col_fix_slvg -- + * Reconcile a fixed-width, column-store leaf page created during salvage. + */ +int +__wt_rec_col_fix_slvg(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage) +{ + WT_BTREE *btree; + WT_PAGE *page; + uint64_t page_start, page_take; + uint32_t entry, nrecs; + + btree = S2BT(session); + page = pageref->page; + + /* + * !!! + * It's vanishingly unlikely and probably impossible for fixed-length + * column-store files to have overlapping key ranges. It's possible + * for an entire key range to go missing (if a page is corrupted and + * lost), but because pages can't split, it shouldn't be possible to + * find pages where the key ranges overlap. That said, we check for + * it during salvage and clean up after it here because it doesn't + * cost much and future column-store formats or operations might allow + * for fixed-length format ranges to overlap during salvage, and I + * don't want to have to retrofit the code later. + */ + WT_RET(__wt_rec_split_init( + session, r, page, pageref->ref_recno, btree->maxleafpage)); + + /* We may not be taking all of the entries on the original page. */ + page_take = salvage->take == 0 ? page->entries : salvage->take; + page_start = salvage->skip == 0 ? 0 : salvage->skip; + + /* Calculate the number of entries per page. */ + entry = 0; + nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); + + for (; nrecs > 0 && salvage->missing > 0; + --nrecs, --salvage->missing, ++entry) + __bit_setv(r->first_free, entry, btree->bitcnt, 0); + + for (; nrecs > 0 && page_take > 0; + --nrecs, --page_take, ++page_start, ++entry) + __bit_setv(r->first_free, entry, btree->bitcnt, + __bit_getv(page->pg_fix_bitf, + (uint32_t)page_start, btree->bitcnt)); + + r->recno += entry; + __wt_rec_incr(session, r, entry, + __bitstr_size((size_t)entry * btree->bitcnt)); + + /* + * We can't split during salvage -- if everything didn't fit, it's + * all gone wrong. + */ + if (salvage->missing != 0 || page_take != 0) + WT_PANIC_RET(session, WT_PANIC, + "%s page too large, attempted split during salvage", + __wt_page_type_string(page->type)); + + /* Write the page. */ + return (__wt_rec_split_finish(session, r)); +} + +/* + * __rec_col_var_helper -- + * Create a column-store variable length record cell and write it onto a + * page. + */ +static int +__rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, + WT_SALVAGE_COOKIE *salvage, + WT_ITEM *value, bool deleted, uint8_t overflow_type, uint64_t rle) +{ + WT_BTREE *btree; + WT_REC_KV *val; + + btree = S2BT(session); + + val = &r->v; + + /* + * Occasionally, salvage needs to discard records from the beginning or + * end of the page, and because the items may be part of a RLE cell, do + * the adjustments here. It's not a mistake we don't bother telling + * our caller we've handled all the records from the page we care about, + * and can quit processing the page: salvage is a rare operation and I + * don't want to complicate our caller's loop. + */ + if (salvage != NULL) { + if (salvage->done) + return (0); + if (salvage->skip != 0) { + if (rle <= salvage->skip) { + salvage->skip -= rle; + return (0); + } + rle -= salvage->skip; + salvage->skip = 0; + } + if (salvage->take != 0) { + if (rle <= salvage->take) + salvage->take -= rle; + else { + rle = salvage->take; + salvage->take = 0; + } + if (salvage->take == 0) + salvage->done = true; + } + } + + if (deleted) { + val->cell_len = __wt_cell_pack_del(&val->cell, rle); + val->buf.data = NULL; + val->buf.size = 0; + val->len = val->cell_len; + } else if (overflow_type) { + val->cell_len = __wt_cell_pack_ovfl( + &val->cell, overflow_type, rle, value->size); + val->buf.data = value->data; + val->buf.size = value->size; + val->len = val->cell_len + value->size; + } else + WT_RET(__wt_rec_cell_build_val( + session, r, value->data, value->size, rle)); + + /* Boundary: split or write the page. */ + if (__wt_rec_need_split(r, val->len)) + WT_RET(__wt_rec_split_crossing_bnd(session, r, val->len)); + + /* Copy the value onto the page. */ + if (!deleted && !overflow_type && btree->dictionary) + WT_RET(__wt_rec_dict_replace(session, r, rle, val)); + __wt_rec_copy_incr(session, r, val); + + /* Update the starting record number in case we split. */ + r->recno += rle; + + return (0); +} + +/* + * __wt_rec_col_var -- + * Reconcile a variable-width column-store leaf page. + */ +int +__wt_rec_col_var(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage) +{ + enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state; + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *vpack, _vpack; + WT_COL *cip; + WT_CURSOR_BTREE *cbt; + WT_DECL_ITEM(orig); + WT_DECL_RET; + WT_INSERT *ins; + WT_ITEM *last; + WT_PAGE *page; + WT_UPDATE *upd; + uint64_t n, nrepeat, repeat_count, rle, skip, src_recno; + uint32_t i, size; + bool deleted, last_deleted, orig_deleted, update_no_copy; + const void *data; + + btree = S2BT(session); + page = pageref->page; + last = r->last; + vpack = &_vpack; + cbt = &r->update_modify_cbt; + + WT_RET(__wt_rec_split_init(session, + r, page, pageref->ref_recno, btree->maxleafpage_precomp)); + + WT_RET(__wt_scr_alloc(session, 0, &orig)); + data = NULL; + size = 0; + upd = NULL; + + /* + * The salvage code may be calling us to reconcile a page where there + * were missing records in the column-store name space. If taking the + * first record from on the page, it might be a deleted record, so we + * have to give the RLE code a chance to figure that out. Else, if + * not taking the first record from the page, write a single element + * representing the missing records onto a new page. (Don't pass the + * salvage cookie to our helper function in this case, we're handling + * one of the salvage cookie fields on our own, and we don't need the + * helper function's assistance.) + */ + rle = 0; + last_deleted = false; + if (salvage != NULL && salvage->missing != 0) { + if (salvage->skip == 0) { + rle = salvage->missing; + last_deleted = true; + + /* + * Correct the number of records we're going to "take", + * pretending the missing records were on the page. + */ + salvage->take += salvage->missing; + } else + WT_ERR(__rec_col_var_helper(session, + r, NULL, NULL, true, false, salvage->missing)); + } + + /* + * We track two data items through this loop: the previous (last) item + * and the current item: if the last item is the same as the current + * item, we increment the RLE count for the last item; if the last item + * is different from the current item, we write the last item onto the + * page, and replace it with the current item. The r->recno counter + * tracks records written to the page, and is incremented by the helper + * function immediately after writing records to the page. The record + * number of our source record, that is, the current item, is maintained + * in src_recno. + */ + src_recno = r->recno + rle; + + /* For each entry in the in-memory page... */ + WT_COL_FOREACH(page, cip, i) { + ovfl_state = OVFL_IGNORE; + if ((cell = WT_COL_PTR(page, cip)) == NULL) { + nrepeat = 1; + ins = NULL; + orig_deleted = true; + } else { + __wt_cell_unpack(cell, vpack); + nrepeat = __wt_cell_rle(vpack); + ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip)); + + /* + * If the original value is "deleted", there's no value + * to compare, we're done. + */ + orig_deleted = vpack->type == WT_CELL_DEL; + if (orig_deleted) + goto record_loop; + + /* + * Overflow items are tricky: we don't know until we're + * finished processing the set of values if we need the + * overflow value or not. If we don't use the overflow + * item at all, we have to discard it from the backing + * file, otherwise we'll leak blocks on the checkpoint. + * That's safe because if the backing overflow value is + * still needed by any running transaction, we'll cache + * a copy in the update list. + * + * Regardless, we avoid copying in overflow records: if + * there's a WT_INSERT entry that modifies a reference + * counted overflow record, we may have to write copies + * of the overflow record, and in that case we'll do the + * comparisons, but we don't read overflow items just to + * see if they match records on either side. + */ + if (vpack->ovfl) { + ovfl_state = OVFL_UNUSED; + goto record_loop; + } + + /* + * If data is Huffman encoded, we have to decode it in + * order to compare it with the last item we saw, which + * may have been an update string. This guarantees we + * find every single pair of objects we can RLE encode, + * including applications updating an existing record + * where the new value happens (?) to match a Huffman- + * encoded value in a previous or next record. + */ + WT_ERR(__wt_dsk_cell_data_ref( + session, WT_PAGE_COL_VAR, vpack, orig)); + } + +record_loop: /* + * Generate on-page entries: loop repeat records, looking for + * WT_INSERT entries matching the record number. The WT_INSERT + * lists are in sorted order, so only need check the next one. + */ + for (n = 0; + n < nrepeat; n += repeat_count, src_recno += repeat_count) { + upd = NULL; + if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) { + WT_ERR(__wt_rec_txn_read( + session, r, ins, cip, vpack, NULL, &upd)); + ins = WT_SKIP_NEXT(ins); + } + + update_no_copy = true; /* No data copy */ + repeat_count = 1; /* Single record */ + deleted = false; + + if (upd != NULL) { + switch (upd->type) { + case WT_UPDATE_MODIFY: + cbt->slot = WT_COL_SLOT(page, cip); + WT_ERR(__wt_value_return_upd( + session, cbt, upd, + F_ISSET(r, WT_REC_VISIBLE_ALL))); + data = cbt->iface.value.data; + size = (uint32_t)cbt->iface.value.size; + update_no_copy = false; + break; + case WT_UPDATE_STANDARD: + data = upd->data; + size = upd->size; + break; + case WT_UPDATE_TOMBSTONE: + deleted = true; + break; + WT_ILLEGAL_VALUE_ERR(session, upd->type); + } + } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) { + /* + * If doing an update save and restore, and the + * underlying value is a removed overflow value, + * we end up here. + * + * If necessary, when the overflow value was + * originally removed, reconciliation appended + * a globally visible copy of the value to the + * key's update list, meaning the on-page item + * isn't accessed after page re-instantiation. + * + * Assert the case. + */ + WT_ASSERT(session, + F_ISSET(r, WT_REC_UPDATE_RESTORE)); + + /* + * The on-page value will never be accessed, + * write a placeholder record. + */ + data = "ovfl-unused"; + size = WT_STORE_SIZE(strlen("ovfl-unused")); + } else { + update_no_copy = false; /* Maybe data copy */ + + /* + * The repeat count is the number of records up + * to the next WT_INSERT record, or up to the + * end of the entry if we have no more WT_INSERT + * records. + */ + if (ins == NULL) + repeat_count = nrepeat - n; + else + repeat_count = + WT_INSERT_RECNO(ins) - src_recno; + + deleted = orig_deleted; + if (deleted) + goto compare; + + /* + * If we are handling overflow items, use the + * overflow item itself exactly once, after + * which we have to copy it into a buffer and + * from then on use a complete copy because we + * are re-creating a new overflow record each + * time. + */ + switch (ovfl_state) { + case OVFL_UNUSED: + /* + * An as-yet-unused overflow item. + * + * We're going to copy the on-page cell, + * write out any record we're tracking. + */ + if (rle != 0) { + WT_ERR(__rec_col_var_helper( + session, r, salvage, last, + last_deleted, 0, rle)); + rle = 0; + } + + last->data = vpack->data; + last->size = vpack->size; + WT_ERR(__rec_col_var_helper( + session, r, salvage, last, false, + WT_CELL_VALUE_OVFL, repeat_count)); + + /* Track if page has overflow items. */ + r->ovfl_items = true; + + ovfl_state = OVFL_USED; + continue; + case OVFL_USED: + /* + * Original is an overflow item; we used + * it for a key and now we need another + * copy; read it into memory. + */ + WT_ERR(__wt_dsk_cell_data_ref(session, + WT_PAGE_COL_VAR, vpack, orig)); + + ovfl_state = OVFL_IGNORE; + /* FALLTHROUGH */ + case OVFL_IGNORE: + /* + * Original is an overflow item and we + * were forced to copy it into memory, + * or the original wasn't an overflow + * item; use the data copied into orig. + */ + data = orig->data; + size = (uint32_t)orig->size; + break; + } + } + +compare: /* + * If we have a record against which to compare, and + * the records compare equal, increment the rle counter + * and continue. If the records don't compare equal, + * output the last record and swap the last and current + * buffers: do NOT update the starting record number, + * we've been doing that all along. + */ + if (rle != 0) { + if ((deleted && last_deleted) || + (!last_deleted && !deleted && + last->size == size && + memcmp(last->data, data, size) == 0)) { + rle += repeat_count; + continue; + } + WT_ERR(__rec_col_var_helper(session, r, + salvage, last, last_deleted, 0, rle)); + } + + /* + * Swap the current/last state. + * + * Reset RLE counter and turn on comparisons. + */ + if (!deleted) { + /* + * We can't simply assign the data values into + * the last buffer because they may have come + * from a copy built from an encoded/overflow + * cell and creating the next record is going + * to overwrite that memory. Check, because + * encoded/overflow cells aren't that common + * and we'd like to avoid the copy. If data + * was taken from the current unpack structure + * (which points into the page), or was taken + * from an update structure, we can just use + * the pointers, they're not moving. + */ + if (data == vpack->data || update_no_copy) { + last->data = data; + last->size = size; + } else + WT_ERR(__wt_buf_set( + session, last, data, size)); + } + last_deleted = deleted; + rle = repeat_count; + } + + /* + * The first time we find an overflow record we never used, + * discard the underlying blocks, they're no longer useful. + */ + if (ovfl_state == OVFL_UNUSED && + vpack->raw != WT_CELL_VALUE_OVFL_RM) + WT_ERR(__wt_ovfl_remove( + session, page, vpack, F_ISSET(r, WT_REC_EVICT))); + } + + /* Walk any append list. */ + for (ins = + WT_SKIP_FIRST(WT_COL_APPEND(page));; ins = WT_SKIP_NEXT(ins)) { + if (ins == NULL) { + /* + * If the page split, instantiate any missing records in + * the page's name space. (Imagine record 98 is + * transactionally visible, 99 wasn't created or is not + * yet visible, 100 is visible. Then the page splits and + * record 100 moves to another page. When we reconcile + * the original page, we write record 98, then we don't + * see record 99 for whatever reason. If we've moved + * record 100, we don't know to write a deleted record + * 99 on the page.) + * + * Assert the recorded record number is past the end of + * the page. + * + * The record number recorded during the split is the + * first key on the split page, that is, one larger than + * the last key on this page, we have to decrement it. + */ + if ((n = page-> + modify->mod_col_split_recno) == WT_RECNO_OOB) + break; + WT_ASSERT(session, n >= src_recno); + n -= 1; + + upd = NULL; + } else { + WT_ERR(__wt_rec_txn_read( + session, r, ins, NULL, NULL, NULL, &upd)); + n = WT_INSERT_RECNO(ins); + } + while (src_recno <= n) { + deleted = false; + update_no_copy = true; + + /* + * The application may have inserted records which left + * gaps in the name space, and these gaps can be huge. + * If we're in a set of deleted records, skip the boring + * part. + */ + if (src_recno < n) { + deleted = true; + if (last_deleted) { + /* + * The record adjustment is decremented + * by one so we can naturally fall into + * the RLE accounting below, where we + * increment rle by one, then continue + * in the outer loop, where we increment + * src_recno by one. + */ + skip = (n - src_recno) - 1; + rle += skip; + src_recno += skip; + } + } else if (upd == NULL) + deleted = true; + else + switch (upd->type) { + case WT_UPDATE_MODIFY: + /* + * Impossible slot, there's no backing + * on-page item. + */ + cbt->slot = UINT32_MAX; + WT_ERR(__wt_value_return_upd( + session, cbt, upd, + F_ISSET(r, WT_REC_VISIBLE_ALL))); + data = cbt->iface.value.data; + size = (uint32_t)cbt->iface.value.size; + update_no_copy = false; + break; + case WT_UPDATE_STANDARD: + data = upd->data; + size = upd->size; + break; + case WT_UPDATE_TOMBSTONE: + deleted = true; + break; + WT_ILLEGAL_VALUE_ERR(session, upd->type); + } + + /* + * Handle RLE accounting and comparisons -- see comment + * above, this code fragment does the same thing. + */ + if (rle != 0) { + if ((deleted && last_deleted) || + (!last_deleted && !deleted && + last->size == size && + memcmp(last->data, data, size) == 0)) { + ++rle; + goto next; + } + WT_ERR(__rec_col_var_helper(session, r, + salvage, last, last_deleted, 0, rle)); + } + + /* + * Swap the current/last state. We can't simply assign + * the data values into the last buffer because they may + * be a temporary copy built from a chain of modified + * updates and creating the next record will overwrite + * that memory. Check, we'd like to avoid the copy. If + * data was taken from an update structure, we can just + * use the pointers, they're not moving. + */ + if (!deleted) { + if (update_no_copy) { + last->data = data; + last->size = size; + } else + WT_ERR(__wt_buf_set( + session, last, data, size)); + } + + /* Ready for the next loop, reset the RLE counter. */ + last_deleted = deleted; + rle = 1; + + /* + * Move to the next record. It's not a simple increment + * because if it's the maximum record, incrementing it + * wraps to 0 and this turns into an infinite loop. + */ +next: if (src_recno == UINT64_MAX) + break; + ++src_recno; + } + + /* + * Execute this loop once without an insert item to catch any + * missing records due to a split, then quit. + */ + if (ins == NULL) + break; + } + + /* If we were tracking a record, write it. */ + if (rle != 0) + WT_ERR(__rec_col_var_helper( + session, r, salvage, last, last_deleted, 0, rle)); + + /* Write the remnant page. */ + ret = __wt_rec_split_finish(session, r); + +err: __wt_scr_free(session, &orig); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/reconcile/rec_dictionary.c b/src/third_party/wiredtiger/src/reconcile/rec_dictionary.c new file mode 100644 index 00000000000..11707f77620 --- /dev/null +++ b/src/third_party/wiredtiger/src/reconcile/rec_dictionary.c @@ -0,0 +1,200 @@ +/*- + * Copyright (c) 2014-2019 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __rec_dictionary_skip_search -- + * Search a dictionary skiplist. + */ +static WT_REC_DICTIONARY * +__rec_dictionary_skip_search(WT_REC_DICTIONARY **head, uint64_t hash) +{ + WT_REC_DICTIONARY **e; + int i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { + if (*e == NULL) { /* Empty levels */ + --i; + --e; + continue; + } + + /* + * Return any exact matches: we don't care in what search level + * we found a match. + */ + if ((*e)->hash == hash) /* Exact match */ + return (*e); + if ((*e)->hash > hash) { /* Drop down a level */ + --i; + --e; + } else /* Keep going at this level */ + e = &(*e)->next[i]; + } + return (NULL); +} + +/* + * __rec_dictionary_skip_search_stack -- + * Search a dictionary skiplist, returning an insert/remove stack. + */ +static void +__rec_dictionary_skip_search_stack( + WT_REC_DICTIONARY **head, WT_REC_DICTIONARY ***stack, uint64_t hash) +{ + WT_REC_DICTIONARY **e; + int i; + + /* + * Start at the highest skip level, then go as far as possible at each + * level before stepping down to the next. + */ + for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) + if (*e == NULL || (*e)->hash > hash) + stack[i--] = e--; /* Drop down a level */ + else + e = &(*e)->next[i]; /* Keep going at this level */ +} + +/* + * __rec_dictionary_skip_insert -- + * Insert an entry into the dictionary skip-list. + */ +static void +__rec_dictionary_skip_insert( + WT_REC_DICTIONARY **head, WT_REC_DICTIONARY *e, uint64_t hash) +{ + WT_REC_DICTIONARY **stack[WT_SKIP_MAXDEPTH]; + u_int i; + + /* Insert the new entry into the skiplist. */ + __rec_dictionary_skip_search_stack(head, stack, hash); + for (i = 0; i < e->depth; ++i) { + e->next[i] = *stack[i]; + *stack[i] = e; + } +} + +/* + * __wt_rec_dictionary_init -- + * Allocate and initialize the dictionary. + */ +int +__wt_rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u_int slots) +{ + u_int depth, i; + + /* Free any previous dictionary. */ + __wt_rec_dictionary_free(session, r); + + r->dictionary_slots = slots; + WT_RET(__wt_calloc(session, + r->dictionary_slots, sizeof(WT_REC_DICTIONARY *), &r->dictionary)); + for (i = 0; i < r->dictionary_slots; ++i) { + depth = __wt_skip_choose_depth(session); + WT_RET(__wt_calloc(session, 1, + sizeof(WT_REC_DICTIONARY) + + depth * sizeof(WT_REC_DICTIONARY *), &r->dictionary[i])); + r->dictionary[i]->depth = depth; + } + return (0); +} + +/* + * __wt_rec_dictionary_free -- + * Free the dictionary. + */ +void +__wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + u_int i; + + if (r->dictionary == NULL) + return; + + /* + * We don't correct dictionary_slots when we fail during allocation, + * but that's OK, the value is either NULL or a memory reference to + * be free'd. + */ + for (i = 0; i < r->dictionary_slots; ++i) + __wt_free(session, r->dictionary[i]); + __wt_free(session, r->dictionary); +} + +/* + * __wt_rec_dictionary_reset -- + * Reset the dictionary when reconciliation restarts and when crossing a + * page boundary (a potential split). + */ +void +__wt_rec_dictionary_reset(WT_RECONCILE *r) +{ + if (r->dictionary_slots) { + r->dictionary_next = 0; + memset(r->dictionary_head, 0, sizeof(r->dictionary_head)); + } +} + +/* + * __wt_rec_dictionary_lookup -- + * Check the dictionary for a matching value on this page. + */ +int +__wt_rec_dictionary_lookup(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_REC_KV *val, WT_REC_DICTIONARY **dpp) +{ + WT_REC_DICTIONARY *dp, *next; + uint64_t hash; + bool match; + + *dpp = NULL; + + /* Search the dictionary, and return any match we find. */ + hash = __wt_hash_fnv64(val->buf.data, val->buf.size); + for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash); + dp != NULL && dp->hash == hash; dp = dp->next[0]) { + WT_RET(__wt_cell_pack_data_match( + (WT_CELL *)((uint8_t *)r->cur_ptr->image.mem + dp->offset), + &val->cell, val->buf.data, &match)); + if (match) { + WT_STAT_DATA_INCR(session, rec_dictionary); + *dpp = dp; + return (0); + } + } + + /* + * We're not doing value replacement in the dictionary. We stop adding + * new entries if we run out of empty dictionary slots (but continue to + * use the existing entries). I can't think of any reason a leaf page + * value is more likely to be seen because it was seen more recently + * than some other value: if we find working sets where that's not the + * case, it shouldn't be too difficult to maintain a pointer which is + * the next dictionary slot to re-use. + */ + if (r->dictionary_next >= r->dictionary_slots) + return (0); + + /* + * Set the hash value, we'll add this entry into the dictionary when we + * write it into the page's disk image buffer (because that's when we + * know where on the page it will be written). + */ + next = r->dictionary[r->dictionary_next++]; + next->offset = 0; /* Not necessary, just cautious. */ + next->hash = hash; + __rec_dictionary_skip_insert(r->dictionary_head, next, hash); + *dpp = next; + return (0); +} diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c new file mode 100644 index 00000000000..dc249f6a22f --- /dev/null +++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c @@ -0,0 +1,1025 @@ +/*- + * Copyright (c) 2014-2019 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __rec_key_state_update -- + * Update prefix and suffix compression based on the last key. + */ +static inline void +__rec_key_state_update(WT_RECONCILE *r, bool ovfl_key) +{ + WT_ITEM *a; + + /* + * If writing an overflow key onto the page, don't update the "last key" + * value, and leave the state of prefix compression alone. (If we are + * currently doing prefix compression, we have a key state which will + * continue to work, we're just skipping the key just created because + * it's an overflow key and doesn't participate in prefix compression. + * If we are not currently doing prefix compression, we can't start, an + * overflow key doesn't give us any state.) + * + * Additionally, if we wrote an overflow key onto the page, turn off the + * suffix compression of row-store internal node keys. (When we split, + * "last key" is the largest key on the previous page, and "cur key" is + * the first key on the next page, which is being promoted. In some + * cases we can discard bytes from the "cur key" that are not needed to + * distinguish between the "last key" and "cur key", compressing the + * size of keys on internal nodes. If we just built an overflow key, + * we're not going to update the "last key", making suffix compression + * impossible for the next key. Alternatively, we could remember where + * the last key was on the page, detect it's an overflow key, read it + * from disk and do suffix compression, but that's too much work for an + * unlikely event.) + * + * If we're not writing an overflow key on the page, update the last-key + * value and turn on both prefix and suffix compression. + */ + if (ovfl_key) + r->key_sfx_compress = false; + else { + a = r->cur; + r->cur = r->last; + r->last = a; + + r->key_pfx_compress = r->key_pfx_compress_conf; + r->key_sfx_compress = r->key_sfx_compress_conf; + } +} + +/* + * __rec_cell_build_int_key -- + * Process a key and return a WT_CELL structure and byte string to be + * stored on a row-store internal page. + */ +static int +__rec_cell_build_int_key(WT_SESSION_IMPL *session, + WT_RECONCILE *r, const void *data, size_t size, bool *is_ovflp) +{ + WT_BTREE *btree; + WT_REC_KV *key; + + *is_ovflp = false; + + btree = S2BT(session); + + key = &r->k; + + /* Copy the bytes into the "current" and key buffers. */ + WT_RET(__wt_buf_set(session, r->cur, data, size)); + WT_RET(__wt_buf_set(session, &key->buf, data, size)); + + /* Create an overflow object if the data won't fit. */ + if (size > btree->maxintlkey) { + WT_STAT_DATA_INCR(session, rec_overflow_key_internal); + + *is_ovflp = true; + return (__wt_rec_cell_build_ovfl( + session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0)); + } + + key->cell_len = __wt_cell_pack_int_key(&key->cell, key->buf.size); + key->len = key->cell_len + key->buf.size; + + return (0); +} + +/* + * __rec_cell_build_leaf_key -- + * Process a key and return a WT_CELL structure and byte string to be + * stored on a row-store leaf page. + */ +static int +__rec_cell_build_leaf_key(WT_SESSION_IMPL *session, + WT_RECONCILE *r, const void *data, size_t size, bool *is_ovflp) +{ + WT_BTREE *btree; + WT_REC_KV *key; + size_t pfx_max; + const uint8_t *a, *b; + uint8_t pfx; + + *is_ovflp = false; + + btree = S2BT(session); + + key = &r->k; + + pfx = 0; + if (data == NULL) + /* + * When data is NULL, our caller has a prefix compressed key + * they can't use (probably because they just crossed a split + * point). Use the full key saved when last called, instead. + */ + WT_RET(__wt_buf_set( + session, &key->buf, r->cur->data, r->cur->size)); + else { + /* + * Save a copy of the key for later reference: we use the full + * key for prefix-compression comparisons, and if we are, for + * any reason, unable to use the compressed key we generate. + */ + WT_RET(__wt_buf_set(session, r->cur, data, size)); + + /* + * Do prefix compression on the key. We know by definition the + * previous key sorts before the current key, which means the + * keys must differ and we just need to compare up to the + * shorter of the two keys. + */ + if (r->key_pfx_compress) { + /* + * We can't compress out more than 256 bytes, limit the + * comparison to that. + */ + pfx_max = UINT8_MAX; + if (size < pfx_max) + pfx_max = size; + if (r->last->size < pfx_max) + pfx_max = r->last->size; + for (a = data, b = r->last->data; pfx < pfx_max; ++pfx) + if (*a++ != *b++) + break; + + /* + * Prefix compression may cost us CPU and memory when + * the page is re-loaded, don't do it unless there's + * reasonable gain. + */ + if (pfx < btree->prefix_compression_min) + pfx = 0; + else + WT_STAT_DATA_INCRV( + session, rec_prefix_compression, pfx); + } + + /* Copy the non-prefix bytes into the key buffer. */ + WT_RET(__wt_buf_set( + session, &key->buf, (uint8_t *)data + pfx, size - pfx)); + } + + /* Optionally compress the key using the Huffman engine. */ + if (btree->huffman_key != NULL) + WT_RET(__wt_huffman_encode(session, btree->huffman_key, + key->buf.data, (uint32_t)key->buf.size, &key->buf)); + + /* Create an overflow object if the data won't fit. */ + if (key->buf.size > btree->maxleafkey) { + /* + * Overflow objects aren't prefix compressed -- rebuild any + * object that was prefix compressed. + */ + if (pfx == 0) { + WT_STAT_DATA_INCR(session, rec_overflow_key_leaf); + + *is_ovflp = true; + return (__wt_rec_cell_build_ovfl( + session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0)); + } + return ( + __rec_cell_build_leaf_key(session, r, NULL, 0, is_ovflp)); + } + + key->cell_len = __wt_cell_pack_leaf_key(&key->cell, pfx, key->buf.size); + key->len = key->cell_len + key->buf.size; + + return (0); +} + +/* + * __wt_bulk_insert_row -- + * Row-store bulk insert. + */ +int +__wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_RECONCILE *r; + WT_REC_KV *key, *val; + bool ovfl_key; + + r = cbulk->reconcile; + btree = S2BT(session); + cursor = &cbulk->cbt.iface; + + key = &r->k; + val = &r->v; + WT_RET(__rec_cell_build_leaf_key(session, r, /* Build key cell */ + cursor->key.data, cursor->key.size, &ovfl_key)); + WT_RET(__wt_rec_cell_build_val(session, r, /* Build value cell */ + cursor->value.data, cursor->value.size, (uint64_t)0)); + + /* Boundary: split or write the page. */ + if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) { + /* + * Turn off prefix compression until a full key written to the + * new page, and (unless already working with an overflow key), + * rebuild the key without compression. + */ + if (r->key_pfx_compress_conf) { + r->key_pfx_compress = false; + if (!ovfl_key) + WT_RET(__rec_cell_build_leaf_key( + session, r, NULL, 0, &ovfl_key)); + } + WT_RET(__wt_rec_split_crossing_bnd( + session, r, key->len + val->len)); + } + + /* Copy the key/value pair onto the page. */ + __wt_rec_copy_incr(session, r, key); + if (val->len == 0) + r->any_empty_value = true; + else { + r->all_empty_value = false; + if (btree->dictionary) + WT_RET(__wt_rec_dict_replace(session, r, 0, val)); + __wt_rec_copy_incr(session, r, val); + } + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + + return (0); +} + +/* + * __rec_row_merge -- + * Merge in a split page. + */ +static int +__rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_ADDR *addr; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + WT_REC_KV *key, *val; + uint32_t i; + bool ovfl_key; + + mod = page->modify; + + key = &r->k; + val = &r->v; + + /* For each entry in the split array... */ + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) { + /* Build the key and value cells. */ + WT_RET(__rec_cell_build_int_key(session, r, + WT_IKEY_DATA(multi->key.ikey), + r->cell_zero ? 1 : multi->key.ikey->size, &ovfl_key)); + r->cell_zero = false; + + addr = &multi->addr; + __wt_rec_cell_build_addr(session, r, + addr->addr, addr->size, __wt_rec_vtype(addr), WT_RECNO_OOB); + + /* Boundary: split or write the page. */ + if (__wt_rec_need_split(r, key->len + val->len)) + WT_RET(__wt_rec_split_crossing_bnd( + session, r, key->len + val->len)); + + /* Copy the key and value onto the page. */ + __wt_rec_copy_incr(session, r, key); + __wt_rec_copy_incr(session, r, val); + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + } + return (0); +} + +/* + * __wt_rec_row_int -- + * Reconcile a row-store internal page. + */ +int +__wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_ADDR *addr; + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack; + WT_CHILD_STATE state; + WT_DECL_RET; + WT_IKEY *ikey; + WT_PAGE *child; + WT_REC_KV *key, *val; + WT_REF *ref; + size_t size; + u_int vtype; + bool hazard, key_onpage_ovfl, ovfl_key; + const void *p; + + btree = S2BT(session); + child = NULL; + hazard = false; + + key = &r->k; + kpack = &_kpack; + WT_CLEAR(*kpack); /* -Wuninitialized */ + val = &r->v; + vpack = &_vpack; + WT_CLEAR(*vpack); /* -Wuninitialized */ + + ikey = NULL; /* -Wuninitialized */ + cell = NULL; + key_onpage_ovfl = false; + + WT_RET(__wt_rec_split_init( + session, r, page, 0, btree->maxintlpage_precomp)); + + /* + * Ideally, we'd never store the 0th key on row-store internal pages + * because it's never used during tree search and there's no reason + * to waste the space. The problem is how we do splits: when we split, + * we've potentially picked out several "split points" in the buffer + * which is overflowing the maximum page size, and when the overflow + * happens, we go back and physically split the buffer, at those split + * points, into new pages. It would be both difficult and expensive + * to re-process the 0th key at each split point to be an empty key, + * so we don't do that. However, we are reconciling an internal page + * for whatever reason, and the 0th key is known to be useless. We + * truncate the key to a single byte, instead of removing it entirely, + * it simplifies various things in other parts of the code (we don't + * have to special case transforming the page from its disk image to + * its in-memory version, for example). + */ + r->cell_zero = true; + + /* For each entry in the in-memory page... */ + WT_INTL_FOREACH_BEGIN(session, page, ref) { + /* + * There are different paths if the key is an overflow item vs. + * a straight-forward on-page value. If an overflow item, we + * would have instantiated it, and we can use that fact to set + * things up. + * + * Note the cell reference and unpacked key cell are available + * only in the case of an instantiated, off-page key, we don't + * bother setting them if that's not possible. + */ + if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS)) { + cell = NULL; + key_onpage_ovfl = false; + ikey = __wt_ref_key_instantiated(ref); + if (ikey != NULL && ikey->cell_offset != 0) { + cell = + WT_PAGE_REF_OFFSET(page, ikey->cell_offset); + __wt_cell_unpack(cell, kpack); + key_onpage_ovfl = kpack->ovfl && + kpack->raw != WT_CELL_KEY_OVFL_RM; + } + } + + WT_ERR(__wt_rec_child_modify(session, r, ref, &hazard, &state)); + addr = ref->addr; + child = ref->page; + + switch (state) { + case WT_CHILD_IGNORE: + /* + * Ignored child. + * + * Overflow keys referencing pages we're not writing are + * no longer useful, schedule them for discard. Don't + * worry about instantiation, internal page keys are + * always instantiated. Don't worry about reuse, + * reusing this key in this reconciliation is unlikely. + */ + if (key_onpage_ovfl) + WT_ERR(__wt_ovfl_discard_add( + session, page, kpack->cell)); + WT_CHILD_RELEASE_ERR(session, hazard, ref); + continue; + + case WT_CHILD_MODIFIED: + /* + * Modified child. Empty pages are merged into the + * parent and discarded. + */ + switch (child->modify->rec_result) { + case WT_PM_REC_EMPTY: + /* + * Overflow keys referencing empty pages are no + * longer useful, schedule them for discard. + * Don't worry about instantiation, internal + * page keys are always instantiated. Don't + * worry about reuse, reusing this key in this + * reconciliation is unlikely. + */ + if (key_onpage_ovfl) + WT_ERR(__wt_ovfl_discard_add( + session, page, kpack->cell)); + WT_CHILD_RELEASE_ERR(session, hazard, ref); + continue; + case WT_PM_REC_MULTIBLOCK: + /* + * Overflow keys referencing split pages are no + * longer useful (the split page's key is the + * interesting key); schedule them for discard. + * Don't worry about instantiation, internal + * page keys are always instantiated. Don't + * worry about reuse, reusing this key in this + * reconciliation is unlikely. + */ + if (key_onpage_ovfl) + WT_ERR(__wt_ovfl_discard_add( + session, page, kpack->cell)); + + WT_ERR(__rec_row_merge(session, r, child)); + WT_CHILD_RELEASE_ERR(session, hazard, ref); + continue; + case WT_PM_REC_REPLACE: + /* + * If the page is replaced, the page's modify + * structure has the page's address. + */ + addr = &child->modify->mod_replace; + break; + WT_ILLEGAL_VALUE_ERR( + session, child->modify->rec_result); + } + break; + case WT_CHILD_ORIGINAL: + /* Original child. */ + break; + case WT_CHILD_PROXY: + /* Deleted child where we write a proxy cell. */ + break; + } + + /* + * Build the value cell, the child page's address. Addr points + * to an on-page cell or an off-page WT_ADDR structure. There's + * a special cell type in the case of page deletion requiring + * a proxy cell, otherwise use the information from the addr or + * original cell. + */ + if (__wt_off_page(page, addr)) { + p = addr->addr; + size = addr->size; + vtype = state == WT_CHILD_PROXY ? + WT_CELL_ADDR_DEL : __wt_rec_vtype(addr); + } else { + __wt_cell_unpack(ref->addr, vpack); + p = vpack->data; + size = vpack->size; + vtype = state == WT_CHILD_PROXY ? + WT_CELL_ADDR_DEL : (u_int)vpack->raw; + } + __wt_rec_cell_build_addr( + session, r, p, size, vtype, WT_RECNO_OOB); + WT_CHILD_RELEASE_ERR(session, hazard, ref); + + /* + * Build key cell. + * Truncate any 0th key, internal pages don't need 0th keys. + */ + if (key_onpage_ovfl) { + key->buf.data = cell; + key->buf.size = __wt_cell_total_len(kpack); + key->cell_len = 0; + key->len = key->buf.size; + ovfl_key = true; + } else { + __wt_ref_key(page, ref, &p, &size); + WT_ERR(__rec_cell_build_int_key( + session, r, p, r->cell_zero ? 1 : size, &ovfl_key)); + } + r->cell_zero = false; + + /* Boundary: split or write the page. */ + if (__wt_rec_need_split(r, key->len + val->len)) { + /* + * In one path above, we copied address blocks from the + * page rather than building the actual key. In that + * case, we have to build the key now because we are + * about to promote it. + */ + if (key_onpage_ovfl) { + WT_ERR(__wt_buf_set(session, r->cur, + WT_IKEY_DATA(ikey), ikey->size)); + key_onpage_ovfl = false; + } + + WT_ERR(__wt_rec_split_crossing_bnd( + session, r, key->len + val->len)); + } + + /* Copy the key and value onto the page. */ + __wt_rec_copy_incr(session, r, key); + __wt_rec_copy_incr(session, r, val); + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + } WT_INTL_FOREACH_END; + + /* Write the remnant page. */ + return (__wt_rec_split_finish(session, r)); + +err: WT_CHILD_RELEASE(session, hazard, ref); + return (ret); +} + +/* + * __rec_row_leaf_insert -- + * Walk an insert chain, writing K/V pairs. + */ +static int +__rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) +{ + WT_BTREE *btree; + WT_CURSOR_BTREE *cbt; + WT_REC_KV *key, *val; + WT_UPDATE *upd; + bool ovfl_key, upd_saved; + + btree = S2BT(session); + cbt = &r->update_modify_cbt; + + key = &r->k; + val = &r->v; + + for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) { + WT_RET(__wt_rec_txn_read( + session, r, ins, NULL, NULL, &upd_saved, &upd)); + + if (upd == NULL) { + /* + * If no update is visible but some were saved, check + * for splits. + */ + if (!upd_saved) + continue; + if (!__wt_rec_need_split(r, WT_INSERT_KEY_SIZE(ins))) + continue; + + /* Copy the current key into place and then split. */ + WT_RET(__wt_buf_set(session, r->cur, + WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins))); + WT_RET(__wt_rec_split_crossing_bnd( + session, r, WT_INSERT_KEY_SIZE(ins))); + + /* + * Turn off prefix and suffix compression until a full + * key is written into the new page. + */ + r->key_pfx_compress = r->key_sfx_compress = false; + continue; + } + + switch (upd->type) { + case WT_UPDATE_MODIFY: + /* + * Impossible slot, there's no backing on-page + * item. + */ + cbt->slot = UINT32_MAX; + WT_RET(__wt_value_return_upd( + session, cbt, upd, F_ISSET(r, WT_REC_VISIBLE_ALL))); + WT_RET(__wt_rec_cell_build_val(session, r, + cbt->iface.value.data, + cbt->iface.value.size, (uint64_t)0)); + break; + case WT_UPDATE_STANDARD: + if (upd->size == 0) + val->len = 0; + else + WT_RET(__wt_rec_cell_build_val(session, + r, upd->data, upd->size, + (uint64_t)0)); + break; + case WT_UPDATE_TOMBSTONE: + continue; + WT_ILLEGAL_VALUE(session, upd->type); + } + + /* Build key cell. */ + WT_RET(__rec_cell_build_leaf_key(session, r, + WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); + + /* Boundary: split or write the page. */ + if (__wt_rec_need_split(r, key->len + val->len)) { + /* + * Turn off prefix compression until a full key written + * to the new page, and (unless already working with an + * overflow key), rebuild the key without compression. + */ + if (r->key_pfx_compress_conf) { + r->key_pfx_compress = false; + if (!ovfl_key) + WT_RET(__rec_cell_build_leaf_key( + session, r, NULL, 0, &ovfl_key)); + } + + WT_RET(__wt_rec_split_crossing_bnd( + session, r, key->len + val->len)); + } + + /* Copy the key/value pair onto the page. */ + __wt_rec_copy_incr(session, r, key); + if (val->len == 0) + r->any_empty_value = true; + else { + r->all_empty_value = false; + if (btree->dictionary) + WT_RET(__wt_rec_dict_replace( + session, r, 0, val)); + __wt_rec_copy_incr(session, r, val); + } + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + } + + return (0); +} + +/* + * __wt_rec_row_leaf -- + * Reconcile a row-store leaf page. + */ +int +__wt_rec_row_leaf(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack; + WT_CURSOR_BTREE *cbt; + WT_DECL_ITEM(tmpkey); + WT_DECL_ITEM(tmpval); + WT_DECL_RET; + WT_IKEY *ikey; + WT_INSERT *ins; + WT_REC_KV *key, *val; + WT_ROW *rip; + WT_UPDATE *upd; + size_t size; + uint64_t slvg_skip; + uint32_t i; + bool dictionary, key_onpage_ovfl, ovfl_key; + void *copy; + const void *p; + + btree = S2BT(session); + cbt = &r->update_modify_cbt; + slvg_skip = salvage == NULL ? 0 : salvage->skip; + + key = &r->k; + val = &r->v; + vpack = &_vpack; + + WT_RET(__wt_rec_split_init( + session, r, page, 0, btree->maxleafpage_precomp)); + + /* + * Write any K/V pairs inserted into the page before the first from-disk + * key on the page. + */ + if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) + WT_RET(__rec_row_leaf_insert(session, r, ins)); + + /* + * Temporary buffers in which to instantiate any uninstantiated keys + * or value items we need. + */ + WT_ERR(__wt_scr_alloc(session, 0, &tmpkey)); + WT_ERR(__wt_scr_alloc(session, 0, &tmpval)); + + /* For each entry in the page... */ + WT_ROW_FOREACH(page, rip, i) { + /* + * The salvage code, on some rare occasions, wants to reconcile + * a page but skip some leading records on the page. Because + * the row-store leaf reconciliation function copies keys from + * the original disk page, this is non-trivial -- just changing + * the in-memory pointers isn't sufficient, we have to change + * the WT_CELL structures on the disk page, too. It's ugly, but + * we pass in a value that tells us how many records to skip in + * this case. + */ + if (slvg_skip != 0) { + --slvg_skip; + continue; + } + + /* + * Figure out the key: set any cell reference (and unpack it), + * set any instantiated key reference. + */ + copy = WT_ROW_KEY_COPY(rip); + (void)__wt_row_leaf_key_info( + page, copy, &ikey, &cell, NULL, NULL); + if (cell == NULL) + kpack = NULL; + else { + kpack = &_kpack; + __wt_cell_unpack(cell, kpack); + } + + /* Unpack the on-page value cell, and look for an update. */ + __wt_row_leaf_value_cell(page, rip, NULL, vpack); + WT_ERR(__wt_rec_txn_read( + session, r, NULL, rip, vpack, NULL, &upd)); + + /* Build value cell. */ + dictionary = false; + if (upd == NULL) { + /* + * When the page was read into memory, there may not + * have been a value item. + * + * If there was a value item, check if it's a dictionary + * cell (a copy of another item on the page). If it's a + * copy, we have to create a new value item as the old + * item might have been discarded from the page. + */ + if (vpack->raw == WT_CELL_VALUE_COPY) { + /* If the item is Huffman encoded, decode it. */ + if (btree->huffman_value == NULL) { + p = vpack->data; + size = vpack->size; + } else { + WT_ERR(__wt_huffman_decode(session, + btree->huffman_value, + vpack->data, vpack->size, + tmpval)); + p = tmpval->data; + size = tmpval->size; + } + WT_ERR(__wt_rec_cell_build_val( + session, r, p, size, (uint64_t)0)); + dictionary = true; + } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) { + /* + * If doing an update save and restore, and the + * underlying value is a removed overflow value, + * we end up here. + * + * If necessary, when the overflow value was + * originally removed, reconciliation appended + * a globally visible copy of the value to the + * key's update list, meaning the on-page item + * isn't accessed after page re-instantiation. + * + * Assert the case. + */ + WT_ASSERT(session, + F_ISSET(r, WT_REC_UPDATE_RESTORE)); + + /* + * If the key is also a removed overflow item, + * don't write anything at all. + * + * We don't have to write anything because the + * code re-instantiating the page gets the key + * to match the saved list of updates from the + * original page. By not putting the key on + * the page, we'll move the key/value set from + * a row-store leaf page slot to an insert list, + * but that shouldn't matter. + * + * The reason we bother with the test is because + * overflows are expensive to write. It's hard + * to imagine a real workload where this test is + * worth the effort, but it's a simple test. + */ + if (kpack != NULL && + kpack->raw == WT_CELL_KEY_OVFL_RM) + goto leaf_insert; + + /* + * The on-page value will never be accessed, + * write a placeholder record. + */ + WT_ERR(__wt_rec_cell_build_val(session, r, + "ovfl-unused", strlen("ovfl-unused"), + (uint64_t)0)); + } else { + val->buf.data = vpack->cell; + val->buf.size = __wt_cell_total_len(vpack); + val->cell_len = 0; + val->len = val->buf.size; + + /* Track if page has overflow items. */ + if (vpack->ovfl) + r->ovfl_items = true; + } + } else { + /* + * The first time we find an overflow record we're not + * going to use, discard the underlying blocks. + */ + if (vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM) + WT_ERR(__wt_ovfl_remove(session, + page, vpack, F_ISSET(r, WT_REC_EVICT))); + + switch (upd->type) { + case WT_UPDATE_MODIFY: + cbt->slot = WT_ROW_SLOT(page, rip); + WT_ERR(__wt_value_return_upd(session, cbt, upd, + F_ISSET(r, WT_REC_VISIBLE_ALL))); + WT_ERR(__wt_rec_cell_build_val(session, r, + cbt->iface.value.data, + cbt->iface.value.size, (uint64_t)0)); + dictionary = true; + break; + case WT_UPDATE_STANDARD: + /* + * If no value, nothing needs to be copied. + * Otherwise, build the value's chunk from the + * update value. + */ + if (upd->size == 0) { + val->buf.data = NULL; + val->cell_len = + val->len = val->buf.size = 0; + } else { + WT_ERR(__wt_rec_cell_build_val( + session, r, + upd->data, upd->size, (uint64_t)0)); + dictionary = true; + } + break; + case WT_UPDATE_TOMBSTONE: + /* + * If this key/value pair was deleted, we're + * done. + * + * Overflow keys referencing discarded values + * are no longer useful, discard the backing + * blocks. Don't worry about reuse, reusing + * keys from a row-store page reconciliation + * seems unlikely enough to ignore. + */ + if (kpack != NULL && kpack->ovfl && + kpack->raw != WT_CELL_KEY_OVFL_RM) { + /* + * Keys are part of the name-space, we + * can't remove them from the in-memory + * tree; if an overflow key was deleted + * without being instantiated (for + * example, cursor-based truncation), do + * it now. + */ + if (ikey == NULL) + WT_ERR(__wt_row_leaf_key( + session, + page, rip, tmpkey, true)); + + WT_ERR(__wt_ovfl_discard_add( + session, page, kpack->cell)); + } + + /* + * We aren't actually creating the key so we + * can't use bytes from this key to provide + * prefix information for a subsequent key. + */ + tmpkey->size = 0; + + /* Proceed with appended key/value pairs. */ + goto leaf_insert; + WT_ILLEGAL_VALUE_ERR(session, upd->type); + } + } + + /* + * Build key cell. + * + * If the key is an overflow key that hasn't been removed, use + * the original backing blocks. + */ + key_onpage_ovfl = kpack != NULL && + kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM; + if (key_onpage_ovfl) { + key->buf.data = cell; + key->buf.size = __wt_cell_total_len(kpack); + key->cell_len = 0; + key->len = key->buf.size; + ovfl_key = true; + + /* + * We aren't creating a key so we can't use this key as + * a prefix for a subsequent key. + */ + tmpkey->size = 0; + + /* Track if page has overflow items. */ + r->ovfl_items = true; + } else { + /* + * Get the key from the page or an instantiated key, or + * inline building the key from a previous key (it's a + * fast path for simple, prefix-compressed keys), or by + * by building the key from scratch. + */ + if (__wt_row_leaf_key_info(page, copy, + NULL, &cell, &tmpkey->data, &tmpkey->size)) + goto build; + + kpack = &_kpack; + __wt_cell_unpack(cell, kpack); + if (btree->huffman_key == NULL && + kpack->type == WT_CELL_KEY && + tmpkey->size >= kpack->prefix) { + /* + * The previous clause checked for a prefix of + * zero, which means the temporary buffer must + * have a non-zero size, and it references a + * valid key. + */ + WT_ASSERT(session, tmpkey->size != 0); + + /* + * Grow the buffer as necessary, ensuring data + * data has been copied into local buffer space, + * then append the suffix to the prefix already + * in the buffer. + * + * Don't grow the buffer unnecessarily or copy + * data we don't need, truncate the item's data + * length to the prefix bytes. + */ + tmpkey->size = kpack->prefix; + WT_ERR(__wt_buf_grow(session, + tmpkey, tmpkey->size + kpack->size)); + memcpy((uint8_t *)tmpkey->mem + tmpkey->size, + kpack->data, kpack->size); + tmpkey->size += kpack->size; + } else + WT_ERR(__wt_row_leaf_key_copy( + session, page, rip, tmpkey)); +build: + WT_ERR(__rec_cell_build_leaf_key(session, r, + tmpkey->data, tmpkey->size, &ovfl_key)); + } + + /* Boundary: split or write the page. */ + if (__wt_rec_need_split(r, key->len + val->len)) { + /* + * If we copied address blocks from the page rather than + * building the actual key, we have to build the key now + * because we are about to promote it. + */ + if (key_onpage_ovfl) { + WT_ERR(__wt_dsk_cell_data_ref(session, + WT_PAGE_ROW_LEAF, kpack, r->cur)); + WT_NOT_READ(key_onpage_ovfl, false); + } + + /* + * Turn off prefix compression until a full key written + * to the new page, and (unless already working with an + * overflow key), rebuild the key without compression. + */ + if (r->key_pfx_compress_conf) { + r->key_pfx_compress = false; + if (!ovfl_key) + WT_ERR(__rec_cell_build_leaf_key( + session, r, NULL, 0, &ovfl_key)); + } + + WT_ERR(__wt_rec_split_crossing_bnd( + session, r, key->len + val->len)); + } + + /* Copy the key/value pair onto the page. */ + __wt_rec_copy_incr(session, r, key); + if (val->len == 0) + r->any_empty_value = true; + else { + r->all_empty_value = false; + if (dictionary && btree->dictionary) + WT_ERR(__wt_rec_dict_replace( + session, r, 0, val)); + __wt_rec_copy_incr(session, r, val); + } + + /* Update compression state. */ + __rec_key_state_update(r, ovfl_key); + +leaf_insert: /* Write any K/V pairs inserted into the page after this key. */ + if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT(page, rip))) != NULL) + WT_ERR(__rec_row_leaf_insert(session, r, ins)); + } + + /* Write the remnant page. */ + ret = __wt_rec_split_finish(session, r); + +err: __wt_scr_free(session, &tmpkey); + __wt_scr_free(session, &tmpval); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c new file mode 100644 index 00000000000..97903db9e9e --- /dev/null +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -0,0 +1,405 @@ +/*- + * Copyright (c) 2014-2019 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __rec_update_save -- + * Save a WT_UPDATE list for later restoration. + */ +static int +__rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, + WT_INSERT *ins, void *ripcip, WT_UPDATE *onpage_upd, size_t upd_memsize) +{ + WT_RET(__wt_realloc_def( + session, &r->supd_allocated, r->supd_next + 1, &r->supd)); + r->supd[r->supd_next].ins = ins; + r->supd[r->supd_next].ripcip = ripcip; + r->supd[r->supd_next].onpage_upd = onpage_upd; + ++r->supd_next; + r->supd_memsize += upd_memsize; + return (0); +} + +/* + * __rec_append_orig_value -- + * Append the key's original value to its update list. + */ +static int +__rec_append_orig_value(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_UPDATE *upd, WT_CELL_UNPACK *unpack) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_UPDATE *append; + size_t size; + + /* Done if at least one self-contained update is globally visible. */ + for (;; upd = upd->next) { + if (WT_UPDATE_DATA_VALUE(upd) && + __wt_txn_upd_visible_all(session, upd)) + return (0); + + /* Add the original value after birthmarks. */ + if (upd->type == WT_UPDATE_BIRTHMARK) { + WT_ASSERT(session, unpack != NULL && + unpack->type != WT_CELL_DEL); + break; + } + + /* Leave reference at the last item in the chain. */ + if (upd->next == NULL) + break; + } + + /* + * We need the original on-page value for some reader: get a copy and + * append it to the end of the update list with a transaction ID that + * guarantees its visibility. + * + * If we don't have a value cell, it's an insert/append list key/value + * pair which simply doesn't exist for some reader; place a deleted + * record at the end of the update list. + */ + append = NULL; /* -Wconditional-uninitialized */ + size = 0; /* -Wconditional-uninitialized */ + if (unpack == NULL || unpack->type == WT_CELL_DEL) + WT_RET(__wt_update_alloc(session, + NULL, &append, &size, WT_UPDATE_TOMBSTONE)); + else { + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp)); + WT_ERR(__wt_update_alloc( + session, tmp, &append, &size, WT_UPDATE_STANDARD)); + } + + /* + * If we're saving the original value for a birthmark, transfer over + * the transaction ID and clear out the birthmark update. + * + * Else, set the entry's transaction information to the lowest possible + * value. Cleared memory matches the lowest possible transaction ID and + * timestamp, do nothing. + */ + if (upd->type == WT_UPDATE_BIRTHMARK) { + append->txnid = upd->txnid; + append->timestamp = upd->timestamp; + append->next = upd->next; + } + + /* Append the new entry into the update list. */ + WT_PUBLISH(upd->next, append); + __wt_cache_page_inmem_incr(session, page, size); + + if (upd->type == WT_UPDATE_BIRTHMARK) { + upd->type = WT_UPDATE_STANDARD; + upd->txnid = WT_TXN_ABORTED; + } + +err: __wt_scr_free(session, &tmp); + return (ret); +} + +/* + * __wt_rec_txn_read -- + * Return the update in a list that should be written (or NULL if none can + * be written). + */ +int +__wt_rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, + WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, + bool *upd_savedp, WT_UPDATE **updp) +{ + WT_PAGE *page; + WT_UPDATE *first_ts_upd, *first_txn_upd, *first_upd, *upd; + wt_timestamp_t timestamp; + size_t upd_memsize; + uint64_t max_txn, txnid; + bool all_visible, prepared, skipped_birthmark, uncommitted, upd_saved; + + if (upd_savedp != NULL) + *upd_savedp = false; + *updp = NULL; + + page = r->page; + first_ts_upd = first_txn_upd = NULL; + upd_memsize = 0; + max_txn = WT_TXN_NONE; + prepared = skipped_birthmark = uncommitted = upd_saved = false; + + /* + * If called with a WT_INSERT item, use its WT_UPDATE list (which must + * exist), otherwise check for an on-page row-store WT_UPDATE list + * (which may not exist). Return immediately if the item has no updates. + */ + if (ins != NULL) + first_upd = ins->upd; + else if ((first_upd = WT_ROW_UPDATE(page, ripcip)) == NULL) + return (0); + + for (upd = first_upd; upd != NULL; upd = upd->next) { + if ((txnid = upd->txnid) == WT_TXN_ABORTED) + continue; + + ++r->updates_seen; + upd_memsize += WT_UPDATE_MEMSIZE(upd); + + /* + * Track the first update in the chain that is not aborted and + * the maximum transaction ID. + */ + if (first_txn_upd == NULL) + first_txn_upd = upd; + + /* Track the largest transaction ID seen. */ + if (WT_TXNID_LT(max_txn, txnid)) + max_txn = txnid; + + /* + * Check whether the update was committed before reconciliation + * started. The global commit point can move forward during + * reconciliation so we use a cached copy to avoid races when a + * concurrent transaction commits or rolls back while we are + * examining its updates. As prepared transaction id's are + * globally visible, need to check the update state as well. + */ + if (F_ISSET(r, WT_REC_EVICT)) { + if (upd->prepare_state == WT_PREPARE_LOCKED || + upd->prepare_state == WT_PREPARE_INPROGRESS) + prepared = true; + + if (F_ISSET(r, WT_REC_VISIBLE_ALL) ? + WT_TXNID_LE(r->last_running, txnid) : + !__txn_visible_id(session, txnid)) + uncommitted = r->update_uncommitted = true; + + if (prepared || uncommitted) + continue; + } + + /* Track the first update with non-zero timestamp. */ + if (first_ts_upd == NULL && upd->timestamp != 0) + first_ts_upd = upd; + + /* + * Find the first update we can use. + * + * Update/restore eviction can handle any update (including + * uncommitted updates). Lookaside eviction can save any + * committed update. Regular eviction checks that the maximum + * transaction ID and timestamp seen are stable. + * + * Lookaside and update/restore eviction try to choose the same + * version as a subsequent checkpoint, so that checkpoint can + * skip over pages with lookaside entries. If the application + * has supplied a stable timestamp, we assume (a) that it is + * old, and (b) that the next checkpoint will use it, so we wait + * to see a stable update. If there is no stable timestamp, we + * assume the next checkpoint will write the most recent version + * (but we save enough information that checkpoint can fix + * things up if we choose an update that is too new). + */ + if (*updp == NULL && r->las_skew_newest) + *updp = upd; + + if (F_ISSET(r, WT_REC_VISIBLE_ALL) ? + !__wt_txn_upd_visible_all(session, upd) : + !__wt_txn_upd_visible(session, upd)) { + if (F_ISSET(r, WT_REC_EVICT)) + ++r->updates_unstable; + + /* + * Rare case: when applications run at low isolation + * levels, update/restore eviction may see a stable + * update followed by an uncommitted update. Give up + * in that case: we need to discard updates from the + * stable update and older for correctness and we can't + * discard an uncommitted update. + */ + if (F_ISSET(r, WT_REC_UPDATE_RESTORE) && + *updp != NULL && (uncommitted || prepared)) { + r->leave_dirty = true; + return (__wt_set_return(session, EBUSY)); + } + + if (upd->type == WT_UPDATE_BIRTHMARK) + skipped_birthmark = true; + + continue; + } + + /* + * Lookaside without stable timestamp was taken care of above + * (set to the first uncommitted transaction). Lookaside with + * stable timestamp always takes the first stable update. + */ + if (*updp == NULL) + *updp = upd; + + if (!F_ISSET(r, WT_REC_EVICT)) + break; + } + + /* Keep track of the selected update. */ + upd = *updp; + + /* Reconciliation should never see an aborted or reserved update. */ + WT_ASSERT(session, upd == NULL || + (upd->txnid != WT_TXN_ABORTED && upd->type != WT_UPDATE_RESERVE)); + + /* If all of the updates were aborted, quit. */ + if (first_txn_upd == NULL) { + WT_ASSERT(session, upd == NULL); + return (0); + } + + /* If no updates were skipped, record that we're making progress. */ + if (upd == first_txn_upd) + r->update_used = true; + + /* + * The checkpoint transaction is special. Make sure we never write + * metadata updates from a checkpoint in a concurrent session. + */ + WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || + upd == NULL || upd->txnid == WT_TXN_NONE || + upd->txnid != S2C(session)->txn_global.checkpoint_state.id || + WT_SESSION_IS_CHECKPOINT(session)); + + /* + * Track the most recent transaction in the page. We store this in the + * tree at the end of reconciliation in the service of checkpoints, it + * is used to avoid discarding trees from memory when they have changes + * required to satisfy a snapshot read. + */ + if (WT_TXNID_LT(r->max_txn, max_txn)) + r->max_txn = max_txn; + + /* Update the maximum timestamp. */ + if (first_ts_upd != NULL && r->max_timestamp < first_ts_upd->timestamp) + r->max_timestamp = first_ts_upd->timestamp; + + /* + * If the update we chose was a birthmark, or we are doing + * update-restore and we skipped a birthmark, the original on-page + * value must be retained. + */ + if (upd != NULL && + (upd->type == WT_UPDATE_BIRTHMARK || + (F_ISSET(r, WT_REC_UPDATE_RESTORE) && skipped_birthmark))) + *updp = NULL; + + /* + * Check if all updates on the page are visible. If not, it must stay + * dirty unless we are saving updates to the lookaside table. + * + * Updates can be out of transaction ID order (but not out of timestamp + * order), so we track the maximum transaction ID and the newest update + * with a timestamp (if any). + */ + timestamp = first_ts_upd == NULL ? 0 : first_ts_upd->timestamp; + all_visible = upd == first_txn_upd && !(uncommitted || prepared) && + (F_ISSET(r, WT_REC_VISIBLE_ALL) ? + __wt_txn_visible_all(session, max_txn, timestamp) : + __wt_txn_visible(session, max_txn, timestamp)); + + if (all_visible) + goto check_original_value; + + r->leave_dirty = true; + + if (F_ISSET(r, WT_REC_VISIBILITY_ERR)) + WT_PANIC_RET(session, EINVAL, + "reconciliation error, update not visible"); + + /* + * If not trying to evict the page, we know what we'll write and we're + * done. + */ + if (!F_ISSET(r, WT_REC_EVICT)) + goto check_original_value; + + /* + * We are attempting eviction with changes that are not yet stable + * (i.e. globally visible). There are two ways to continue, the + * save/restore eviction path or the lookaside table eviction path. + * Both cannot be configured because the paths track different + * information. The update/restore path can handle uncommitted changes, + * by evicting most of the page and then creating a new, smaller page + * to which we re-attach those changes. Lookaside eviction writes + * changes into the lookaside table and restores them on demand if and + * when the page is read back into memory. + * + * Both paths are configured outside of reconciliation: the save/restore + * path is the WT_REC_UPDATE_RESTORE flag, the lookaside table path is + * the WT_REC_LOOKASIDE flag. + */ + if (!F_ISSET(r, WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE)) + return (__wt_set_return(session, EBUSY)); + if (uncommitted && !F_ISSET(r, WT_REC_UPDATE_RESTORE)) + return (__wt_set_return(session, EBUSY)); + + WT_ASSERT(session, r->max_txn != WT_TXN_NONE); + + /* + * The order of the updates on the list matters, we can't move only the + * unresolved updates, move the entire update list. + */ + WT_RET(__rec_update_save(session, r, ins, ripcip, *updp, upd_memsize)); + upd_saved = true; + if (upd_savedp != NULL) + *upd_savedp = true; + + /* + * Track the first off-page update when saving history in the lookaside + * table. When skewing newest, we want the first (non-aborted) update + * after the one stored on the page. Otherwise, we want the update + * before the on-page update. + */ + if (F_ISSET(r, WT_REC_LOOKASIDE) && r->las_skew_newest) { + if (WT_TXNID_LT(r->unstable_txn, first_upd->txnid)) + r->unstable_txn = first_upd->txnid; + if (first_ts_upd != NULL && + r->unstable_timestamp < first_ts_upd->timestamp) + r->unstable_timestamp = first_ts_upd->timestamp; + } else if (F_ISSET(r, WT_REC_LOOKASIDE)) { + for (upd = first_upd; upd != *updp; upd = upd->next) { + if (upd->txnid == WT_TXN_ABORTED) + continue; + + if (upd->txnid != WT_TXN_NONE && + WT_TXNID_LT(upd->txnid, r->unstable_txn)) + r->unstable_txn = upd->txnid; + if (upd->timestamp < r->unstable_timestamp) + r->unstable_timestamp = upd->timestamp; + } + } + +check_original_value: + /* + * Paranoia: check that we didn't choose an update that has since been + * rolled back. + */ + WT_ASSERT(session, *updp == NULL || (*updp)->txnid != WT_TXN_ABORTED); + + /* + * Returning an update means the original on-page value might be lost, + * and that's a problem if there's a reader that needs it. This call + * makes a copy of the on-page value and if there is a birthmark in the + * update list, replaces it. We do that any time there are saved + * updates and during reconciliation of a backing overflow record that + * will be physically removed once it's no longer needed. + */ + if (*updp != NULL && (upd_saved || + (vpack != NULL && vpack->ovfl && + vpack->raw != WT_CELL_VALUE_OVFL_RM))) + WT_RET( + __rec_append_orig_value(session, page, first_upd, vpack)); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 87ce7ca1cc3..1c873fc3d8a 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -8,307 +8,18 @@ #include "wt_internal.h" -struct __rec_chunk; typedef struct __rec_chunk WT_CHUNK; -struct __rec_dictionary; typedef struct __rec_dictionary WT_DICTIONARY; -struct __rec_kv; typedef struct __rec_kv WT_KV; - -/* - * Reconciliation is the process of taking an in-memory page, walking each entry - * in the page, building a backing disk image in a temporary buffer representing - * that information, and writing that buffer to disk. What could be simpler? - * - * WT_RECONCILE -- - * Information tracking a single page reconciliation. - */ -typedef struct { - WT_REF *ref; /* Page being reconciled */ - WT_PAGE *page; - uint32_t flags; /* Caller's configuration */ - - /* - * Track start/stop write generation to decide if all changes to the - * page are written. - */ - uint32_t orig_write_gen; - - /* - * Track start/stop checkpoint generations to decide if lookaside table - * records are correct. - */ - uint64_t orig_btree_checkpoint_gen; - uint64_t orig_txn_checkpoint_gen; - - /* - * Track the oldest running transaction and whether to skew lookaside - * to the newest update. - */ - bool las_skew_newest; - uint64_t last_running; - - /* Track the page's min/maximum transactions. */ - uint64_t max_txn; - wt_timestamp_t max_timestamp; - - /* Lookaside boundary tracking. */ - uint64_t unstable_txn; - wt_timestamp_t unstable_timestamp; - - u_int updates_seen; /* Count of updates seen. */ - u_int updates_unstable; /* Count of updates not visible_all. */ - - bool update_uncommitted; /* An update was uncommitted */ - bool update_used; /* An update could be used */ - - /* - * When we can't mark the page clean (for example, checkpoint found some - * uncommitted updates), there's a leave-dirty flag. - */ - bool leave_dirty; - - /* - * Track if reconciliation has seen any overflow items. If a leaf page - * with no overflow items is written, the parent page's address cell is - * set to the leaf-no-overflow type. This means we can delete the leaf - * page without reading it because we don't have to discard any overflow - * items it might reference. - * - * The test test is per-page reconciliation, that is, once we see an - * overflow item on the page, all subsequent leaf pages written for the - * page will not be leaf-no-overflow type, regardless of whether or not - * they contain overflow items. In other words, leaf-no-overflow is not - * guaranteed to be set on every page that doesn't contain an overflow - * item, only that if it is set, the page contains no overflow items. - * XXX - * This was originally done because raw compression couldn't do better, - * now that raw compression has been removed, we should do better. - */ - bool ovfl_items; - - /* - * Track if reconciliation of a row-store leaf page has seen empty (zero - * length) values. We don't write out anything for empty values, so if - * there are empty values on a page, we have to make two passes over the - * page when it's read to figure out how many keys it has, expensive in - * the common case of no empty values and (entries / 2) keys. Likewise, - * a page with only empty values is another common data set, and keys on - * that page will be equal to the number of entries. In both cases, set - * a flag in the page's on-disk header. - * - * The test is per-page reconciliation as described above for the - * overflow-item test. - */ - bool all_empty_value, any_empty_value; - - /* - * Reconciliation gets tricky if we have to split a page, which happens - * when the disk image we create exceeds the page type's maximum disk - * image size. - * - * First, the target size of the page we're building. - */ - uint32_t page_size; /* Page size */ - - /* - * Second, the split size: if we're doing the page layout, split to a - * smaller-than-maximum page size when a split is required so we don't - * repeatedly split a packed page. - */ - uint32_t split_size; /* Split page size */ - uint32_t min_split_size; /* Minimum split page size */ - - /* - * We maintain two split chunks in the memory during reconciliation to - * be written out as pages. As we get to the end of the data, if the - * last one turns out to be smaller than the minimum split size, we go - * back into the penultimate chunk and split at this minimum split size - * boundary. This moves some data from the penultimate chunk to the last - * chunk, hence increasing the size of the last page written without - * decreasing the penultimate page size beyond the minimum split size. - * For this reason, we maintain an expected split percentage boundary - * and a minimum split percentage boundary. - * - * Chunks are referenced by current and previous pointers. In case of a - * split, previous references the first chunk and current switches to - * the second chunk. If reconciliation generates more split chunks, the - * the previous chunk is written to the disk and current and previous - * swap. - */ - struct __rec_chunk { - /* - * The recno and entries fields are the starting record number - * of the split chunk (for column-store splits), and the number - * of entries in the split chunk. - * - * The key for a row-store page; no column-store key is needed - * because the page's recno, stored in the recno field, is the - * column-store key. - */ - uint32_t entries; - uint64_t recno; - WT_ITEM key; - - uint32_t min_entries; - uint64_t min_recno; - WT_ITEM min_key; - - /* Minimum split-size boundary buffer offset. */ - size_t min_offset; - - WT_ITEM image; /* disk-image */ - } chunkA, chunkB, *cur_ptr, *prev_ptr; - - /* - * We track current information about the current record number, the - * number of entries copied into the disk image buffer, where we are - * in the buffer, and how much memory remains. Those values are - * packaged here rather than passing pointers to stack locations - * around the code. - */ - uint64_t recno; /* Current record number */ - uint32_t entries; /* Current number of entries */ - uint8_t *first_free; /* Current first free byte */ - size_t space_avail; /* Remaining space in this chunk */ - /* Remaining space in this chunk to put a minimum size boundary */ - size_t min_space_avail; - - /* - * Saved update list, supporting the WT_REC_UPDATE_RESTORE and - * WT_REC_LOOKASIDE configurations. While reviewing updates for each - * page, we save WT_UPDATE lists here, and then move them to per-block - * areas as the blocks are defined. - */ - WT_SAVE_UPD *supd; /* Saved updates */ - uint32_t supd_next; - size_t supd_allocated; - size_t supd_memsize; /* Size of saved update structures */ - - /* List of pages we've written so far. */ - WT_MULTI *multi; - uint32_t multi_next; - size_t multi_allocated; - - /* - * Root pages are written when wrapping up the reconciliation, remember - * the image we're going to write. - */ - WT_ITEM *wrapup_checkpoint; - bool wrapup_checkpoint_compressed; - - /* - * We don't need to keep the 0th key around on internal pages, the - * search code ignores them as nothing can sort less by definition. - * There's some trickiness here, see the code for comments on how - * these fields work. - */ - bool cell_zero; /* Row-store internal page 0th key */ - - /* - * We calculate checksums to find previously written identical blocks, - * but once a match fails during an eviction, there's no point trying - * again. - */ - bool evict_matching_checksum_failed; - - /* - * WT_DICTIONARY -- - * We optionally build a dictionary of values for leaf pages. Where - * two value cells are identical, only write the value once, the second - * and subsequent copies point to the original cell. The dictionary is - * fixed size, but organized in a skip-list to make searches faster. - */ - struct __rec_dictionary { - uint64_t hash; /* Hash value */ - uint32_t offset; /* Matching cell */ - - u_int depth; /* Skiplist */ - WT_DICTIONARY *next[0]; - } **dictionary; /* Dictionary */ - u_int dictionary_next, dictionary_slots; /* Next, max entries */ - /* Skiplist head. */ - WT_DICTIONARY *dictionary_head[WT_SKIP_MAXDEPTH]; - - /* - * WT_KV-- - * An on-page key/value item we're building. - */ - struct __rec_kv { - WT_ITEM buf; /* Data */ - WT_CELL cell; /* Cell and cell's length */ - size_t cell_len; - size_t len; /* Total length of cell + data */ - } k, v; /* Key/Value being built */ - - WT_ITEM *cur, _cur; /* Key/Value being built */ - WT_ITEM *last, _last; /* Last key/value built */ - - bool key_pfx_compress; /* If can prefix-compress next key */ - bool key_pfx_compress_conf; /* If prefix compression configured */ - bool key_sfx_compress; /* If can suffix-compress next key */ - bool key_sfx_compress_conf; /* If suffix compression configured */ - - bool is_bulk_load; /* If it's a bulk load */ - - WT_SALVAGE_COOKIE *salvage; /* If it's a salvage operation */ - - bool cache_write_lookaside; /* Used the lookaside table */ - bool cache_write_restore; /* Used update/restoration */ - - uint32_t tested_ref_state; /* Debugging information */ - - /* - * XXX - * In the case of a modified update, we may need a copy of the current - * value as a set of bytes. We call back into the btree code using a - * fake cursor to do that work. This a layering violation and fragile, - * we need a better solution. - */ - WT_CURSOR_BTREE update_modify_cbt; -} WT_RECONCILE; - -#define WT_CROSSING_MIN_BND(r, next_len) \ - ((r)->cur_ptr->min_offset == 0 && \ - (next_len) > (r)->min_space_avail) -#define WT_CROSSING_SPLIT_BND(r, next_len) ((next_len) > (r)->space_avail) -#define WT_CHECK_CROSSING_BND(r, next_len) \ - (WT_CROSSING_MIN_BND(r, next_len) || WT_CROSSING_SPLIT_BND(r, next_len)) - -static void __rec_cell_build_addr(WT_SESSION_IMPL *, - WT_RECONCILE *, const void *, size_t, u_int, uint64_t); -static int __rec_cell_build_int_key(WT_SESSION_IMPL *, - WT_RECONCILE *, const void *, size_t, bool *); -static int __rec_cell_build_leaf_key(WT_SESSION_IMPL *, - WT_RECONCILE *, const void *, size_t, bool *); -static int __rec_cell_build_ovfl(WT_SESSION_IMPL *, - WT_RECONCILE *, WT_KV *, uint8_t, uint64_t); -static int __rec_cell_build_val(WT_SESSION_IMPL *, - WT_RECONCILE *, const void *, size_t, uint64_t); static void __rec_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *); -static int __rec_col_fix(WT_SESSION_IMPL *, WT_RECONCILE *, WT_REF *); -static int __rec_col_fix_slvg(WT_SESSION_IMPL *, - WT_RECONCILE *, WT_REF *, WT_SALVAGE_COOKIE *); -static int __rec_col_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_REF *); -static int __rec_col_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); -static int __rec_col_var(WT_SESSION_IMPL *, - WT_RECONCILE *, WT_REF *, WT_SALVAGE_COOKIE *); -static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *, - WT_SALVAGE_COOKIE *, WT_ITEM *, bool, uint8_t, uint64_t); static int __rec_destroy_session(WT_SESSION_IMPL *); static int __rec_init(WT_SESSION_IMPL *, WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *); static int __rec_las_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *); static int __rec_las_wrapup_err(WT_SESSION_IMPL *, WT_RECONCILE *); static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t); -static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); -static int __rec_row_leaf(WT_SESSION_IMPL *, - WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *); -static int __rec_row_leaf_insert( - WT_SESSION_IMPL *, WT_RECONCILE *, WT_INSERT *); -static int __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *); static int __rec_split_row_promote( WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t); -static int __rec_split_write( - WT_SESSION_IMPL *, WT_RECONCILE *, WT_CHUNK *, WT_ITEM *, bool); +static int __rec_split_write(WT_SESSION_IMPL *, + WT_RECONCILE *, WT_REC_CHUNK *, WT_ITEM *, bool); static int __rec_write_check_complete( WT_SESSION_IMPL *, WT_RECONCILE *, int, bool *); static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *); @@ -316,12 +27,6 @@ static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_write_wrapup_err( WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); -static void __rec_dictionary_free(WT_SESSION_IMPL *, WT_RECONCILE *); -static int __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int); -static int __rec_dictionary_lookup( - WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *, WT_DICTIONARY **); -static void __rec_dictionary_reset(WT_RECONCILE *); - /* * __wt_reconcile -- * Reconcile an in-memory page into its on-disk format, and write it. @@ -435,23 +140,23 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, switch (page->type) { case WT_PAGE_COL_FIX: if (salvage != NULL) - ret = __rec_col_fix_slvg(session, r, ref, salvage); + ret = __wt_rec_col_fix_slvg(session, r, ref, salvage); else - ret = __rec_col_fix(session, r, ref); + ret = __wt_rec_col_fix(session, r, ref); break; case WT_PAGE_COL_INT: WT_WITH_PAGE_INDEX(session, - ret = __rec_col_int(session, r, ref)); + ret = __wt_rec_col_int(session, r, ref)); break; case WT_PAGE_COL_VAR: - ret = __rec_col_var(session, r, ref, salvage); + ret = __wt_rec_col_var(session, r, ref, salvage); break; case WT_PAGE_ROW_INT: WT_WITH_PAGE_INDEX(session, - ret = __rec_row_int(session, r, page)); + ret = __wt_rec_row_int(session, r, page)); break; case WT_PAGE_ROW_LEAF: - ret = __rec_row_leaf(session, r, page, salvage); + ret = __wt_rec_row_leaf(session, r, page, salvage); break; default: ret = __wt_illegal_value(session, page->type); @@ -870,6 +575,12 @@ __rec_init(WT_SESSION_IMPL *session, * history, or the stable timestamp hasn't changed since last time this * page was successfully, skew oldest instead. */ + if (F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DEBUG_MODE) && + __wt_random(&session->rnd) % 3 == 0) + r->las_skew_newest = false; + else + r->las_skew_newest = + LF_ISSET(WT_REC_LOOKASIDE) && LF_ISSET(WT_REC_VISIBLE_ALL); r->las_skew_newest = LF_ISSET(WT_REC_LOOKASIDE) && LF_ISSET(WT_REC_VISIBLE_ALL); if (r->las_skew_newest && @@ -965,9 +676,9 @@ __rec_init(WT_SESSION_IMPL *session, * Sanity check the size: 100 slots is the smallest dictionary we use. */ if (btree->dictionary != 0 && btree->dictionary > r->dictionary_slots) - WT_RET(__rec_dictionary_init(session, + WT_RET(__wt_rec_dictionary_init(session, r, btree->dictionary < 100 ? 100 : btree->dictionary)); - __rec_dictionary_reset(r); + __wt_rec_dictionary_reset(r); /* * Prefix compression discards repeated prefix bytes from row-store leaf @@ -1059,7 +770,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) __wt_free(session, r->supd); - __rec_dictionary_free(session, r); + __wt_rec_dictionary_free(session, r); __wt_buf_free(session, &r->k.buf); __wt_buf_free(session, &r->v.buf); @@ -1083,914 +794,6 @@ __rec_destroy_session(WT_SESSION_IMPL *session) } /* - * __rec_update_save -- - * Save a WT_UPDATE list for later restoration. - */ -static int -__rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, - WT_INSERT *ins, void *ripcip, WT_UPDATE *onpage_upd, size_t upd_memsize) -{ - WT_RET(__wt_realloc_def( - session, &r->supd_allocated, r->supd_next + 1, &r->supd)); - r->supd[r->supd_next].ins = ins; - r->supd[r->supd_next].ripcip = ripcip; - r->supd[r->supd_next].onpage_upd = onpage_upd; - ++r->supd_next; - r->supd_memsize += upd_memsize; - return (0); -} - -/* - * __rec_append_orig_value -- - * Append the key's original value to its update list. - */ -static int -__rec_append_orig_value(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_UPDATE *upd, WT_CELL_UNPACK *unpack) -{ - WT_DECL_ITEM(tmp); - WT_DECL_RET; - WT_UPDATE *append; - size_t size; - - /* Done if at least one self-contained update is globally visible. */ - for (;; upd = upd->next) { - if (WT_UPDATE_DATA_VALUE(upd) && - __wt_txn_upd_visible_all(session, upd)) - return (0); - - /* Add the original value after birthmarks. */ - if (upd->type == WT_UPDATE_BIRTHMARK) { - WT_ASSERT(session, unpack != NULL && - unpack->type != WT_CELL_DEL); - break; - } - - /* Leave reference at the last item in the chain. */ - if (upd->next == NULL) - break; - } - - /* - * We need the original on-page value for some reader: get a copy and - * append it to the end of the update list with a transaction ID that - * guarantees its visibility. - * - * If we don't have a value cell, it's an insert/append list key/value - * pair which simply doesn't exist for some reader; place a deleted - * record at the end of the update list. - */ - append = NULL; /* -Wconditional-uninitialized */ - size = 0; /* -Wconditional-uninitialized */ - if (unpack == NULL || unpack->type == WT_CELL_DEL) - WT_RET(__wt_update_alloc(session, - NULL, &append, &size, WT_UPDATE_TOMBSTONE)); - else { - WT_RET(__wt_scr_alloc(session, 0, &tmp)); - WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp)); - WT_ERR(__wt_update_alloc( - session, tmp, &append, &size, WT_UPDATE_STANDARD)); - } - - /* - * If we're saving the original value for a birthmark, transfer over - * the transaction ID and clear out the birthmark update. - * - * Else, set the entry's transaction information to the lowest possible - * value. Cleared memory matches the lowest possible transaction ID and - * timestamp, do nothing. - */ - if (upd->type == WT_UPDATE_BIRTHMARK) { - append->txnid = upd->txnid; - append->timestamp = upd->timestamp; - append->next = upd->next; - } - - /* Append the new entry into the update list. */ - WT_PUBLISH(upd->next, append); - __wt_cache_page_inmem_incr(session, page, size); - - if (upd->type == WT_UPDATE_BIRTHMARK) { - upd->type = WT_UPDATE_STANDARD; - upd->txnid = WT_TXN_ABORTED; - } - -err: __wt_scr_free(session, &tmp); - return (ret); -} - -/* - * __rec_txn_read -- - * Return the update in a list that should be written (or NULL if none can - * be written). - */ -static int -__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, - WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, - bool *upd_savedp, WT_UPDATE **updp) -{ - WT_PAGE *page; - WT_UPDATE *first_ts_upd, *first_txn_upd, *first_upd, *upd; - wt_timestamp_t timestamp; - size_t upd_memsize; - uint64_t max_txn, txnid; - bool all_visible, prepared, skipped_birthmark, uncommitted, upd_saved; - - if (upd_savedp != NULL) - *upd_savedp = false; - *updp = NULL; - - page = r->page; - first_ts_upd = first_txn_upd = NULL; - upd_memsize = 0; - max_txn = WT_TXN_NONE; - prepared = skipped_birthmark = uncommitted = upd_saved = false; - - /* - * If called with a WT_INSERT item, use its WT_UPDATE list (which must - * exist), otherwise check for an on-page row-store WT_UPDATE list - * (which may not exist). Return immediately if the item has no updates. - */ - if (ins != NULL) - first_upd = ins->upd; - else if ((first_upd = WT_ROW_UPDATE(page, ripcip)) == NULL) - return (0); - - for (upd = first_upd; upd != NULL; upd = upd->next) { - if ((txnid = upd->txnid) == WT_TXN_ABORTED) - continue; - - ++r->updates_seen; - upd_memsize += WT_UPDATE_MEMSIZE(upd); - - /* - * Track the first update in the chain that is not aborted and - * the maximum transaction ID. - */ - if (first_txn_upd == NULL) - first_txn_upd = upd; - - /* Track the largest transaction ID seen. */ - if (WT_TXNID_LT(max_txn, txnid)) - max_txn = txnid; - - /* - * Check whether the update was committed before reconciliation - * started. The global commit point can move forward during - * reconciliation so we use a cached copy to avoid races when a - * concurrent transaction commits or rolls back while we are - * examining its updates. As prepared transaction id's are - * globally visible, need to check the update state as well. - */ - if (F_ISSET(r, WT_REC_EVICT)) { - if (upd->prepare_state == WT_PREPARE_LOCKED || - upd->prepare_state == WT_PREPARE_INPROGRESS) - prepared = true; - - if (F_ISSET(r, WT_REC_VISIBLE_ALL) ? - WT_TXNID_LE(r->last_running, txnid) : - !__txn_visible_id(session, txnid)) - uncommitted = r->update_uncommitted = true; - - if (prepared || uncommitted) - continue; - } - - /* Track the first update with non-zero timestamp. */ - if (first_ts_upd == NULL && upd->timestamp != 0) - first_ts_upd = upd; - - /* - * Find the first update we can use. - * - * Update/restore eviction can handle any update (including - * uncommitted updates). Lookaside eviction can save any - * committed update. Regular eviction checks that the maximum - * transaction ID and timestamp seen are stable. - * - * Lookaside and update/restore eviction try to choose the same - * version as a subsequent checkpoint, so that checkpoint can - * skip over pages with lookaside entries. If the application - * has supplied a stable timestamp, we assume (a) that it is - * old, and (b) that the next checkpoint will use it, so we wait - * to see a stable update. If there is no stable timestamp, we - * assume the next checkpoint will write the most recent version - * (but we save enough information that checkpoint can fix - * things up if we choose an update that is too new). - */ - if (*updp == NULL && r->las_skew_newest) - *updp = upd; - - if (F_ISSET(r, WT_REC_VISIBLE_ALL) ? - !__wt_txn_upd_visible_all(session, upd) : - !__wt_txn_upd_visible(session, upd)) { - if (F_ISSET(r, WT_REC_EVICT)) - ++r->updates_unstable; - - /* - * Rare case: when applications run at low isolation - * levels, update/restore eviction may see a stable - * update followed by an uncommitted update. Give up - * in that case: we need to discard updates from the - * stable update and older for correctness and we can't - * discard an uncommitted update. - */ - if (F_ISSET(r, WT_REC_UPDATE_RESTORE) && - *updp != NULL && (uncommitted || prepared)) { - r->leave_dirty = true; - return (__wt_set_return(session, EBUSY)); - } - - if (upd->type == WT_UPDATE_BIRTHMARK) - skipped_birthmark = true; - - continue; - } - - /* - * Lookaside without stable timestamp was taken care of above - * (set to the first uncommitted transaction). Lookaside with - * stable timestamp always takes the first stable update. - */ - if (*updp == NULL) - *updp = upd; - } - - /* Keep track of the selected update. */ - upd = *updp; - - /* Reconciliation should never see an aborted or reserved update. */ - WT_ASSERT(session, upd == NULL || - (upd->txnid != WT_TXN_ABORTED && upd->type != WT_UPDATE_RESERVE)); - - /* If all of the updates were aborted, quit. */ - if (first_txn_upd == NULL) { - WT_ASSERT(session, upd == NULL); - return (0); - } - - /* If no updates were skipped, record that we're making progress. */ - if (upd == first_txn_upd) - r->update_used = true; - - /* - * The checkpoint transaction is special. Make sure we never write - * metadata updates from a checkpoint in a concurrent session. - */ - WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || - upd == NULL || upd->txnid == WT_TXN_NONE || - upd->txnid != S2C(session)->txn_global.checkpoint_state.id || - WT_SESSION_IS_CHECKPOINT(session)); - - /* - * Track the most recent transaction in the page. We store this in the - * tree at the end of reconciliation in the service of checkpoints, it - * is used to avoid discarding trees from memory when they have changes - * required to satisfy a snapshot read. - */ - if (WT_TXNID_LT(r->max_txn, max_txn)) - r->max_txn = max_txn; - - /* Update the maximum timestamp. */ - if (first_ts_upd != NULL && r->max_timestamp < first_ts_upd->timestamp) - r->max_timestamp = first_ts_upd->timestamp; - - /* - * If the update we chose was a birthmark, or we are doing - * update-restore and we skipped a birthmark, the original on-page - * value must be retained. - */ - if (upd != NULL && - (upd->type == WT_UPDATE_BIRTHMARK || - (F_ISSET(r, WT_REC_UPDATE_RESTORE) && skipped_birthmark))) - *updp = NULL; - - /* - * Check if all updates on the page are visible. If not, it must stay - * dirty unless we are saving updates to the lookaside table. - * - * Updates can be out of transaction ID order (but not out of timestamp - * order), so we track the maximum transaction ID and the newest update - * with a timestamp (if any). - */ - timestamp = first_ts_upd == NULL ? 0 : first_ts_upd->timestamp; - all_visible = upd == first_txn_upd && !(uncommitted || prepared) && - (F_ISSET(r, WT_REC_VISIBLE_ALL) ? - __wt_txn_visible_all(session, max_txn, timestamp) : - __wt_txn_visible(session, max_txn, timestamp)); - - if (all_visible) - goto check_original_value; - - r->leave_dirty = true; - - if (F_ISSET(r, WT_REC_VISIBILITY_ERR)) - WT_PANIC_RET(session, EINVAL, - "reconciliation error, update not visible"); - - /* - * If not trying to evict the page, we know what we'll write and we're - * done. - */ - if (!F_ISSET(r, WT_REC_EVICT)) - goto check_original_value; - - /* - * We are attempting eviction with changes that are not yet stable - * (i.e. globally visible). There are two ways to continue, the - * save/restore eviction path or the lookaside table eviction path. - * Both cannot be configured because the paths track different - * information. The update/restore path can handle uncommitted changes, - * by evicting most of the page and then creating a new, smaller page - * to which we re-attach those changes. Lookaside eviction writes - * changes into the lookaside table and restores them on demand if and - * when the page is read back into memory. - * - * Both paths are configured outside of reconciliation: the save/restore - * path is the WT_REC_UPDATE_RESTORE flag, the lookaside table path is - * the WT_REC_LOOKASIDE flag. - */ - if (!F_ISSET(r, WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE)) - return (__wt_set_return(session, EBUSY)); - if (uncommitted && !F_ISSET(r, WT_REC_UPDATE_RESTORE)) - return (__wt_set_return(session, EBUSY)); - - WT_ASSERT(session, r->max_txn != WT_TXN_NONE); - - /* - * The order of the updates on the list matters, we can't move only the - * unresolved updates, move the entire update list. - */ - WT_RET(__rec_update_save(session, r, ins, ripcip, *updp, upd_memsize)); - upd_saved = true; - if (upd_savedp != NULL) - *upd_savedp = true; - - /* - * Track the first off-page update when saving history in the lookaside - * table. When skewing newest, we want the first (non-aborted) update - * after the one stored on the page. Otherwise, we want the update - * before the on-page update. - */ - if (F_ISSET(r, WT_REC_LOOKASIDE) && r->las_skew_newest) { - if (WT_TXNID_LT(r->unstable_txn, first_upd->txnid)) - r->unstable_txn = first_upd->txnid; - if (first_ts_upd != NULL && - r->unstable_timestamp < first_ts_upd->timestamp) - r->unstable_timestamp = first_ts_upd->timestamp; - } else if (F_ISSET(r, WT_REC_LOOKASIDE)) { - for (upd = first_upd; upd != *updp; upd = upd->next) { - if (upd->txnid == WT_TXN_ABORTED) - continue; - - if (upd->txnid != WT_TXN_NONE && - WT_TXNID_LT(upd->txnid, r->unstable_txn)) - r->unstable_txn = upd->txnid; - if (upd->timestamp < r->unstable_timestamp) - r->unstable_timestamp = upd->timestamp; - } - } - -check_original_value: - /* - * Paranoia: check that we didn't choose an update that has since been - * rolled back. - */ - WT_ASSERT(session, *updp == NULL || (*updp)->txnid != WT_TXN_ABORTED); - - /* - * Returning an update means the original on-page value might be lost, - * and that's a problem if there's a reader that needs it. This call - * makes a copy of the on-page value and if there is a birthmark in the - * update list, replaces it. We do that any time there are saved - * updates and during reconciliation of a backing overflow record that - * will be physically removed once it's no longer needed. - */ - if (*updp != NULL && (upd_saved || - (vpack != NULL && vpack->ovfl && - vpack->raw != WT_CELL_VALUE_OVFL_RM))) - WT_RET( - __rec_append_orig_value(session, page, first_upd, vpack)); - - return (0); -} - -/* - * WT_CHILD_RELEASE, WT_CHILD_RELEASE_ERR -- - * Macros to clean up during internal-page reconciliation, releasing the - * hazard pointer we're holding on child pages. - */ -#define WT_CHILD_RELEASE(session, hazard, ref) do { \ - if (hazard) { \ - (hazard) = false; \ - WT_TRET( \ - __wt_page_release(session, ref, WT_READ_NO_EVICT)); \ - } \ -} while (0) -#define WT_CHILD_RELEASE_ERR(session, hazard, ref) do { \ - WT_CHILD_RELEASE(session, hazard, ref); \ - WT_ERR(ret); \ -} while (0) - -typedef enum { - WT_CHILD_IGNORE, /* Ignored child */ - WT_CHILD_MODIFIED, /* Modified child */ - WT_CHILD_ORIGINAL, /* Original child */ - WT_CHILD_PROXY /* Deleted child: proxy */ -} WT_CHILD_STATE; - -/* - * __rec_child_deleted -- - * Handle pages with leaf pages in the WT_REF_DELETED state. - */ -static int -__rec_child_deleted(WT_SESSION_IMPL *session, - WT_RECONCILE *r, WT_REF *ref, WT_CHILD_STATE *statep) -{ - WT_PAGE_DELETED *page_del; - - page_del = ref->page_del; - - /* - * Internal pages with child leaf pages in the WT_REF_DELETED state are - * a special case during reconciliation. First, if the deletion was a - * result of a session truncate call, the deletion may not be visible to - * us. In that case, we proceed as with any change not visible during - * reconciliation by ignoring the change for the purposes of writing the - * internal page. - * - * In this case, there must be an associated page-deleted structure, and - * it holds the transaction ID we care about. - * - * In some cases, there had better not be any updates we can't see. - * - * A visible update to be in READY state (i.e. not in LOCKED or - * PREPARED state), for truly visible to others. - */ - if (F_ISSET(r, WT_REC_VISIBILITY_ERR) && page_del != NULL && - __wt_page_del_active(session, ref, false)) - WT_PANIC_RET(session, EINVAL, - "reconciliation illegally skipped an update"); - - /* - * Deal with any underlying disk blocks. - * - * First, check to see if there is an address associated with this leaf: - * if there isn't, we're done, the underlying page is already gone. If - * the page still exists, check for any transactions in the system that - * might want to see the page's state before it's deleted. - * - * If any such transactions exist, we cannot discard the underlying leaf - * page to the block manager because the transaction may eventually read - * it. However, this write might be part of a checkpoint, and should we - * recover to that checkpoint, we'll need to delete the leaf page, else - * we'd leak it. The solution is to write a proxy cell on the internal - * page ensuring the leaf page is eventually discarded. - * - * If no such transactions exist, we can discard the leaf page to the - * block manager and no cell needs to be written at all. We do this - * outside of the underlying tracking routines because this action is - * permanent and irrevocable. (Clearing the address means we've lost - * track of the disk address in a permanent way. This is safe because - * there's no path to reading the leaf page again: if there's ever a - * read into this part of the name space again, the cache read function - * instantiates an entirely new page.) - */ - if (ref->addr != NULL && !__wt_page_del_active(session, ref, true)) { - /* - * Minor memory cleanup: if a truncate call deleted this page - * and we were ever forced to instantiate the page in memory, - * we would have built a list of updates in the page reference - * in order to be able to commit/rollback the truncate. We just - * passed a visibility test, discard the update list. - */ - if (page_del != NULL) { - __wt_free(session, ref->page_del->update_list); - __wt_free(session, ref->page_del); - } - - WT_RET(__wt_ref_block_free(session, ref)); - } - - /* - * If the original page is gone, we can skip the slot on the internal - * page. - */ - if (ref->addr == NULL) { - *statep = WT_CHILD_IGNORE; - return (0); - } - - /* - * Internal pages with deletes that aren't stable cannot be evicted, we - * don't have sufficient information to restore the page's information - * if subsequently read (we wouldn't know which transactions should see - * the original page and which should see the deleted page). - */ - if (F_ISSET(r, WT_REC_EVICT)) - return (__wt_set_return(session, EBUSY)); - - /* - * If there are deleted child pages we can't discard immediately, keep - * the page dirty so they are eventually freed. - */ - r->leave_dirty = true; - - /* - * If the original page cannot be freed, we need to keep a slot on the - * page to reference it from the parent page. - * - * If the delete is not visible in this checkpoint, write the original - * address normally. Otherwise, we have to write a proxy record. - * If the delete state is not ready, then delete is not visible as it - * is in prepared state. - */ - if (!__wt_page_del_active(session, ref, false)) - *statep = WT_CHILD_PROXY; - - return (0); -} - -/* - * __rec_child_modify -- - * Return if the internal page's child references any modifications. - */ -static int -__rec_child_modify(WT_SESSION_IMPL *session, - WT_RECONCILE *r, WT_REF *ref, bool *hazardp, WT_CHILD_STATE *statep) -{ - WT_DECL_RET; - WT_PAGE_MODIFY *mod; - - /* We may acquire a hazard pointer our caller must release. */ - *hazardp = false; - - /* Default to using the original child address. */ - *statep = WT_CHILD_ORIGINAL; - - /* - * This function is called when walking an internal page to decide how - * to handle child pages referenced by the internal page. - * - * Internal pages are reconciled for two reasons: first, when evicting - * an internal page, second by the checkpoint code when writing internal - * pages. During eviction, all pages should be in the WT_REF_DISK or - * WT_REF_DELETED state. During checkpoint, eviction that might affect - * review of an internal page is prohibited, however, as the subtree is - * not reserved for our exclusive use, there are other page states that - * must be considered. - */ - for (;; __wt_yield()) { - switch (r->tested_ref_state = ref->state) { - case WT_REF_DISK: - /* On disk, not modified by definition. */ - goto done; - - case WT_REF_DELETED: - /* - * The child is in a deleted state. - * - * It's possible the state could change underneath us as - * the page is read in, and we can race between checking - * for a deleted state and looking at the transaction ID - * to see if the delete is visible to us. Lock down the - * structure. - */ - if (!WT_REF_CAS_STATE( - session, ref, WT_REF_DELETED, WT_REF_LOCKED)) - break; - ret = __rec_child_deleted(session, r, ref, statep); - WT_REF_SET_STATE(ref, WT_REF_DELETED); - goto done; - - case WT_REF_LOCKED: - /* - * Locked. - * - * We should never be here during eviction, active child - * pages in an evicted page's subtree fails the eviction - * attempt. - */ - WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); - if (F_ISSET(r, WT_REC_EVICT)) - return (__wt_set_return(session, EBUSY)); - - /* - * If called during checkpoint, the child is being - * considered by the eviction server or the child is a - * truncated page being read. The eviction may have - * started before the checkpoint and so we must wait - * for the eviction to be resolved. I suspect we could - * handle reads of truncated pages, but we can't - * distinguish between the two and reads of truncated - * pages aren't expected to be common. - */ - break; - - case WT_REF_LIMBO: - WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); - /* FALLTHROUGH */ - case WT_REF_LOOKASIDE: - /* - * On disk or in cache with lookaside updates. - * - * We should never be here during eviction: active - * child pages in an evicted page's subtree fails the - * eviction attempt. - */ - if (F_ISSET(r, WT_REC_EVICT) && - __wt_page_las_active(session, ref)) { - WT_ASSERT(session, false); - return (__wt_set_return(session, EBUSY)); - } - - /* - * A page evicted with lookaside entries may not have - * an address, if no updates were visible to - * reconciliation. Any child pages in that state - * should be ignored. - */ - if (ref->addr == NULL) { - *statep = WT_CHILD_IGNORE; - WT_CHILD_RELEASE(session, *hazardp, ref); - } - goto done; - - case WT_REF_MEM: - /* - * In memory. - * - * We should never be here during eviction, active child - * pages in an evicted page's subtree fails the eviction - * attempt. - */ - WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); - if (F_ISSET(r, WT_REC_EVICT)) - return (__wt_set_return(session, EBUSY)); - - /* - * If called during checkpoint, acquire a hazard pointer - * so the child isn't evicted, it's an in-memory case. - * - * This call cannot return split/restart, we have a lock - * on the parent which prevents a child page split. - * - * Set WT_READ_NO_WAIT because we're only interested in - * the WT_REF's final state. Pages in transition might - * change WT_REF state during our read, and then return - * WT_NOTFOUND to us. In that case, loop and look again. - */ - ret = __wt_page_in(session, ref, - WT_READ_CACHE | WT_READ_NO_EVICT | - WT_READ_NO_GEN | WT_READ_NO_WAIT); - if (ret == WT_NOTFOUND) { - ret = 0; - break; - } - WT_RET(ret); - *hazardp = true; - goto in_memory; - - case WT_REF_READING: - /* - * Being read, not modified by definition. - * - * We should never be here during eviction, active child - * pages in an evicted page's subtree fails the eviction - * attempt. - */ - WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); - if (F_ISSET(r, WT_REC_EVICT)) - return (__wt_set_return(session, EBUSY)); - goto done; - - case WT_REF_SPLIT: - /* - * The page was split out from under us. - * - * We should never be here during eviction, active child - * pages in an evicted page's subtree fails the eviction - * attempt. - * - * We should never be here during checkpoint, dirty page - * eviction is shutout during checkpoint, all splits in - * process will have completed before we walk any pages - * for checkpoint. - */ - WT_ASSERT(session, WT_REF_SPLIT != WT_REF_SPLIT); - return (__wt_set_return(session, EBUSY)); - - WT_ILLEGAL_VALUE(session, r->tested_ref_state); - } - WT_STAT_CONN_INCR(session, child_modify_blocked_page); - } - -in_memory: - /* - * In-memory states: the child is potentially modified if the page's - * modify structure has been instantiated. If the modify structure - * exists and the page has actually been modified, set that state. - * If that's not the case, we would normally use the original cell's - * disk address as our reference, however there are two special cases, - * both flagged by a missing block address. - * - * First, if forced to instantiate a deleted child page and it's never - * modified, we end up here with a page that has a modify structure, no - * modifications, and no disk address. Ignore those pages, they're not - * modified and there is no reason to write the cell. - * - * Second, insert splits are permitted during checkpoint. When doing the - * final checkpoint pass, we first walk the internal page's page-index - * and write out any dirty pages we find, then we write out the internal - * page in post-order traversal. If we found the split page in the first - * step, it will have an address; if we didn't find the split page in - * the first step, it won't have an address and we ignore it, it's not - * part of the checkpoint. - */ - mod = ref->page->modify; - if (mod != NULL && mod->rec_result != 0) - *statep = WT_CHILD_MODIFIED; - else if (ref->addr == NULL) { - *statep = WT_CHILD_IGNORE; - WT_CHILD_RELEASE(session, *hazardp, ref); - } - -done: WT_DIAGNOSTIC_YIELD; - return (ret); -} - -/* - * __rec_incr -- - * Update the memory tracking structure for a set of new entries. - */ -static inline void -__rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size) -{ - /* - * The buffer code is fragile and prone to off-by-one errors -- check - * for overflow in diagnostic mode. - */ - WT_ASSERT(session, r->space_avail >= size); - WT_ASSERT(session, WT_BLOCK_FITS(r->first_free, size, - r->cur_ptr->image.mem, r->cur_ptr->image.memsize)); - - r->entries += v; - r->space_avail -= size; - r->first_free += size; - - /* - * If offset for the minimum split size boundary is not set, we have not - * yet reached the minimum boundary, reduce the space available for it. - */ - if (r->cur_ptr->min_offset == 0) { - if (r->min_space_avail >= size) - r->min_space_avail -= size; - else - r->min_space_avail = 0; - } -} - -/* - * __rec_copy_incr -- - * Copy a key/value cell and buffer pair into the new image. - */ -static inline void -__rec_copy_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *kv) -{ - size_t len; - uint8_t *p, *t; - - /* - * If there's only one chunk of data to copy (because the cell and data - * are being copied from the original disk page), the cell length won't - * be set, the WT_ITEM data/length will reference the data to be copied. - * - * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do - * the copy in-line. - */ - for (p = r->first_free, - t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len) - *p++ = *t++; - - /* The data can be quite large -- call memcpy. */ - if (kv->buf.size != 0) - memcpy(p, kv->buf.data, kv->buf.size); - - WT_ASSERT(session, kv->len == kv->cell_len + kv->buf.size); - __rec_incr(session, r, 1, kv->len); -} - -/* - * __rec_dict_replace -- - * Check for a dictionary match. - */ -static int -__rec_dict_replace( - WT_SESSION_IMPL *session, WT_RECONCILE *r, uint64_t rle, WT_KV *val) -{ - WT_DICTIONARY *dp; - uint64_t offset; - - /* - * We optionally create a dictionary of values and only write a unique - * value once per page, using a special "copy" cell for all subsequent - * copies of the value. We have to do the cell build and resolution at - * this low level because we need physical cell offsets for the page. - * - * Sanity check: short-data cells can be smaller than dictionary-copy - * cells. If the data is already small, don't bother doing the work. - * This isn't just work avoidance: on-page cells can't grow as a result - * of writing a dictionary-copy cell, the reconciliation functions do a - * split-boundary test based on the size required by the value's cell; - * if we grow the cell after that test we'll potentially write off the - * end of the buffer's memory. - */ - if (val->buf.size <= WT_INTPACK32_MAXSIZE) - return (0); - WT_RET(__rec_dictionary_lookup(session, r, val, &dp)); - if (dp == NULL) - return (0); - - /* - * If the dictionary offset isn't set, we're creating a new entry in the - * dictionary, set its location. - * - * If the dictionary offset is set, we have a matching value. Create a - * copy cell instead. - */ - if (dp->offset == 0) - dp->offset = WT_PTRDIFF32(r->first_free, r->cur_ptr->image.mem); - else { - /* - * The offset is the byte offset from this cell to the previous, - * matching cell, NOT the byte offset from the beginning of the - * page. - */ - offset = (uint64_t)WT_PTRDIFF(r->first_free, - (uint8_t *)r->cur_ptr->image.mem + dp->offset); - val->len = val->cell_len = - __wt_cell_pack_copy(&val->cell, rle, offset); - val->buf.data = NULL; - val->buf.size = 0; - } - return (0); -} - -/* - * __rec_key_state_update -- - * Update prefix and suffix compression based on the last key. - */ -static inline void -__rec_key_state_update(WT_RECONCILE *r, bool ovfl_key) -{ - WT_ITEM *a; - - /* - * If writing an overflow key onto the page, don't update the "last key" - * value, and leave the state of prefix compression alone. (If we are - * currently doing prefix compression, we have a key state which will - * continue to work, we're just skipping the key just created because - * it's an overflow key and doesn't participate in prefix compression. - * If we are not currently doing prefix compression, we can't start, an - * overflow key doesn't give us any state.) - * - * Additionally, if we wrote an overflow key onto the page, turn off the - * suffix compression of row-store internal node keys. (When we split, - * "last key" is the largest key on the previous page, and "cur key" is - * the first key on the next page, which is being promoted. In some - * cases we can discard bytes from the "cur key" that are not needed to - * distinguish between the "last key" and "cur key", compressing the - * size of keys on internal nodes. If we just built an overflow key, - * we're not going to update the "last key", making suffix compression - * impossible for the next key. Alternatively, we could remember where - * the last key was on the page, detect it's an overflow key, read it - * from disk and do suffix compression, but that's too much work for an - * unlikely event.) - * - * If we're not writing an overflow key on the page, update the last-key - * value and turn on both prefix and suffix compression. - */ - if (ovfl_key) - r->key_sfx_compress = false; - else { - a = r->cur; - r->cur = r->last; - r->last = a; - - r->key_pfx_compress = r->key_pfx_compress_conf; - r->key_sfx_compress = r->key_sfx_compress_conf; - } -} - -/* - * Macros from fixed-length entries to/from bytes. - */ -#define WT_FIX_BYTES_TO_ENTRIES(btree, bytes) \ - ((uint32_t)((((bytes) * 8) / (btree)->bitcnt))) -#define WT_FIX_ENTRIES_TO_BYTES(btree, entries) \ - ((uint32_t)WT_ALIGN((entries) * (btree)->bitcnt, 8)) - -/* * __rec_leaf_page_max -- * Figure out the maximum leaf page size for the reconciliation. */ @@ -2057,35 +860,6 @@ __rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * __rec_need_split -- - * Check whether adding some bytes to the page requires a split. - */ -static bool -__rec_need_split(WT_RECONCILE *r, size_t len) -{ - /* - * In the case of a row-store leaf page, trigger a split if a threshold - * number of saved updates is reached. This allows pages to split for - * update/restore and lookaside eviction when there is no visible data - * causing the disk image to grow. - * - * In the case of small pages or large keys, we might try to split when - * a page has no updates or entries, which isn't possible. To consider - * update/restore or lookaside information, require either page entries - * or updates that will be attached to the image. The limit is one of - * either, but it doesn't make sense to create pages or images with few - * entries or updates, even where page sizes are small (especially as - * updates that will eventually become overflow items can throw off our - * calculations). Bound the combination at something reasonable. - */ - if (r->page->type == WT_PAGE_ROW_LEAF && r->entries + r->supd_next > 10) - len += r->supd_memsize; - - /* Check for the disk image crossing a boundary. */ - return (WT_CHECK_CROSSING_BND(r, len)); -} - -/* * __wt_split_page_size -- * Given a split percentage, calculate split page size in bytes. */ @@ -2123,8 +897,8 @@ __wt_split_page_size(int split_pct, uint32_t maxpagesize, uint32_t allocsize) * Initialize a single chunk structure. */ static int -__rec_split_chunk_init( - WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_CHUNK *chunk, size_t memsize) +__rec_split_chunk_init(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_REC_CHUNK *chunk, size_t memsize) { chunk->min_recno = WT_RECNO_OOB; chunk->min_entries = 0; @@ -2158,16 +932,16 @@ __rec_split_chunk_init( } /* - * __rec_split_init -- + * __wt_rec_split_init -- * Initialization for the reconciliation split functions. */ -static int -__rec_split_init(WT_SESSION_IMPL *session, +int +__wt_rec_split_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page, uint64_t recno, uint64_t max) { WT_BM *bm; WT_BTREE *btree; - WT_CHUNK *chunk; + WT_REC_CHUNK *chunk; WT_REF *ref; size_t corrected_page_size, disk_img_buf_size; @@ -2463,21 +1237,21 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) } /* - * __rec_split -- + * __wt_rec_split -- * Handle the page reconciliation bookkeeping. (Did you know "bookkeeper" * has 3 doubled letters in a row? Sweet-tooth does, too.) */ -static int -__rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) +int +__wt_rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) { WT_BTREE *btree; - WT_CHUNK *tmp; + WT_REC_CHUNK *tmp; size_t inuse; btree = S2BT(session); /* Fixed length col store can call with next_len 0 */ - WT_ASSERT(session, next_len == 0 || __rec_need_split(r, next_len)); + WT_ASSERT(session, next_len == 0 || __wt_rec_need_split(r, next_len)); /* * We should never split during salvage, and we're about to drop core @@ -2495,11 +1269,11 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) * Additionally, grow the buffer to contain the current item if we * haven't already consumed a reasonable portion of a split chunk. */ - if (inuse < r->split_size / 2 && !__rec_need_split(r, 0)) + if (inuse < r->split_size / 2 && !__wt_rec_need_split(r, 0)) goto done; /* All page boundaries reset the dictionary. */ - __rec_dictionary_reset(r); + __wt_rec_dictionary_reset(r); /* Set the number of entries and size for the just finished chunk. */ r->cur_ptr->entries = r->entries; @@ -2567,18 +1341,18 @@ done: /* } /* - * __rec_split_crossing_bnd -- + * __wt_rec_split_crossing_bnd -- * Save the details for the minimum split size boundary or call for a * split. */ -static inline int -__rec_split_crossing_bnd( +int +__wt_rec_split_crossing_bnd( WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) { WT_BTREE *btree; size_t min_offset; - WT_ASSERT(session, __rec_need_split(r, next_len)); + WT_ASSERT(session, __wt_rec_need_split(r, next_len)); /* * If crossing the minimum split size boundary, store the boundary @@ -2587,7 +1361,7 @@ __rec_split_crossing_bnd( * large enough, just split at this point. */ if (WT_CROSSING_MIN_BND(r, next_len) && - !WT_CROSSING_SPLIT_BND(r, next_len) && !__rec_need_split(r, 0)) { + !WT_CROSSING_SPLIT_BND(r, next_len) && !__wt_rec_need_split(r, 0)) { btree = S2BT(session); WT_ASSERT(session, r->cur_ptr->min_offset == 0); @@ -2609,13 +1383,13 @@ __rec_split_crossing_bnd( session, r, &r->cur_ptr->min_key, r->page->type)); /* All page boundaries reset the dictionary. */ - __rec_dictionary_reset(r); + __wt_rec_dictionary_reset(r); return (0); } /* We are crossing a split boundary */ - return (__rec_split(session, r, next_len)); + return (__wt_rec_split(session, r, next_len)); } /* @@ -2632,8 +1406,8 @@ static int __rec_split_finish_process_prev(WT_SESSION_IMPL *session, WT_RECONCILE *r) { WT_BTREE *btree; - WT_CHUNK *cur_ptr, *prev_ptr, *tmp; WT_PAGE_HEADER *dsk; + WT_REC_CHUNK *cur_ptr, *prev_ptr, *tmp; size_t combined_size, len_to_move; uint8_t *cur_dsk_start; @@ -2714,11 +1488,11 @@ __rec_split_finish_process_prev(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * __rec_split_finish -- + * __wt_rec_split_finish -- * Finish processing a page. */ -static int -__rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) +int +__wt_rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) { /* * We're done reconciling, write the final page. We may arrive here with @@ -2771,13 +1545,13 @@ __rec_supd_move( */ static int __rec_split_write_supd(WT_SESSION_IMPL *session, - WT_RECONCILE *r, WT_CHUNK *chunk, WT_MULTI *multi, bool last_block) + WT_RECONCILE *r, WT_REC_CHUNK *chunk, WT_MULTI *multi, bool last_block) { WT_BTREE *btree; - WT_CHUNK *next; WT_DECL_ITEM(key); WT_DECL_RET; WT_PAGE *page; + WT_REC_CHUNK *next; WT_SAVE_UPD *supd; WT_UPDATE *upd; uint32_t i, j; @@ -2876,7 +1650,7 @@ err: __wt_scr_free(session, &key); */ static void __rec_split_write_header(WT_SESSION_IMPL *session, - WT_RECONCILE *r, WT_CHUNK *chunk, WT_MULTI *multi, WT_PAGE_HEADER *dsk) + WT_RECONCILE *r, WT_REC_CHUNK *chunk, WT_MULTI *multi, WT_PAGE_HEADER *dsk) { WT_BTREE *btree; WT_PAGE *page; @@ -3088,7 +1862,7 @@ __rec_compression_adjust(WT_SESSION_IMPL *session, */ static int __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, - WT_CHUNK *chunk, WT_ITEM *compressed_image, bool last_block) + WT_REC_CHUNK *chunk, WT_ITEM *compressed_image, bool last_block) { WT_BTREE *btree; WT_MULTI *multi; @@ -3304,7 +2078,7 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) recno = btree->type == BTREE_ROW ? WT_RECNO_OOB : 1; - return (__rec_split_init(session, + return (__wt_rec_split_init(session, r, cbulk->leaf, recno, btree->maxleafpage_precomp)); } @@ -3326,7 +2100,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) switch (btree->type) { case BTREE_COL_FIX: if (cbulk->entry != 0) - __rec_incr(session, r, cbulk->entry, + __wt_rec_incr(session, r, cbulk->entry, __bitstr_size( (size_t)cbulk->entry * btree->bitcnt)); break; @@ -3338,7 +2112,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) break; } - WT_RET(__rec_split_finish(session, r)); + WT_RET(__wt_rec_split_finish(session, r)); WT_RET(__rec_write_wrapup(session, r, r->page)); __rec_write_page_status(session, r); @@ -3354,1912 +2128,6 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) } /* - * __wt_bulk_insert_row -- - * Row-store bulk insert. - */ -int -__wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) -{ - WT_BTREE *btree; - WT_CURSOR *cursor; - WT_KV *key, *val; - WT_RECONCILE *r; - bool ovfl_key; - - r = cbulk->reconcile; - btree = S2BT(session); - cursor = &cbulk->cbt.iface; - - key = &r->k; - val = &r->v; - WT_RET(__rec_cell_build_leaf_key(session, r, /* Build key cell */ - cursor->key.data, cursor->key.size, &ovfl_key)); - WT_RET(__rec_cell_build_val(session, r, /* Build value cell */ - cursor->value.data, cursor->value.size, (uint64_t)0)); - - /* Boundary: split or write the page. */ - if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) { - /* - * Turn off prefix compression until a full key written to the - * new page, and (unless already working with an overflow key), - * rebuild the key without compression. - */ - if (r->key_pfx_compress_conf) { - r->key_pfx_compress = false; - if (!ovfl_key) - WT_RET(__rec_cell_build_leaf_key( - session, r, NULL, 0, &ovfl_key)); - } - WT_RET(__rec_split_crossing_bnd( - session, r, key->len + val->len)); - } - - /* Copy the key/value pair onto the page. */ - __rec_copy_incr(session, r, key); - if (val->len == 0) - r->any_empty_value = true; - else { - r->all_empty_value = false; - if (btree->dictionary) - WT_RET(__rec_dict_replace(session, r, 0, val)); - __rec_copy_incr(session, r, val); - } - - /* Update compression state. */ - __rec_key_state_update(r, ovfl_key); - - return (0); -} - -/* - * __rec_col_fix_bulk_insert_split_check -- - * Check if a bulk-loaded fixed-length column store page needs to split. - */ -static inline int -__rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) -{ - WT_BTREE *btree; - WT_RECONCILE *r; - WT_SESSION_IMPL *session; - - session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session; - r = cbulk->reconcile; - btree = S2BT(session); - - if (cbulk->entry == cbulk->nrecs) { - if (cbulk->entry != 0) { - /* - * If everything didn't fit, update the counters and - * split. - * - * Boundary: split or write the page. - * - * No need to have a minimum split size boundary, all - * pages are filled 100% except the last, allowing it to - * grow in the future. - */ - __rec_incr(session, r, cbulk->entry, - __bitstr_size( - (size_t)cbulk->entry * btree->bitcnt)); - WT_RET(__rec_split(session, r, 0)); - } - cbulk->entry = 0; - cbulk->nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); - } - return (0); -} - -/* - * __wt_bulk_insert_fix -- - * Fixed-length column-store bulk insert. - */ -int -__wt_bulk_insert_fix( - WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) -{ - WT_BTREE *btree; - WT_CURSOR *cursor; - WT_RECONCILE *r; - - r = cbulk->reconcile; - btree = S2BT(session); - cursor = &cbulk->cbt.iface; - - WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk)); - __bit_setv(r->first_free, cbulk->entry, - btree->bitcnt, deleted ? 0 : ((uint8_t *)cursor->value.data)[0]); - ++cbulk->entry; - ++r->recno; - - return (0); -} - -/* - * __wt_bulk_insert_fix_bitmap -- - * Fixed-length column-store bulk insert. - */ -int -__wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) -{ - WT_BTREE *btree; - WT_CURSOR *cursor; - WT_RECONCILE *r; - uint32_t entries, offset, page_entries, page_size; - const uint8_t *data; - - r = cbulk->reconcile; - btree = S2BT(session); - cursor = &cbulk->cbt.iface; - - if (((r->recno - 1) * btree->bitcnt) & 0x7) - WT_RET_MSG(session, EINVAL, - "Bulk bitmap load not aligned on a byte boundary"); - for (data = cursor->value.data, - entries = (uint32_t)cursor->value.size; - entries > 0; - entries -= page_entries, data += page_size) { - WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk)); - - page_entries = WT_MIN(entries, cbulk->nrecs - cbulk->entry); - page_size = __bitstr_size(page_entries * btree->bitcnt); - offset = __bitstr_size(cbulk->entry * btree->bitcnt); - memcpy(r->first_free + offset, data, page_size); - cbulk->entry += page_entries; - r->recno += page_entries; - } - return (0); -} - -/* - * __wt_bulk_insert_var -- - * Variable-length column-store bulk insert. - */ -int -__wt_bulk_insert_var( - WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) -{ - WT_BTREE *btree; - WT_KV *val; - WT_RECONCILE *r; - - r = cbulk->reconcile; - btree = S2BT(session); - - val = &r->v; - if (deleted) { - val->cell_len = __wt_cell_pack_del(&val->cell, cbulk->rle); - val->buf.data = NULL; - val->buf.size = 0; - val->len = val->cell_len; - } else - /* - * Store the bulk cursor's last buffer, not the current value, - * we're tracking duplicates, which means we want the previous - * value seen, not the current value. - */ - WT_RET(__rec_cell_build_val(session, - r, cbulk->last.data, cbulk->last.size, cbulk->rle)); - - /* Boundary: split or write the page. */ - if (WT_CROSSING_SPLIT_BND(r, val->len)) - WT_RET(__rec_split_crossing_bnd(session, r, val->len)); - - /* Copy the value onto the page. */ - if (btree->dictionary) - WT_RET(__rec_dict_replace(session, r, cbulk->rle, val)); - __rec_copy_incr(session, r, val); - - /* Update the starting record number in case we split. */ - r->recno += cbulk->rle; - - return (0); -} - -/* - * __rec_vtype -- - * Return a value cell's address type. - */ -static inline u_int -__rec_vtype(WT_ADDR *addr) -{ - if (addr->type == WT_ADDR_INT) - return (WT_CELL_ADDR_INT); - if (addr->type == WT_ADDR_LEAF) - return (WT_CELL_ADDR_LEAF); - return (WT_CELL_ADDR_LEAF_NO); -} - -/* - * __rec_col_int -- - * Reconcile a column-store internal page. - */ -static int -__rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) -{ - WT_ADDR *addr; - WT_BTREE *btree; - WT_CELL_UNPACK *vpack, _vpack; - WT_CHILD_STATE state; - WT_DECL_RET; - WT_KV *val; - WT_PAGE *child, *page; - WT_REF *ref; - bool hazard; - - btree = S2BT(session); - page = pageref->page; - child = NULL; - hazard = false; - - val = &r->v; - vpack = &_vpack; - - WT_RET(__rec_split_init(session, - r, page, pageref->ref_recno, btree->maxintlpage_precomp)); - - /* For each entry in the in-memory page... */ - WT_INTL_FOREACH_BEGIN(session, page, ref) { - /* Update the starting record number in case we split. */ - r->recno = ref->ref_recno; - - /* - * Modified child. - * The page may be emptied or internally created during a split. - * Deleted/split pages are merged into the parent and discarded. - */ - WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state)); - addr = NULL; - child = ref->page; - - switch (state) { - case WT_CHILD_IGNORE: - /* Ignored child. */ - WT_CHILD_RELEASE_ERR(session, hazard, ref); - continue; - - case WT_CHILD_MODIFIED: - /* - * Modified child. Empty pages are merged into the - * parent and discarded. - */ - switch (child->modify->rec_result) { - case WT_PM_REC_EMPTY: - /* - * Column-store pages are almost never empty, as - * discarding a page would remove a chunk of the - * name space. The exceptions are pages created - * when the tree is created, and never filled. - */ - WT_CHILD_RELEASE_ERR(session, hazard, ref); - continue; - case WT_PM_REC_MULTIBLOCK: - WT_ERR(__rec_col_merge(session, r, child)); - WT_CHILD_RELEASE_ERR(session, hazard, ref); - continue; - case WT_PM_REC_REPLACE: - addr = &child->modify->mod_replace; - break; - WT_ILLEGAL_VALUE_ERR( - session, child->modify->rec_result); - } - break; - case WT_CHILD_ORIGINAL: - /* Original child. */ - break; - case WT_CHILD_PROXY: - /* - * Deleted child where we write a proxy cell, not yet - * supported for column-store. - */ - WT_ERR(__wt_illegal_value(session, state)); - } - - /* - * Build the value cell. The child page address is in one of 3 - * places: if the page was replaced, the page's modify structure - * references it and we built the value cell just above in the - * switch statement. Else, the WT_REF->addr reference points to - * an on-page cell or an off-page WT_ADDR structure: if it's an - * on-page cell and we copy it from the page, else build a new - * cell. - */ - if (addr == NULL && __wt_off_page(page, ref->addr)) - addr = ref->addr; - if (addr == NULL) { - __wt_cell_unpack(ref->addr, vpack); - val->buf.data = ref->addr; - val->buf.size = __wt_cell_total_len(vpack); - val->cell_len = 0; - val->len = val->buf.size; - } else - __rec_cell_build_addr(session, r, - addr->addr, addr->size, - __rec_vtype(addr), ref->ref_recno); - WT_CHILD_RELEASE_ERR(session, hazard, ref); - - /* Boundary: split or write the page. */ - if (__rec_need_split(r, val->len)) - WT_ERR(__rec_split_crossing_bnd(session, r, val->len)); - - /* Copy the value onto the page. */ - __rec_copy_incr(session, r, val); - } WT_INTL_FOREACH_END; - - /* Write the remnant page. */ - return (__rec_split_finish(session, r)); - -err: WT_CHILD_RELEASE(session, hazard, ref); - return (ret); -} - -/* - * __rec_col_merge -- - * Merge in a split page. - */ -static int -__rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) -{ - WT_ADDR *addr; - WT_KV *val; - WT_MULTI *multi; - WT_PAGE_MODIFY *mod; - uint32_t i; - - mod = page->modify; - - val = &r->v; - - /* For each entry in the split array... */ - for (multi = mod->mod_multi, - i = 0; i < mod->mod_multi_entries; ++multi, ++i) { - /* Update the starting record number in case we split. */ - r->recno = multi->key.recno; - - /* Build the value cell. */ - addr = &multi->addr; - __rec_cell_build_addr(session, r, - addr->addr, addr->size, __rec_vtype(addr), r->recno); - - /* Boundary: split or write the page. */ - if (__rec_need_split(r, val->len)) - WT_RET(__rec_split_crossing_bnd(session, r, val->len)); - - /* Copy the value onto the page. */ - __rec_copy_incr(session, r, val); - } - return (0); -} - -/* - * __rec_col_fix -- - * Reconcile a fixed-width, column-store leaf page. - */ -static int -__rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) -{ - WT_BTREE *btree; - WT_INSERT *ins; - WT_PAGE *page; - WT_UPDATE *upd; - uint64_t recno; - uint32_t entry, nrecs; - - btree = S2BT(session); - page = pageref->page; - - WT_RET(__rec_split_init( - session, r, page, pageref->ref_recno, btree->maxleafpage)); - - /* Copy the original, disk-image bytes into place. */ - memcpy(r->first_free, page->pg_fix_bitf, - __bitstr_size((size_t)page->entries * btree->bitcnt)); - - /* Update any changes to the original on-page data items. */ - WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) { - WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, NULL, &upd)); - if (upd != NULL) - __bit_setv(r->first_free, - WT_INSERT_RECNO(ins) - pageref->ref_recno, - btree->bitcnt, *upd->data); - } - - /* Calculate the number of entries per page remainder. */ - entry = page->entries; - nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail) - page->entries; - r->recno += entry; - - /* Walk any append list. */ - for (ins = - WT_SKIP_FIRST(WT_COL_APPEND(page));; ins = WT_SKIP_NEXT(ins)) { - if (ins == NULL) { - /* - * If the page split, instantiate any missing records in - * the page's name space. (Imagine record 98 is - * transactionally visible, 99 wasn't created or is not - * yet visible, 100 is visible. Then the page splits and - * record 100 moves to another page. When we reconcile - * the original page, we write record 98, then we don't - * see record 99 for whatever reason. If we've moved - * record 100, we don't know to write a deleted record - * 99 on the page.) - * - * The record number recorded during the split is the - * first key on the split page, that is, one larger than - * the last key on this page, we have to decrement it. - */ - if ((recno = - page->modify->mod_col_split_recno) == WT_RECNO_OOB) - break; - recno -= 1; - - /* - * The following loop assumes records to write, and the - * previous key might have been visible. - */ - if (r->recno > recno) - break; - upd = NULL; - } else { - WT_RET(__rec_txn_read( - session, r, ins, NULL, NULL, NULL, &upd)); - recno = WT_INSERT_RECNO(ins); - } - for (;;) { - /* - * The application may have inserted records which left - * gaps in the name space. - */ - for (; - nrecs > 0 && r->recno < recno; - --nrecs, ++entry, ++r->recno) - __bit_setv( - r->first_free, entry, btree->bitcnt, 0); - - if (nrecs > 0) { - __bit_setv(r->first_free, entry, btree->bitcnt, - upd == NULL ? 0 : *upd->data); - --nrecs; - ++entry; - ++r->recno; - break; - } - - /* - * If everything didn't fit, update the counters and - * split. - * - * Boundary: split or write the page. - * - * No need to have a minimum split size boundary, all - * pages are filled 100% except the last, allowing it to - * grow in the future. - */ - __rec_incr(session, r, entry, - __bitstr_size((size_t)entry * btree->bitcnt)); - WT_RET(__rec_split(session, r, 0)); - - /* Calculate the number of entries per page. */ - entry = 0; - nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); - } - - /* - * Execute this loop once without an insert item to catch any - * missing records due to a split, then quit. - */ - if (ins == NULL) - break; - } - - /* Update the counters. */ - __rec_incr( - session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt)); - - /* Write the remnant page. */ - return (__rec_split_finish(session, r)); -} - -/* - * __rec_col_fix_slvg -- - * Reconcile a fixed-width, column-store leaf page created during salvage. - */ -static int -__rec_col_fix_slvg(WT_SESSION_IMPL *session, - WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage) -{ - WT_BTREE *btree; - WT_PAGE *page; - uint64_t page_start, page_take; - uint32_t entry, nrecs; - - btree = S2BT(session); - page = pageref->page; - - /* - * !!! - * It's vanishingly unlikely and probably impossible for fixed-length - * column-store files to have overlapping key ranges. It's possible - * for an entire key range to go missing (if a page is corrupted and - * lost), but because pages can't split, it shouldn't be possible to - * find pages where the key ranges overlap. That said, we check for - * it during salvage and clean up after it here because it doesn't - * cost much and future column-store formats or operations might allow - * for fixed-length format ranges to overlap during salvage, and I - * don't want to have to retrofit the code later. - */ - WT_RET(__rec_split_init( - session, r, page, pageref->ref_recno, btree->maxleafpage)); - - /* We may not be taking all of the entries on the original page. */ - page_take = salvage->take == 0 ? page->entries : salvage->take; - page_start = salvage->skip == 0 ? 0 : salvage->skip; - - /* Calculate the number of entries per page. */ - entry = 0; - nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); - - for (; nrecs > 0 && salvage->missing > 0; - --nrecs, --salvage->missing, ++entry) - __bit_setv(r->first_free, entry, btree->bitcnt, 0); - - for (; nrecs > 0 && page_take > 0; - --nrecs, --page_take, ++page_start, ++entry) - __bit_setv(r->first_free, entry, btree->bitcnt, - __bit_getv(page->pg_fix_bitf, - (uint32_t)page_start, btree->bitcnt)); - - r->recno += entry; - __rec_incr(session, r, entry, - __bitstr_size((size_t)entry * btree->bitcnt)); - - /* - * We can't split during salvage -- if everything didn't fit, it's - * all gone wrong. - */ - if (salvage->missing != 0 || page_take != 0) - WT_PANIC_RET(session, WT_PANIC, - "%s page too large, attempted split during salvage", - __wt_page_type_string(page->type)); - - /* Write the page. */ - return (__rec_split_finish(session, r)); -} - -/* - * __rec_col_var_helper -- - * Create a column-store variable length record cell and write it onto a - * page. - */ -static int -__rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, - WT_SALVAGE_COOKIE *salvage, - WT_ITEM *value, bool deleted, uint8_t overflow_type, uint64_t rle) -{ - WT_BTREE *btree; - WT_KV *val; - - btree = S2BT(session); - - val = &r->v; - - /* - * Occasionally, salvage needs to discard records from the beginning or - * end of the page, and because the items may be part of a RLE cell, do - * the adjustments here. It's not a mistake we don't bother telling - * our caller we've handled all the records from the page we care about, - * and can quit processing the page: salvage is a rare operation and I - * don't want to complicate our caller's loop. - */ - if (salvage != NULL) { - if (salvage->done) - return (0); - if (salvage->skip != 0) { - if (rle <= salvage->skip) { - salvage->skip -= rle; - return (0); - } - rle -= salvage->skip; - salvage->skip = 0; - } - if (salvage->take != 0) { - if (rle <= salvage->take) - salvage->take -= rle; - else { - rle = salvage->take; - salvage->take = 0; - } - if (salvage->take == 0) - salvage->done = true; - } - } - - if (deleted) { - val->cell_len = __wt_cell_pack_del(&val->cell, rle); - val->buf.data = NULL; - val->buf.size = 0; - val->len = val->cell_len; - } else if (overflow_type) { - val->cell_len = __wt_cell_pack_ovfl( - &val->cell, overflow_type, rle, value->size); - val->buf.data = value->data; - val->buf.size = value->size; - val->len = val->cell_len + value->size; - } else - WT_RET(__rec_cell_build_val( - session, r, value->data, value->size, rle)); - - /* Boundary: split or write the page. */ - if (__rec_need_split(r, val->len)) - WT_RET(__rec_split_crossing_bnd(session, r, val->len)); - - /* Copy the value onto the page. */ - if (!deleted && !overflow_type && btree->dictionary) - WT_RET(__rec_dict_replace(session, r, rle, val)); - __rec_copy_incr(session, r, val); - - /* Update the starting record number in case we split. */ - r->recno += rle; - - return (0); -} - -/* - * __rec_col_var -- - * Reconcile a variable-width column-store leaf page. - */ -static int -__rec_col_var(WT_SESSION_IMPL *session, - WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage) -{ - enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state; - WT_BTREE *btree; - WT_CELL *cell; - WT_CELL_UNPACK *vpack, _vpack; - WT_COL *cip; - WT_CURSOR_BTREE *cbt; - WT_DECL_ITEM(orig); - WT_DECL_RET; - WT_INSERT *ins; - WT_ITEM *last; - WT_PAGE *page; - WT_UPDATE *upd; - uint64_t n, nrepeat, repeat_count, rle, skip, src_recno; - uint32_t i, size; - bool deleted, last_deleted, orig_deleted, update_no_copy; - const void *data; - - btree = S2BT(session); - page = pageref->page; - last = r->last; - vpack = &_vpack; - cbt = &r->update_modify_cbt; - - WT_RET(__rec_split_init(session, - r, page, pageref->ref_recno, btree->maxleafpage_precomp)); - - WT_RET(__wt_scr_alloc(session, 0, &orig)); - data = NULL; - size = 0; - upd = NULL; - - /* - * The salvage code may be calling us to reconcile a page where there - * were missing records in the column-store name space. If taking the - * first record from on the page, it might be a deleted record, so we - * have to give the RLE code a chance to figure that out. Else, if - * not taking the first record from the page, write a single element - * representing the missing records onto a new page. (Don't pass the - * salvage cookie to our helper function in this case, we're handling - * one of the salvage cookie fields on our own, and we don't need the - * helper function's assistance.) - */ - rle = 0; - last_deleted = false; - if (salvage != NULL && salvage->missing != 0) { - if (salvage->skip == 0) { - rle = salvage->missing; - last_deleted = true; - - /* - * Correct the number of records we're going to "take", - * pretending the missing records were on the page. - */ - salvage->take += salvage->missing; - } else - WT_ERR(__rec_col_var_helper(session, - r, NULL, NULL, true, false, salvage->missing)); - } - - /* - * We track two data items through this loop: the previous (last) item - * and the current item: if the last item is the same as the current - * item, we increment the RLE count for the last item; if the last item - * is different from the current item, we write the last item onto the - * page, and replace it with the current item. The r->recno counter - * tracks records written to the page, and is incremented by the helper - * function immediately after writing records to the page. The record - * number of our source record, that is, the current item, is maintained - * in src_recno. - */ - src_recno = r->recno + rle; - - /* For each entry in the in-memory page... */ - WT_COL_FOREACH(page, cip, i) { - ovfl_state = OVFL_IGNORE; - if ((cell = WT_COL_PTR(page, cip)) == NULL) { - nrepeat = 1; - ins = NULL; - orig_deleted = true; - } else { - __wt_cell_unpack(cell, vpack); - nrepeat = __wt_cell_rle(vpack); - ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip)); - - /* - * If the original value is "deleted", there's no value - * to compare, we're done. - */ - orig_deleted = vpack->type == WT_CELL_DEL; - if (orig_deleted) - goto record_loop; - - /* - * Overflow items are tricky: we don't know until we're - * finished processing the set of values if we need the - * overflow value or not. If we don't use the overflow - * item at all, we have to discard it from the backing - * file, otherwise we'll leak blocks on the checkpoint. - * That's safe because if the backing overflow value is - * still needed by any running transaction, we'll cache - * a copy in the update list. - * - * Regardless, we avoid copying in overflow records: if - * there's a WT_INSERT entry that modifies a reference - * counted overflow record, we may have to write copies - * of the overflow record, and in that case we'll do the - * comparisons, but we don't read overflow items just to - * see if they match records on either side. - */ - if (vpack->ovfl) { - ovfl_state = OVFL_UNUSED; - goto record_loop; - } - - /* - * If data is Huffman encoded, we have to decode it in - * order to compare it with the last item we saw, which - * may have been an update string. This guarantees we - * find every single pair of objects we can RLE encode, - * including applications updating an existing record - * where the new value happens (?) to match a Huffman- - * encoded value in a previous or next record. - */ - WT_ERR(__wt_dsk_cell_data_ref( - session, WT_PAGE_COL_VAR, vpack, orig)); - } - -record_loop: /* - * Generate on-page entries: loop repeat records, looking for - * WT_INSERT entries matching the record number. The WT_INSERT - * lists are in sorted order, so only need check the next one. - */ - for (n = 0; - n < nrepeat; n += repeat_count, src_recno += repeat_count) { - upd = NULL; - if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) { - WT_ERR(__rec_txn_read( - session, r, ins, cip, vpack, NULL, &upd)); - ins = WT_SKIP_NEXT(ins); - } - - update_no_copy = true; /* No data copy */ - repeat_count = 1; /* Single record */ - deleted = false; - - if (upd != NULL) { - switch (upd->type) { - case WT_UPDATE_MODIFY: - cbt->slot = WT_COL_SLOT(page, cip); - WT_ERR(__wt_value_return_upd( - session, cbt, upd, - F_ISSET(r, WT_REC_VISIBLE_ALL))); - data = cbt->iface.value.data; - size = (uint32_t)cbt->iface.value.size; - update_no_copy = false; - break; - case WT_UPDATE_STANDARD: - data = upd->data; - size = upd->size; - break; - case WT_UPDATE_TOMBSTONE: - deleted = true; - break; - WT_ILLEGAL_VALUE_ERR(session, upd->type); - } - } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) { - /* - * If doing an update save and restore, and the - * underlying value is a removed overflow value, - * we end up here. - * - * If necessary, when the overflow value was - * originally removed, reconciliation appended - * a globally visible copy of the value to the - * key's update list, meaning the on-page item - * isn't accessed after page re-instantiation. - * - * Assert the case. - */ - WT_ASSERT(session, - F_ISSET(r, WT_REC_UPDATE_RESTORE)); - - /* - * The on-page value will never be accessed, - * write a placeholder record. - */ - data = "ovfl-unused"; - size = WT_STORE_SIZE(strlen("ovfl-unused")); - } else { - update_no_copy = false; /* Maybe data copy */ - - /* - * The repeat count is the number of records up - * to the next WT_INSERT record, or up to the - * end of the entry if we have no more WT_INSERT - * records. - */ - if (ins == NULL) - repeat_count = nrepeat - n; - else - repeat_count = - WT_INSERT_RECNO(ins) - src_recno; - - deleted = orig_deleted; - if (deleted) - goto compare; - - /* - * If we are handling overflow items, use the - * overflow item itself exactly once, after - * which we have to copy it into a buffer and - * from then on use a complete copy because we - * are re-creating a new overflow record each - * time. - */ - switch (ovfl_state) { - case OVFL_UNUSED: - /* - * An as-yet-unused overflow item. - * - * We're going to copy the on-page cell, - * write out any record we're tracking. - */ - if (rle != 0) { - WT_ERR(__rec_col_var_helper( - session, r, salvage, last, - last_deleted, 0, rle)); - rle = 0; - } - - last->data = vpack->data; - last->size = vpack->size; - WT_ERR(__rec_col_var_helper( - session, r, salvage, last, false, - WT_CELL_VALUE_OVFL, repeat_count)); - - /* Track if page has overflow items. */ - r->ovfl_items = true; - - ovfl_state = OVFL_USED; - continue; - case OVFL_USED: - /* - * Original is an overflow item; we used - * it for a key and now we need another - * copy; read it into memory. - */ - WT_ERR(__wt_dsk_cell_data_ref(session, - WT_PAGE_COL_VAR, vpack, orig)); - - ovfl_state = OVFL_IGNORE; - /* FALLTHROUGH */ - case OVFL_IGNORE: - /* - * Original is an overflow item and we - * were forced to copy it into memory, - * or the original wasn't an overflow - * item; use the data copied into orig. - */ - data = orig->data; - size = (uint32_t)orig->size; - break; - } - } - -compare: /* - * If we have a record against which to compare, and - * the records compare equal, increment the rle counter - * and continue. If the records don't compare equal, - * output the last record and swap the last and current - * buffers: do NOT update the starting record number, - * we've been doing that all along. - */ - if (rle != 0) { - if ((deleted && last_deleted) || - (!last_deleted && !deleted && - last->size == size && - memcmp(last->data, data, size) == 0)) { - rle += repeat_count; - continue; - } - WT_ERR(__rec_col_var_helper(session, r, - salvage, last, last_deleted, 0, rle)); - } - - /* - * Swap the current/last state. - * - * Reset RLE counter and turn on comparisons. - */ - if (!deleted) { - /* - * We can't simply assign the data values into - * the last buffer because they may have come - * from a copy built from an encoded/overflow - * cell and creating the next record is going - * to overwrite that memory. Check, because - * encoded/overflow cells aren't that common - * and we'd like to avoid the copy. If data - * was taken from the current unpack structure - * (which points into the page), or was taken - * from an update structure, we can just use - * the pointers, they're not moving. - */ - if (data == vpack->data || update_no_copy) { - last->data = data; - last->size = size; - } else - WT_ERR(__wt_buf_set( - session, last, data, size)); - } - last_deleted = deleted; - rle = repeat_count; - } - - /* - * The first time we find an overflow record we never used, - * discard the underlying blocks, they're no longer useful. - */ - if (ovfl_state == OVFL_UNUSED && - vpack->raw != WT_CELL_VALUE_OVFL_RM) - WT_ERR(__wt_ovfl_remove( - session, page, vpack, F_ISSET(r, WT_REC_EVICT))); - } - - /* Walk any append list. */ - for (ins = - WT_SKIP_FIRST(WT_COL_APPEND(page));; ins = WT_SKIP_NEXT(ins)) { - if (ins == NULL) { - /* - * If the page split, instantiate any missing records in - * the page's name space. (Imagine record 98 is - * transactionally visible, 99 wasn't created or is not - * yet visible, 100 is visible. Then the page splits and - * record 100 moves to another page. When we reconcile - * the original page, we write record 98, then we don't - * see record 99 for whatever reason. If we've moved - * record 100, we don't know to write a deleted record - * 99 on the page.) - * - * Assert the recorded record number is past the end of - * the page. - * - * The record number recorded during the split is the - * first key on the split page, that is, one larger than - * the last key on this page, we have to decrement it. - */ - if ((n = page-> - modify->mod_col_split_recno) == WT_RECNO_OOB) - break; - WT_ASSERT(session, n >= src_recno); - n -= 1; - - upd = NULL; - } else { - WT_ERR(__rec_txn_read( - session, r, ins, NULL, NULL, NULL, &upd)); - n = WT_INSERT_RECNO(ins); - } - while (src_recno <= n) { - deleted = false; - update_no_copy = true; - - /* - * The application may have inserted records which left - * gaps in the name space, and these gaps can be huge. - * If we're in a set of deleted records, skip the boring - * part. - */ - if (src_recno < n) { - deleted = true; - if (last_deleted) { - /* - * The record adjustment is decremented - * by one so we can naturally fall into - * the RLE accounting below, where we - * increment rle by one, then continue - * in the outer loop, where we increment - * src_recno by one. - */ - skip = (n - src_recno) - 1; - rle += skip; - src_recno += skip; - } - } else if (upd == NULL) - deleted = true; - else - switch (upd->type) { - case WT_UPDATE_MODIFY: - /* - * Impossible slot, there's no backing - * on-page item. - */ - cbt->slot = UINT32_MAX; - WT_ERR(__wt_value_return_upd( - session, cbt, upd, - F_ISSET(r, WT_REC_VISIBLE_ALL))); - data = cbt->iface.value.data; - size = (uint32_t)cbt->iface.value.size; - update_no_copy = false; - break; - case WT_UPDATE_STANDARD: - data = upd->data; - size = upd->size; - break; - case WT_UPDATE_TOMBSTONE: - deleted = true; - break; - WT_ILLEGAL_VALUE_ERR(session, upd->type); - } - - /* - * Handle RLE accounting and comparisons -- see comment - * above, this code fragment does the same thing. - */ - if (rle != 0) { - if ((deleted && last_deleted) || - (!last_deleted && !deleted && - last->size == size && - memcmp(last->data, data, size) == 0)) { - ++rle; - goto next; - } - WT_ERR(__rec_col_var_helper(session, r, - salvage, last, last_deleted, 0, rle)); - } - - /* - * Swap the current/last state. We can't simply assign - * the data values into the last buffer because they may - * be a temporary copy built from a chain of modified - * updates and creating the next record will overwrite - * that memory. Check, we'd like to avoid the copy. If - * data was taken from an update structure, we can just - * use the pointers, they're not moving. - */ - if (!deleted) { - if (update_no_copy) { - last->data = data; - last->size = size; - } else - WT_ERR(__wt_buf_set( - session, last, data, size)); - } - - /* Ready for the next loop, reset the RLE counter. */ - last_deleted = deleted; - rle = 1; - - /* - * Move to the next record. It's not a simple increment - * because if it's the maximum record, incrementing it - * wraps to 0 and this turns into an infinite loop. - */ -next: if (src_recno == UINT64_MAX) - break; - ++src_recno; - } - - /* - * Execute this loop once without an insert item to catch any - * missing records due to a split, then quit. - */ - if (ins == NULL) - break; - } - - /* If we were tracking a record, write it. */ - if (rle != 0) - WT_ERR(__rec_col_var_helper( - session, r, salvage, last, last_deleted, 0, rle)); - - /* Write the remnant page. */ - ret = __rec_split_finish(session, r); - -err: __wt_scr_free(session, &orig); - return (ret); -} - -/* - * __rec_row_int -- - * Reconcile a row-store internal page. - */ -static int -__rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) -{ - WT_ADDR *addr; - WT_BTREE *btree; - WT_CELL *cell; - WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack; - WT_CHILD_STATE state; - WT_DECL_RET; - WT_IKEY *ikey; - WT_KV *key, *val; - WT_PAGE *child; - WT_REF *ref; - size_t size; - u_int vtype; - bool hazard, key_onpage_ovfl, ovfl_key; - const void *p; - - btree = S2BT(session); - child = NULL; - hazard = false; - - key = &r->k; - kpack = &_kpack; - WT_CLEAR(*kpack); /* -Wuninitialized */ - val = &r->v; - vpack = &_vpack; - WT_CLEAR(*vpack); /* -Wuninitialized */ - - ikey = NULL; /* -Wuninitialized */ - cell = NULL; - key_onpage_ovfl = false; - - WT_RET(__rec_split_init( - session, r, page, 0, btree->maxintlpage_precomp)); - - /* - * Ideally, we'd never store the 0th key on row-store internal pages - * because it's never used during tree search and there's no reason - * to waste the space. The problem is how we do splits: when we split, - * we've potentially picked out several "split points" in the buffer - * which is overflowing the maximum page size, and when the overflow - * happens, we go back and physically split the buffer, at those split - * points, into new pages. It would be both difficult and expensive - * to re-process the 0th key at each split point to be an empty key, - * so we don't do that. However, we are reconciling an internal page - * for whatever reason, and the 0th key is known to be useless. We - * truncate the key to a single byte, instead of removing it entirely, - * it simplifies various things in other parts of the code (we don't - * have to special case transforming the page from its disk image to - * its in-memory version, for example). - */ - r->cell_zero = true; - - /* For each entry in the in-memory page... */ - WT_INTL_FOREACH_BEGIN(session, page, ref) { - /* - * There are different paths if the key is an overflow item vs. - * a straight-forward on-page value. If an overflow item, we - * would have instantiated it, and we can use that fact to set - * things up. - * - * Note the cell reference and unpacked key cell are available - * only in the case of an instantiated, off-page key, we don't - * bother setting them if that's not possible. - */ - if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS)) { - cell = NULL; - key_onpage_ovfl = false; - ikey = __wt_ref_key_instantiated(ref); - if (ikey != NULL && ikey->cell_offset != 0) { - cell = - WT_PAGE_REF_OFFSET(page, ikey->cell_offset); - __wt_cell_unpack(cell, kpack); - key_onpage_ovfl = kpack->ovfl && - kpack->raw != WT_CELL_KEY_OVFL_RM; - } - } - - WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state)); - addr = ref->addr; - child = ref->page; - - switch (state) { - case WT_CHILD_IGNORE: - /* - * Ignored child. - * - * Overflow keys referencing pages we're not writing are - * no longer useful, schedule them for discard. Don't - * worry about instantiation, internal page keys are - * always instantiated. Don't worry about reuse, - * reusing this key in this reconciliation is unlikely. - */ - if (key_onpage_ovfl) - WT_ERR(__wt_ovfl_discard_add( - session, page, kpack->cell)); - WT_CHILD_RELEASE_ERR(session, hazard, ref); - continue; - - case WT_CHILD_MODIFIED: - /* - * Modified child. Empty pages are merged into the - * parent and discarded. - */ - switch (child->modify->rec_result) { - case WT_PM_REC_EMPTY: - /* - * Overflow keys referencing empty pages are no - * longer useful, schedule them for discard. - * Don't worry about instantiation, internal - * page keys are always instantiated. Don't - * worry about reuse, reusing this key in this - * reconciliation is unlikely. - */ - if (key_onpage_ovfl) - WT_ERR(__wt_ovfl_discard_add( - session, page, kpack->cell)); - WT_CHILD_RELEASE_ERR(session, hazard, ref); - continue; - case WT_PM_REC_MULTIBLOCK: - /* - * Overflow keys referencing split pages are no - * longer useful (the split page's key is the - * interesting key); schedule them for discard. - * Don't worry about instantiation, internal - * page keys are always instantiated. Don't - * worry about reuse, reusing this key in this - * reconciliation is unlikely. - */ - if (key_onpage_ovfl) - WT_ERR(__wt_ovfl_discard_add( - session, page, kpack->cell)); - - WT_ERR(__rec_row_merge(session, r, child)); - WT_CHILD_RELEASE_ERR(session, hazard, ref); - continue; - case WT_PM_REC_REPLACE: - /* - * If the page is replaced, the page's modify - * structure has the page's address. - */ - addr = &child->modify->mod_replace; - break; - WT_ILLEGAL_VALUE_ERR( - session, child->modify->rec_result); - } - break; - case WT_CHILD_ORIGINAL: - /* Original child. */ - break; - case WT_CHILD_PROXY: - /* Deleted child where we write a proxy cell. */ - break; - } - - /* - * Build the value cell, the child page's address. Addr points - * to an on-page cell or an off-page WT_ADDR structure. There's - * a special cell type in the case of page deletion requiring - * a proxy cell, otherwise use the information from the addr or - * original cell. - */ - if (__wt_off_page(page, addr)) { - p = addr->addr; - size = addr->size; - vtype = state == WT_CHILD_PROXY ? - WT_CELL_ADDR_DEL : __rec_vtype(addr); - } else { - __wt_cell_unpack(ref->addr, vpack); - p = vpack->data; - size = vpack->size; - vtype = state == WT_CHILD_PROXY ? - WT_CELL_ADDR_DEL : (u_int)vpack->raw; - } - __rec_cell_build_addr(session, r, p, size, vtype, WT_RECNO_OOB); - WT_CHILD_RELEASE_ERR(session, hazard, ref); - - /* - * Build key cell. - * Truncate any 0th key, internal pages don't need 0th keys. - */ - if (key_onpage_ovfl) { - key->buf.data = cell; - key->buf.size = __wt_cell_total_len(kpack); - key->cell_len = 0; - key->len = key->buf.size; - ovfl_key = true; - } else { - __wt_ref_key(page, ref, &p, &size); - WT_ERR(__rec_cell_build_int_key( - session, r, p, r->cell_zero ? 1 : size, &ovfl_key)); - } - r->cell_zero = false; - - /* Boundary: split or write the page. */ - if (__rec_need_split(r, key->len + val->len)) { - /* - * In one path above, we copied address blocks from the - * page rather than building the actual key. In that - * case, we have to build the key now because we are - * about to promote it. - */ - if (key_onpage_ovfl) { - WT_ERR(__wt_buf_set(session, r->cur, - WT_IKEY_DATA(ikey), ikey->size)); - key_onpage_ovfl = false; - } - - WT_ERR(__rec_split_crossing_bnd( - session, r, key->len + val->len)); - } - - /* Copy the key and value onto the page. */ - __rec_copy_incr(session, r, key); - __rec_copy_incr(session, r, val); - - /* Update compression state. */ - __rec_key_state_update(r, ovfl_key); - } WT_INTL_FOREACH_END; - - /* Write the remnant page. */ - return (__rec_split_finish(session, r)); - -err: WT_CHILD_RELEASE(session, hazard, ref); - return (ret); -} - -/* - * __rec_row_merge -- - * Merge in a split page. - */ -static int -__rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) -{ - WT_ADDR *addr; - WT_KV *key, *val; - WT_MULTI *multi; - WT_PAGE_MODIFY *mod; - uint32_t i; - bool ovfl_key; - - mod = page->modify; - - key = &r->k; - val = &r->v; - - /* For each entry in the split array... */ - for (multi = mod->mod_multi, - i = 0; i < mod->mod_multi_entries; ++multi, ++i) { - /* Build the key and value cells. */ - WT_RET(__rec_cell_build_int_key(session, r, - WT_IKEY_DATA(multi->key.ikey), - r->cell_zero ? 1 : multi->key.ikey->size, &ovfl_key)); - r->cell_zero = false; - - addr = &multi->addr; - __rec_cell_build_addr(session, r, - addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB); - - /* Boundary: split or write the page. */ - if (__rec_need_split(r, key->len + val->len)) - WT_RET(__rec_split_crossing_bnd( - session, r, key->len + val->len)); - - /* Copy the key and value onto the page. */ - __rec_copy_incr(session, r, key); - __rec_copy_incr(session, r, val); - - /* Update compression state. */ - __rec_key_state_update(r, ovfl_key); - } - return (0); -} - -/* - * __rec_row_leaf -- - * Reconcile a row-store leaf page. - */ -static int -__rec_row_leaf(WT_SESSION_IMPL *session, - WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) -{ - WT_BTREE *btree; - WT_CELL *cell; - WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack; - WT_CURSOR_BTREE *cbt; - WT_DECL_ITEM(tmpkey); - WT_DECL_ITEM(tmpval); - WT_DECL_RET; - WT_IKEY *ikey; - WT_INSERT *ins; - WT_KV *key, *val; - WT_ROW *rip; - WT_UPDATE *upd; - size_t size; - uint64_t slvg_skip; - uint32_t i; - bool dictionary, key_onpage_ovfl, ovfl_key; - void *copy; - const void *p; - - btree = S2BT(session); - cbt = &r->update_modify_cbt; - slvg_skip = salvage == NULL ? 0 : salvage->skip; - - key = &r->k; - val = &r->v; - vpack = &_vpack; - - WT_RET(__rec_split_init( - session, r, page, 0, btree->maxleafpage_precomp)); - - /* - * Write any K/V pairs inserted into the page before the first from-disk - * key on the page. - */ - if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) - WT_RET(__rec_row_leaf_insert(session, r, ins)); - - /* - * Temporary buffers in which to instantiate any uninstantiated keys - * or value items we need. - */ - WT_ERR(__wt_scr_alloc(session, 0, &tmpkey)); - WT_ERR(__wt_scr_alloc(session, 0, &tmpval)); - - /* For each entry in the page... */ - WT_ROW_FOREACH(page, rip, i) { - /* - * The salvage code, on some rare occasions, wants to reconcile - * a page but skip some leading records on the page. Because - * the row-store leaf reconciliation function copies keys from - * the original disk page, this is non-trivial -- just changing - * the in-memory pointers isn't sufficient, we have to change - * the WT_CELL structures on the disk page, too. It's ugly, but - * we pass in a value that tells us how many records to skip in - * this case. - */ - if (slvg_skip != 0) { - --slvg_skip; - continue; - } - - /* - * Figure out the key: set any cell reference (and unpack it), - * set any instantiated key reference. - */ - copy = WT_ROW_KEY_COPY(rip); - (void)__wt_row_leaf_key_info( - page, copy, &ikey, &cell, NULL, NULL); - if (cell == NULL) - kpack = NULL; - else { - kpack = &_kpack; - __wt_cell_unpack(cell, kpack); - } - - /* Unpack the on-page value cell, and look for an update. */ - __wt_row_leaf_value_cell(page, rip, NULL, vpack); - WT_ERR(__rec_txn_read( - session, r, NULL, rip, vpack, NULL, &upd)); - - /* Build value cell. */ - dictionary = false; - if (upd == NULL) { - /* - * When the page was read into memory, there may not - * have been a value item. - * - * If there was a value item, check if it's a dictionary - * cell (a copy of another item on the page). If it's a - * copy, we have to create a new value item as the old - * item might have been discarded from the page. - */ - if (vpack->raw == WT_CELL_VALUE_COPY) { - /* If the item is Huffman encoded, decode it. */ - if (btree->huffman_value == NULL) { - p = vpack->data; - size = vpack->size; - } else { - WT_ERR(__wt_huffman_decode(session, - btree->huffman_value, - vpack->data, vpack->size, - tmpval)); - p = tmpval->data; - size = tmpval->size; - } - WT_ERR(__rec_cell_build_val( - session, r, p, size, (uint64_t)0)); - dictionary = true; - } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) { - /* - * If doing an update save and restore, and the - * underlying value is a removed overflow value, - * we end up here. - * - * If necessary, when the overflow value was - * originally removed, reconciliation appended - * a globally visible copy of the value to the - * key's update list, meaning the on-page item - * isn't accessed after page re-instantiation. - * - * Assert the case. - */ - WT_ASSERT(session, - F_ISSET(r, WT_REC_UPDATE_RESTORE)); - - /* - * If the key is also a removed overflow item, - * don't write anything at all. - * - * We don't have to write anything because the - * code re-instantiating the page gets the key - * to match the saved list of updates from the - * original page. By not putting the key on - * the page, we'll move the key/value set from - * a row-store leaf page slot to an insert list, - * but that shouldn't matter. - * - * The reason we bother with the test is because - * overflows are expensive to write. It's hard - * to imagine a real workload where this test is - * worth the effort, but it's a simple test. - */ - if (kpack != NULL && - kpack->raw == WT_CELL_KEY_OVFL_RM) - goto leaf_insert; - - /* - * The on-page value will never be accessed, - * write a placeholder record. - */ - WT_ERR(__rec_cell_build_val(session, r, - "ovfl-unused", strlen("ovfl-unused"), - (uint64_t)0)); - } else { - val->buf.data = vpack->cell; - val->buf.size = __wt_cell_total_len(vpack); - val->cell_len = 0; - val->len = val->buf.size; - - /* Track if page has overflow items. */ - if (vpack->ovfl) - r->ovfl_items = true; - } - } else { - /* - * The first time we find an overflow record we're not - * going to use, discard the underlying blocks. - */ - if (vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM) - WT_ERR(__wt_ovfl_remove(session, - page, vpack, F_ISSET(r, WT_REC_EVICT))); - - switch (upd->type) { - case WT_UPDATE_MODIFY: - cbt->slot = WT_ROW_SLOT(page, rip); - WT_ERR(__wt_value_return_upd(session, cbt, upd, - F_ISSET(r, WT_REC_VISIBLE_ALL))); - WT_ERR(__rec_cell_build_val(session, r, - cbt->iface.value.data, - cbt->iface.value.size, (uint64_t)0)); - dictionary = true; - break; - case WT_UPDATE_STANDARD: - /* - * If no value, nothing needs to be copied. - * Otherwise, build the value's chunk from the - * update value. - */ - if (upd->size == 0) { - val->buf.data = NULL; - val->cell_len = - val->len = val->buf.size = 0; - } else { - WT_ERR(__rec_cell_build_val(session, r, - upd->data, upd->size, - (uint64_t)0)); - dictionary = true; - } - break; - case WT_UPDATE_TOMBSTONE: - /* - * If this key/value pair was deleted, we're - * done. - * - * Overflow keys referencing discarded values - * are no longer useful, discard the backing - * blocks. Don't worry about reuse, reusing - * keys from a row-store page reconciliation - * seems unlikely enough to ignore. - */ - if (kpack != NULL && kpack->ovfl && - kpack->raw != WT_CELL_KEY_OVFL_RM) { - /* - * Keys are part of the name-space, we - * can't remove them from the in-memory - * tree; if an overflow key was deleted - * without being instantiated (for - * example, cursor-based truncation), do - * it now. - */ - if (ikey == NULL) - WT_ERR(__wt_row_leaf_key( - session, - page, rip, tmpkey, true)); - - WT_ERR(__wt_ovfl_discard_add( - session, page, kpack->cell)); - } - - /* - * We aren't actually creating the key so we - * can't use bytes from this key to provide - * prefix information for a subsequent key. - */ - tmpkey->size = 0; - - /* Proceed with appended key/value pairs. */ - goto leaf_insert; - WT_ILLEGAL_VALUE_ERR(session, upd->type); - } - } - - /* - * Build key cell. - * - * If the key is an overflow key that hasn't been removed, use - * the original backing blocks. - */ - key_onpage_ovfl = kpack != NULL && - kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM; - if (key_onpage_ovfl) { - key->buf.data = cell; - key->buf.size = __wt_cell_total_len(kpack); - key->cell_len = 0; - key->len = key->buf.size; - ovfl_key = true; - - /* - * We aren't creating a key so we can't use this key as - * a prefix for a subsequent key. - */ - tmpkey->size = 0; - - /* Track if page has overflow items. */ - r->ovfl_items = true; - } else { - /* - * Get the key from the page or an instantiated key, or - * inline building the key from a previous key (it's a - * fast path for simple, prefix-compressed keys), or by - * by building the key from scratch. - */ - if (__wt_row_leaf_key_info(page, copy, - NULL, &cell, &tmpkey->data, &tmpkey->size)) - goto build; - - kpack = &_kpack; - __wt_cell_unpack(cell, kpack); - if (btree->huffman_key == NULL && - kpack->type == WT_CELL_KEY && - tmpkey->size >= kpack->prefix) { - /* - * The previous clause checked for a prefix of - * zero, which means the temporary buffer must - * have a non-zero size, and it references a - * valid key. - */ - WT_ASSERT(session, tmpkey->size != 0); - - /* - * Grow the buffer as necessary, ensuring data - * data has been copied into local buffer space, - * then append the suffix to the prefix already - * in the buffer. - * - * Don't grow the buffer unnecessarily or copy - * data we don't need, truncate the item's data - * length to the prefix bytes. - */ - tmpkey->size = kpack->prefix; - WT_ERR(__wt_buf_grow(session, - tmpkey, tmpkey->size + kpack->size)); - memcpy((uint8_t *)tmpkey->mem + tmpkey->size, - kpack->data, kpack->size); - tmpkey->size += kpack->size; - } else - WT_ERR(__wt_row_leaf_key_copy( - session, page, rip, tmpkey)); -build: - WT_ERR(__rec_cell_build_leaf_key(session, r, - tmpkey->data, tmpkey->size, &ovfl_key)); - } - - /* Boundary: split or write the page. */ - if (__rec_need_split(r, key->len + val->len)) { - /* - * If we copied address blocks from the page rather than - * building the actual key, we have to build the key now - * because we are about to promote it. - */ - if (key_onpage_ovfl) { - WT_ERR(__wt_dsk_cell_data_ref(session, - WT_PAGE_ROW_LEAF, kpack, r->cur)); - WT_NOT_READ(key_onpage_ovfl, false); - } - - /* - * Turn off prefix compression until a full key written - * to the new page, and (unless already working with an - * overflow key), rebuild the key without compression. - */ - if (r->key_pfx_compress_conf) { - r->key_pfx_compress = false; - if (!ovfl_key) - WT_ERR(__rec_cell_build_leaf_key( - session, r, NULL, 0, &ovfl_key)); - } - - WT_ERR(__rec_split_crossing_bnd( - session, r, key->len + val->len)); - } - - /* Copy the key/value pair onto the page. */ - __rec_copy_incr(session, r, key); - if (val->len == 0) - r->any_empty_value = true; - else { - r->all_empty_value = false; - if (dictionary && btree->dictionary) - WT_ERR(__rec_dict_replace(session, r, 0, val)); - __rec_copy_incr(session, r, val); - } - - /* Update compression state. */ - __rec_key_state_update(r, ovfl_key); - -leaf_insert: /* Write any K/V pairs inserted into the page after this key. */ - if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT(page, rip))) != NULL) - WT_ERR(__rec_row_leaf_insert(session, r, ins)); - } - - /* Write the remnant page. */ - ret = __rec_split_finish(session, r); - -err: __wt_scr_free(session, &tmpkey); - __wt_scr_free(session, &tmpval); - return (ret); -} - -/* - * __rec_row_leaf_insert -- - * Walk an insert chain, writing K/V pairs. - */ -static int -__rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) -{ - WT_BTREE *btree; - WT_CURSOR_BTREE *cbt; - WT_KV *key, *val; - WT_UPDATE *upd; - bool ovfl_key, upd_saved; - - btree = S2BT(session); - cbt = &r->update_modify_cbt; - - key = &r->k; - val = &r->v; - - for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) { - WT_RET(__rec_txn_read( - session, r, ins, NULL, NULL, &upd_saved, &upd)); - - if (upd == NULL) { - /* - * If no update is visible but some were saved, check - * for splits. - */ - if (!upd_saved) - continue; - if (!__rec_need_split(r, WT_INSERT_KEY_SIZE(ins))) - continue; - - /* Copy the current key into place and then split. */ - WT_RET(__wt_buf_set(session, r->cur, - WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins))); - WT_RET(__rec_split_crossing_bnd( - session, r, WT_INSERT_KEY_SIZE(ins))); - - /* - * Turn off prefix and suffix compression until a full - * key is written into the new page. - */ - r->key_pfx_compress = r->key_sfx_compress = false; - continue; - } - - switch (upd->type) { - case WT_UPDATE_MODIFY: - /* - * Impossible slot, there's no backing on-page - * item. - */ - cbt->slot = UINT32_MAX; - WT_RET(__wt_value_return_upd( - session, cbt, upd, F_ISSET(r, WT_REC_VISIBLE_ALL))); - WT_RET(__rec_cell_build_val(session, r, - cbt->iface.value.data, - cbt->iface.value.size, (uint64_t)0)); - break; - case WT_UPDATE_STANDARD: - if (upd->size == 0) - val->len = 0; - else - WT_RET(__rec_cell_build_val(session, - r, upd->data, upd->size, - (uint64_t)0)); - break; - case WT_UPDATE_TOMBSTONE: - continue; - WT_ILLEGAL_VALUE(session, upd->type); - } - - /* Build key cell. */ - WT_RET(__rec_cell_build_leaf_key(session, r, - WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); - - /* Boundary: split or write the page. */ - if (__rec_need_split(r, key->len + val->len)) { - /* - * Turn off prefix compression until a full key written - * to the new page, and (unless already working with an - * overflow key), rebuild the key without compression. - */ - if (r->key_pfx_compress_conf) { - r->key_pfx_compress = false; - if (!ovfl_key) - WT_RET(__rec_cell_build_leaf_key( - session, r, NULL, 0, &ovfl_key)); - } - - WT_RET(__rec_split_crossing_bnd( - session, r, key->len + val->len)); - } - - /* Copy the key/value pair onto the page. */ - __rec_copy_incr(session, r, key); - if (val->len == 0) - r->any_empty_value = true; - else { - r->all_empty_value = false; - if (btree->dictionary) - WT_RET(__rec_dict_replace(session, r, 0, val)); - __rec_copy_incr(session, r, val); - } - - /* Update compression state. */ - __rec_key_state_update(r, ovfl_key); - } - - return (0); -} - -/* * __rec_split_discard -- * Discard the pages resulting from a previous split. */ @@ -5649,232 +2517,12 @@ __rec_las_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * __rec_cell_build_int_key -- - * Process a key and return a WT_CELL structure and byte string to be - * stored on a row-store internal page. - */ -static int -__rec_cell_build_int_key(WT_SESSION_IMPL *session, - WT_RECONCILE *r, const void *data, size_t size, bool *is_ovflp) -{ - WT_BTREE *btree; - WT_KV *key; - - *is_ovflp = false; - - btree = S2BT(session); - - key = &r->k; - - /* Copy the bytes into the "current" and key buffers. */ - WT_RET(__wt_buf_set(session, r->cur, data, size)); - WT_RET(__wt_buf_set(session, &key->buf, data, size)); - - /* Create an overflow object if the data won't fit. */ - if (size > btree->maxintlkey) { - WT_STAT_DATA_INCR(session, rec_overflow_key_internal); - - *is_ovflp = true; - return (__rec_cell_build_ovfl( - session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0)); - } - - key->cell_len = __wt_cell_pack_int_key(&key->cell, key->buf.size); - key->len = key->cell_len + key->buf.size; - - return (0); -} - -/* - * __rec_cell_build_leaf_key -- - * Process a key and return a WT_CELL structure and byte string to be - * stored on a row-store leaf page. - */ -static int -__rec_cell_build_leaf_key(WT_SESSION_IMPL *session, - WT_RECONCILE *r, const void *data, size_t size, bool *is_ovflp) -{ - WT_BTREE *btree; - WT_KV *key; - size_t pfx_max; - const uint8_t *a, *b; - uint8_t pfx; - - *is_ovflp = false; - - btree = S2BT(session); - - key = &r->k; - - pfx = 0; - if (data == NULL) - /* - * When data is NULL, our caller has a prefix compressed key - * they can't use (probably because they just crossed a split - * point). Use the full key saved when last called, instead. - */ - WT_RET(__wt_buf_set( - session, &key->buf, r->cur->data, r->cur->size)); - else { - /* - * Save a copy of the key for later reference: we use the full - * key for prefix-compression comparisons, and if we are, for - * any reason, unable to use the compressed key we generate. - */ - WT_RET(__wt_buf_set(session, r->cur, data, size)); - - /* - * Do prefix compression on the key. We know by definition the - * previous key sorts before the current key, which means the - * keys must differ and we just need to compare up to the - * shorter of the two keys. - */ - if (r->key_pfx_compress) { - /* - * We can't compress out more than 256 bytes, limit the - * comparison to that. - */ - pfx_max = UINT8_MAX; - if (size < pfx_max) - pfx_max = size; - if (r->last->size < pfx_max) - pfx_max = r->last->size; - for (a = data, b = r->last->data; pfx < pfx_max; ++pfx) - if (*a++ != *b++) - break; - - /* - * Prefix compression may cost us CPU and memory when - * the page is re-loaded, don't do it unless there's - * reasonable gain. - */ - if (pfx < btree->prefix_compression_min) - pfx = 0; - else - WT_STAT_DATA_INCRV( - session, rec_prefix_compression, pfx); - } - - /* Copy the non-prefix bytes into the key buffer. */ - WT_RET(__wt_buf_set( - session, &key->buf, (uint8_t *)data + pfx, size - pfx)); - } - - /* Optionally compress the key using the Huffman engine. */ - if (btree->huffman_key != NULL) - WT_RET(__wt_huffman_encode(session, btree->huffman_key, - key->buf.data, (uint32_t)key->buf.size, &key->buf)); - - /* Create an overflow object if the data won't fit. */ - if (key->buf.size > btree->maxleafkey) { - /* - * Overflow objects aren't prefix compressed -- rebuild any - * object that was prefix compressed. - */ - if (pfx == 0) { - WT_STAT_DATA_INCR(session, rec_overflow_key_leaf); - - *is_ovflp = true; - return (__rec_cell_build_ovfl( - session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0)); - } - return ( - __rec_cell_build_leaf_key(session, r, NULL, 0, is_ovflp)); - } - - key->cell_len = __wt_cell_pack_leaf_key(&key->cell, pfx, key->buf.size); - key->len = key->cell_len + key->buf.size; - - return (0); -} - -/* - * __rec_cell_build_addr -- - * Process an address reference and return a cell structure to be stored - * on the page. - */ -static void -__rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, - const void *addr, size_t size, u_int cell_type, uint64_t recno) -{ - WT_KV *val; - - val = &r->v; - - WT_ASSERT(session, size != 0 || cell_type == WT_CELL_ADDR_DEL); - - /* - * We don't check the address size because we can't store an address on - * an overflow page: if the address won't fit, the overflow page's - * address won't fit either. This possibility must be handled by Btree - * configuration, we have to disallow internal page sizes that are too - * small with respect to the largest address cookie the underlying block - * manager might return. - */ - - /* - * We don't copy the data into the buffer, it's not necessary; just - * re-point the buffer's data/length fields. - */ - val->buf.data = addr; - val->buf.size = size; - val->cell_len = - __wt_cell_pack_addr(&val->cell, cell_type, recno, val->buf.size); - val->len = val->cell_len + val->buf.size; -} - -/* - * __rec_cell_build_val -- - * Process a data item and return a WT_CELL structure and byte string to - * be stored on the page. - */ -static int -__rec_cell_build_val(WT_SESSION_IMPL *session, - WT_RECONCILE *r, const void *data, size_t size, uint64_t rle) -{ - WT_BTREE *btree; - WT_KV *val; - - btree = S2BT(session); - - val = &r->v; - - /* - * We don't copy the data into the buffer, it's not necessary; just - * re-point the buffer's data/length fields. - */ - val->buf.data = data; - val->buf.size = size; - - /* Handle zero-length cells quickly. */ - if (size != 0) { - /* Optionally compress the data using the Huffman engine. */ - if (btree->huffman_value != NULL) - WT_RET(__wt_huffman_encode( - session, btree->huffman_value, - val->buf.data, (uint32_t)val->buf.size, &val->buf)); - - /* Create an overflow object if the data won't fit. */ - if (val->buf.size > btree->maxleafvalue) { - WT_STAT_DATA_INCR(session, rec_overflow_value); - - return (__rec_cell_build_ovfl( - session, r, val, WT_CELL_VALUE_OVFL, rle)); - } - } - val->cell_len = __wt_cell_pack_data(&val->cell, rle, val->buf.size); - val->len = val->cell_len + val->buf.size; - - return (0); -} - -/* - * __rec_cell_build_ovfl -- + * __wt_rec_cell_build_ovfl -- * Store overflow items in the file, returning the address cookie. */ -static int -__rec_cell_build_ovfl(WT_SESSION_IMPL *session, - WT_RECONCILE *r, WT_KV *kv, uint8_t type, uint64_t rle) +int +__wt_rec_cell_build_ovfl(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_REC_KV *kv, uint8_t type, uint64_t rle) { WT_BM *bm; WT_BTREE *btree; @@ -5939,194 +2587,3 @@ __rec_cell_build_ovfl(WT_SESSION_IMPL *session, err: __wt_scr_free(session, &tmp); return (ret); } - -/* - * __rec_dictionary_skip_search -- - * Search a dictionary skiplist. - */ -static WT_DICTIONARY * -__rec_dictionary_skip_search(WT_DICTIONARY **head, uint64_t hash) -{ - WT_DICTIONARY **e; - int i; - - /* - * Start at the highest skip level, then go as far as possible at each - * level before stepping down to the next. - */ - for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { - if (*e == NULL) { /* Empty levels */ - --i; - --e; - continue; - } - - /* - * Return any exact matches: we don't care in what search level - * we found a match. - */ - if ((*e)->hash == hash) /* Exact match */ - return (*e); - if ((*e)->hash > hash) { /* Drop down a level */ - --i; - --e; - } else /* Keep going at this level */ - e = &(*e)->next[i]; - } - return (NULL); -} - -/* - * __rec_dictionary_skip_search_stack -- - * Search a dictionary skiplist, returning an insert/remove stack. - */ -static void -__rec_dictionary_skip_search_stack( - WT_DICTIONARY **head, WT_DICTIONARY ***stack, uint64_t hash) -{ - WT_DICTIONARY **e; - int i; - - /* - * Start at the highest skip level, then go as far as possible at each - * level before stepping down to the next. - */ - for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) - if (*e == NULL || (*e)->hash > hash) - stack[i--] = e--; /* Drop down a level */ - else - e = &(*e)->next[i]; /* Keep going at this level */ -} - -/* - * __rec_dictionary_skip_insert -- - * Insert an entry into the dictionary skip-list. - */ -static void -__rec_dictionary_skip_insert( - WT_DICTIONARY **head, WT_DICTIONARY *e, uint64_t hash) -{ - WT_DICTIONARY **stack[WT_SKIP_MAXDEPTH]; - u_int i; - - /* Insert the new entry into the skiplist. */ - __rec_dictionary_skip_search_stack(head, stack, hash); - for (i = 0; i < e->depth; ++i) { - e->next[i] = *stack[i]; - *stack[i] = e; - } -} - -/* - * __rec_dictionary_init -- - * Allocate and initialize the dictionary. - */ -static int -__rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u_int slots) -{ - u_int depth, i; - - /* Free any previous dictionary. */ - __rec_dictionary_free(session, r); - - r->dictionary_slots = slots; - WT_RET(__wt_calloc(session, - r->dictionary_slots, sizeof(WT_DICTIONARY *), &r->dictionary)); - for (i = 0; i < r->dictionary_slots; ++i) { - depth = __wt_skip_choose_depth(session); - WT_RET(__wt_calloc(session, 1, - sizeof(WT_DICTIONARY) + depth * sizeof(WT_DICTIONARY *), - &r->dictionary[i])); - r->dictionary[i]->depth = depth; - } - return (0); -} - -/* - * __rec_dictionary_free -- - * Free the dictionary. - */ -static void -__rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r) -{ - u_int i; - - if (r->dictionary == NULL) - return; - - /* - * We don't correct dictionary_slots when we fail during allocation, - * but that's OK, the value is either NULL or a memory reference to - * be free'd. - */ - for (i = 0; i < r->dictionary_slots; ++i) - __wt_free(session, r->dictionary[i]); - __wt_free(session, r->dictionary); -} - -/* - * __rec_dictionary_reset -- - * Reset the dictionary when reconciliation restarts and when crossing a - * page boundary (a potential split). - */ -static void -__rec_dictionary_reset(WT_RECONCILE *r) -{ - if (r->dictionary_slots) { - r->dictionary_next = 0; - memset(r->dictionary_head, 0, sizeof(r->dictionary_head)); - } -} - -/* - * __rec_dictionary_lookup -- - * Check the dictionary for a matching value on this page. - */ -static int -__rec_dictionary_lookup( - WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *val, WT_DICTIONARY **dpp) -{ - WT_DICTIONARY *dp, *next; - uint64_t hash; - bool match; - - *dpp = NULL; - - /* Search the dictionary, and return any match we find. */ - hash = __wt_hash_fnv64(val->buf.data, val->buf.size); - for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash); - dp != NULL && dp->hash == hash; dp = dp->next[0]) { - WT_RET(__wt_cell_pack_data_match( - (WT_CELL *)((uint8_t *)r->cur_ptr->image.mem + dp->offset), - &val->cell, val->buf.data, &match)); - if (match) { - WT_STAT_DATA_INCR(session, rec_dictionary); - *dpp = dp; - return (0); - } - } - - /* - * We're not doing value replacement in the dictionary. We stop adding - * new entries if we run out of empty dictionary slots (but continue to - * use the existing entries). I can't think of any reason a leaf page - * value is more likely to be seen because it was seen more recently - * than some other value: if we find working sets where that's not the - * case, it shouldn't be too difficult to maintain a pointer which is - * the next dictionary slot to re-use. - */ - if (r->dictionary_next >= r->dictionary_slots) - return (0); - - /* - * Set the hash value, we'll add this entry into the dictionary when we - * write it into the page's disk image buffer (because that's when we - * know where on the page it will be written). - */ - next = r->dictionary[r->dictionary_next++]; - next->offset = 0; /* Not necessary, just cautious. */ - next->hash = hash; - __rec_dictionary_skip_insert(r->dictionary_head, next, hash); - *dpp = next; - return (0); -} diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 534d598b3f3..8160ef92bbd 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -781,6 +781,8 @@ static const char * const __stats_connection_desc[] = { "cache: cache overflow score", "cache: cache overflow table entries", "cache: cache overflow table insert calls", + "cache: cache overflow table max on-disk size", + "cache: cache overflow table on-disk size", "cache: cache overflow table remove calls", "cache: checkpoint blocked page eviction", "cache: eviction calls to get a page", @@ -1204,6 +1206,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing cache_lookaside_score */ /* not clearing cache_lookaside_entries */ stats->cache_lookaside_insert = 0; + /* not clearing cache_lookaside_ondisk_max */ + /* not clearing cache_lookaside_ondisk */ stats->cache_lookaside_remove = 0; stats->cache_eviction_checkpoint = 0; stats->cache_eviction_get_ref = 0; @@ -1616,6 +1620,10 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, cache_lookaside_entries); to->cache_lookaside_insert += WT_STAT_READ(from, cache_lookaside_insert); + to->cache_lookaside_ondisk_max += + WT_STAT_READ(from, cache_lookaside_ondisk_max); + to->cache_lookaside_ondisk += + WT_STAT_READ(from, cache_lookaside_ondisk); to->cache_lookaside_remove += WT_STAT_READ(from, cache_lookaside_remove); to->cache_eviction_checkpoint += diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 81bf2bdea4f..b21ccd355ce 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -485,7 +485,7 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) if (cval.val) F_SET(txn, WT_TXN_IGNORE_PREPARE); - WT_RET(__wt_txn_parse_read_timestamp(session, cfg)); + WT_RET(__wt_txn_parse_read_timestamp(session, cfg, NULL)); return (0); } @@ -933,8 +933,12 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0); - /* Transaction should not have updated any of the logged tables. */ - WT_ASSERT(session, txn->logrec == NULL); + /* + * A transaction should not have updated any of the logged tables, + * if debug mode logging is not turned on. + */ + if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_DEBUG_MODE)) + WT_ASSERT(session, txn->logrec == NULL); WT_RET(__wt_txn_context_check(session, true)); diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index f55715eb91b..1b5beff581f 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -38,6 +38,7 @@ __txn_op_log_row_key_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) if (cbt->ins == NULL) { session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; + WT_ASSERT(session, cbt->slot < page->entries); rip = &page->pg_row[cbt->slot]; WT_ASSERT(session, __wt_row_leaf_key(session, page, rip, &key, false) == 0); @@ -59,19 +60,15 @@ __txn_op_log_row_key_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) * Log an operation for the current transaction. */ static int -__txn_op_log(WT_SESSION_IMPL *session, - WT_ITEM *logrec, WT_TXN_OP *op, WT_CURSOR_BTREE *cbt) +__txn_op_log(WT_SESSION_IMPL *session, WT_ITEM *logrec, + WT_TXN_OP *op, WT_CURSOR_BTREE *cbt, uint32_t fileid) { WT_CURSOR *cursor; WT_ITEM value; WT_UPDATE *upd; uint64_t recno; - uint32_t fileid; cursor = &cbt->iface; - - fileid = op->btree->id; - upd = op->u.op_upd; value.data = upd->data; value.size = upd->size; @@ -210,7 +207,16 @@ __txn_logrec_init(WT_SESSION_IMPL *session) if (txn->logrec != NULL) return (0); - WT_ASSERT(session, txn->id != WT_TXN_NONE); + /* + * The only way we should ever get in here without a txn id is if we + * are recording diagnostic information. In that case, allocate an id. + */ + if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_DEBUG_MODE) && + txn->id == WT_TXN_NONE) + WT_RET(__wt_txn_id_check(session)); + else + WT_ASSERT(session, txn->id != WT_TXN_NONE); + WT_RET(__wt_struct_size(session, &header_size, fmt, rectype, txn->id)); WT_RET(__wt_logrec_alloc(session, header_size, &logrec)); @@ -233,6 +239,7 @@ err: __wt_logrec_free(session, &logrec); int __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_ITEM *logrec; WT_TXN *txn; @@ -240,11 +247,13 @@ __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) uint32_t fileid; + conn = S2C(session); txn = &session->txn; - if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) || + if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) || F_ISSET(session, WT_SESSION_NO_LOGGING) || - F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING)) + (F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING) && + !FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE))) return (0); /* We'd better have a transaction. */ @@ -255,6 +264,14 @@ __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) op = txn->mod + txn->mod_count - 1; fileid = op->btree->id; + /* + * If this operation is diagnostic only, set the ignore bit on the + * fileid so that recovery can skip it. + */ + if (F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING) && + FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE)) + FLD_SET(fileid, WT_LOGOP_IGNORE); + WT_RET(__txn_logrec_init(session)); logrec = txn->logrec; @@ -267,7 +284,7 @@ __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) break; case WT_TXN_OP_BASIC_COL: case WT_TXN_OP_BASIC_ROW: - ret = __txn_op_log(session, logrec, op, cbt); + ret = __txn_op_log(session, logrec, op, cbt, fileid); break; case WT_TXN_OP_TRUNCATE_COL: ret = __wt_logop_col_truncate_pack(session, logrec, fileid, @@ -366,6 +383,47 @@ __wt_txn_checkpoint_logread(WT_SESSION_IMPL *session, } /* + * __wt_txn_ts_log -- + * Write a log record recording timestamps in the transaction. + */ +int +__wt_txn_ts_log(WT_SESSION_IMPL *session) +{ + struct timespec t; + WT_CONNECTION_IMPL *conn; + WT_ITEM *logrec; + WT_TXN *txn; + wt_timestamp_t commit, durable, first, prepare, read; + + conn = S2C(session); + txn = &session->txn; + + if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) || + F_ISSET(session, WT_SESSION_NO_LOGGING) || + !FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE)) + return (0); + + /* We'd better have a transaction running. */ + WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); + + WT_RET(__txn_logrec_init(session)); + logrec = txn->logrec; + commit = durable = first = prepare = read = WT_TS_NONE; + if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) { + commit = txn->commit_timestamp; + first = txn->first_commit_timestamp; + } + prepare = txn->prepare_timestamp; + if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) + read = txn->read_timestamp; + + __wt_epoch(session, &t); + return (__wt_logop_txn_timestamp_pack(session, logrec, + (uint64_t)t.tv_sec, (uint64_t)t.tv_nsec, + commit, durable, first, prepare, read)); +} + +/* * __wt_txn_checkpoint_log -- * Write a log record for a checkpoint operation. */ diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index 702196d17ee..b0960deb9c3 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -51,6 +51,11 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r, c = NULL; /* + * File ids with the bit set to ignore this operation are skipped. + */ + if (WT_LOGOP_IS_IGNORED(id)) + return (0); + /* * Metadata operations have an id of 0. Match operations based * on the id and the current pass of recovery for metadata. * @@ -115,7 +120,8 @@ __txn_op_apply( WT_DECL_RET; WT_ITEM key, start_key, stop_key, value; WT_SESSION_IMPL *session; - uint64_t recno, start_recno, stop_recno; + wt_timestamp_t commit, durable, first, prepare, read; + uint64_t recno, start_recno, stop_recno, t_nsec, t_sec; uint32_t fileid, mode, optype, opsize; session = r->session; @@ -125,6 +131,16 @@ __txn_op_apply( WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize)); end = *pp + opsize; + /* + * If it is an operation type that should be ignored, we're done. + * Note that file ids within known operations also use the same + * macros to indicate that operation should be ignored. + */ + if (WT_LOGOP_IS_IGNORED(optype)) { + *pp += opsize; + goto done; + } + switch (optype) { case WT_LOGOP_COL_MODIFY: WT_ERR(__wt_logop_col_modify_unpack(session, pp, end, @@ -266,10 +282,20 @@ __txn_op_apply( WT_TRET(stop->close(stop)); WT_ERR(ret); break; + case WT_LOGOP_TXN_TIMESTAMP: + /* + * Timestamp records are informational only. We have to + * unpack it to properly move forward in the log record + * to the next operation, but otherwise ignore. + */ + WT_ERR(__wt_logop_txn_timestamp_unpack(session, pp, end, &t_sec, + &t_nsec, &commit, &durable, &first, &prepare, &read)); + break; WT_ILLEGAL_VALUE_ERR(session, optype); } +done: /* Reset the cursor so it doesn't block eviction. */ if (cursor != NULL) WT_ERR(cursor->reset(cursor)); diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index b50da548f71..5ae391127b5 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -667,8 +667,10 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_DECL_RET; WT_TXN *txn; wt_timestamp_t ts; + bool set_ts; txn = &session->txn; + set_ts = false; /* Look for a commit timestamp. */ ret = __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval); @@ -678,6 +680,7 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_txn_parse_timestamp(session, "commit", &ts, &cval)); WT_RET(__wt_timestamp_validate(session, "commit", ts, &cval)); txn->commit_timestamp = ts; + set_ts = true; __wt_txn_set_commit_timestamp(session); } else /* @@ -687,7 +690,10 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_txn_context_prepare_check(session)); /* Look for a read timestamp. */ - WT_RET(__wt_txn_parse_read_timestamp(session, cfg)); + WT_RET(__wt_txn_parse_read_timestamp(session, cfg, &set_ts)); + + if (set_ts) + WT_RET(__wt_txn_ts_log(session)); return (0); } @@ -775,7 +781,8 @@ __wt_txn_parse_prepare_timestamp( * Parse a request to set a transaction's read_timestamp. */ int -__wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) +__wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session, + const char *cfg[], bool *set_tsp) { WT_CONFIG_ITEM cval; WT_TXN *txn; @@ -844,6 +851,8 @@ __wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) round_to_oldest = false; } + if (set_tsp != NULL) + *set_tsp = true; __wt_txn_set_read_timestamp(session); __wt_readunlock(session, &txn_global->rwlock); if (round_to_oldest) { diff --git a/src/third_party/wiredtiger/test/csuite/Makefile.am b/src/third_party/wiredtiger/test/csuite/Makefile.am index 362d0775a88..cb3e24b5177 100644 --- a/src/third_party/wiredtiger/test/csuite/Makefile.am +++ b/src/third_party/wiredtiger/test/csuite/Makefile.am @@ -127,6 +127,10 @@ test_wt4333_handle_locks_SOURCES = wt4333_handle_locks/main.c noinst_PROGRAMS += test_wt4333_handle_locks all_TESTS += test_wt4333_handle_locks +test_wt4803_cache_overflow_abort_SOURCES = wt4803_cache_overflow_abort/main.c +noinst_PROGRAMS += test_wt4803_cache_overflow_abort +all_TESTS += test_wt4803_cache_overflow_abort + # Run this during a "make check" smoke test. TESTS = $(all_TESTS) LOG_COMPILER = $(TEST_WRAPPER) diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c index bf59fe674a7..23e2ca3ebd9 100644 --- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c +++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c @@ -71,9 +71,10 @@ static char home[1024]; /* Program working dir */ #define SESSION_MAX (MAX_TH + 3 + MAX_TH * PREPARE_PCT) static const char * table_pfx = "table"; +static const char * const uri_collection = "collection"; static const char * const uri_local = "local"; static const char * const uri_oplog = "oplog"; -static const char * const uri_collection = "collection"; +static const char * const uri_shadow = "shadow"; static const char * const ckpt_file = "checkpoint_done"; @@ -82,9 +83,13 @@ static volatile uint64_t global_ts = 1; #define ENV_CONFIG_COMPAT ",compatibility=(release=\"2.9\")" #define ENV_CONFIG_DEF \ - "create,log=(archive=false,file_max=10M,enabled),session_max=%" PRIu32 + "cache_size=20M,create,log=(archive=true,file_max=10M,enabled)," \ + "debug_mode=(table_logging=true,checkpoint_retention=5)," \ + "statistics=(fast),statistics_log=(wait=1,json=true),session_max=%" PRIu32 #define ENV_CONFIG_TXNSYNC \ - "create,log=(archive=false,file_max=10M,enabled)," \ + "cache_size=20M,create,log=(archive=true,file_max=10M,enabled)," \ + "debug_mode=(table_logging=true,checkpoint_retention=5)," \ + "statistics=(fast),statistics_log=(wait=1,json=true)," \ "transaction_sync=(enabled,method=none),session_max=%" PRIu32 #define ENV_CONFIG_REC "log=(archive=false,recover=on)" @@ -225,7 +230,7 @@ static WT_THREAD_RET thread_run(void *arg) { FILE *fp; - WT_CURSOR *cur_coll, *cur_local, *cur_oplog; + WT_CURSOR *cur_coll, *cur_local, *cur_oplog, *cur_shadow; WT_ITEM data; WT_RAND_STATE rnd; WT_SESSION *prepared_session, *session; @@ -286,6 +291,15 @@ thread_run(void *arg) testutil_check(session->open_cursor(session, uri, NULL, NULL, &cur_coll)); testutil_check(__wt_snprintf( + uri, sizeof(uri), "%s:%s", table_pfx, uri_shadow)); + if (use_prep) + testutil_check(prepared_session->open_cursor(prepared_session, + uri, NULL, NULL, &cur_shadow)); + else + testutil_check(session->open_cursor(session, + uri, NULL, NULL, &cur_shadow)); + + testutil_check(__wt_snprintf( uri, sizeof(uri), "%s:%s", table_pfx, uri_local)); if (use_prep) testutil_check(prepared_session->open_cursor(prepared_session, @@ -315,7 +329,7 @@ thread_run(void *arg) if (use_ts) { testutil_check(pthread_rwlock_rdlock(&ts_lock)); - active_ts = __wt_atomic_addv64(&global_ts, 1); + active_ts = __wt_atomic_addv64(&global_ts, 2); testutil_check(__wt_snprintf(tscfg, sizeof(tscfg), "commit_timestamp=%" PRIx64, active_ts)); @@ -334,6 +348,7 @@ thread_run(void *arg) cur_coll->set_key(cur_coll, kname); cur_local->set_key(cur_local, kname); cur_oplog->set_key(cur_oplog, kname); + cur_shadow->set_key(cur_shadow, kname); /* * Put an informative string into the value so that it * can be viewed well in a binary dump. @@ -351,6 +366,20 @@ thread_run(void *arg) data.data = cbuf; cur_coll->set_value(cur_coll, &data); testutil_check(cur_coll->insert(cur_coll)); + cur_shadow->set_value(cur_shadow, &data); + if (use_ts) { + /* + * Change the timestamp in the middle of the + * transaction so that we simulate a secondary. + */ + ++active_ts; + testutil_check(__wt_snprintf(tscfg, + sizeof(tscfg), "commit_timestamp=%" PRIx64, + active_ts)); + testutil_check(session->timestamp_transaction( + session, tscfg)); + } + testutil_check(cur_shadow->insert(cur_shadow)); data.size = __wt_random(&rnd) % MAX_VAL; data.data = obuf; cur_oplog->set_value(cur_oplog, &data); @@ -437,6 +466,10 @@ run_workload(uint32_t nth) testutil_check(session->create(session, uri, "key_format=S,value_format=u,log=(enabled=false)")); testutil_check(__wt_snprintf( + uri, sizeof(uri), "%s:%s", table_pfx, uri_shadow)); + testutil_check(session->create(session, uri, + "key_format=S,value_format=u,log=(enabled=false)")); + testutil_check(__wt_snprintf( uri, sizeof(uri), "%s:%s", table_pfx, uri_local)); testutil_check(session->create(session, uri, "key_format=S,value_format=u")); @@ -548,7 +581,7 @@ main(int argc, char *argv[]) FILE *fp; REPORT c_rep[MAX_TH], l_rep[MAX_TH], o_rep[MAX_TH]; WT_CONNECTION *conn; - WT_CURSOR *cur_coll, *cur_local, *cur_oplog; + WT_CURSOR *cur_coll, *cur_local, *cur_oplog, *cur_shadow; WT_RAND_STATE rnd; WT_SESSION *session; pid_t pid; @@ -725,6 +758,10 @@ main(int argc, char *argv[]) testutil_check(session->open_cursor(session, buf, NULL, NULL, &cur_coll)); testutil_check(__wt_snprintf( + buf, sizeof(buf), "%s:%s", table_pfx, uri_shadow)); + testutil_check(session->open_cursor(session, + buf, NULL, NULL, &cur_shadow)); + testutil_check(__wt_snprintf( buf, sizeof(buf), "%s:%s", table_pfx, uri_local)); testutil_check(session->open_cursor(session, buf, NULL, NULL, &cur_local)); @@ -798,13 +835,20 @@ main(int argc, char *argv[]) cur_coll->set_key(cur_coll, kname); cur_local->set_key(cur_local, kname); cur_oplog->set_key(cur_oplog, kname); + cur_shadow->set_key(cur_shadow, kname); /* * The collection table should always only have the - * data as of the checkpoint. + * data as of the checkpoint. The shadow table should + * always have the exact same data (or not) as the + * collection table. */ if ((ret = cur_coll->search(cur_coll)) != 0) { if (ret != WT_NOTFOUND) testutil_die(ret, "search"); + if ((ret = cur_shadow->search(cur_shadow)) == 0) + testutil_die(ret, + "shadow search success"); + /* * If we don't find a record, the stable * timestamp written to our file better be @@ -841,7 +885,10 @@ main(int argc, char *argv[]) " > stable ts %" PRIu64 "\n", fname, key, stable_fp, stable_val); fatal = true; - } + } else if ((ret = cur_shadow->search(cur_shadow)) != 0) + /* Collection and shadow both have the data. */ + testutil_die(ret, "shadow search failure"); + /* * The local table should always have all data. */ diff --git a/src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c b/src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c new file mode 100644 index 00000000000..7d9b0baf132 --- /dev/null +++ b/src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c @@ -0,0 +1,239 @@ +/*- + * Public Domain 2014-2019 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "test_util.h" + +#include <signal.h> +#include <sys/wait.h> + +/* + * JIRA ticket reference: WT-4803 + * Test case description: This test is checking the functionality of the + * lookaside file_max configuration. When the size of the lookaside file exceeds + * this value, we expect to panic. + * Failure mode: If we receive a panic in the test cases we weren't expecting to + * and vice versa. + */ + +#define NUM_KEYS 2000 + +/* + * This is a global flag that should be set before running test_las_workload. + * It lets the child process know whether it should be expecting a panic or not + * so that it can adjust its exit code as needed. + */ +static bool expect_panic; + +static int +handle_message(WT_EVENT_HANDLER *handler, + WT_SESSION *session, int error, const char *message) +{ + WT_UNUSED(handler); + WT_UNUSED(session); + + (void)fprintf( + stderr, "%s: %s\n", message, session->strerror(session, error)); + + if (error == WT_PANIC && + strstr(message, "exceeds maximum size") != NULL) { + fprintf(stderr, "Got cache overflow error (expect_panic=%s)\n", + expect_panic ? "true" : "false"); + + /* + * If we're expecting a panic, exit with zero to indicate to the + * parent that this test was successful. + * + * If not, don't intercept. We'll naturally exit with non-zero + * if we're terminating due to panic. + */ + if (expect_panic) + exit(EXIT_SUCCESS); + } + + return (0); +} + +static WT_EVENT_HANDLER event_handler = { + handle_message, + NULL, + NULL, + NULL +}; + +static void +las_workload(TEST_OPTS *opts, const char *las_file_max) +{ + WT_CURSOR *cursor; + WT_SESSION *other_session, *session; + int i; + char buf[WT_MEGABYTE], open_config[128]; + + testutil_check(__wt_snprintf(open_config, sizeof(open_config), + "create,cache_size=50MB,cache_overflow=(file_max=%s)", + las_file_max)); + + testutil_check(wiredtiger_open( + opts->home, &event_handler, open_config, &opts->conn)); + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &session)); + testutil_check( + session->create(session, opts->uri, "key_format=i,value_format=S")); + testutil_check( + session->open_cursor(session, opts->uri, NULL, NULL, &cursor)); + + memset(buf, 0xA, WT_MEGABYTE); + buf[WT_MEGABYTE - 1] = '\0'; + + /* Populate the table. */ + for (i = 0; i < NUM_KEYS; ++i) { + cursor->set_key(cursor, i); + cursor->set_value(cursor, buf); + testutil_check(cursor->insert(cursor)); + } + + /* + * Open a snapshot isolation transaction in another session. This forces + * the cache to retain all previous values. Then update all keys with a + * new value in the original session while keeping that snapshot + * transaction open. With the large value buffer, small cache and lots + * of keys, this will force a lot of lookaside usage. + * + * When the file_max setting is small, the maximum size should easily be + * reached and we should panic. When the maximum size is large or not + * set, then we should succeed. + */ + testutil_check( + opts->conn->open_session(opts->conn, NULL, NULL, &other_session)); + testutil_check(other_session->begin_transaction( + other_session, "isolation=snapshot")); + + memset(buf, 0xB, WT_MEGABYTE); + buf[WT_MEGABYTE - 1] = '\0'; + + for (i = 0; i < NUM_KEYS; ++i) { + cursor->set_key(cursor, i); + cursor->set_value(cursor, buf); + testutil_check(cursor->update(cursor)); + } + + /* + * Cleanup. + * We do not get here when the file_max size is small because we will + * have already hit the maximum and exited. This code only executes on + * the successful path. + */ + testutil_check( + other_session->rollback_transaction(other_session, NULL)); + testutil_check(other_session->close(other_session, NULL)); + + testutil_check(cursor->close(cursor)); + testutil_check(session->close(session, NULL)); +} + +static int +test_las_workload(TEST_OPTS *opts, const char *las_file_max) +{ + pid_t pid; + int status; + + /* + * We're going to run this workload for different configurations of + * file_max. So clean out the work directory each time. + */ + testutil_make_work_dir(opts->home); + + /* + * Since it's possible that the workload will panic and abort, we will + * fork the process and execute the workload in the child process. + * + * This way, we can safely check the exit code of the child process and + * confirm that it is what we expected. + */ + pid = fork(); + if (pid < 0) + /* Failed fork. */ + testutil_die(errno, "fork"); + else if (pid == 0) { + /* Child process from here. */ + las_workload(opts, las_file_max); + + /* + * If we're expecting a panic during the workload, we shouldn't + * get to this point. Exit with non-zero to indicate to parent + * that we should fail this test. + */ + fprintf(stderr, + "Successfully completed workload (expect_panic=%s)\n", + expect_panic ? "true" : "false"); + + if (expect_panic) + exit(EXIT_FAILURE); + else + exit(EXIT_SUCCESS); + } + + /* Parent process from here. */ + if (waitpid(pid, &status, 0) == -1) + testutil_die(errno, "waitpid"); + + return (status); +} + +int +main(int argc, char **argv) +{ + TEST_OPTS opts; + + memset(&opts, 0x0, sizeof(opts)); + testutil_check(testutil_parse_opts(argc, argv, &opts)); + + /* + * The lookaside is unbounded. + * We don't expect any failure since we can use as much as needed. + */ + expect_panic = false; + testutil_check(test_las_workload(&opts, "0")); + + /* + * The lookaside is limited to 5GB. + * This is more than enough for this workload so we don't expect any + * failure. + */ + expect_panic = false; + testutil_check(test_las_workload(&opts, "5GB")); + + /* + * The lookaside is limited to 100MB. + * This is insufficient for this workload so we're expecting a failure. + */ + expect_panic = true; + testutil_check(test_las_workload(&opts, "100MB")); + + testutil_cleanup(&opts); + + return (0); +} diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index be111c6432c..01aff272320 100644 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -8,7 +8,7 @@ functions: command: git.get_project params: directory: wiredtiger - "fetch artifacts" : + "fetch artifacts" : - command: s3.get params: aws_key: ${aws_key} @@ -19,7 +19,7 @@ functions: "fetch mongo-tests repo" : command: shell.exec params: - script: | + script: | git clone https://github.com/wiredtiger/mongo-tests "compile wiredtiger": command: shell.exec @@ -35,7 +35,7 @@ functions: ./build_posix/reconf ${configure_env_vars|} ./configure --enable-diagnostic --enable-python --enable-zlib --enable-strict --enable-static --prefix=$(pwd)/LOCAL_INSTALL ${make_command|make} ${smp_command|} 2>&1 - + # On macOS, change the binary location with install_name_tool since DYLD_LIBRARY_PATH # appears not to work for dynamic modules loaded by python. For wt, the libtool generated # script has the wrong path for running on test machines. @@ -609,6 +609,20 @@ tasks: ${test_env_vars|} $(pwd)/test_wt4156_metadata_salvage 2>&1 + - name: csuite-wt4803-cache-overflow-abort-test + depends_on: + - name: compile + commands: + - func: "fetch artifacts" + - command: shell.exec + params: + working_dir: "wiredtiger/build_posix/test/csuite" + script: | + set -o errexit + set -o verbose + + ${test_env_vars|} $(pwd)/test_wt4803_cache_overflow_abort 2>&1 + - name: csuite-rwlock-test depends_on: - name: compile @@ -760,9 +774,9 @@ tasks: # Break out Python unit tests into multiple buckets/tasks based on test name and runtime # The test/suite/run.py script can work out test names by casting each command argument - # with "test_" prefix and "*.py" postfix. + # with "test_" prefix and "*.py" postfix. # - # One example: + # One example: # "test/suite/run.py [ab]" will be translated to testing "test_a*.py" and "test_b*.py" - name: unit-test-bucket00 @@ -934,18 +948,18 @@ tasks: - name: million-collection-test depends_on: [] - run_on: + run_on: - rhel62-large - commands: + commands: - func: "fetch mongo-tests repo" - command: shell.exec params: working_dir: mongo-tests - script: | + script: | set -o errexit set -o verbose ulimit -n 1000000 - ulimit -c unlimited + ulimit -c unlimited largescale/run-million-collection-test.sh . buildvariants: @@ -1051,4 +1065,3 @@ buildvariants: - name: make-check-test - name: unit-test - name: fops - diff --git a/src/third_party/wiredtiger/test/suite/test_debug_mode01.py b/src/third_party/wiredtiger/test/suite/test_debug_mode01.py new file mode 100644 index 00000000000..88ba81f9c1c --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_debug_mode01.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2019 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest + +# test_debug_mode01.py +# Test the debug mode settings. Test rollback_error in this one. +class test_debug_mode01(wttest.WiredTigerTestCase): + conn_config = 'log=(enabled=true),debug_mode=(rollback_error=5)' + session_config = 'isolation=snapshot' + uri = 'file:test_debug' + + entries = 22 + min_error = entries // 5 + + def rollback_error(self, val, insert=True): + keys = range(1, self.entries) + c = self.session.open_cursor(self.uri, None) + # We expect some operations to return an exception so we cannot + # use the simple 'c[k] = 1'. But we must explicitly set the key + # and value and then use the insert or update primitives. + # + # Look for a generic 'WT_ROLLBACK' string not the specific + # simulated reason string. + msg = '/WT_ROLLBACK/' + rollback = 0 + for k in keys: + self.session.begin_transaction() + c.set_key(k) + c.set_value(val) + # Execute the insert or update. It will return true if the simulated + # conflict exception is raised, false if no exception occurred. + if insert: + conflict = self.assertRaisesException(wiredtiger.WiredTigerError, \ + lambda:c.insert(), msg, True) + else: + conflict = self.assertRaisesException(wiredtiger.WiredTigerError, \ + lambda:c.update(), msg, True) + + if conflict: + rollback += 1 + self.pr("Key: " + str(k) + " Rolled back") + self.session.rollback_transaction() + else: + self.session.commit_transaction() + c.close() + return rollback + + def test_rollback_error(self): + self.session.create(self.uri, 'key_format=i,value_format=i') + rollback = self.rollback_error(1) + rollback += self.rollback_error(2, False) + self.pr("Rollback: " + str(rollback)) + self.pr("Minimum: " + str(self.min_error)) + self.assertTrue(rollback >= self.min_error) + + def test_rollback_error_off(self): + # The setting is added in to wiredtiger_open via the config above. + # Test that we can properly turn the setting off via reconfigure. + # There should then be no rollback errors. + self.conn.reconfigure("debug_mode=(rollback_error=0)") + + self.session.create(self.uri, 'key_format=i,value_format=i') + rollback = self.rollback_error(1) + rollback += self.rollback_error(2) + self.assertTrue(rollback == 0) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_debug_mode02.py b/src/third_party/wiredtiger/test/suite/test_debug_mode02.py new file mode 100644 index 00000000000..0452e60fbd1 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_debug_mode02.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# +# Public Domain 2024-2029 MongoDB, Inc. +# Public Domain 2008-2024 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import fnmatch, os, time, wiredtiger, wttest + +# test_debug_mode02.py +# Test the debug mode settings. Test checkpoint_retention use. +class test_debug_mode02(wttest.WiredTigerTestCase): + uri = 'file:test_debug' + + entries = 100 + loop = 0 + retain = 5 + log1 = 'WiredTigerLog.0000000001' + log2 = 'WiredTigerLog.0000000002' + + def conn_config(self): + return 'log=(enabled=true,file_max=100K),debug_mode=(checkpoint_retention=%d)' % self.retain + + def log_set(self): + logs = fnmatch.filter(os.listdir(self.home), "*gerLog*") + return set(logs) + + def check_archive(self, logfile): + archived = False + for i in range(1,90): + # Sleep and then see if archive ran. We do this in a loop + # for slow machines. Max out at 90 seconds. + time.sleep(1.0) + if not os.path.exists(logfile): + archived = True + break + self.assertTrue(archived) + + def advance_log_checkpoint(self): + # Advance the log file to the next file and write a checkpoint. + keys = range(1, self.entries) + cur_set = self.log_set() + c = self.session.open_cursor(self.uri, None) + new_set = cur_set + # Write data in small chunks until we switch log files. + while cur_set == new_set: + for k in keys: + c[k + (self.loop * self.entries)] = 1 + self.loop += 1 + new_set = self.log_set() + c.close() + # Write a checkpoint into the new log file. + self.session.checkpoint() + + def test_checkpoint_retain(self): + self.session.create(self.uri, 'key_format=i,value_format=i') + # No log files should be archived while we have fewer than the + # retention number of logs. Make sure each iteration the new + # logs are a proper superset of the previous time. + for i in range(1, self.retain): + cur_set = self.log_set() + self.advance_log_checkpoint() + # We don't accomodate slow machines here because we don't expect + # the files the change and there is no way to know if archive ran + # otherwise. + time.sleep(1.0) + new_set = self.log_set() + self.assertTrue(new_set.issuperset(cur_set)) + + self.assertTrue(os.path.exists(self.log1)) + self.advance_log_checkpoint() + self.check_archive(self.log1) + + # Test that both zero and one archive as usual. And test reconfigure. + def test_checkpoint_retain_off(self): + self.conn.reconfigure("debug_mode=(checkpoint_retention=0)") + self.session.create(self.uri, 'key_format=i,value_format=i') + + self.advance_log_checkpoint() + self.check_archive(self.log1) + + self.conn.reconfigure("debug_mode=(checkpoint_retention=1)") + self.advance_log_checkpoint() + self.check_archive(self.log2) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_debug_mode03.py b/src/third_party/wiredtiger/test/suite/test_debug_mode03.py new file mode 100644 index 00000000000..feb5c0d904a --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_debug_mode03.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# +# Public Domain 2034-2039 MongoDB, Inc. +# Public Domain 2008-2034 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest + +# test_debug_mode03.py +# Test the debug mode settings. Test table_logging use. +class test_debug_mode03(wttest.WiredTigerTestCase): + conn_config = 'log=(enabled=true,file_max=100K),debug_mode=(table_logging=true)' + uri = 'file:test_debug' + entries = 100 + value = b'\x01\x02abcd\x03\x04' + + def add_data(self): + # Add a binary value we can search for in the log. + keys = range(0, self.entries) + c = self.session.open_cursor(self.uri, None) + for k in keys: + c[k] = self.value + c.close() + + def find_log_recs(self): + # Open a log cursor. We should find log records that have + # the value we inserted. + c = self.session.open_cursor("log:", None) + count = 0 + while c.next() == 0: + # lsn.file, lsn.offset, opcount + keys = c.get_key() + # txnid, rectype, optype, fileid, logrec_key, logrec_value + values = c.get_value() + # Look for log records that have a key/value pair. + if values[4] != b'': + if self.value in values[5]: # logrec_value + count += 1 + c.close() + return count + + def test_table_logging(self): + self.session.create(self.uri, 'key_format=i,value_format=u,log=(enabled=false)') + self.add_data() + count = self.find_log_recs() + self.assertEqual(count, self.entries) + + # Test that both zero and one archive as usual. And test reconfigure. + def test_table_logging_off(self): + self.conn.reconfigure("debug_mode=(table_logging=false)") + self.session.create(self.uri, 'key_format=i,value_format=u,log=(enabled=false)') + self.add_data() + count = self.find_log_recs() + self.assertEqual(count, 0) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_debug_mode04.py b/src/third_party/wiredtiger/test/suite/test_debug_mode04.py new file mode 100644 index 00000000000..1f5429495e8 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_debug_mode04.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python +# +# Public Domain 2034-2039 MongoDB, Inc. +# Public Domain 2008-2034 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest + +# test_debug_mode04.py +# Test the debug mode settings. Test eviction use. +class test_debug_mode04(wttest.WiredTigerTestCase): + conn_config = 'log=(enabled=true,file_max=100K),debug_mode=(eviction=true)' + uri = 'file:test_debug' + entries = 100 + value = b'\x01\x02abcd\x03\x04' + + def add_data(self): + keys = range(0, self.entries) + c = self.session.open_cursor(self.uri, None) + for k in keys: + c[k] = self.value + c.close() + + # Just test turning it on and off. There really isn't something + # specific to verify. + def test_table_logging(self): + self.session.create(self.uri, 'key_format=i,value_format=u') + self.add_data() + + def test_table_logging_off(self): + self.conn.reconfigure("debug_mode=(eviction=false)") + self.session.create(self.uri, 'key_format=i,value_format=u') + self.add_data() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_las04.py b/src/third_party/wiredtiger/test/suite/test_las04.py new file mode 100644 index 00000000000..9d35d3c17f3 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_las04.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2019 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_las04.py +# Test file_max configuration and reconfiguration for the lookaside table. +# + +import wiredtiger, wttest +from wtscenario import make_scenarios + +# Taken from src/include/misc.h. +WT_MB = 1048576 + +class test_las04(wttest.WiredTigerTestCase): + uri = 'table:las_04' + in_memory_values = [ + ('false', dict(in_memory=False)), + ('none', dict(in_memory=None)), + ('true', dict(in_memory=True)) + ] + init_file_max_values = [ + ('default', dict(init_file_max=None, init_stat_val=0)), + ('non-zero', dict(init_file_max='100MB', init_stat_val=(WT_MB * 100))), + ('zero', dict(init_file_max='0', init_stat_val=0)) + ] + reconfig_file_max_values = [ + ('non-zero', dict(reconfig_file_max='100MB', + reconfig_stat_val=(WT_MB * 100))), + ('too-low', dict(reconfig_file_max='99MB', reconfig_stat_val=None)), + ('zero', dict(reconfig_file_max='0', reconfig_stat_val=0)) + ] + scenarios = make_scenarios(init_file_max_values, reconfig_file_max_values, + in_memory_values) + + def conn_config(self): + config = 'statistics=(fast)' + if self.init_file_max is not None: + config += ',cache_overflow=(file_max={})'.format(self.init_file_max) + if self.in_memory is not None: + config += ',in_memory=' + ('true' if self.in_memory else 'false') + return config + + def get_stat(self, stat): + stat_cursor = self.session.open_cursor('statistics:') + val = stat_cursor[stat][2] + stat_cursor.close() + return val + + def test_las(self): + self.session.create(self.uri, 'key_format=S,value_format=S') + + if self.in_memory: + # For in-memory configurations, we simply ignore any lookaside + # related configuration. + self.assertEqual( + self.get_stat(wiredtiger.stat.conn.cache_lookaside_ondisk_max), + 0) + else: + self.assertEqual( + self.get_stat(wiredtiger.stat.conn.cache_lookaside_ondisk_max), + self.init_stat_val) + + reconfigure = lambda: self.conn.reconfigure( + 'cache_overflow=(file_max={})'.format(self.reconfig_file_max)) + + # We expect an error when the statistic value is None because the value + # is out of range. + if self.reconfig_stat_val is None: + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, reconfigure, '/below minimum/') + return + + reconfigure() + + if self.in_memory: + self.assertEqual( + self.get_stat(wiredtiger.stat.conn.cache_lookaside_ondisk_max), + 0) + else: + self.assertEqual( + self.get_stat(wiredtiger.stat.conn.cache_lookaside_ondisk_max), + self.reconfig_stat_val) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp17.py b/src/third_party/wiredtiger/test/suite/test_timestamp17.py new file mode 100644 index 00000000000..f03b002c0ed --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_timestamp17.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2019 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_timestamp17.py +# Test unintended timestamp usage on an update and ensure behavior +# matches expectations. Additionally, move the timestamp to ensure +# that values read are still consistent after those timestamps are +# moved. +# + +import random +from suite_subprocess import suite_subprocess +import wiredtiger, wttest +from wtscenario import make_scenarios + +def timestamp_str(t): + return '%x' % t + +class test_timestamp17(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'test_timestamp17' + uri = 'table:' + tablename + session_config = 'isolation=snapshot' + + def test_inconsistent_timestamping(self): + self.session.create(self.uri, 'key_format=i,value_format=i') + self.session.begin_transaction() + cur1 = self.session.open_cursor(self.uri) + cur1[1] = 1 + self.session.commit_transaction('commit_timestamp=25') + + self.session.begin_transaction() + cur1[1] = 2 + self.session.commit_transaction('commit_timestamp=50') + + self.session.begin_transaction() + cur1[1] = 3 + self.session.commit_transaction('commit_timestamp=200') + + self.session.begin_transaction() + cur1.set_key(1) + cur1.remove() + self.session.commit_transaction('commit_timestamp=100') + + # Read before any updates and ensure we cannot find the key or value. + self.session.begin_transaction('read_timestamp=20') + cur1.set_key(1) + search_success = cur1.search() + self.assertEqual(search_success, wiredtiger.WT_NOTFOUND) + self.session.commit_transaction() + + # Read at 25 and we should see 1. + self.session.begin_transaction('read_timestamp=25') + cur1.set_key(1) + search_success = cur1.search() + self.assertEqual(search_success, 0) + value1 = cur1.get_value() + self.session.commit_transaction() + self.assertEqual(1, value1) + + # Read at 50 and we should see 2. + self.session.begin_transaction('read_timestamp=50') + cur1.set_key(1) + search_success = cur1.search() + self.assertEqual(search_success, 0) + value1 = cur1.get_value() + self.session.commit_transaction() + self.assertEqual(2, value1) + + # Read at 100 and we should not find anything. + self.session.begin_transaction('read_timestamp=100') + cur1.set_key(1) + search_success = cur1.search() + self.assertEqual(search_success, wiredtiger.WT_NOTFOUND) + self.session.commit_transaction() + + # Read at 200 and we should still not find anything. + self.session.begin_transaction('read_timestamp=200') + cur1.set_key(1) + search_success = cur1.search() + self.assertEqual(search_success, wiredtiger.WT_NOTFOUND) + self.session.commit_transaction() + + # Read at 300 for further validation. + self.session.begin_transaction('read_timestamp=300') + cur1.set_key(1) + search_success = cur1.search() + self.assertEqual(search_success, wiredtiger.WT_NOTFOUND) + self.session.commit_transaction() + + # Move oldest timestamp forward and + # confirm we see the correct numbers. + self.conn.set_timestamp('oldest_timestamp=49') + + # Read at 49 and we should see 1. + self.session.begin_transaction('read_timestamp=49') + cur1.set_key(1) + search_success = cur1.search() + self.assertEqual(search_success, 0) + value1 = cur1.get_value() + self.session.commit_transaction() + self.assertEqual(1, value1) + + self.conn.set_timestamp('oldest_timestamp=99') + + # Read at 99 and we should see 2. + self.session.begin_transaction('read_timestamp=99') + cur1.set_key(1) + search_success = cur1.search() + self.assertEqual(search_success, 0) + value1 = cur1.get_value() + self.session.commit_transaction() + self.assertEqual(2, value1) + + # Move oldest to the point at which we deleted. + self.conn.set_timestamp('oldest_timestamp=100') + + # Read at 100 and we should not find anything. + self.session.begin_transaction('read_timestamp=100') + cur1.set_key(1) + search_success = cur1.search() + self.assertEqual(search_success, wiredtiger.WT_NOTFOUND) + self.session.commit_transaction() + + # Read at 200 and we should not find anything. + self.session.begin_transaction('read_timestamp=200') + cur1.set_key(1) + search_success = cur1.search() + self.assertEqual(search_success, wiredtiger.WT_NOTFOUND) + self.session.commit_transaction() + + # Move oldest timestamp to 200 to ensure history + # works as expected and we do not see the value 3. + self.conn.set_timestamp('oldest_timestamp=200') + + self.session.begin_transaction('read_timestamp=200') + cur1.set_key(1) + search_success = cur1.search() + self.assertEqual(search_success, wiredtiger.WT_NOTFOUND) + self.session.commit_transaction() + + self.session.begin_transaction('read_timestamp=250') + cur1.set_key(1) + search_success = cur1.search() + self.assertEqual(search_success, wiredtiger.WT_NOTFOUND) + self.session.commit_transaction() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/wttest.py b/src/third_party/wiredtiger/test/suite/wttest.py index c0b755d2230..045cac26552 100644 --- a/src/third_party/wiredtiger/test/suite/wttest.py +++ b/src/third_party/wiredtiger/test/suite/wttest.py @@ -508,8 +508,9 @@ class WiredTigerTestCase(unittest.TestCase): """ Like TestCase.assertRaises(), with some additional options. If the exceptionString argument is used, the exception's string - must match it. If optional is set, then no assertion occurs - if the exception doesn't occur. + must match it, or its pattern if the string starts and ends with + a slash. If optional is set, then no assertion occurs if the + exception doesn't occur. Returns true if the assertion is raised. """ raised = False @@ -519,9 +520,19 @@ class WiredTigerTestCase(unittest.TestCase): if not isinstance(err, exceptionType): self.fail('Exception of incorrect type raised, got type: ' + \ str(type(err))) - if exceptionString != None and exceptionString != str(err): - self.fail('Exception with incorrect string raised, got: "' + \ - str(err) + '"') + if exceptionString != None: + # Match either a pattern or an exact string. + fail = False + self.pr('Expecting string msg: ' + exceptionString) + if len(exceptionString) > 2 and \ + exceptionString[0] == '/' and exceptionString[-1] == '/' : + if re.search(exceptionString[1:-1], str(err)) == None: + fail = True + elif exceptionString != str(err): + fail = True + if fail: + self.fail('Exception with incorrect string raised, got: "' + \ + str(err) + '" Expected: ' + exceptionString) raised = True if not raised and not optional: self.fail('no assertion raised') |