diff options
author | Luke Chen <luke.chen@mongodb.com> | 2019-04-16 14:36:51 +1000 |
---|---|---|
committer | Luke Chen <luke.chen@mongodb.com> | 2019-04-16 14:45:56 +1000 |
commit | a654dcf592ea7ed65426a0de96b4079ff4fc6716 (patch) | |
tree | a5256edad1bb219e6af72fd7e7525f58e235a307 /src/third_party/wiredtiger/src | |
parent | 19b622ebfb42a525f38e278c09f440eb47b12f1e (diff) | |
download | mongo-a654dcf592ea7ed65426a0de96b4079ff4fc6716.tar.gz |
Import wiredtiger: 9416282c42d40328dfb7ff0f28831f639f98d3cb from branch mongodb-4.2
ref: 1768d66613..9416282c42
for: 4.1.11
WT-4317 Read checksum error in test_wt4156_metadata_salvage
WT-4579 Track the newest durable timestamp for each page
WT-4585 Add WT_WITH_HOTBACKUP_LOCK macro
WT-4598 Enable the assertion that the durable_timestamp is newer than or equals the commit timestamp.
WT-4640 Remove round_to_oldest in favour of roundup_timestamps
WT-4695 Python3: allow most tests to run with Python3 with small changes
WT-4696 Python3: change dist scripts to run under Python3
WT-4698 Python3: fix modify related tests
WT-4699 Python3: fix test_jsondump02.py
WT-4700 Python3: run with same source as Python2
WT-4703 Extend test/checkpoint to do removes and online checking
WT-4704 Add statistic tracking oldest active read timestamp
WT-4705 column-store no longer needs to handle WT_COL page offsets of 0
WT-4707 Failure in verifying cells with copied values
WT-4708 Coverity reported copy-paste error in WiredTiger error message
WT-4711 Python formatting errors reported while running "s_all"
WT-4714 Use the durable timestamp to determine if a page should stay dirty
WT-4724 Syntax error in wtperf_ckpt.sh when running 'dash' as default shell
Diffstat (limited to 'src/third_party/wiredtiger/src')
39 files changed, 807 insertions, 645 deletions
diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c index 9edc4e0108b..55f9d4ca57c 100644 --- a/src/third_party/wiredtiger/src/block/block_write.c +++ b/src/third_party/wiredtiger/src/block/block_write.c @@ -43,10 +43,8 @@ __wt_block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t len) * more targeted solution at some point. */ if (!conn->hot_backup) { - __wt_readlock(session, &conn->hot_backup_lock); - if (!conn->hot_backup) - ret = __wt_ftruncate(session, block->fh, len); - __wt_readunlock(session, &conn->hot_backup_lock); + WT_WITH_HOTBACKUP_READ_LOCK(session, + ret = __wt_ftruncate(session, block->fh, len), NULL); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index f504bdeddf4..6a85ccf6c17 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -244,8 +244,7 @@ restart_read: /* Find the matching WT_COL slot. */ * information. */ if (cbt->cip_saved != cip) { - if ((cell = WT_COL_PTR(page, cip)) == NULL) - continue; + cell = WT_COL_PTR(page, cip); __wt_cell_unpack(session, page, cell, &unpack); if (unpack.type == WT_CELL_DEL) { if ((rle = __wt_cell_rle(&unpack)) == 1) diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 22effc47553..1b8df0008b9 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -390,8 +390,7 @@ restart_read: /* Find the matching WT_COL slot. */ * information. */ if (cbt->cip_saved != cip) { - if ((cell = WT_COL_PTR(page, cip)) == NULL) - continue; + cell = WT_COL_PTR(page, cip); __wt_cell_unpack(session, page, cell, &unpack); if (unpack.type == WT_CELL_DEL) { if (__wt_cell_rle(&unpack) == 1) diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index a6645608150..e75432f7836 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -328,8 +328,8 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *valid) * when read. */ cip = &page->pg_var[cbt->slot]; - if ((cell = WT_COL_PTR(page, cip)) == NULL || - __wt_cell_type(cell) == WT_CELL_DEL) + cell = WT_COL_PTR(page, cip); + if (__wt_cell_type(cell) == WT_CELL_DEL) return (0); break; case BTREE_ROW: diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 8d1ed01377c..86d00c18300 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -993,13 +993,9 @@ __debug_page_col_var(WT_DBG *ds, WT_REF *ref) recno = ref->ref_recno; WT_COL_FOREACH(page, cip, i) { - if ((cell = WT_COL_PTR(page, cip)) == NULL) { - unpack = NULL; - rle = 1; - } else { - __wt_cell_unpack(ds->session, page, cell, unpack); - rle = __wt_cell_rle(unpack); - } + cell = WT_COL_PTR(page, cip); + __wt_cell_unpack(ds->session, page, cell, unpack); + rle = __wt_cell_rle(unpack); WT_RET(__wt_snprintf( tag, sizeof(tag), "%" PRIu64 " %" PRIu64, recno, rle)); WT_RET( @@ -1339,7 +1335,8 @@ __debug_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack) case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: __wt_timestamp_to_string(unpack->oldest_start_ts, ts_string[0]); - __wt_timestamp_to_string(unpack->newest_start_ts, ts_string[1]); + __wt_timestamp_to_string( + unpack->newest_durable_ts, ts_string[1]); __wt_timestamp_to_string(unpack->newest_stop_ts, ts_string[2]); WT_RET(ds->f(ds, ", ts %s,%s,%s", ts_string[0], ts_string[1], ts_string[2])); diff --git a/src/third_party/wiredtiger/src/btree/bt_rebalance.c b/src/third_party/wiredtiger/src/btree/bt_rebalance.c index 46dc96aedce..c04135ee82d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_rebalance.c +++ b/src/third_party/wiredtiger/src/btree/bt_rebalance.c @@ -57,9 +57,9 @@ __rebalance_discard(WT_SESSION_IMPL *session, WT_REBALANCE_STUFF *rs) * Add a new entry to the list of leaf pages. */ static int -__rebalance_leaf_append(WT_SESSION_IMPL *session, - const uint8_t *key, size_t key_len, - WT_CELL_UNPACK *unpack, WT_REBALANCE_STUFF *rs) +__rebalance_leaf_append(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts, + const uint8_t *key, size_t key_len, WT_CELL_UNPACK *unpack, + WT_REBALANCE_STUFF *rs) { WT_ADDR *copy_addr; WT_REF *copy; @@ -80,7 +80,7 @@ __rebalance_leaf_append(WT_SESSION_IMPL *session, WT_RET(__wt_calloc_one(session, ©_addr)); copy->addr = copy_addr; copy_addr->oldest_start_ts = unpack->oldest_start_ts; - copy_addr->newest_start_ts = unpack->newest_start_ts; + copy_addr->newest_durable_ts = durable_ts; copy_addr->newest_stop_ts = unpack->newest_stop_ts; WT_RET(__wt_memdup( session, unpack->data, unpack->size, ©_addr->addr)); @@ -194,8 +194,8 @@ __rebalance_free_original(WT_SESSION_IMPL *session, WT_REBALANCE_STUFF *rs) * Walk a column-store page and its descendants. */ static int -__rebalance_col_walk( - WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_REBALANCE_STUFF *rs) +__rebalance_col_walk(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts, + const WT_PAGE_HEADER *dsk, WT_REBALANCE_STUFF *rs) { WT_BTREE *btree; WT_CELL_UNPACK unpack; @@ -221,7 +221,8 @@ __rebalance_col_walk( /* An internal page: read it and recursively walk it. */ WT_ERR(__wt_bt_read( session, buf, unpack.data, unpack.size)); - WT_ERR(__rebalance_col_walk(session, buf->data, rs)); + WT_ERR(__rebalance_col_walk( + session, unpack.newest_durable_ts, buf->data, rs)); __wt_verbose(session, WT_VERB_REBALANCE, "free-list append internal page: %s", __wt_addr_string( @@ -232,7 +233,7 @@ __rebalance_col_walk( case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: WT_ERR(__rebalance_leaf_append( - session, NULL, 0, &unpack, rs)); + session, durable_ts, NULL, 0, &unpack, rs)); break; WT_ILLEGAL_VALUE_ERR(session, unpack.type); } @@ -273,8 +274,8 @@ __rebalance_row_leaf_key(WT_SESSION_IMPL *session, * Walk a row-store page and its descendants. */ static int -__rebalance_row_walk( - WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_REBALANCE_STUFF *rs) +__rebalance_row_walk(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts, + const WT_PAGE_HEADER *dsk, WT_REBALANCE_STUFF *rs) { WT_BTREE *btree; WT_CELL_UNPACK key, unpack; @@ -347,7 +348,8 @@ __rebalance_row_walk( /* Read and recursively walk the page. */ WT_ERR(__wt_bt_read( session, buf, unpack.data, unpack.size)); - WT_ERR(__rebalance_row_walk(session, buf->data, rs)); + WT_ERR(__rebalance_row_walk( + session, unpack.newest_durable_ts, buf->data, rs)); break; case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: @@ -376,7 +378,7 @@ __rebalance_row_walk( len = key.size; } WT_ERR(__rebalance_leaf_append( - session, p, len, &unpack, rs)); + session, durable_ts, p, len, &unpack, rs)); first_cell = false; break; @@ -399,17 +401,19 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_BTREE *btree; WT_DECL_RET; WT_REBALANCE_STUFF *rs, _rstuff; + WT_REF *ref; WT_UNUSED(cfg); btree = S2BT(session); + ref = &btree->root; /* * If the tree has never been written to disk, we're done, rebalance * walks disk images, not in-memory pages. For the same reason, the * tree has to be clean. */ - if (btree->root.page->dsk == NULL) + if (ref->page->dsk == NULL) return (0); if (btree->modified) WT_RET_MSG(session, EINVAL, @@ -422,17 +426,22 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_scr_alloc(session, 0, &rs->tmp2)); /* Set the internal page tree type. */ - rs->type = btree->root.page->type; + rs->type = ref->page->type; - /* Recursively walk the tree. */ + /* + * Recursively walk the tree. We start with a durable timestamp, but + * it should never be used (we'll accumulate durable timestamps from + * all the internal pages in our final write), so set it to something + * impossible. + */ switch (rs->type) { case WT_PAGE_ROW_INT: - WT_ERR( - __rebalance_row_walk(session, btree->root.page->dsk, rs)); + WT_ERR(__rebalance_row_walk( + session, WT_TS_MAX, ref->page->dsk, rs)); break; case WT_PAGE_COL_INT: - WT_ERR( - __rebalance_col_walk(session, btree->root.page->dsk, rs)); + WT_ERR(__rebalance_col_walk( + session, WT_TS_MAX, ref->page->dsk, rs)); break; WT_ILLEGAL_VALUE_ERR(session, rs->type); } @@ -450,8 +459,8 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) * Swap the old root page for our newly built root page, writing the new * root page as part of a checkpoint will finish the rebalance. */ - __wt_page_out(session, &btree->root.page); - btree->root.page = rs->root; + __wt_page_out(session, &ref->page); + ref->page = rs->root; rs->root = NULL; err: /* Discard any leftover root page we created. */ diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index a03cfb6405d..08f7c424d6c 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -75,7 +75,6 @@ struct __wt_track { #define trk_addr shared->addr.addr #define trk_addr_size shared->addr.size #define trk_oldest_start_ts shared->addr.oldest_start_ts -#define trk_newest_start_ts shared->addr.newest_start_ts #define trk_newest_stop_ts shared->addr.newest_stop_ts #define trk_gen shared->gen #define trk_ovfl_addr shared->ovfl_addr @@ -505,10 +504,9 @@ __slvg_trk_init(WT_SESSION_IMPL *session, trk->trk_addr_size = (uint8_t)addr_size; trk->trk_size = dsk->mem_size; trk->trk_oldest_start_ts = WT_TS_MAX; - trk->trk_newest_start_ts = trk->trk_newest_stop_ts = WT_TS_NONE; + trk->trk_newest_stop_ts = WT_TS_NONE; if (!__wt_process.page_version_ts || dsk->type == WT_PAGE_COL_FIX) { - trk->trk_oldest_start_ts = - trk->trk_newest_start_ts = WT_TS_NONE; + trk->trk_oldest_start_ts = WT_TS_NONE; trk->trk_newest_stop_ts = WT_TS_MAX; } trk->trk_gen = dsk->write_gen; @@ -665,8 +663,6 @@ __slvg_trk_leaf_ts(WT_TRACK *trk, WT_CELL_UNPACK *unpack) { trk->trk_oldest_start_ts = WT_MIN(unpack->start_ts, trk->trk_oldest_start_ts); - trk->trk_newest_start_ts = - WT_MAX(unpack->start_ts, trk->trk_newest_start_ts); trk->trk_newest_stop_ts = WT_MAX(unpack->stop_ts, trk->trk_newest_stop_ts); } @@ -1070,8 +1066,6 @@ merge: */ a_trk->trk_oldest_start_ts = b_trk->trk_oldest_start_ts = WT_MIN(a_trk->trk_oldest_start_ts, b_trk->trk_oldest_start_ts); - a_trk->trk_newest_start_ts = b_trk->trk_newest_start_ts = - WT_MAX(a_trk->trk_newest_start_ts, b_trk->trk_newest_start_ts); a_trk->trk_newest_stop_ts = b_trk->trk_newest_stop_ts = WT_MAX(a_trk->trk_newest_stop_ts, b_trk->trk_newest_stop_ts); __wt_verbose(session, WT_VERB_SALVAGE, @@ -1203,9 +1197,13 @@ __slvg_col_build_internal( ref->home = page; ref->page = NULL; + /* + * Salvage doesn't read tree internal pages, so all pages are + * immediately durable, regardless of the leaf page timestamps. + */ WT_ERR(__wt_calloc_one(session, &addr)); addr->oldest_start_ts = trk->trk_oldest_start_ts; - addr->newest_start_ts = trk->trk_newest_start_ts; + addr->newest_durable_ts = WT_TS_NONE; addr->newest_stop_ts = trk->trk_newest_stop_ts; WT_ERR(__wt_memdup( session, trk->trk_addr, trk->trk_addr_size, &addr->addr)); @@ -1726,8 +1724,6 @@ merge: */ a_trk->trk_oldest_start_ts = b_trk->trk_oldest_start_ts = WT_MIN(a_trk->trk_oldest_start_ts, b_trk->trk_oldest_start_ts); - a_trk->trk_newest_start_ts = b_trk->trk_newest_start_ts = - WT_MAX(a_trk->trk_newest_start_ts, b_trk->trk_newest_start_ts); a_trk->trk_newest_stop_ts = b_trk->trk_newest_stop_ts = WT_MAX(a_trk->trk_newest_stop_ts, b_trk->trk_newest_stop_ts); __wt_verbose(session, WT_VERB_SALVAGE, @@ -1875,9 +1871,13 @@ __slvg_row_build_internal( ref->home = page; ref->page = NULL; + /* + * Salvage doesn't read tree internal pages, so all pages are + * immediately durable, regardless of the leaf page timestamps. + */ WT_ERR(__wt_calloc_one(session, &addr)); addr->oldest_start_ts = trk->trk_oldest_start_ts; - addr->newest_start_ts = trk->trk_newest_start_ts; + addr->newest_durable_ts = WT_TS_NONE; addr->newest_stop_ts = trk->trk_newest_stop_ts; WT_ERR(__wt_memdup( session, trk->trk_addr, trk->trk_addr_size, &addr->addr)); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index f0407ce71b1..127c307b9ab 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -264,7 +264,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, session, from_home, (WT_CELL *)ref_addr, &unpack); WT_RET(__wt_calloc_one(session, &addr)); addr->oldest_start_ts = unpack.oldest_start_ts; - addr->newest_start_ts = unpack.newest_start_ts; + addr->newest_durable_ts = unpack.newest_durable_ts; addr->newest_stop_ts = unpack.newest_stop_ts; WT_ERR(__wt_memdup( session, unpack.data, unpack.size, &addr->addr)); @@ -1675,7 +1675,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_RET(__wt_calloc_one(session, &addr)); ref->addr = addr; addr->oldest_start_ts = multi->addr.oldest_start_ts; - addr->newest_start_ts = multi->addr.newest_start_ts; + addr->newest_durable_ts = multi->addr.newest_durable_ts; addr->newest_stop_ts = multi->addr.newest_stop_ts; WT_RET(__wt_memdup(session, multi->addr.addr, multi->addr.size, &addr->addr)); diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c index c201d9af73a..976a771a233 100644 --- a/src/third_party/wiredtiger/src/btree/bt_stat.c +++ b/src/third_party/wiredtiger/src/btree/bt_stat.c @@ -160,21 +160,18 @@ __stat_page_col_var( * we see. */ WT_COL_FOREACH(page, cip, i) { - if ((cell = WT_COL_PTR(page, cip)) == NULL) { + cell = WT_COL_PTR(page, cip); + __wt_cell_unpack(session, page, cell, unpack); + if (unpack->type == WT_CELL_DEL) { orig_deleted = true; - ++deleted_cnt; + deleted_cnt += __wt_cell_rle(unpack); } else { orig_deleted = false; - __wt_cell_unpack(session, page, cell, unpack); - if (unpack->type == WT_CELL_DEL) - orig_deleted = true; - else { - entry_cnt += __wt_cell_rle(unpack); - rle_cnt += __wt_cell_rle(unpack) - 1; - } - if (unpack->ovfl) - ++ovfl_cnt; + entry_cnt += __wt_cell_rle(unpack); } + rle_cnt += __wt_cell_rle(unpack) - 1; + if (unpack->ovfl) + ++ovfl_cnt; /* * Walk the insert list, checking for changes. For each insert diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index 1a412ace8f9..f85389bbe81 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -237,7 +237,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) */ memset(&addr_unpack, 0, sizeof(addr_unpack)); addr_unpack.oldest_start_ts = ckpt->oldest_start_ts; - addr_unpack.newest_start_ts = ckpt->newest_start_ts; + addr_unpack.newest_durable_ts = ckpt->newest_durable_ts; addr_unpack.newest_stop_ts = ckpt->newest_stop_ts; addr_unpack.raw = WT_CELL_ADDR_INT; @@ -331,14 +331,9 @@ __verify_addr_ts(WT_SESSION_IMPL *session, "internal page reference at %s has a newest stop " "timestamp of 0", __wt_page_addr_string(session, ref, vs->tmp1)); - if (unpack->oldest_start_ts > unpack->newest_start_ts) + if (unpack->oldest_start_ts > unpack->newest_stop_ts) WT_RET_MSG(session, WT_ERROR, "internal page reference at %s has an oldest start " - "timestamp newer than its newest start timestamp", - __wt_page_addr_string(session, ref, vs->tmp1)); - if (unpack->newest_start_ts > unpack->newest_stop_ts) - WT_RET_MSG(session, WT_ERROR, - "internal page reference at %s has a newest start " "timestamp newer than its newest stop timestamp", __wt_page_addr_string(session, ref, vs->tmp1)); return (0); @@ -448,13 +443,11 @@ recno_chk: if (recno != vs->record_total + 1) break; case WT_PAGE_COL_VAR: recno = 0; - WT_COL_FOREACH(page, cip, i) - if ((cell = WT_COL_PTR(page, cip)) == NULL) - ++recno; - else { - __wt_cell_unpack(session, page, cell, unpack); - recno += __wt_cell_rle(unpack); - } + WT_COL_FOREACH(page, cip, i) { + cell = WT_COL_PTR(page, cip); + __wt_cell_unpack(session, page, cell, unpack); + recno += __wt_cell_rle(unpack); + } vs->record_total += recno; break; } @@ -745,7 +738,7 @@ __verify_ts_addr_cmp(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t cell_num, bool gt, WT_VSTUFF *vs) { const char *ts1_bp, *ts2_bp; - char ts1_buf[32], ts2_buf[32]; + char ts_string[2][WT_TS_INT_STRING_SIZE]; if (gt && ts1 >= ts2) return (0); @@ -760,9 +753,8 @@ __verify_ts_addr_cmp(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t cell_num, ts1_bp = "WT_TS_NONE"; break; default: - WT_RET( - __wt_snprintf(ts1_buf, sizeof(ts1_buf), "%" PRIu64, ts1)); - ts1_bp = ts1_buf; + __wt_timestamp_to_string(ts1, ts_string[0]); + ts1_bp = ts_string[0]; break; } switch (ts2) { @@ -773,14 +765,13 @@ __verify_ts_addr_cmp(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t cell_num, ts2_bp = "WT_TS_NONE"; break; default: - WT_RET( - __wt_snprintf(ts2_buf, sizeof(ts2_buf), "%" PRIu64, ts2)); - ts2_bp = ts2_buf; + __wt_timestamp_to_string(ts2, ts_string[1]); + ts2_bp = ts_string[1]; break; } WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 " on page at %s failed verification with %s " - "time of %s, %s the parent's %s time of %s", + "timestamp of %s, %s the parent's %s timestamp of %s", cell_num, __wt_page_addr_string(session, ref, vs->tmp1), ts1_name, ts1_bp, @@ -801,6 +792,7 @@ __verify_page_cell(WT_SESSION_IMPL *session, WT_DECL_RET; const WT_PAGE_HEADER *dsk; uint32_t cell_num; + char ts_string[2][WT_TS_INT_STRING_SIZE]; bool found_ovfl; /* @@ -851,30 +843,27 @@ __verify_page_cell(WT_SESSION_IMPL *session, cell_num - 1, __wt_page_addr_string( session, ref, vs->tmp1)); - if (unpack.oldest_start_ts > unpack.newest_start_ts) + if (unpack.oldest_start_ts > unpack.newest_stop_ts) { + __wt_timestamp_to_string( + unpack.oldest_start_ts, ts_string[0]); + __wt_timestamp_to_string( + unpack.newest_stop_ts, ts_string[1]); WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 " on page at %s has an " - "oldest start timestamp newer than its " - "newest start timestamp", - cell_num - 1, - __wt_page_addr_string( - session, ref, vs->tmp1)); - if (unpack.newest_start_ts > unpack.newest_stop_ts) - WT_RET_MSG(session, WT_ERROR, - "cell %" PRIu32 " on page at %s has a " - "newest start timestamp newer than its " - "newest stop timestamp", + "oldest start timestamp %s newer than " + "its newest stop timestamp %s", cell_num - 1, - __wt_page_addr_string( - session, ref, vs->tmp1)); + __wt_page_addr_string(session, + ref, vs->tmp1), ts_string[0], ts_string[1]); + } WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "oldest start", unpack.oldest_start_ts, "oldest start", addr_unpack->oldest_start_ts, true, vs)); WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, - "newest start", unpack.newest_start_ts, - "newest start", addr_unpack->newest_start_ts, + "newest durable", unpack.newest_durable_ts, + "newest durable", addr_unpack->newest_durable_ts, false, vs)); WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "newest stop", unpack.newest_stop_ts, @@ -893,24 +882,25 @@ __verify_page_cell(WT_SESSION_IMPL *session, cell_num - 1, __wt_page_addr_string( session, ref, vs->tmp1)); - if (unpack.start_ts > unpack.stop_ts) + if (unpack.start_ts > unpack.stop_ts) { + __wt_timestamp_to_string( + unpack.start_ts, ts_string[0]); + __wt_timestamp_to_string( + unpack.stop_ts, ts_string[1]); WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 " on page at %s has a " - "start timestamp newer than its stop " - "timestamp ", + "start timestamp %s newer than its stop " + "timestamp %s", cell_num - 1, - __wt_page_addr_string( - session, ref, vs->tmp1)); + __wt_page_addr_string(session, + ref, vs->tmp1), ts_string[0], ts_string[1]); + } WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "start", unpack.start_ts, "oldest start", addr_unpack->oldest_start_ts, true, vs)); WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, - "start", unpack.start_ts, - "newest start", addr_unpack->newest_start_ts, - false, vs)); - WT_RET(__verify_ts_addr_cmp(session, ref, cell_num - 1, "stop", unpack.stop_ts, "newest stop", addr_unpack->newest_stop_ts, false, vs)); diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c index ee6cd904aec..24d6d22f1ef 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c @@ -222,7 +222,7 @@ __verify_dsk_ts_addr_cmp(WT_SESSION_IMPL *session, uint32_t cell_num, bool gt, const char *tag) { const char *ts1_bp, *ts2_bp; - char ts1_buf[32], ts2_buf[32]; + char ts_string[2][WT_TS_INT_STRING_SIZE]; if (gt && ts1 >= ts2) return (0); @@ -237,9 +237,8 @@ __verify_dsk_ts_addr_cmp(WT_SESSION_IMPL *session, uint32_t cell_num, ts1_bp = "WT_TS_NONE"; break; default: - WT_RET( - __wt_snprintf(ts1_buf, sizeof(ts1_buf), "%" PRIu64, ts1)); - ts1_bp = ts1_buf; + __wt_timestamp_to_string(ts1, ts_string[0]); + ts1_bp = ts_string[0]; break; } switch (ts2) { @@ -250,14 +249,13 @@ __verify_dsk_ts_addr_cmp(WT_SESSION_IMPL *session, uint32_t cell_num, ts2_bp = "WT_TS_NONE"; break; default: - WT_RET( - __wt_snprintf(ts2_buf, sizeof(ts2_buf), "%" PRIu64, ts2)); - ts2_bp = ts2_buf; + __wt_timestamp_to_string(ts2, ts_string[1]); + ts2_bp = ts_string[1]; break; } WT_RET_MSG(session, WT_ERROR, "cell %" PRIu32 " on page at %s failed verification with %s " - "time of %s, %s the parent's %s time of %s", + "timestamp of %s, %s the parent's %s timestamp of %s", cell_num, tag, ts1_name, ts1_bp, gt ? "less than" : "greater than", @@ -272,6 +270,8 @@ static int __verify_dsk_ts(WT_SESSION_IMPL *session, WT_CELL_UNPACK *unpack, uint32_t cell_num, WT_ADDR *addr, const char *tag) { + char ts_string[2][WT_TS_INT_STRING_SIZE]; + /* * Check timestamp order, and optionally, against a parent address. * Timestamps in the parent address aren't necessarily an exact match, @@ -291,17 +291,17 @@ __verify_dsk_ts(WT_SESSION_IMPL *session, "cell %" PRIu32 " on page at %s has a newest stop " "timestamp of 0", cell_num - 1, tag); - if (unpack->oldest_start_ts > unpack->newest_start_ts) + if (unpack->oldest_start_ts > unpack->newest_stop_ts) { + __wt_timestamp_to_string( + unpack->oldest_start_ts, ts_string[0]); + __wt_timestamp_to_string( + unpack->newest_stop_ts, ts_string[1]); WT_RET_VRFY(session, "cell %" PRIu32 " on page at %s has an oldest " - "start timestamp newer than its newest start " - "timestamp", - cell_num - 1, tag); - if (unpack->newest_start_ts > unpack->newest_stop_ts) - WT_RET_VRFY(session, - "cell %" PRIu32 " on page at %s has a newest start " - "timestamp newer than its newest stop timestamp", - cell_num - 1, tag); + "start timestamp %s newer than its newest stop " + "timestamp %s", + cell_num - 1, tag, ts_string[0], ts_string[1]); + } if (addr == NULL) break; @@ -310,8 +310,8 @@ __verify_dsk_ts(WT_SESSION_IMPL *session, "oldest start", addr->oldest_start_ts, true, tag)); WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, - "newest start", unpack->newest_start_ts, - "newest start", addr->newest_start_ts, + "newest durable", unpack->newest_durable_ts, + "newest durable", addr->newest_durable_ts, false, tag)); WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "newest stop", unpack->newest_stop_ts, @@ -329,11 +329,16 @@ __verify_dsk_ts(WT_SESSION_IMPL *session, "cell %" PRIu32 " on page at %s has a stop " "timestamp of 0", cell_num - 1, tag); - if (unpack->start_ts > unpack->stop_ts) + if (unpack->start_ts > unpack->stop_ts) { + __wt_timestamp_to_string( + unpack->start_ts, ts_string[0]); + __wt_timestamp_to_string( + unpack->stop_ts, ts_string[0]); WT_RET_VRFY(session, "cell %" PRIu32 " on page at %s has a start " - "timestamp newer than its stop timestamp ", - cell_num - 1, tag); + "timestamp %s newer than its stop timestamp %s", + cell_num - 1, tag, ts_string[0], ts_string[1]); + } if (addr == NULL) break; @@ -342,10 +347,6 @@ __verify_dsk_ts(WT_SESSION_IMPL *session, "oldest start", addr->oldest_start_ts, true, tag)); WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, - "start", unpack->start_ts, - "newest start", addr->newest_start_ts, - false, tag)); - WT_RET(__verify_dsk_ts_addr_cmp(session, cell_num - 1, "stop", unpack->stop_ts, "newest stop", addr->newest_stop_ts, false, tag)); diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index 905c24dceae..06e0056613c 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -1156,12 +1156,9 @@ __wt_las_sweep(WT_SESSION_IMPL *session) (prepare_state != WT_PREPARE_INPROGRESS || durable_timestamp == 0)); - /* - * FIXME Disable this assertion until fixed by WT-4598. - * WT_ASSERT(session, - * (prepare_state == WT_PREPARE_INPROGRESS || - * durable_timestamp >= las_timestamp)); - */ + WT_ASSERT(session, + (prepare_state == WT_PREPARE_INPROGRESS || + durable_timestamp >= las_timestamp)); /* * There are several conditions that need to be met diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index 09a52300f3b..e56e7f29004 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -287,7 +287,6 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_begin_transaction[] = { { "name", "string", NULL, NULL, NULL, 0 }, { "priority", "int", NULL, "min=-100,max=100", NULL, 0 }, { "read_timestamp", "string", NULL, NULL, NULL, 0 }, - { "round_to_oldest", "boolean", NULL, NULL, NULL, 0 }, { "roundup_timestamps", "category", NULL, NULL, confchk_WT_SESSION_begin_transaction_roundup_timestamps_subconfigs, 2 }, @@ -533,7 +532,6 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_timestamp_transaction[] = { { "durable_timestamp", "string", NULL, NULL, NULL, 0 }, { "prepare_timestamp", "string", NULL, NULL, NULL, 0 }, { "read_timestamp", "string", NULL, NULL, NULL, 0 }, - { "round_to_oldest", "boolean", NULL, NULL, NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -1371,9 +1369,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { }, { "WT_SESSION.begin_transaction", "ignore_prepare=false,isolation=,name=,priority=0,read_timestamp=" - ",round_to_oldest=false,roundup_timestamps=(prepared=false," - "read=false),snapshot=,sync=", - confchk_WT_SESSION_begin_transaction, 9 + ",roundup_timestamps=(prepared=false,read=false),snapshot=,sync=", + confchk_WT_SESSION_begin_transaction, 8 }, { "WT_SESSION.checkpoint", "drop=,force=false,name=,target=,use_timestamp=true", @@ -1482,8 +1479,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { }, { "WT_SESSION.timestamp_transaction", "commit_timestamp=,durable_timestamp=,prepare_timestamp=," - "read_timestamp=,round_to_oldest=false", - confchk_WT_SESSION_timestamp_transaction, 5 + "read_timestamp=", + confchk_WT_SESSION_timestamp_transaction, 4 }, { "WT_SESSION.transaction_sync", "timeout_ms=1200000", diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index 8bc111346c5..81f5724663f 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -343,6 +343,28 @@ __wt_logmgr_reconfig(WT_SESSION_IMPL *session, const char **cfg) } /* + * __log_archive_once_int -- + * Helper for __log_archive_once. Intended to be called while holding the + * hot backup read lock. + */ +static int +__log_archive_once_int(WT_SESSION_IMPL *session, + char **logfiles, u_int logcount, uint32_t min_lognum) +{ + uint32_t lognum; + u_int i; + + for (i = 0; i < logcount; i++) { + WT_RET(__wt_log_extract_lognum(session, logfiles[i], &lognum)); + if (lognum < min_lognum) + WT_RET(__wt_log_remove( + session, WT_LOG_FILENAME, lognum)); + } + + return (0); +} + +/* * __log_archive_once -- * Perform one iteration of log archiving. Must be called with the * log archive lock held. @@ -353,15 +375,13 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; - uint32_t lognum, min_lognum; - u_int i, logcount; + uint32_t min_lognum; + u_int logcount; char **logfiles; - bool locked; conn = S2C(session); log = conn->log; logcount = 0; - locked = false; logfiles = NULL; /* @@ -386,22 +406,18 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) session, conn->log_path, WT_LOG_FILENAME, &logfiles, &logcount)); /* - * We can only archive files if a hot backup is not in progress or - * if we are the backup. + * If backup_file is non-zero we know we're coming from an incremental + * backup cursor. In that case just perform the archive operation + * without the lock. */ - __wt_readlock(session, &conn->hot_backup_lock); - locked = true; - if (!conn->hot_backup || backup_file != 0) { - for (i = 0; i < logcount; i++) { - WT_ERR(__wt_log_extract_lognum( - session, logfiles[i], &lognum)); - if (lognum < min_lognum) - WT_ERR(__wt_log_remove( - session, WT_LOG_FILENAME, lognum)); - } - } - __wt_readunlock(session, &conn->hot_backup_lock); - locked = false; + if (backup_file != 0) + ret = __log_archive_once_int( + session, logfiles, logcount, min_lognum); + else + WT_WITH_HOTBACKUP_READ_LOCK(session, + ret = __log_archive_once_int( + session, logfiles, logcount, min_lognum), NULL); + WT_ERR(ret); /* * Indicate what is our new earliest LSN. It is the start @@ -411,8 +427,6 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) if (0) err: __wt_err(session, ret, "log archive server error"); - if (locked) - __wt_readunlock(session, &conn->hot_backup_lock); WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); return (ret); } @@ -594,18 +608,15 @@ __log_file_server(void *arg) * truncate: both are OK, it's just more work * during cursor traversal. */ - if (!conn->hot_backup) { - __wt_readlock( - session, &conn->hot_backup_lock); - if (!conn->hot_backup && - conn->log_cursors == 0) - WT_ERR_ERROR_OK( - __wt_ftruncate(session, + if (!conn->hot_backup && + conn->log_cursors == 0) { + WT_WITH_HOTBACKUP_READ_LOCK(session, + WT_ERR_ERROR_OK( + __wt_ftruncate( + session, close_fh, close_end_lsn.l.offset), - ENOTSUP); - __wt_readunlock( - session, &conn->hot_backup_lock); + ENOTSUP), NULL); } WT_SET_LSN(&close_end_lsn, close_end_lsn.l.file + 1, 0); @@ -976,11 +987,8 @@ __log_server(void *arg) * agreed not to rename or remove any files in * the database directory. */ - __wt_readlock(session, &conn->hot_backup_lock); - if (!conn->hot_backup) - ret = __log_prealloc_once(session); - __wt_readunlock( - session, &conn->hot_backup_lock); + WT_WITH_HOTBACKUP_READ_LOCK(session, + ret = __log_prealloc_once(session), NULL); WT_ERR(ret); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c index 04882e527ce..9a279ca3970 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_backup.c +++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c @@ -265,10 +265,8 @@ __backup_start(WT_SESSION_IMPL *session, * operations will not see the backup file list until it is * complete and valid. */ - __wt_writelock(session, &conn->hot_backup_lock); - conn->hot_backup = true; - conn->hot_backup_list = NULL; - __wt_writeunlock(session, &conn->hot_backup_lock); + WT_WITH_HOTBACKUP_WRITE_LOCK(session, + WT_CONN_HOTBACKUP_START(conn)); /* We're the lock holder, we own cleanup. */ F_SET(cb, WT_CURBACKUP_LOCKER); @@ -368,9 +366,8 @@ err: /* Close the hot backup file. */ ret = __wt_sync_and_rename(session, &cb->bfs, WT_BACKUP_TMP, dest); if (ret == 0) { - __wt_writelock(session, &conn->hot_backup_lock); - conn->hot_backup_list = cb->list; - __wt_writeunlock(session, &conn->hot_backup_lock); + WT_WITH_HOTBACKUP_WRITE_LOCK(session, + conn->hot_backup_list = cb->list); F_SET(session, WT_SESSION_BACKUP_CURSOR); } /* @@ -399,18 +396,14 @@ __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) WT_ASSERT(session, !F_ISSET(cb, WT_CURBACKUP_DUP)); /* If it's not a dup backup cursor, make sure one isn't open. */ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_BACKUP_DUP)); - __wt_writelock(session, &conn->hot_backup_lock); - conn->hot_backup_list = NULL; - __wt_writeunlock(session, &conn->hot_backup_lock); + WT_WITH_HOTBACKUP_WRITE_LOCK(session, conn->hot_backup_list = NULL); __backup_free(session, cb); /* Remove any backup specific file. */ WT_TRET(__wt_backup_file_remove(session)); /* Checkpoint deletion and next hot backup can proceed. */ - __wt_writelock(session, &conn->hot_backup_lock); - conn->hot_backup = false; - __wt_writeunlock(session, &conn->hot_backup_lock); + WT_WITH_HOTBACKUP_WRITE_LOCK(session, conn->hot_backup = false); F_CLR(session, WT_SESSION_BACKUP_CURSOR); return (ret); diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c index baabcd0182c..ee0d57037eb 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_index.c +++ b/src/third_party/wiredtiger/src/cursor/cur_index.c @@ -544,8 +544,8 @@ __wt_curindex_open(WT_SESSION_IMPL *session, WT_ERR(__curindex_open_colgroups(session, cindex, cfg)); if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) - __wt_json_column_init(cursor, uri, table->key_format, - &idx->colconf, &table->colconf); + WT_ERR(__wt_json_column_init(cursor, uri, table->key_format, + &idx->colconf, &table->colconf)); if (0) { err: WT_TRET(__curindex_close(cursor)); diff --git a/src/third_party/wiredtiger/src/cursor/cur_json.c b/src/third_party/wiredtiger/src/cursor/cur_json.c index 21716005c27..f540775180e 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_json.c +++ b/src/third_party/wiredtiger/src/cursor/cur_json.c @@ -309,6 +309,8 @@ __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor) if ((json = (WT_CURSOR_JSON *)cursor->json_private) != NULL) { __wt_free(session, json->key_buf); __wt_free(session, json->value_buf); + __wt_free(session, json->key_names.str); + __wt_free(session, json->value_names.str); __wt_free(session, json); } } @@ -373,21 +375,26 @@ __wt_json_unpack_char(u_char ch, u_char *buf, size_t bufsz, bool force_unicode) * Set json_key_names, json_value_names to comma separated lists * of column names. */ -void +int __wt_json_column_init(WT_CURSOR *cursor, const char *uri, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf) { WT_CURSOR_JSON *json; + WT_SESSION_IMPL *session; + size_t len; uint32_t keycnt, nkeys; const char *beginkey, *end, *lparen, *p; json = (WT_CURSOR_JSON *)cursor->json_private; + session = (WT_SESSION_IMPL *)cursor->session; beginkey = colconf->str; end = beginkey + colconf->len; if (idxconf != NULL) { - json->key_names.str = idxconf->str; - json->key_names.len = idxconf->len; + len = idxconf->len; + WT_RET(__wt_strndup(session, idxconf->str, len, + &json->key_names.str)); + json->key_names.len = len; } else if (colconf->len > 0 && *beginkey == '(') { beginkey++; if (end[-1] == ')') @@ -407,20 +414,25 @@ __wt_json_column_init(WT_CURSOR *cursor, const char *uri, const char *keyformat, } if ((lparen = strchr(uri, '(')) != NULL) { /* This cursor is a projection. */ - json->value_names.str = lparen; - json->value_names.len = strlen(lparen) - 1; - WT_ASSERT((WT_SESSION_IMPL *)cursor->session, - json->value_names.str[json->value_names.len] == ')'); + len = strlen(lparen) - 1; + WT_ASSERT(session, lparen[len] == ')'); + WT_RET(__wt_strndup(session, lparen, len, + &json->value_names.str)); + json->value_names.len = len; } else { - json->value_names.str = p; - json->value_names.len = WT_PTRDIFF(end, p); + len = WT_PTRDIFF(end, p); + WT_RET(__wt_strndup(session, p, len, &json->value_names.str)); + json->value_names.len = len; } if (idxconf == NULL) { if (p > beginkey) p--; - json->key_names.str = beginkey; - json->key_names.len = WT_PTRDIFF(p, beginkey); + len = WT_PTRDIFF(p, beginkey); + WT_RET(__wt_strndup(session, beginkey, len, + &json->key_names.str)); + json->key_names.len = len; } + return (0); } #define MATCH_KEYWORD(session, in, result, keyword, matchval) do { \ diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c index 77c6018778c..3198a15bd13 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_table.c +++ b/src/third_party/wiredtiger/src/cursor/cur_table.c @@ -1050,8 +1050,8 @@ __wt_curtable_open(WT_SESSION_IMPL *session, cursor, cursor->internal_uri, owner, cfg, cursorp)); if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) - __wt_json_column_init( - cursor, uri, table->key_format, NULL, &table->colconf); + WT_ERR(__wt_json_column_init( + cursor, uri, table->key_format, NULL, &table->colconf)); /* * Open the colgroup cursors immediately: we're going to need them for diff --git a/src/third_party/wiredtiger/src/docs/tools/doxypy.py b/src/third_party/wiredtiger/src/docs/tools/doxypy.py index 54fef5f03a5..f05a597ed6e 100755 --- a/src/third_party/wiredtiger/src/docs/tools/doxypy.py +++ b/src/third_party/wiredtiger/src/docs/tools/doxypy.py @@ -1,5 +1,7 @@ #!/usr/bin/env python +from __future__ import print_function + __applicationName__ = "doxypy" __blurb__ = """ doxypy is an input filter for Doxygen. It preprocesses python @@ -86,7 +88,7 @@ class FSM(object): self.current_input = input self.current_transition = transition if options.debug: - print >>sys.stderr, "# FSM: executing (%s -> %s) for line '%s'" % (from_state, to_state, input) + print("# FSM: executing (%s -> %s) for line '%s'" % (from_state, to_state, input), file=sys.stderr) callback(match) return @@ -208,8 +210,8 @@ class Doxypy(object): if self.output: try: if options.debug: - print >>sys.stderr, "# OUTPUT: ", self.output - print >>self.outstream, "\n".join(self.output) + print("# OUTPUT: ", self.output, file=sys.stderr) + print("\n".join(self.output), file=self.outstream) self.outstream.flush() except IOError: # Fix for FS#33. Catches "broken pipe" when doxygen closes @@ -228,7 +230,7 @@ class Doxypy(object): Closes the current commentblock and starts a new comment search. """ if options.debug: - print >>sys.stderr, "# CALLBACK: resetCommentSearch" + print("# CALLBACK: resetCommentSearch" , file=sys.stderr) self.__closeComment() self.startCommentSearch(match) @@ -239,7 +241,7 @@ class Doxypy(object): the current indentation. """ if options.debug: - print >>sys.stderr, "# CALLBACK: startCommentSearch" + print("# CALLBACK: startCommentSearch", file=sys.stderr) self.defclass = [self.fsm.current_input] self.comment = [] self.indent = match.group(1) @@ -251,7 +253,7 @@ class Doxypy(object): appends the current line to the output. """ if options.debug: - print >>sys.stderr, "# CALLBACK: stopCommentSearch" + print("# CALLBACK: stopCommentSearch" , file=sys.stderr) self.__closeComment() self.defclass = [] @@ -263,7 +265,7 @@ class Doxypy(object): Closes the open comment block, resets it and appends the current line. """ if options.debug: - print >>sys.stderr, "# CALLBACK: appendFileheadLine" + print("# CALLBACK: appendFileheadLine" , file=sys.stderr) self.__closeComment() self.comment = [] self.output.append(self.fsm.current_input) @@ -275,7 +277,7 @@ class Doxypy(object): well as singleline comments. """ if options.debug: - print >>sys.stderr, "# CALLBACK: appendCommentLine" + print("# CALLBACK: appendCommentLine" , file=sys.stderr) (from_state, to_state, condition, callback) = self.fsm.current_transition # single line comment @@ -312,13 +314,13 @@ class Doxypy(object): def appendNormalLine(self, match): """Appends a line to the output.""" if options.debug: - print >>sys.stderr, "# CALLBACK: appendNormalLine" + print("# CALLBACK: appendNormalLine" , file=sys.stderr) self.output.append(self.fsm.current_input) def appendDefclassLine(self, match): """Appends a line to the triggering block.""" if options.debug: - print >>sys.stderr, "# CALLBACK: appendDefclassLine" + print("# CALLBACK: appendDefclassLine" , file=sys.stderr) self.defclass.append(self.fsm.current_input) def makeCommentBlock(self): @@ -397,7 +399,7 @@ def optParse(): (options, filename) = parser.parse_args() if not filename: - print >>sys.stderr, "No filename given." + print("No filename given.", file=sys.stderr) sys.exit(-1) return filename[0] diff --git a/src/third_party/wiredtiger/src/docs/tools/fixlinks.py b/src/third_party/wiredtiger/src/docs/tools/fixlinks.py index 59f8494ada3..1532118cd21 100755 --- a/src/third_party/wiredtiger/src/docs/tools/fixlinks.py +++ b/src/third_party/wiredtiger/src/docs/tools/fixlinks.py @@ -59,8 +59,8 @@ def process(source): (m.group(0), m.group(1), m.group(1), m.group(2))), source) # Replace "self, handle" with "self" -- these are typedef'ed away - source = re.sub(r'(\s+#.*self), - (?:connection|cursor|session)', r'\1', source) + source = re.sub(r'(\s+#.*self),' + + r'(?:connection|cursor|session)', r'\1', source) return source if __name__ == '__main__': diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index dc1bdc07419..c8f2221daa2 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -133,7 +133,7 @@ __wt_page_header_byteswap(WT_PAGE_HEADER *dsk) */ struct __wt_addr { wt_timestamp_t oldest_start_ts; /* Aggregated timestamp information */ - wt_timestamp_t newest_start_ts; + wt_timestamp_t newest_durable_ts; wt_timestamp_t newest_stop_ts; uint8_t *addr; /* Block-manager's cookie */ @@ -990,8 +990,6 @@ struct __wt_col { * of a base pointer. The on-page data is a WT_CELL (same as row-store * pages). * - * If the value is 0, it's a single, deleted record. - * * Obscure the field name, code shouldn't use WT_COL->__col_value, the * public interface is WT_COL_PTR and WT_COL_PTR_SET. */ @@ -1004,8 +1002,7 @@ struct __wt_col { * not exist on the page, return a NULL.) */ #define WT_COL_PTR(page, cip) \ - ((cip)->__col_value == 0 ? \ - NULL : WT_PAGE_REF_OFFSET(page, (cip)->__col_value)) + WT_PAGE_REF_OFFSET(page, (cip)->__col_value) #define WT_COL_PTR_SET(cip, value) \ (cip)->__col_value = (value) diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i index 0bbe3283dee..260e2304034 100644 --- a/src/third_party/wiredtiger/src/include/cell.i +++ b/src/third_party/wiredtiger/src/include/cell.i @@ -147,7 +147,7 @@ struct __wt_cell_unpack { /* Start/stop timestamps for a value */ wt_timestamp_t start_ts, stop_ts; /* Aggregated timestamp information */ - wt_timestamp_t oldest_start_ts, newest_start_ts, newest_stop_ts; + wt_timestamp_t oldest_start_ts, newest_durable_ts, newest_stop_ts; /* * !!! @@ -219,37 +219,50 @@ __cell_pack_timestamp_value(WT_SESSION_IMPL *session, */ static inline void __wt_timestamp_addr_check(WT_SESSION_IMPL *session, - wt_timestamp_t oldest_start_ts, - wt_timestamp_t newest_start_ts, wt_timestamp_t newest_stop_ts) + wt_timestamp_t oldest_start_ts, wt_timestamp_t newest_stop_ts) { +#ifdef HAVE_DIAGNOSTIC + char ts_string[2][WT_TS_INT_STRING_SIZE]; + + if (newest_stop_ts == WT_TS_NONE) { + __wt_errx(session, "newest stop timestamp of 0"); + WT_ASSERT(session, newest_stop_ts != WT_TS_NONE); + } + if (oldest_start_ts > newest_stop_ts) { + __wt_timestamp_to_string(oldest_start_ts, ts_string[0]); + __wt_timestamp_to_string(newest_stop_ts, ts_string[1]); + __wt_errx(session, + "an oldest start timestamp %s newer than its newest " + "stop timestamp %s", + ts_string[0], ts_string[1]); + WT_ASSERT(session, oldest_start_ts <= newest_stop_ts); + } +#else + WT_UNUSED(session); WT_UNUSED(oldest_start_ts); - WT_UNUSED(newest_start_ts); WT_UNUSED(newest_stop_ts); - - WT_ASSERT(session, newest_stop_ts != WT_TS_NONE); - WT_ASSERT(session, oldest_start_ts <= newest_start_ts); - WT_ASSERT(session, newest_start_ts <= newest_stop_ts); +#endif } /* * __cell_pack_timestamp_addr -- - * Pack a oldest_start, newest_start, newest_stop timestamp triplet for an - * address. + * Pack a oldest_start, newest_durable_ts, newest_stop timestamp triplet + * for an address. */ static inline void __cell_pack_timestamp_addr(WT_SESSION_IMPL *session, uint8_t **pp, wt_timestamp_t oldest_start_ts, - wt_timestamp_t newest_start_ts, wt_timestamp_t newest_stop_ts) + wt_timestamp_t newest_durable_ts, wt_timestamp_t newest_stop_ts) { - __wt_timestamp_addr_check(session, - oldest_start_ts, newest_start_ts, newest_stop_ts); + __wt_timestamp_addr_check(session, oldest_start_ts, newest_stop_ts); ++*pp; if (__wt_process.page_version_ts) { /* Store differences, not absolutes. */ (void)__wt_vpack_uint(pp, 0, oldest_start_ts); - (void)__wt_vpack_uint(pp, 0, newest_start_ts - oldest_start_ts); - (void)__wt_vpack_uint(pp, 0, newest_stop_ts - newest_start_ts); + (void)__wt_vpack_uint( + pp, 0, newest_durable_ts - oldest_start_ts); + (void)__wt_vpack_uint(pp, 0, newest_stop_ts - oldest_start_ts); } } @@ -260,8 +273,8 @@ __cell_pack_timestamp_addr(WT_SESSION_IMPL *session, static inline size_t __wt_cell_pack_addr(WT_SESSION_IMPL *session, WT_CELL *cell, u_int cell_type, uint64_t recno, - wt_timestamp_t oldest_start_ts, - wt_timestamp_t newest_start_ts, wt_timestamp_t newest_stop_ts, size_t size) + wt_timestamp_t oldest_start_ts, wt_timestamp_t newest_durable_ts, + wt_timestamp_t newest_stop_ts, size_t size) { uint8_t *p; @@ -270,7 +283,7 @@ __wt_cell_pack_addr(WT_SESSION_IMPL *session, *p = '\0'; __cell_pack_timestamp_addr(session, - &p, oldest_start_ts, newest_start_ts, newest_stop_ts); + &p, oldest_start_ts, newest_durable_ts, newest_stop_ts); if (recno == WT_RECNO_OOB) cell->__chunk[0] = (uint8_t)cell_type; /* Type */ @@ -728,10 +741,11 @@ __wt_cell_unpack_safe(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, * the copied cell must be available from unpack after we return, as our * caller has no way to find the copied cell). */ - WT_CELL_LEN_CHK(cell, 0); unpack->cell = cell; restart: + WT_CELL_LEN_CHK(cell, 0); + /* * This path is performance critical for read-only trees, we're parsing * on-page structures. For that reason we don't clear the unpacked cell @@ -742,7 +756,7 @@ restart: unpack->v = 0; unpack->start_ts = WT_TS_NONE; unpack->stop_ts = WT_TS_MAX; - unpack->oldest_start_ts = unpack->newest_start_ts = WT_TS_NONE; + unpack->oldest_start_ts = unpack->newest_durable_ts = WT_TS_NONE; unpack->newest_stop_ts = WT_TS_MAX; unpack->raw = (uint8_t)__wt_cell_type_raw(cell); unpack->type = (uint8_t)__wt_cell_type(cell); @@ -798,15 +812,14 @@ restart: WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->oldest_start_ts)); WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : - WT_PTRDIFF(end, p), &unpack->newest_start_ts)); - unpack->newest_start_ts += unpack->oldest_start_ts; + WT_PTRDIFF(end, p), &unpack->newest_durable_ts)); + unpack->newest_durable_ts += unpack->oldest_start_ts; WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->newest_stop_ts)); - unpack->newest_stop_ts += unpack->newest_start_ts; + unpack->newest_stop_ts += unpack->oldest_start_ts; __wt_timestamp_addr_check(session, - unpack->oldest_start_ts, - unpack->newest_start_ts, unpack->newest_stop_ts); + unpack->oldest_start_ts, unpack->newest_stop_ts); break; case WT_CELL_DEL: case WT_CELL_VALUE: @@ -950,7 +963,7 @@ __wt_cell_unpack_dsk(WT_SESSION_IMPL *session, * somewhere. * unpack->oldest_start_ts - unpack->newest_start_ts + unpack->newest_durable_ts unpack->newest_stop_ts */ unpack->data = ""; diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index b6100ae134d..1f461a06137 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -148,6 +148,16 @@ struct __wt_named_extractor { } while (0) /* + * WT_CONN_HOTBACKUP_START -- + * Macro to set connection data appropriately for when we commence hot + * backup. + */ +#define WT_CONN_HOTBACKUP_START(conn) do { \ + conn->hot_backup = true; \ + conn->hot_backup_list = NULL; \ +} while (0) + +/* * WT_CONNECTION_IMPL -- * Implementation of WT_CONNECTION */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 1ca81b0b4d9..d93deb0a361 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -318,7 +318,7 @@ extern int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT extern int __wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, WT_CURSOR_JSON *json, bool iskey, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor); extern size_t __wt_json_unpack_char(u_char ch, u_char *buf, size_t bufsz, bool force_unicode) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); -extern void __wt_json_column_init(WT_CURSOR *cursor, const char *uri, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf); +extern int __wt_json_column_init(WT_CURSOR *cursor, const char *uri, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype, const char **tokstart, size_t *toklen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern const char *__wt_json_tokname(int toktype) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern int __wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, const char *format, WT_CURSOR_JSON *json, bool iskey, WT_ITEM *item) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -572,6 +572,7 @@ extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp) WT_GCC_FUNC_DECL_AT extern bool __wt_fsync_background_chk(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_fsync_background(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_close_connection_close(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_file_zero(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t start_off, wt_off_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_os_inmemory(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_fopen(WT_SESSION_IMPL *session, const char *name, uint32_t open_flags, uint32_t flags, WT_FSTREAM **fstrp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_os_stdio(WT_SESSION_IMPL *session); @@ -828,7 +829,7 @@ extern int __wt_verbose_dump_txn(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTR extern int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_checkpoint_progress(WT_SESSION_IMPL *session, bool closing); extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern void __wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, wt_timestamp_t oldest_start_ts, wt_timestamp_t newest_start_ts, wt_timestamp_t newest_stop_ts); +extern void __wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, wt_timestamp_t oldest_start_ts, wt_timestamp_t newest_durable_ts, wt_timestamp_t newest_stop_ts); extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -857,6 +858,7 @@ extern void __wt_timestamp_to_hex_string(wt_timestamp_t ts, char *hex_timestamp) extern void __wt_verbose_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t ts, const char *msg); extern int __wt_txn_parse_timestamp_raw(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_txn_get_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_query_timestamp(WT_SESSION_IMPL *session, char *hex_timestamp, const char *cfg[], bool global_txn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h index b1f9a557934..23c30e9a031 100644 --- a/src/third_party/wiredtiger/src/include/meta.h +++ b/src/third_party/wiredtiger/src/include/meta.h @@ -77,7 +77,7 @@ struct __wt_ckpt { uint64_t write_gen; /* Write generation */ wt_timestamp_t oldest_start_ts; /* Aggregated timestamp information */ - wt_timestamp_t newest_start_ts; + wt_timestamp_t newest_durable_ts; wt_timestamp_t newest_stop_ts; void *bpriv; /* Block manager private */ diff --git a/src/third_party/wiredtiger/src/include/schema.h b/src/third_party/wiredtiger/src/include/schema.h index 74af41132f2..cd217fe9c51 100644 --- a/src/third_party/wiredtiger/src/include/schema.h +++ b/src/third_party/wiredtiger/src/include/schema.h @@ -83,6 +83,9 @@ struct __wt_table { #define WT_SESSION_LOCKED_TABLE \ (WT_SESSION_LOCKED_TABLE_READ | \ WT_SESSION_LOCKED_TABLE_WRITE) +#define WT_SESSION_LOCKED_HOTBACKUP \ + (WT_SESSION_LOCKED_HOTBACKUP_READ | \ + WT_SESSION_LOCKED_HOTBACKUP_WRITE) /* * WT_WITH_LOCK_WAIT -- @@ -257,6 +260,76 @@ struct __wt_table { } while (0) /* + * WT_WITH_HOTBACKUP_READ_LOCK -- + * Acquire the hot backup read lock and perform an operation provided that + * there is no hot backup in progress. The skipp parameter can be used to + * check whether the operation got skipped or not. + */ +#define WT_WITH_HOTBACKUP_READ_LOCK(session, op, skipp) do { \ + WT_CONNECTION_IMPL *__conn = S2C(session); \ + if ((skipp) != (bool *)NULL) \ + *(bool *)(skipp) = true; \ + if (F_ISSET(session, WT_SESSION_LOCKED_HOTBACKUP)) { \ + if (!__conn->hot_backup) { \ + if ((skipp) != (bool *)NULL) \ + *(bool *)(skipp) = false; \ + op; \ + } \ + } else { \ + __wt_readlock(session, &__conn->hot_backup_lock); \ + F_SET(session, WT_SESSION_LOCKED_HOTBACKUP_READ); \ + if (!__conn->hot_backup) { \ + if ((skipp) != (bool *)NULL) \ + *(bool *)(skipp) = false; \ + op; \ + } \ + F_CLR(session, WT_SESSION_LOCKED_HOTBACKUP_READ); \ + __wt_readunlock(session, &__conn->hot_backup_lock); \ + } \ +} while (0) + +/* + * WT_WITH_HOTBACKUP_WRITE_LOCK -- + * Acquire the hot backup write lock and perform an operation. + */ +#define WT_WITH_HOTBACKUP_WRITE_LOCK(session, op) do { \ + WT_CONNECTION_IMPL *__conn = S2C(session); \ + if (F_ISSET(session, WT_SESSION_LOCKED_HOTBACKUP_WRITE)) { \ + op; \ + } else { \ + WT_ASSERT(session, \ + !F_ISSET( \ + session, WT_SESSION_LOCKED_HOTBACKUP_READ)); \ + __wt_writelock(session, &__conn->hot_backup_lock); \ + F_SET(session, WT_SESSION_LOCKED_HOTBACKUP_WRITE); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_HOTBACKUP_WRITE); \ + __wt_writeunlock(session, &__conn->hot_backup_lock); \ + } \ +} while (0) + +/* + * WT_WITH_HOTBACKUP_READ_LOCK_UNCOND -- + * Acquire the hot backup read lock and perform an operation + * unconditionally. This is a specialized macro for a few isolated cases. + * Code that wishes to acquire the read lock should default to using + * WT_WITH_HOTBACKUP_READ_LOCK which checks that there is no hot backup in + * progress. + */ +#define WT_WITH_HOTBACKUP_READ_LOCK_UNCOND(session, op) do { \ + WT_CONNECTION_IMPL *__conn = S2C(session); \ + if (F_ISSET(session, WT_SESSION_LOCKED_HOTBACKUP)) { \ + op; \ + } else { \ + __wt_readlock(session, &__conn->hot_backup_lock); \ + F_SET(session, WT_SESSION_LOCKED_HOTBACKUP_READ); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_HOTBACKUP_READ); \ + __wt_readunlock(session, &__conn->hot_backup_lock); \ + } \ +} while (0) + +/* * WT_WITHOUT_LOCKS -- * Drop the handle, table and/or schema locks, perform an operation, * re-acquire the lock(s). diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index 0d99b4cc6e0..c7ae31b4e54 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -172,23 +172,25 @@ struct __wt_session_impl { #define WT_SESSION_LOCKED_CHECKPOINT 0x0000040u #define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x0000080u #define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x0000100u -#define WT_SESSION_LOCKED_METADATA 0x0000200u -#define WT_SESSION_LOCKED_PASS 0x0000400u -#define WT_SESSION_LOCKED_SCHEMA 0x0000800u -#define WT_SESSION_LOCKED_SLOT 0x0001000u -#define WT_SESSION_LOCKED_TABLE_READ 0x0002000u -#define WT_SESSION_LOCKED_TABLE_WRITE 0x0004000u -#define WT_SESSION_LOCKED_TURTLE 0x0008000u -#define WT_SESSION_LOGGING_INMEM 0x0010000u -#define WT_SESSION_LOOKASIDE_CURSOR 0x0020000u -#define WT_SESSION_NO_DATA_HANDLES 0x0040000u -#define WT_SESSION_NO_LOGGING 0x0080000u -#define WT_SESSION_NO_RECONCILE 0x0100000u -#define WT_SESSION_NO_SCHEMA_LOCK 0x0200000u -#define WT_SESSION_QUIET_CORRUPT_FILE 0x0400000u -#define WT_SESSION_READ_WONT_NEED 0x0800000u -#define WT_SESSION_SCHEMA_TXN 0x1000000u -#define WT_SESSION_SERVER_ASYNC 0x2000000u +#define WT_SESSION_LOCKED_HOTBACKUP_READ 0x0000200u +#define WT_SESSION_LOCKED_HOTBACKUP_WRITE 0x0000400u +#define WT_SESSION_LOCKED_METADATA 0x0000800u +#define WT_SESSION_LOCKED_PASS 0x0001000u +#define WT_SESSION_LOCKED_SCHEMA 0x0002000u +#define WT_SESSION_LOCKED_SLOT 0x0004000u +#define WT_SESSION_LOCKED_TABLE_READ 0x0008000u +#define WT_SESSION_LOCKED_TABLE_WRITE 0x0010000u +#define WT_SESSION_LOCKED_TURTLE 0x0020000u +#define WT_SESSION_LOGGING_INMEM 0x0040000u +#define WT_SESSION_LOOKASIDE_CURSOR 0x0080000u +#define WT_SESSION_NO_DATA_HANDLES 0x0100000u +#define WT_SESSION_NO_LOGGING 0x0200000u +#define WT_SESSION_NO_RECONCILE 0x0400000u +#define WT_SESSION_NO_SCHEMA_LOCK 0x0800000u +#define WT_SESSION_QUIET_CORRUPT_FILE 0x1000000u +#define WT_SESSION_READ_WONT_NEED 0x2000000u +#define WT_SESSION_SCHEMA_TXN 0x4000000u +#define WT_SESSION_SERVER_ASYNC 0x8000000u /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint32_t flags; diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index f5d1075581b..e2b2aae3d33 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -708,7 +708,9 @@ struct __wt_connection_stats { int64_t txn_pinned_snapshot_range; int64_t txn_pinned_timestamp; int64_t txn_pinned_timestamp_checkpoint; + int64_t txn_pinned_timestamp_reader; int64_t txn_pinned_timestamp_oldest; + int64_t txn_timestamp_oldest_active_read; int64_t txn_sync; int64_t txn_commit; int64_t txn_rollback; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 7fed51cc76b..c60b1772fe9 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -23,6 +23,12 @@ #define WT_TXN_OLDEST_WAIT 0x2u /* AUTOMATIC FLAG VALUE GENERATION STOP */ +/* AUTOMATIC FLAG VALUE GENERATION START */ +#define WT_TXN_TS_ALREADY_LOCKED 0x1u +#define WT_TXN_TS_INCLUDE_CKPT 0x2u +#define WT_TXN_TS_INCLUDE_OLDEST 0x4u +/* AUTOMATIC FLAG VALUE GENERATION STOP */ + /* * Transaction ID comparison dealing with edge cases. * diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 5a091db45a0..890fbb26a74 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -1791,9 +1791,6 @@ struct __wt_session { * @config{read_timestamp, read using the specified timestamp. The * supplied value must not be older than the current oldest timestamp. * See @ref transaction_timestamps., a string; default empty.} - * @config{round_to_oldest, if read timestamp is earlier than oldest - * timestamp\, read timestamp will be rounded to oldest timestamp., a - * boolean flag; default \c false.} * @config{roundup_timestamps = (, round up timestamps of the * transaction. This setting alters the visibility expected in a * transaction. See @ref transaction_timestamps., a set of related @@ -1929,9 +1926,6 @@ struct __wt_session { * supplied value must not be older than the current oldest timestamp. * This can only be set once for a transaction. See @ref * transaction_timestamps., a string; default empty.} - * @config{round_to_oldest, if read timestamp is earlier than oldest - * timestamp\, read timestamp will be rounded to oldest timestamp., a - * boolean flag; default \c false.} * @configend * @errors */ @@ -5805,17 +5799,24 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1383 /*! * transaction: transaction range of timestamps pinned by the oldest + * active read timestamp + */ +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1384 +/*! + * transaction: transaction range of timestamps pinned by the oldest * timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1384 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1385 +/*! transaction: transaction read timestamp of the oldest active reader */ +#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1386 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1385 +#define WT_STAT_CONN_TXN_SYNC 1387 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1386 +#define WT_STAT_CONN_TXN_COMMIT 1388 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1387 +#define WT_STAT_CONN_TXN_ROLLBACK 1389 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1388 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1390 /*! * @} diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 10b52246987..1dc6c60a137 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -629,68 +629,6 @@ err: WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); } /* - * __log_zero -- - * Zero a log file. - */ -static int -__log_zero(WT_SESSION_IMPL *session, - WT_FH *fh, wt_off_t start_off, wt_off_t len) -{ - WT_CONNECTION_IMPL *conn; - WT_DECL_ITEM(zerobuf); - WT_DECL_RET; - WT_LOG *log; - uint32_t allocsize, bufsz, off, partial, wrlen; - - conn = S2C(session); - log = conn->log; - allocsize = log->allocsize; - zerobuf = NULL; - if (allocsize < WT_MEGABYTE) - bufsz = WT_MEGABYTE; - else - bufsz = allocsize; - /* - * If they're using smaller log files, cap it at the file size. - */ - if (conn->log_file_max < bufsz) - bufsz = (uint32_t)conn->log_file_max; - WT_RET(__wt_scr_alloc(session, bufsz, &zerobuf)); - memset(zerobuf->mem, 0, zerobuf->memsize); - WT_STAT_CONN_INCR(session, log_zero_fills); - - /* - * Read in a chunk starting at the end of the file. Keep going until - * we reach the beginning or we find a chunk that contains any non-zero - * bytes. Compare against a known zero byte chunk. - */ - off = (uint32_t)start_off; - while (off < (uint32_t)len) { - /* - * Typically we start to zero the file after the log header - * and the bufsz is a sector-aligned size. So we want to - * align our writes when we can. - */ - partial = off % bufsz; - if (partial != 0) - wrlen = bufsz - partial; - else - wrlen = bufsz; - /* - * Check if we're writing a partial amount at the end too. - */ - if ((uint32_t)len - off < bufsz) - wrlen = (uint32_t)len - off; - __wt_capacity_throttle(session, wrlen, WT_THROTTLE_LOG); - WT_ERR(__wt_write(session, - fh, (wt_off_t)off, wrlen, zerobuf->mem)); - off += wrlen; - } -err: __wt_scr_free(session, &zerobuf); - return (ret); -} - -/* * __log_prealloc -- * Pre-allocate a log file. */ @@ -710,7 +648,7 @@ __log_prealloc(WT_SESSION_IMPL *session, WT_FH *fh) * and zero the log file based on what is available. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ZERO_FILL)) - return (__log_zero(session, fh, + return (__wt_file_zero(session, fh, log->first_record, conn->log_file_max)); /* If configured to not extend the file, we're done. */ @@ -1235,7 +1173,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) WT_LOG *log; WT_LSN end_lsn, logrec_lsn; u_int yield_cnt; - bool create_log; + bool create_log, skipp; conn = S2C(session); log = conn->log; @@ -1284,13 +1222,11 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) */ create_log = true; if (conn->log_prealloc > 0 && !conn->hot_backup) { - __wt_readlock(session, &conn->hot_backup_lock); - if (conn->hot_backup) - __wt_readunlock(session, &conn->hot_backup_lock); - else { - ret = __log_alloc_prealloc(session, log->fileid); - __wt_readunlock(session, &conn->hot_backup_lock); + WT_WITH_HOTBACKUP_READ_LOCK(session, + ret = __log_alloc_prealloc(session, log->fileid), + &skipp); + if (!skipp) { /* * If ret is 0 it means we found a pre-allocated file. * If ret is WT_NOTFOUND, create the new log file and @@ -1517,24 +1453,23 @@ __log_truncate_file(WT_SESSION_IMPL *session, WT_FH *log_fh, wt_off_t offset) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; + bool skipp; conn = S2C(session); log = conn->log; if (!F_ISSET(log, WT_LOG_TRUNCATE_NOTSUP) && !conn->hot_backup) { - __wt_readlock(session, &conn->hot_backup_lock); - if (conn->hot_backup) - __wt_readunlock(session, &conn->hot_backup_lock); - else { - ret = __wt_ftruncate(session, log_fh, offset); - __wt_readunlock(session, &conn->hot_backup_lock); + WT_WITH_HOTBACKUP_READ_LOCK(session, + ret = __wt_ftruncate( + session, log_fh, offset), &skipp); + if (!skipp) { if (ret != ENOTSUP) return (ret); F_SET(log, WT_LOG_TRUNCATE_NOTSUP); } } - return (__log_zero(session, log_fh, offset, conn->log_file_max)); + return (__wt_file_zero(session, log_fh, offset, conn->log_file_max)); } /* diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c index 58711cc4e92..c4eb01b2d39 100644 --- a/src/third_party/wiredtiger/src/meta/meta_ckpt.c +++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c @@ -352,16 +352,16 @@ __ckpt_load(WT_SESSION_IMPL *session, WT_RET_NOTFOUND_OK(ret); ckpt->oldest_start_ts = ret == WT_NOTFOUND || a.len == 0 ? WT_TS_NONE : (uint64_t)a.val; - ret = __wt_config_subgets(session, v, "newest_start_ts", &a); + ret = __wt_config_subgets(session, v, "newest_durable_ts", &a); WT_RET_NOTFOUND_OK(ret); - ckpt->newest_start_ts = + ckpt->newest_durable_ts = ret == WT_NOTFOUND || a.len == 0 ? WT_TS_NONE : (uint64_t)a.val; ret = __wt_config_subgets(session, v, "newest_stop_ts", &a); WT_RET_NOTFOUND_OK(ret); ckpt->newest_stop_ts = ret == WT_NOTFOUND || a.len == 0 ? WT_TS_MAX : (uint64_t)a.val; __wt_timestamp_addr_check(session, - ckpt->oldest_start_ts, ckpt->newest_start_ts, ckpt->newest_stop_ts); + ckpt->oldest_start_ts, ckpt->newest_stop_ts); WT_RET(__wt_config_subgets(session, v, "write_gen", &a)); if (a.len == 0) @@ -433,8 +433,8 @@ __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, __wt_seconds(session, &ckpt->sec); } - __wt_timestamp_addr_check(session, ckpt->oldest_start_ts, - ckpt->newest_start_ts, ckpt->newest_stop_ts); + __wt_timestamp_addr_check(session, + ckpt->oldest_start_ts, ckpt->newest_stop_ts); WT_ERR(__wt_buf_catfmt(session, buf, "%s%s", sep, ckpt->name)); sep = ","; @@ -452,7 +452,7 @@ __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, ",time=%" PRIu64 ",size=%" PRId64 ",oldest_start_ts=%" PRId64 - ",newest_start_ts=%" PRId64 + ",newest_durable_ts=%" PRId64 ",newest_stop_ts=%" PRId64 ",write_gen=%" PRId64 ")", (int)ckpt->addr.size, (char *)ckpt->addr.data, @@ -460,7 +460,7 @@ __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, ckpt->sec, (int64_t)ckpt->size, (int64_t)ckpt->oldest_start_ts, - (int64_t)ckpt->newest_start_ts, + (int64_t)ckpt->newest_durable_ts, (int64_t)ckpt->newest_stop_ts, (int64_t)ckpt->write_gen)); } diff --git a/src/third_party/wiredtiger/src/os_common/os_fhandle.c b/src/third_party/wiredtiger/src/os_common/os_fhandle.c index df67508c4fe..ca2fe730444 100644 --- a/src/third_party/wiredtiger/src/os_common/os_fhandle.c +++ b/src/third_party/wiredtiger/src/os_common/os_fhandle.c @@ -500,3 +500,53 @@ __wt_close_connection_close(WT_SESSION_IMPL *session) } WT_TAILQ_SAFE_REMOVE_END return (ret); } + +/* + * __wt_file_zero -- + * Zero out the file from offset for size bytes. + */ +int +__wt_file_zero(WT_SESSION_IMPL *session, + WT_FH *fh, wt_off_t start_off, wt_off_t size) +{ + WT_DECL_ITEM(zerobuf); + WT_DECL_RET; + WT_THROTTLE_TYPE type; + uint64_t bufsz, off, partial, wrlen; + + zerobuf = NULL; + bufsz = WT_MIN((uint64_t)size, WT_MEGABYTE); + /* + * For now logging is the only type and statistic. This needs + * updating if block manager decides to use this function. + */ + type = WT_THROTTLE_LOG; + WT_STAT_CONN_INCR(session, log_zero_fills); + WT_RET(__wt_scr_alloc(session, bufsz, &zerobuf)); + memset(zerobuf->mem, 0, zerobuf->memsize); + off = (uint64_t)start_off; + while (off < (uint64_t)size) { + /* + * We benefit from aligning our writes when we can. Log files + * will typically want to start to zero after the log header + * and the bufsz is a sector-aligned size. So align when + * we can. + */ + partial = off % bufsz; + if (partial != 0) + wrlen = bufsz - partial; + else + wrlen = bufsz; + /* + * Check if we're writing a partial amount at the end too. + */ + if ((uint64_t)size - off < bufsz) + wrlen = (uint64_t)size - off; + __wt_capacity_throttle(session, wrlen, type); + WT_ERR(__wt_write(session, + fh, (wt_off_t)off, (size_t)wrlen, zerobuf->mem)); + off += wrlen; + } +err: __wt_scr_free(session, &zerobuf); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index f25ada93885..90db828b1a5 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -151,7 +151,7 @@ typedef struct { uint64_t recno; WT_ITEM key; wt_timestamp_t oldest_start_ts; - wt_timestamp_t newest_start_ts; + wt_timestamp_t newest_durable_ts; wt_timestamp_t newest_stop_ts; /* Saved minimum split-size boundary information. */ @@ -159,7 +159,7 @@ typedef struct { uint64_t min_recno; WT_ITEM min_key; wt_timestamp_t min_oldest_start_ts; - wt_timestamp_t min_newest_start_ts; + wt_timestamp_t min_newest_durable_ts; wt_timestamp_t min_newest_stop_ts; size_t min_offset; /* byte offset */ @@ -279,7 +279,7 @@ typedef struct { WT_UPDATE *upd; /* Update to write (or NULL) */ uint64_t txnid; /* Transaction ID, timestamps */ - wt_timestamp_t start_ts, stop_ts; + wt_timestamp_t start_ts, durable_ts, stop_ts; bool upd_saved; /* Updates saved to list */ @@ -318,7 +318,7 @@ static int __rec_las_wrapup_err(WT_SESSION_IMPL *, WT_RECONCILE *); static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t); static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_row_leaf(WT_SESSION_IMPL *, - WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *); + WT_RECONCILE *, WT_REF *, WT_SALVAGE_COOKIE *); static int __rec_row_leaf_insert( WT_SESSION_IMPL *, WT_RECONCILE *, WT_INSERT *); static int __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); @@ -469,7 +469,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, ret = __rec_row_int(session, r, page)); break; case WT_PAGE_ROW_LEAF: - ret = __rec_row_leaf(session, r, page, salvage); + ret = __rec_row_leaf(session, r, ref, salvage); break; default: ret = __wt_illegal_value(session, page->type); @@ -1186,6 +1186,7 @@ __rec_append_orig_value(WT_SESSION_IMPL *session, append->txnid = upd->txnid; append->start_ts = upd->start_ts; append->durable_ts = upd->durable_ts; + append->stop_ts = upd->stop_ts; append->next = upd->next; } @@ -1396,15 +1397,16 @@ __rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, * TIMESTAMP-FIXME * This is waiting on the WT_UPDATE structure's start/stop * timestamp work. For now, if we don't have a timestamp, - * just pretend it's durable, otherwise pretend the start - * and stop timestamps are the same. + * just pretend it's durable, otherwise pretend the durable, + * start and stop timestamps are all the same. * */ if (upd_select->upd->start_ts == WT_TS_NONE) { - upd_select->start_ts = WT_TS_NONE; + upd_select->start_ts = + upd_select->durable_ts = WT_TS_NONE; upd_select->stop_ts = WT_TS_MAX; } else - upd_select->start_ts = + upd_select->start_ts = upd_select->durable_ts = upd_select->stop_ts = upd_select->upd->start_ts; /* @@ -1453,7 +1455,7 @@ __rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, * order), so we track the maximum transaction ID and the newest update * with a timestamp (if any). */ - timestamp = first_ts_upd == NULL ? 0 : first_ts_upd->start_ts; + timestamp = first_ts_upd == NULL ? 0 : first_ts_upd->durable_ts; all_visible = upd == first_txn_upd && !(uncommitted || prepared) && (F_ISSET(r, WT_REC_VISIBLE_ALL) ? __wt_txn_visible_all(session, max_txn, timestamp) : @@ -1515,14 +1517,11 @@ __rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, if (WT_TXNID_LT(r->unstable_txn, first_upd->txnid)) r->unstable_txn = first_upd->txnid; if (first_ts_upd != NULL) { - /* - * FIXME Disable this assertion until fixed by WT-4598. - * WT_ASSERT(session, - * first_ts_upd->prepare_state == - * WT_PREPARE_INPROGRESS || - * first_ts_upd->start_ts <= - * first_ts_upd->durable_ts); - */ + WT_ASSERT(session, + first_ts_upd->prepare_state == + WT_PREPARE_INPROGRESS || + first_ts_upd->start_ts <= first_ts_upd->durable_ts); + if (r->unstable_timestamp < first_ts_upd->start_ts) r->unstable_timestamp = first_ts_upd->start_ts; @@ -1545,12 +1544,10 @@ __rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, * to use the two independently and be confident both * will be set. */ - /* - * FIXME Disable this assertion until fixed by WT-4598. - * WT_ASSERT(session, - * upd->prepare_state == WT_PREPARE_INPROGRESS || - * upd->durable_ts >= upd->start_ts); - */ + WT_ASSERT(session, + upd->prepare_state == WT_PREPARE_INPROGRESS || + upd->durable_ts >= upd->start_ts); + if (upd->start_ts < r->unstable_timestamp) r->unstable_timestamp = upd->start_ts; /* @@ -2244,7 +2241,7 @@ __wt_split_page_size(int split_pct, uint32_t maxpagesize, uint32_t allocsize) */ static void __rec_addr_ts_init(WT_RECONCILE *r, wt_timestamp_t *oldest_start_tsp, - wt_timestamp_t *newest_start_tsp, wt_timestamp_t *newest_stop_tsp) + wt_timestamp_t *newest_durable_ts, wt_timestamp_t *newest_stop_tsp) { /* * If the page format supports address timestamps (and not fixed-length @@ -2254,9 +2251,9 @@ __rec_addr_ts_init(WT_RECONCILE *r, wt_timestamp_t *oldest_start_tsp, * the oldest/newest timestamps to simple durability. */ *oldest_start_tsp = WT_TS_MAX; - *newest_start_tsp = *newest_stop_tsp = WT_TS_NONE; + *newest_durable_ts = *newest_stop_tsp = WT_TS_NONE; if (!__wt_process.page_version_ts || r->page->type == WT_PAGE_COL_FIX) { - *oldest_start_tsp = *newest_start_tsp = WT_TS_NONE; + *oldest_start_tsp = *newest_durable_ts = WT_TS_NONE; *newest_stop_tsp = WT_TS_MAX; } } @@ -2267,12 +2264,12 @@ __rec_addr_ts_init(WT_RECONCILE *r, wt_timestamp_t *oldest_start_tsp, */ static inline void __rec_addr_ts_update(WT_RECONCILE *r, wt_timestamp_t oldest_start_ts, - wt_timestamp_t newest_start_ts, wt_timestamp_t newest_stop_ts) + wt_timestamp_t newest_durable_ts, wt_timestamp_t newest_stop_ts) { r->cur_ptr->oldest_start_ts = WT_MIN(oldest_start_ts, r->cur_ptr->oldest_start_ts); - r->cur_ptr->newest_start_ts = - WT_MAX(newest_start_ts, r->cur_ptr->newest_start_ts); + r->cur_ptr->newest_durable_ts = + WT_MAX(newest_durable_ts, r->cur_ptr->newest_durable_ts); r->cur_ptr->newest_stop_ts = WT_MAX(newest_stop_ts, r->cur_ptr->newest_stop_ts); } @@ -2290,14 +2287,14 @@ __rec_split_chunk_init( chunk->key.size = 0; chunk->entries = 0; __rec_addr_ts_init(r, &chunk->oldest_start_ts, - &chunk->newest_start_ts, &chunk->newest_stop_ts); + &chunk->newest_durable_ts, &chunk->newest_stop_ts); chunk->min_recno = WT_RECNO_OOB; /* Don't touch the key item memory, that memory is reused. */ chunk->min_key.size = 0; chunk->min_entries = 0; __rec_addr_ts_init(r, &chunk->min_oldest_start_ts, - &chunk->min_newest_start_ts, &chunk->min_newest_stop_ts); + &chunk->min_newest_durable_ts, &chunk->min_newest_stop_ts); chunk->min_offset = 0; /* @@ -2760,7 +2757,8 @@ __rec_split_crossing_bnd( WT_RET(__rec_split_row_promote( session, r, &r->cur_ptr->min_key, r->page->type)); r->cur_ptr->min_oldest_start_ts = r->cur_ptr->oldest_start_ts; - r->cur_ptr->min_newest_start_ts = r->cur_ptr->newest_start_ts; + r->cur_ptr->min_newest_durable_ts = + r->cur_ptr->newest_durable_ts; r->cur_ptr->min_newest_stop_ts = r->cur_ptr->newest_stop_ts; /* Assert we're not re-entering this code. */ @@ -2818,8 +2816,9 @@ __rec_split_finish_process_prev(WT_SESSION_IMPL *session, WT_RECONCILE *r) prev_ptr->entries += cur_ptr->entries; prev_ptr->oldest_start_ts = WT_MIN(prev_ptr->oldest_start_ts, cur_ptr->oldest_start_ts); - prev_ptr->newest_start_ts = - WT_MAX(prev_ptr->newest_start_ts, cur_ptr->newest_start_ts); + prev_ptr->newest_durable_ts = + WT_MAX(prev_ptr->newest_durable_ts, + cur_ptr->newest_durable_ts); prev_ptr->newest_stop_ts = WT_MAX(prev_ptr->newest_stop_ts, cur_ptr->newest_stop_ts); dsk = r->cur_ptr->image.mem; @@ -2873,15 +2872,16 @@ __rec_split_finish_process_prev(WT_SESSION_IMPL *session, WT_RECONCILE *r) prev_ptr->min_key.data, prev_ptr->min_key.size)); cur_ptr->oldest_start_ts = WT_MIN(prev_ptr->oldest_start_ts, cur_ptr->oldest_start_ts); - cur_ptr->newest_start_ts = - WT_MAX(prev_ptr->newest_start_ts, cur_ptr->newest_start_ts); + cur_ptr->newest_durable_ts = + WT_MAX(prev_ptr->newest_durable_ts, + cur_ptr->newest_durable_ts); cur_ptr->newest_stop_ts = WT_MAX(prev_ptr->newest_stop_ts, cur_ptr->newest_stop_ts); cur_ptr->image.size += len_to_move; prev_ptr->entries = prev_ptr->min_entries; prev_ptr->oldest_start_ts = prev_ptr->min_oldest_start_ts; - prev_ptr->newest_start_ts = prev_ptr->min_newest_start_ts; + prev_ptr->newest_durable_ts = prev_ptr->min_newest_durable_ts; prev_ptr->newest_stop_ts = prev_ptr->min_newest_stop_ts; prev_ptr->image.size -= len_to_move; } @@ -3040,11 +3040,10 @@ done: if (F_ISSET(r, WT_REC_LOOKASIDE)) { multi->page_las.unstable_txn = r->unstable_txn; WT_ASSERT(session, r->unstable_txn != WT_TXN_NONE); multi->page_las.max_timestamp = r->max_timestamp; - /* - * FIXME Disable this assertion until fixed by WT-4598. - * WT_ASSERT(session, r->all_upd_prepare_in_prog == true || - * r->unstable_durable_timestamp >= r->unstable_timestamp); - */ + + WT_ASSERT(session, r->all_upd_prepare_in_prog == true || + r->unstable_durable_timestamp >= r->unstable_timestamp); + multi->page_las.unstable_timestamp = r->unstable_timestamp; multi->page_las.unstable_durable_timestamp = r->unstable_durable_timestamp; @@ -3297,7 +3296,7 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, /* Initialize the address (set the addr type for the parent). */ multi->addr.oldest_start_ts = chunk->oldest_start_ts; - multi->addr.newest_start_ts = chunk->newest_start_ts; + multi->addr.newest_durable_ts = chunk->newest_durable_ts; multi->addr.newest_stop_ts = chunk->newest_stop_ts; switch (page->type) { @@ -3765,7 +3764,7 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) WT_KV *val; WT_PAGE *child, *page; WT_REF *ref; - wt_timestamp_t oldest_start_ts, newest_start_ts, newest_stop_ts; + wt_timestamp_t oldest_start_ts, newest_durable_ts, newest_stop_ts; bool hazard; btree = S2BT(session); @@ -3854,13 +3853,13 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) val->cell_len = 0; val->len = val->buf.size; oldest_start_ts = vpack->oldest_start_ts; - newest_start_ts = vpack->newest_start_ts; + newest_durable_ts = vpack->newest_durable_ts; newest_stop_ts = vpack->newest_stop_ts; } else { __rec_cell_build_addr( session, r, addr, false, ref->ref_recno); oldest_start_ts = addr->oldest_start_ts; - newest_start_ts = addr->newest_start_ts; + newest_durable_ts = addr->newest_durable_ts; newest_stop_ts = addr->newest_stop_ts; } WT_CHILD_RELEASE_ERR(session, hazard, ref); @@ -3871,8 +3870,8 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) /* Copy the value onto the page. */ __rec_image_copy(session, r, val); - __rec_addr_ts_update( - r, oldest_start_ts, newest_start_ts, newest_stop_ts); + __rec_addr_ts_update(r, + oldest_start_ts, newest_durable_ts, newest_stop_ts); } WT_INTL_FOREACH_END; /* Write the remnant page. */ @@ -3916,7 +3915,7 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* Copy the value onto the page. */ __rec_image_copy(session, r, val); __rec_addr_ts_update(r, addr->oldest_start_ts, - addr->newest_start_ts, addr->newest_stop_ts); + addr->newest_durable_ts, addr->newest_stop_ts); } return (0); } @@ -4128,7 +4127,7 @@ __rec_col_fix_slvg(WT_SESSION_IMPL *session, static int __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_SALVAGE_COOKIE *salvage, WT_ITEM *value, - wt_timestamp_t start_ts, wt_timestamp_t stop_ts, + wt_timestamp_t start_ts, wt_timestamp_t durable_ts ,wt_timestamp_t stop_ts, uint64_t rle, bool deleted, bool overflow_type) { WT_BTREE *btree; @@ -4193,7 +4192,7 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_RET(__rec_dict_replace( session, r, start_ts, stop_ts, rle, val)); __rec_image_copy(session, r, val); - __rec_addr_ts_update(r, start_ts, start_ts, stop_ts); + __rec_addr_ts_update(r, start_ts, durable_ts, stop_ts); /* Update the starting record number in case we split. */ r->recno += rle; @@ -4215,6 +4214,7 @@ __rec_col_var(WT_SESSION_IMPL *session, wt_timestamp_t start_ts, stop_ts; /* Timestamps */ bool deleted; /* If deleted */ } last; + WT_ADDR *addr; WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *vpack, _vpack; @@ -4226,7 +4226,7 @@ __rec_col_var(WT_SESSION_IMPL *session, WT_PAGE *page; WT_UPDATE *upd; WT_UPDATE_SELECT upd_select; - wt_timestamp_t start_ts, stop_ts; + wt_timestamp_t start_ts, durable_ts, newest_durable_ts, stop_ts; uint64_t n, nrepeat, repeat_count, rle, skip, src_recno; uint32_t i, size; bool deleted, orig_deleted, update_no_copy; @@ -4240,9 +4240,24 @@ __rec_col_var(WT_SESSION_IMPL *session, size = 0; data = NULL; + /* + * Acquire the newest-durable timestamp for this page so we can roll it + * forward. If it exists, it's in the WT_REF structure or the parent's + * disk image. + */ + if ((addr = pageref->addr) == NULL) + newest_durable_ts = WT_TS_NONE; + else if (__wt_off_page(pageref->home, addr)) + newest_durable_ts = addr->newest_durable_ts; + else { + __wt_cell_unpack(session, pageref->home, pageref->addr, vpack); + newest_durable_ts = vpack->newest_durable_ts; + } + /* Set the "last" values to cause failure if they're not set. */ last.value = r->last; - last.start_ts = last.stop_ts = WT_TS_NONE; + last.start_ts = WT_TS_MAX; + last.stop_ts = WT_TS_NONE; last.deleted = false; /* @@ -4250,7 +4265,8 @@ __rec_col_var(WT_SESSION_IMPL *session, * [-Werror=maybe-uninitialized] */ /* NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores) */ - start_ts = stop_ts = WT_TS_NONE; + start_ts = WT_TS_MAX; + durable_ts = stop_ts = WT_TS_NONE; WT_RET(__rec_split_init(session, r, page, pageref->ref_recno, btree->maxleafpage_precomp)); @@ -4283,7 +4299,7 @@ __rec_col_var(WT_SESSION_IMPL *session, salvage->take += salvage->missing; } else WT_ERR(__rec_col_var_helper(session, r, - NULL, NULL, WT_TS_NONE, WT_TS_MAX, + NULL, NULL, WT_TS_NONE, WT_TS_NONE, WT_TS_MAX, salvage->missing, true, false)); } @@ -4303,58 +4319,53 @@ __rec_col_var(WT_SESSION_IMPL *session, /* For each entry in the in-memory page... */ WT_COL_FOREACH(page, cip, i) { ovfl_state = OVFL_IGNORE; - if ((cell = WT_COL_PTR(page, cip)) == NULL) { - nrepeat = 1; - ins = NULL; - orig_deleted = true; - } else { - __wt_cell_unpack(session, page, cell, vpack); - nrepeat = __wt_cell_rle(vpack); - ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip)); - - /* - * If the original value is "deleted", there's no value - * to compare, we're done. - */ - orig_deleted = vpack->type == WT_CELL_DEL; - if (orig_deleted) - goto record_loop; + cell = WT_COL_PTR(page, cip); + __wt_cell_unpack(session, page, cell, vpack); + nrepeat = __wt_cell_rle(vpack); + ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip)); - /* - * Overflow items are tricky: we don't know until we're - * finished processing the set of values if we need the - * overflow value or not. If we don't use the overflow - * item at all, we have to discard it from the backing - * file, otherwise we'll leak blocks on the checkpoint. - * That's safe because if the backing overflow value is - * still needed by any running transaction, we'll cache - * a copy in the update list. - * - * Regardless, we avoid copying in overflow records: if - * there's a WT_INSERT entry that modifies a reference - * counted overflow record, we may have to write copies - * of the overflow record, and in that case we'll do the - * comparisons, but we don't read overflow items just to - * see if they match records on either side. - */ - if (vpack->ovfl) { - ovfl_state = OVFL_UNUSED; - goto record_loop; - } + /* + * If the original value is "deleted", there's no value + * to compare, we're done. + */ + orig_deleted = vpack->type == WT_CELL_DEL; + if (orig_deleted) + goto record_loop; - /* - * If data is Huffman encoded, we have to decode it in - * order to compare it with the last item we saw, which - * may have been an update string. This guarantees we - * find every single pair of objects we can RLE encode, - * including applications updating an existing record - * where the new value happens (?) to match a Huffman- - * encoded value in a previous or next record. - */ - WT_ERR(__wt_dsk_cell_data_ref( - session, WT_PAGE_COL_VAR, vpack, orig)); + /* + * Overflow items are tricky: we don't know until we're + * finished processing the set of values if we need the + * overflow value or not. If we don't use the overflow + * item at all, we have to discard it from the backing + * file, otherwise we'll leak blocks on the checkpoint. + * That's safe because if the backing overflow value is + * still needed by any running transaction, we'll cache + * a copy in the update list. + * + * Regardless, we avoid copying in overflow records: if + * there's a WT_INSERT entry that modifies a reference + * counted overflow record, we may have to write copies + * of the overflow record, and in that case we'll do the + * comparisons, but we don't read overflow items just to + * see if they match records on either side. + */ + if (vpack->ovfl) { + ovfl_state = OVFL_UNUSED; + goto record_loop; } + /* + * If data is Huffman encoded, we have to decode it in + * order to compare it with the last item we saw, which + * may have been an update string. This guarantees we + * find every single pair of objects we can RLE encode, + * including applications updating an existing record + * where the new value happens (?) to match a Huffman- + * encoded value in a previous or next record. + */ + WT_ERR(__wt_dsk_cell_data_ref( + session, WT_PAGE_COL_VAR, vpack, orig)); + record_loop: /* * Generate on-page entries: loop repeat records, looking for * WT_INSERT entries matching the record number. The WT_INSERT @@ -4363,6 +4374,7 @@ record_loop: /* for (n = 0; n < nrepeat; n += repeat_count, src_recno += repeat_count) { start_ts = vpack->start_ts; + durable_ts = newest_durable_ts; stop_ts = vpack->stop_ts; upd = NULL; if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) { @@ -4378,9 +4390,11 @@ record_loop: /* * the page. */ start_ts = WT_TS_NONE; + durable_ts = WT_TS_NONE; stop_ts = WT_TS_MAX; } else { start_ts = upd_select.start_ts; + durable_ts = upd_select.durable_ts; stop_ts = upd_select.stop_ts; } ins = WT_SKIP_NEXT(ins); @@ -4471,8 +4485,8 @@ record_loop: /* if (rle != 0) { WT_ERR(__rec_col_var_helper( session, r, salvage, - last.value, - last.start_ts, last.stop_ts, + last.value, last.start_ts, + durable_ts, last.stop_ts, rle, last.deleted, false)); rle = 0; } @@ -4480,8 +4494,8 @@ record_loop: /* last.value->data = vpack->data; last.value->size = vpack->size; WT_ERR(__rec_col_var_helper(session, r, - salvage, - last.value, start_ts, stop_ts, + salvage, last.value, + start_ts, durable_ts, stop_ts, repeat_count, false, true)); /* Track if page has overflow items. */ @@ -4534,8 +4548,8 @@ compare: /* continue; } WT_ERR(__rec_col_var_helper(session, r, salvage, - last.value, last.start_ts, last.stop_ts, - rle, last.deleted, false)); + last.value, last.start_ts, durable_ts, + last.stop_ts, rle, last.deleted, false)); } /* @@ -4623,9 +4637,11 @@ compare: /* * tombstone on the page. */ start_ts = WT_TS_NONE; + durable_ts = WT_TS_NONE; stop_ts = WT_TS_MAX; } else { start_ts = upd_select.start_ts; + durable_ts = upd_select.durable_ts; stop_ts = upd_select.stop_ts; } while (src_recno <= n) { @@ -4665,11 +4681,13 @@ compare: /* * the page. */ start_ts = WT_TS_NONE; + durable_ts = WT_TS_NONE; stop_ts = WT_TS_MAX; deleted = true; } else { start_ts = upd_select.start_ts; + durable_ts = upd_select.durable_ts; stop_ts = upd_select.stop_ts; switch (upd->type) { @@ -4714,8 +4732,8 @@ compare: /* goto next; } WT_ERR(__rec_col_var_helper(session, r, salvage, - last.value, last.start_ts, last.stop_ts, - rle, last.deleted, false)); + last.value, last.start_ts, durable_ts, + last.stop_ts, rle, last.deleted, false)); } /* @@ -4762,8 +4780,9 @@ next: if (src_recno == UINT64_MAX) /* If we were tracking a record, write it. */ if (rle != 0) - WT_ERR(__rec_col_var_helper(session, r, salvage, last.value, - last.start_ts, last.stop_ts, rle, last.deleted, false)); + WT_ERR(__rec_col_var_helper(session, r, salvage, + last.value, last.start_ts, durable_ts, last.stop_ts, + rle, last.deleted, false)); /* Write the remnant page. */ ret = __rec_split_finish(session, r); @@ -4789,7 +4808,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_KV *key, *val; WT_PAGE *child; WT_REF *ref; - wt_timestamp_t oldest_start_ts, newest_start_ts, newest_stop_ts; + wt_timestamp_t oldest_start_ts, newest_durable_ts, newest_stop_ts; size_t size; bool hazard, key_onpage_ovfl, ovfl_key; const void *p; @@ -4943,7 +4962,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) __rec_cell_build_addr(session, r, addr, state == WT_CHILD_PROXY, WT_RECNO_OOB); oldest_start_ts = addr->oldest_start_ts; - newest_start_ts = addr->newest_start_ts; + newest_durable_ts = addr->newest_durable_ts; newest_stop_ts = addr->newest_stop_ts; } else { __wt_cell_unpack(session, page, ref->addr, vpack); @@ -4959,7 +4978,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) val->cell_len = 0; val->len = val->buf.size; oldest_start_ts = vpack->oldest_start_ts; - newest_start_ts = vpack->newest_start_ts; + newest_durable_ts = vpack->newest_durable_ts; newest_stop_ts = vpack->newest_stop_ts; } WT_CHILD_RELEASE_ERR(session, hazard, ref); @@ -5002,8 +5021,8 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* Copy the key and value onto the page. */ __rec_image_copy(session, r, key); __rec_image_copy(session, r, val); - __rec_addr_ts_update( - r, oldest_start_ts, newest_start_ts, newest_stop_ts); + __rec_addr_ts_update(r, + oldest_start_ts, newest_durable_ts, newest_stop_ts); /* Update compression state. */ __rec_key_state_update(r, ovfl_key); @@ -5056,7 +5075,7 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) __rec_image_copy(session, r, key); __rec_image_copy(session, r, val); __rec_addr_ts_update(r, addr->oldest_start_ts, - addr->newest_start_ts, addr->newest_stop_ts); + addr->newest_durable_ts, addr->newest_stop_ts); /* Update compression state. */ __rec_key_state_update(r, ovfl_key); @@ -5091,8 +5110,9 @@ __rec_row_zero_len(WT_SESSION_IMPL *session, */ static int __rec_row_leaf(WT_SESSION_IMPL *session, - WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) + WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage) { + WT_ADDR *addr; WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack; @@ -5103,10 +5123,11 @@ __rec_row_leaf(WT_SESSION_IMPL *session, WT_IKEY *ikey; WT_INSERT *ins; WT_KV *key, *val; + WT_PAGE *page; WT_ROW *rip; WT_UPDATE *upd; WT_UPDATE_SELECT upd_select; - wt_timestamp_t start_ts, stop_ts; + wt_timestamp_t start_ts, durable_ts, newest_durable_ts, stop_ts; size_t size; uint64_t slvg_skip, txnid; uint32_t i; @@ -5116,12 +5137,27 @@ __rec_row_leaf(WT_SESSION_IMPL *session, btree = S2BT(session); cbt = &r->update_modify_cbt; + page = pageref->page; slvg_skip = salvage == NULL ? 0 : salvage->skip; key = &r->k; val = &r->v; vpack = &_vpack; + /* + * Acquire the newest-durable timestamp for this page so we can roll it + * forward. If it exists, it's in the WT_REF structure or the parent's + * disk image. + */ + if ((addr = pageref->addr) == NULL) + newest_durable_ts = WT_TS_NONE; + else if (__wt_off_page(pageref->home, addr)) + newest_durable_ts = addr->newest_durable_ts; + else { + __wt_cell_unpack(session, pageref->home, pageref->addr, vpack); + newest_durable_ts = vpack->newest_durable_ts; + } + WT_RET(__rec_split_init( session, r, page, 0, btree->maxleafpage_precomp)); @@ -5173,6 +5209,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session, /* Unpack the on-page value cell, set the default timestamps. */ __wt_row_leaf_value_cell(session, page, rip, NULL, vpack); start_ts = vpack->start_ts; + durable_ts = newest_durable_ts; stop_ts = vpack->stop_ts; txnid = WT_TXN_NONE; @@ -5180,9 +5217,10 @@ __rec_row_leaf(WT_SESSION_IMPL *session, WT_ERR(__rec_upd_select( session, r, NULL, rip, vpack, &upd_select)); if ((upd = upd_select.upd) != NULL) { - txnid = upd_select.txnid; start_ts = upd_select.start_ts; + durable_ts = upd_select.durable_ts; stop_ts = upd_select.stop_ts; + txnid = upd_select.txnid; } /* Build value cell. */ @@ -5450,7 +5488,7 @@ build: session, r, start_ts, stop_ts, 0, val)); __rec_image_copy(session, r, val); } - __rec_addr_ts_update(r, start_ts, start_ts, stop_ts); + __rec_addr_ts_update(r, start_ts, durable_ts, stop_ts); /* Update compression state. */ __rec_key_state_update(r, ovfl_key); @@ -5480,7 +5518,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) WT_KV *key, *val; WT_UPDATE *upd; WT_UPDATE_SELECT upd_select; - wt_timestamp_t start_ts, stop_ts; + wt_timestamp_t start_ts, durable_ts, stop_ts; uint64_t txnid; bool ovfl_key, upd_saved; @@ -5494,9 +5532,10 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) WT_RET(__rec_upd_select( session, r, ins, NULL, NULL, &upd_select)); upd = upd_select.upd; - txnid = upd_select.txnid; start_ts = upd_select.start_ts; + durable_ts = upd_select.durable_ts; stop_ts = upd_select.stop_ts; + txnid = upd_select.txnid; upd_saved = upd_select.upd_saved; if (upd == NULL) { @@ -5580,7 +5619,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) session, r, start_ts, stop_ts, 0, val)); __rec_image_copy(session, r, val); } - __rec_addr_ts_update(r, start_ts, start_ts, stop_ts); + __rec_addr_ts_update(r, start_ts, durable_ts, stop_ts); /* Update compression state. */ __rec_key_state_update(r, ovfl_key); @@ -5837,7 +5876,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) r->wrapup_checkpoint_compressed)); __wt_checkpoint_tree_reconcile_update(session, r->multi->addr.oldest_start_ts, - r->multi->addr.newest_start_ts, + r->multi->addr.newest_durable_ts, r->multi->addr.newest_stop_ts); } @@ -6179,10 +6218,9 @@ __rec_cell_build_addr(WT_SESSION_IMPL *session, */ val->buf.data = addr->addr; val->buf.size = addr->size; - val->cell_len = __wt_cell_pack_addr(session, - &val->cell, cell_type, recno, - addr->oldest_start_ts, addr->newest_start_ts, addr->newest_stop_ts, - val->buf.size); + val->cell_len = __wt_cell_pack_addr( + session, &val->cell, cell_type, recno, addr->oldest_start_ts, + addr->newest_durable_ts, addr->newest_stop_ts, val->buf.size); val->len = val->cell_len + val->buf.size; } diff --git a/src/third_party/wiredtiger/src/schema/schema_util.c b/src/third_party/wiredtiger/src/schema/schema_util.c index 9626cf51d13..f3ad28708c9 100644 --- a/src/third_party/wiredtiger/src/schema/schema_util.c +++ b/src/third_party/wiredtiger/src/schema/schema_util.c @@ -9,24 +9,19 @@ #include "wt_internal.h" /* - * __wt_schema_backup_check -- - * Check if a backup cursor is open and give an error if the schema - * operation will conflict. This is called after the schema operations - * have taken the schema lock so no hot backup cursor can be created until - * this is done. + * __schema_backup_check_int -- + * Helper for __wt_schema_backup_check. Intended to be called while + * holding the hot backup read lock. */ -int -__wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name) +static int +__schema_backup_check_int(WT_SESSION_IMPL *session, const char *name) { WT_CONNECTION_IMPL *conn; - WT_DECL_RET; int i; char **backup_list; conn = S2C(session); - if (!conn->hot_backup) - return (0); - __wt_readlock(session, &conn->hot_backup_lock); + /* * There is a window at the end of a backup where the list has been * cleared from the connection but the flag is still set. It is safe @@ -34,16 +29,34 @@ __wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name) */ if (!conn->hot_backup || (backup_list = conn->hot_backup_list) == NULL) { - __wt_readunlock(session, &conn->hot_backup_lock); return (0); } for (i = 0; backup_list[i] != NULL; ++i) { - if (strcmp(backup_list[i], name) == 0) { - ret = __wt_set_return(session, EBUSY); - break; - } + if (strcmp(backup_list[i], name) == 0) + return __wt_set_return(session, EBUSY); } - __wt_readunlock(session, &conn->hot_backup_lock); + + return (0); +} + +/* + * __wt_schema_backup_check -- + * Check if a backup cursor is open and give an error if the schema + * operation will conflict. This is called after the schema operations + * have taken the schema lock so no hot backup cursor can be created until + * this is done. + */ +int +__wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + conn = S2C(session); + if (!conn->hot_backup) + return (0); + WT_WITH_HOTBACKUP_READ_LOCK_UNCOND(session, + ret = __schema_backup_check_int(session, name)); return (ret); } diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 4a7b50d72e1..3a21171d781 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -1137,7 +1137,9 @@ static const char * const __stats_connection_desc[] = { "transaction: transaction range of IDs currently pinned by named snapshots", "transaction: transaction range of timestamps currently pinned", "transaction: transaction range of timestamps pinned by a checkpoint", + "transaction: transaction range of timestamps pinned by the oldest active read timestamp", "transaction: transaction range of timestamps pinned by the oldest timestamp", + "transaction: transaction read timestamp of the oldest active reader", "transaction: transaction sync calls", "transaction: transactions committed", "transaction: transactions rolled back", @@ -1568,7 +1570,9 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing txn_pinned_snapshot_range */ /* not clearing txn_pinned_timestamp */ /* not clearing txn_pinned_timestamp_checkpoint */ + /* not clearing txn_pinned_timestamp_reader */ /* not clearing txn_pinned_timestamp_oldest */ + /* not clearing txn_timestamp_oldest_active_read */ stats->txn_sync = 0; stats->txn_commit = 0; stats->txn_rollback = 0; @@ -2167,8 +2171,12 @@ __wt_stat_connection_aggregate( to->txn_pinned_timestamp += WT_STAT_READ(from, txn_pinned_timestamp); to->txn_pinned_timestamp_checkpoint += WT_STAT_READ(from, txn_pinned_timestamp_checkpoint); + to->txn_pinned_timestamp_reader += + WT_STAT_READ(from, txn_pinned_timestamp_reader); to->txn_pinned_timestamp_oldest += WT_STAT_READ(from, txn_pinned_timestamp_oldest); + to->txn_timestamp_oldest_active_read += + WT_STAT_READ(from, txn_timestamp_oldest_active_read); to->txn_sync += WT_STAT_READ(from, txn_sync); to->txn_commit += WT_STAT_READ(from, txn_commit); to->txn_rollback += WT_STAT_READ(from, txn_rollback); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 9967dc3b2b3..c45afbf5730 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -1304,6 +1304,7 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session) WT_TXN_GLOBAL *txn_global; wt_timestamp_t checkpoint_timestamp; wt_timestamp_t commit_timestamp; + wt_timestamp_t oldest_active_read_timestamp; wt_timestamp_t pinned_timestamp; uint64_t checkpoint_pinned, snapshot_pinned; @@ -1329,6 +1330,21 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session) WT_STAT_SET(session, stats, txn_pinned_timestamp_oldest, commit_timestamp - txn_global->oldest_timestamp); + if (__wt_txn_get_pinned_timestamp( + session, &oldest_active_read_timestamp, 0) == 0) { + WT_STAT_SET(session, stats, + txn_timestamp_oldest_active_read, + oldest_active_read_timestamp); + WT_STAT_SET(session, stats, + txn_pinned_timestamp_reader, + commit_timestamp - oldest_active_read_timestamp); + } else { + WT_STAT_SET(session, + stats, txn_timestamp_oldest_active_read, 0); + WT_STAT_SET(session, + stats, txn_pinned_timestamp_reader, 0); + } + WT_STAT_SET(session, stats, txn_pinned_snapshot_range, snapshot_pinned == WT_TXN_NONE ? 0 : txn_global->current - snapshot_pinned); diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index a1c700661ce..ced994cbb9b 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -1272,6 +1272,93 @@ __drop_to(WT_CKPT *ckptbase, const char *name, size_t len) } /* + * __checkpoint_lock_dirty_tree_int -- + * Helper for __checkpoint_lock_dirty_tree. Intended to be called while + * holding the hot backup lock. + */ +static int +__checkpoint_lock_dirty_tree_int( + WT_SESSION_IMPL *session, bool is_checkpoint, + bool force, WT_BTREE *btree, WT_CKPT *ckpt, WT_CKPT *ckptbase) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + WT_UNUSED(is_checkpoint); + conn = S2C(session); + + /* + * We can't delete checkpoints if a backup cursor is open. WiredTiger + * checkpoints are uniquely named and it's OK to have multiple of them + * in the system: clear the delete flag for them, and otherwise fail. + * Hold the lock until we're done (blocking hot backups from starting), + * we don't want to race with a future hot backup. + */ + if (conn->hot_backup) + WT_CKPT_FOREACH(ckptbase, ckpt) { + if (!F_ISSET(ckpt, WT_CKPT_DELETE)) + continue; + if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { + F_CLR(ckpt, WT_CKPT_DELETE); + continue; + } + WT_RET_MSG(session, EBUSY, + "checkpoint %s blocked by hot backup: it would" + "delete an existing checkpoint, and checkpoints " + "cannot be deleted during a hot backup", + ckpt->name); + } + /* + * Mark old checkpoints that are being deleted and figure out which + * trees we can skip in this checkpoint. + */ + WT_RET(__checkpoint_mark_skip(session, ckptbase, force)); + if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) + return (0); + /* + * Lock the checkpoints that will be deleted. + * + * Checkpoints are only locked when tracking is enabled, which covers + * checkpoint and drop operations, but not close. The reasoning is + * there should be no access to a checkpoint during close, because any + * thread accessing a checkpoint will also have the current file handle + * open. + */ + if (WT_META_TRACKING(session)) + WT_CKPT_FOREACH(ckptbase, ckpt) { + if (!F_ISSET(ckpt, WT_CKPT_DELETE)) + continue; + /* + * We can't delete checkpoints referenced by a cursor. + * WiredTiger checkpoints are uniquely named and it's + * OK to have multiple in the system: clear the delete + * flag for them, and otherwise fail. + */ + ret = __wt_session_lock_checkpoint(session, ckpt->name); + if (ret == 0) + continue; + if (ret == EBUSY && + WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { + F_CLR(ckpt, WT_CKPT_DELETE); + continue; + } + WT_RET_MSG(session, ret, + "checkpoints cannot be dropped when in-use"); + } + /* + * There are special trees: those being bulk-loaded, salvaged, upgraded + * or verified during the checkpoint. They should never be part of a + * checkpoint: we will fail to lock them because the operations have + * exclusive access to the handles. Named checkpoints will fail in that + * case, ordinary checkpoints skip files that cannot be opened normally. + */ + WT_ASSERT(session, + !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)); + + return (0); +} + +/* * __checkpoint_lock_dirty_tree -- * Decide whether the tree needs to be included in the checkpoint and if * so, acquire the necessary locks. @@ -1284,18 +1371,14 @@ __checkpoint_lock_dirty_tree(WT_SESSION_IMPL *session, WT_CKPT *ckpt, *ckptbase; WT_CONFIG dropconf; WT_CONFIG_ITEM cval, k, v; - WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; const char *name; char *name_alloc; - bool hot_backup_locked; btree = S2BT(session); - conn = S2C(session); ckpt = ckptbase = NULL; dhandle = session->dhandle; - hot_backup_locked = false; name_alloc = NULL; /* Only referenced in diagnostic builds. */ @@ -1379,91 +1462,24 @@ __checkpoint_lock_dirty_tree(WT_SESSION_IMPL *session, F_SET(ckpt, WT_CKPT_ADD); /* - * We can't delete checkpoints if a backup cursor is open. WiredTiger - * checkpoints are uniquely named and it's OK to have multiple of them - * in the system: clear the delete flag for them, and otherwise fail. - * Hold the lock until we're done (blocking hot backups from starting), - * we don't want to race with a future hot backup. + * There is some interaction between backups and checkpoints. Perform + * all backup related operations that the checkpoint needs now, while + * holding the hot backup read lock. */ - __wt_readlock(session, &conn->hot_backup_lock); - hot_backup_locked = true; - if (conn->hot_backup) - WT_CKPT_FOREACH(ckptbase, ckpt) { - if (!F_ISSET(ckpt, WT_CKPT_DELETE)) - continue; - if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { - F_CLR(ckpt, WT_CKPT_DELETE); - continue; - } - WT_ERR_MSG(session, EBUSY, - "checkpoint %s blocked by hot backup: it would " - "delete an existing checkpoint, and checkpoints " - "cannot be deleted during a hot backup", - ckpt->name); - } - - /* - * Mark old checkpoints that are being deleted and figure out which - * trees we can skip in this checkpoint. - */ - WT_ERR(__checkpoint_mark_skip(session, ckptbase, force)); + WT_WITH_HOTBACKUP_READ_LOCK_UNCOND(session, + ret = __checkpoint_lock_dirty_tree_int( + session, is_checkpoint, force, btree, ckpt, ckptbase)); + WT_ERR(ret); if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) goto err; - /* - * Lock the checkpoints that will be deleted. - * - * Checkpoints are only locked when tracking is enabled, which covers - * checkpoint and drop operations, but not close. The reasoning is - * there should be no access to a checkpoint during close, because any - * thread accessing a checkpoint will also have the current file handle - * open. - */ - if (WT_META_TRACKING(session)) - WT_CKPT_FOREACH(ckptbase, ckpt) { - if (!F_ISSET(ckpt, WT_CKPT_DELETE)) - continue; - - /* - * We can't delete checkpoints referenced by a cursor. - * WiredTiger checkpoints are uniquely named and it's - * OK to have multiple in the system: clear the delete - * flag for them, and otherwise fail. - */ - ret = __wt_session_lock_checkpoint(session, ckpt->name); - if (ret == 0) - continue; - if (ret == EBUSY && - WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { - F_CLR(ckpt, WT_CKPT_DELETE); - continue; - } - WT_ERR_MSG(session, ret, - "checkpoints cannot be dropped when in-use"); - } - - /* - * There are special trees: those being bulk-loaded, salvaged, upgraded - * or verified during the checkpoint. They should never be part of a - * checkpoint: we will fail to lock them because the operations have - * exclusive access to the handles. Named checkpoints will fail in that - * case, ordinary checkpoints skip files that cannot be opened normally. - */ - WT_ASSERT(session, - !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)); - - __wt_readunlock(session, &conn->hot_backup_lock); - WT_ASSERT(session, btree->ckpt == NULL && !F_ISSET(btree, WT_BTREE_SKIP_CKPT)); btree->ckpt = ckptbase; return (0); -err: if (hot_backup_locked) - __wt_readunlock(session, &conn->hot_backup_lock); - - __wt_meta_ckptlist_free(session, &ckptbase); +err: __wt_meta_ckptlist_free(session, &ckptbase); __wt_free(session, name_alloc); return (ret); @@ -1543,16 +1559,13 @@ __checkpoint_mark_skip( void __wt_checkpoint_tree_reconcile_update( WT_SESSION_IMPL *session, wt_timestamp_t oldest_start_ts, - wt_timestamp_t newest_start_ts, wt_timestamp_t newest_stop_ts) + wt_timestamp_t newest_durable_ts, wt_timestamp_t newest_stop_ts) { WT_BTREE *btree; WT_CKPT *ckpt, *ckptbase; btree = S2BT(session); - __wt_timestamp_addr_check(session, - oldest_start_ts, newest_start_ts, newest_stop_ts); - /* * Reconciliation just wrote a checkpoint, everything has been written. * Update the checkpoint with reconciliation information. The reason @@ -1564,7 +1577,7 @@ __wt_checkpoint_tree_reconcile_update( if (F_ISSET(ckpt, WT_CKPT_ADD)) { ckpt->write_gen = btree->write_gen; ckpt->oldest_start_ts = oldest_start_ts; - ckpt->newest_start_ts = newest_start_ts; + ckpt->newest_durable_ts = newest_durable_ts; ckpt->newest_stop_ts = newest_stop_ts; } } diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index 7a502265602..50d24778ffb 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -8,12 +8,6 @@ #include "wt_internal.h" -/* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_TXN_TS_ALREADY_LOCKED 0x1u -#define WT_TXN_TS_INCLUDE_CKPT 0x2u -#define WT_TXN_TS_INCLUDE_OLDEST 0x4u -/* AUTOMATIC FLAG VALUE GENERATION STOP */ - /* * __wt_timestamp_to_string -- * Convert a timestamp to the MongoDB string representation. @@ -164,11 +158,11 @@ __txn_get_read_timestamp( } /* - * __txn_get_pinned_timestamp -- + * __wt_txn_get_pinned_timestamp -- * Calculate the current pinned timestamp. */ -static int -__txn_get_pinned_timestamp( +int +__wt_txn_get_pinned_timestamp( WT_SESSION_IMPL *session, wt_timestamp_t *tsp, uint32_t flags) { WT_CONNECTION_IMPL *conn; @@ -289,10 +283,10 @@ __txn_global_query_timestamp( return (WT_NOTFOUND); ts = txn_global->oldest_timestamp; } else if (WT_STRING_MATCH("oldest_reader", cval.str, cval.len)) - WT_RET(__txn_get_pinned_timestamp( + WT_RET(__wt_txn_get_pinned_timestamp( session, &ts, WT_TXN_TS_INCLUDE_CKPT)); else if (WT_STRING_MATCH("pinned", cval.str, cval.len)) - WT_RET(__txn_get_pinned_timestamp(session, &ts, + WT_RET(__wt_txn_get_pinned_timestamp(session, &ts, WT_TXN_TS_INCLUDE_CKPT | WT_TXN_TS_INCLUDE_OLDEST)); else if (WT_STRING_MATCH("recovery", cval.str, cval.len)) /* Read-only value forever. No lock needed. */ @@ -381,7 +375,7 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force) return (0); /* Scan to find the global pinned timestamp. */ - if ((ret = __txn_get_pinned_timestamp( + if ((ret = __wt_txn_get_pinned_timestamp( session, &pinned_timestamp, WT_TXN_TS_INCLUDE_OLDEST)) != 0) return (ret == WT_NOTFOUND ? 0 : ret); @@ -397,7 +391,7 @@ __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force) * Scan the global pinned timestamp again, it's possible that it got * changed after the previous scan. */ - if ((ret = __txn_get_pinned_timestamp(session, &pinned_timestamp, + if ((ret = __wt_txn_get_pinned_timestamp(session, &pinned_timestamp, WT_TXN_TS_ALREADY_LOCKED | WT_TXN_TS_INCLUDE_OLDEST)) != 0) { __wt_writeunlock(session, &txn_global->rwlock); return (ret == WT_NOTFOUND ? 0 : ret); @@ -636,8 +630,7 @@ __wt_txn_set_commit_timestamp( */ if (has_oldest_ts && commit_ts < oldest_ts) { __wt_timestamp_to_string(commit_ts, ts_string[0]); - __wt_timestamp_to_string( - oldest_ts, ts_string[1]); + __wt_timestamp_to_string(oldest_ts, ts_string[1]); WT_RET_MSG(session, EINVAL, "commit timestamp %s is less than the oldest " "timestamp %s", @@ -646,8 +639,7 @@ __wt_txn_set_commit_timestamp( if (has_stable_ts && commit_ts < stable_ts) { __wt_timestamp_to_string(commit_ts, ts_string[0]); - __wt_timestamp_to_string( - oldest_ts, ts_string[1]); + __wt_timestamp_to_string(stable_ts, ts_string[1]); WT_RET_MSG(session, EINVAL, "commit timestamp %s is less than the stable " "timestamp %s", @@ -746,7 +738,7 @@ __wt_txn_set_durable_timestamp( if (has_stable_ts && durable_ts < stable_ts) { __wt_timestamp_to_string(durable_ts, ts_string[0]); - __wt_timestamp_to_string(oldest_ts, ts_string[1]); + __wt_timestamp_to_string(stable_ts, ts_string[1]); WT_RET_MSG(session, EINVAL, "durable timestamp %s is less than the stable timestamp %s", ts_string[0], ts_string[1]); @@ -878,7 +870,7 @@ __wt_txn_set_read_timestamp( WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; wt_timestamp_t ts_oldest; char ts_string[2][WT_TS_INT_STRING_SIZE]; - bool roundup_to_oldest; + bool did_roundup_to_oldest; WT_RET(__wt_txn_context_prepare_check(session)); @@ -896,45 +888,37 @@ __wt_txn_set_read_timestamp( " may only be set once per transaction"); /* - * The read timestamp could be rounded to the oldest timestamp. - */ - roundup_to_oldest = F_ISSET(txn, WT_TXN_TS_ROUND_READ); - - /* * This code is not using the timestamp validate function to * avoid a race between checking and setting transaction * timestamp. */ __wt_readlock(session, &txn_global->rwlock); ts_oldest = txn_global->oldest_timestamp; + did_roundup_to_oldest = false; if (read_ts < ts_oldest) { /* * If given read timestamp is earlier than oldest * timestamp then round the read timestamp to * oldest timestamp. */ - if (roundup_to_oldest) + if (F_ISSET(txn, WT_TXN_TS_ROUND_READ)) { txn->read_timestamp = ts_oldest; - else { - __wt_readunlock(session, &txn_global->rwlock); - __wt_timestamp_to_string(read_ts, ts_string[0]); - __wt_timestamp_to_string(ts_oldest, ts_string[1]); - WT_RET_MSG(session, EINVAL, "read timestamp " - "%s less than the oldest timestamp %s", - ts_string[0], ts_string[1]); + did_roundup_to_oldest = true; + } else { + __wt_readunlock(session, &txn_global->rwlock); + __wt_timestamp_to_string(read_ts, ts_string[0]); + __wt_timestamp_to_string(ts_oldest, ts_string[1]); + WT_RET_MSG(session, EINVAL, "read timestamp " + "%s less than the oldest timestamp %s", + ts_string[0], ts_string[1]); } - } else { + } else txn->read_timestamp = read_ts; - /* - * Reset to avoid a verbose message as read - * timestamp is not rounded to oldest timestamp. - */ - roundup_to_oldest = false; - } __wt_txn_publish_read_timestamp(session); __wt_readunlock(session, &txn_global->rwlock); - if (roundup_to_oldest && WT_VERBOSE_ISSET(session, WT_VERB_TIMESTAMP)) { + if (did_roundup_to_oldest && + WT_VERBOSE_ISSET(session, WT_VERB_TIMESTAMP)) { /* * This message is generated here to reduce the span of * critical section. |