From cc34fa291c577b580d8c3619cc787797a5bc9445 Mon Sep 17 00:00:00 2001 From: Luke Chen Date: Mon, 19 Sep 2022 14:21:52 +1000 Subject: Import wiredtiger: b12072d37a533c8f57b65beb00020ba099e75505 from branch mongodb-6.1 ref: 3b7547744a..b12072d37a for: 6.1.0-rc3 WT-9805 Save the updates need to be deleted from history store and delete them later (#8262) --- src/third_party/wiredtiger/import.data | 2 +- src/third_party/wiredtiger/src/btree/bt_cursor.c | 7 + src/third_party/wiredtiger/src/history/hs_rec.c | 285 +++++++++++---------- src/third_party/wiredtiger/src/include/extern.h | 2 + src/third_party/wiredtiger/src/include/reconcile.h | 20 ++ .../wiredtiger/src/include/wt_internal.h | 2 + .../wiredtiger/src/reconcile/rec_visibility.c | 94 +++++-- .../wiredtiger/src/reconcile/rec_write.c | 10 + .../test/format/failure_configs/CONFIG.WT-9805 | 196 ++++++++++++++ .../wiredtiger/test/suite/test_prepare25.py | 2 +- .../wiredtiger/test/suite/test_prepare26.py | 134 ++++++++++ 11 files changed, 597 insertions(+), 157 deletions(-) create mode 100644 src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9805 create mode 100644 src/third_party/wiredtiger/test/suite/test_prepare26.py diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index d1c4aa2b577..3e06c3247b2 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-6.1", - "commit": "3b7547744a48b59e8efee92083c4832a550e5340" + "commit": "b12072d37a533c8f57b65beb00020ba099e75505" } diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 0d61b0350f3..40d9337c1d6 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -612,6 +612,9 @@ __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_STAT_CONN_DATA_INCR(session, cursor_reset); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + /* Initialize the update value as we are not pointing to any value. */ + cbt->upd_value->type = WT_UPDATE_INVALID; + WT_TIME_WINDOW_INIT(&cbt->upd_value->tw); return (__cursor_reset(cbt)); } @@ -2205,6 +2208,10 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt) cbt->modify_update = &cbt->_modify_update; cbt->upd_value = &cbt->_upd_value; + /* Initialize the value. */ + cbt->upd_value->type = WT_UPDATE_INVALID; + WT_TIME_WINDOW_INIT(&cbt->upd_value->tw); + #ifdef HAVE_DIAGNOSTIC cbt->lastkey = &cbt->_lastkey; cbt->lastrecno = WT_RECNO_OOB; diff --git a/src/third_party/wiredtiger/src/history/hs_rec.c b/src/third_party/wiredtiger/src/history/hs_rec.c index 236f8856257..eb3697423be 100644 --- a/src/third_party/wiredtiger/src/history/hs_rec.c +++ b/src/third_party/wiredtiger/src/history/hs_rec.c @@ -60,65 +60,6 @@ __hs_verbose_cache_stats(WT_SESSION_IMPL *session, WT_BTREE *btree) cache->hs_verb_gen_write = ckpt_gen_current; } -/* - * __hs_delete_record -- - * Delete the update left in the history store - */ -static int -__hs_delete_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_ITEM *key, - WT_UPDATE *delete_upd, WT_UPDATE *delete_tombstone) -{ - WT_DECL_RET; - bool hs_read_committed; -#ifdef HAVE_DIAGNOSTIC - WT_TIME_WINDOW *hs_tw; -#endif - - hs_read_committed = F_ISSET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); - F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); - - /* No need to delete from the history store if it is already obsolete. */ - if (delete_tombstone != NULL && __wt_txn_upd_visible_all(session, delete_tombstone)) { - ret = 0; - goto done; - } - - hs_cursor->set_key(hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX); - WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_cursor), true); - /* It's possible the value in the history store becomes obsolete concurrently. */ - if (ret == WT_NOTFOUND) { - WT_ASSERT( - session, delete_tombstone != NULL && __wt_txn_upd_visible_all(session, delete_tombstone)); - ret = 0; - goto done; - } - -#ifdef HAVE_DIAGNOSTIC - __wt_hs_upd_time_window(hs_cursor, &hs_tw); - WT_ASSERT(session, hs_tw->start_txn == WT_TXN_NONE || hs_tw->start_txn == delete_upd->txnid); - WT_ASSERT(session, hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == delete_upd->start_ts); - WT_ASSERT(session, - hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == delete_upd->durable_ts); - if (delete_tombstone != NULL) { - WT_ASSERT(session, hs_tw->stop_txn == delete_tombstone->txnid); - WT_ASSERT(session, hs_tw->stop_ts == delete_tombstone->start_ts); - WT_ASSERT(session, hs_tw->durable_stop_ts == delete_tombstone->durable_ts); - } else - WT_ASSERT(session, !WT_TIME_WINDOW_HAS_STOP(hs_tw)); -#endif - - WT_ERR(hs_cursor->remove(hs_cursor)); -done: - if (delete_tombstone != NULL) - F_CLR(delete_tombstone, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS); - F_CLR(delete_upd, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS); - -err: - if (!hs_read_committed) - F_CLR(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); - return (ret); -} - /* * __hs_insert_record -- * A helper function to insert the record into the history store including stop time point. @@ -344,10 +285,46 @@ __hs_next_upd_full_value(WT_SESSION_IMPL *session, WT_UPDATE_VECTOR *updates, return (0); } +/* + * __hs_pack_key -- + * Pack the history store key + */ +static inline int +__hs_pack_key(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_RECONCILE *r, WT_INSERT *ins, + WT_ROW *rip, WT_ITEM *key) +{ + WT_DECL_RET; + uint8_t *p; + + switch (r->page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + p = key->mem; + WT_RET(__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(ins))); + key->size = WT_PTRDIFF(p, key->data); + break; + case WT_PAGE_ROW_LEAF: + if (ins == NULL) { + WT_WITH_BTREE( + session, btree, ret = __wt_row_leaf_key(session, r->page, rip, key, false)); + WT_RET(ret); + } else { + key->data = WT_INSERT_KEY(ins); + key->size = WT_INSERT_KEY_SIZE(ins); + } + break; + default: + WT_RET(__wt_illegal_value(session, r->page->type)); + } + + return (ret); +} + /* * __wt_hs_insert_updates -- - * Copy one set of saved updates into the database's history store table. Whether the function - * fails or succeeds, if there is a successful write to history, cache_write_hs is set to true. + * Copy one set of saved updates into the database's history store table if they haven't been + * moved there. Whether the function fails or succeeds, if there is a successful write to + * history, cache_write_hs is set to true. */ int __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *multi) @@ -367,14 +344,12 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult WT_MODIFY entries[MAX_REVERSE_MODIFY_NUM]; WT_UPDATE_VECTOR updates; WT_SAVE_UPD *list; - WT_UPDATE *delete_tombstone, *delete_upd, *newest_hs, *no_ts_upd, *oldest_upd, *prev_upd, - *ref_upd, *tombstone, *upd; + WT_UPDATE *newest_hs, *no_ts_upd, *oldest_upd, *prev_upd, *ref_upd, *tombstone, *upd; WT_TIME_WINDOW tw; wt_off_t hs_size; uint64_t insert_cnt, max_hs_size, modify_cnt; uint64_t cache_hs_insert_full_update, cache_hs_insert_reverse_modify, cache_hs_write_squash; uint32_t i; - uint8_t *p; int nentries; bool enable_reverse_modify, error_on_ts_ordering, hs_inserted, squashed; @@ -407,65 +382,6 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult if (list->onpage_upd == NULL) continue; - /* History store table key component: source key. */ - switch (r->page->type) { - case WT_PAGE_COL_FIX: - case WT_PAGE_COL_VAR: - p = key->mem; - WT_ERR(__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins))); - key->size = WT_PTRDIFF(p, key->data); - break; - case WT_PAGE_ROW_LEAF: - if (list->ins == NULL) { - WT_WITH_BTREE( - session, btree, ret = __wt_row_leaf_key(session, r->page, list->rip, key, false)); - WT_ERR(ret); - } else { - key->data = WT_INSERT_KEY(list->ins); - key->size = WT_INSERT_KEY_SIZE(list->ins); - } - break; - default: - WT_ERR(__wt_illegal_value(session, r->page->type)); - } - - no_ts_upd = newest_hs = NULL; - ref_upd = list->onpage_upd; - delete_tombstone = delete_upd = NULL; - - __wt_update_vector_clear(&updates); - - /* - * Reverse deltas are only supported on 'S' and 'u' value formats. - */ - enable_reverse_modify = - (WT_STREQ(btree->value_format, "S") || WT_STREQ(btree->value_format, "u")); - - /* - * Delete the update that is both on the update chain and the history store from the history - * store. Otherwise, we will trigger out of order fix when the update is inserted to the - * history store again. - */ - for (upd = list->onpage_tombstone != NULL ? list->onpage_tombstone : list->onpage_upd; - upd != NULL; upd = upd->next) { - if (upd->txnid == WT_TXN_ABORTED) - continue; - - if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS)) { - WT_ASSERT_ALWAYS(session, F_ISSET(upd, WT_UPDATE_HS | WT_UPDATE_RESTORED_FROM_HS), - "Attempting to remove an update from the history store in WiredTiger, but the " - "update was missing."); - if (upd->type == WT_UPDATE_TOMBSTONE) - delete_tombstone = upd; - else { - delete_upd = upd; - WT_ERR( - __hs_delete_record(session, hs_cursor, key, delete_upd, delete_tombstone)); - break; - } - } - } - /* Skip aborted updates. */ for (upd = list->onpage_upd->next; upd != NULL && upd->txnid == WT_TXN_ABORTED; upd = upd->next) @@ -479,6 +395,20 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult if (F_ISSET(upd, WT_UPDATE_HS)) continue; + /* History store table key component: source key. */ + WT_ERR(__hs_pack_key(session, btree, r, list->ins, list->rip, key)); + + no_ts_upd = newest_hs = NULL; + ref_upd = list->onpage_upd; + + __wt_update_vector_clear(&updates); + + /* + * Reverse deltas are only supported on 'S' and 'u' value formats. + */ + enable_reverse_modify = + (WT_STREQ(btree->value_format, "S") || WT_STREQ(btree->value_format, "u")); + /* * The algorithm assumes the oldest update on the update chain in memory is either a full * update or a tombstone. @@ -831,7 +761,7 @@ __wt_hs_delete_key(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btre { WT_DECL_RET; WT_ITEM hs_key; - wt_timestamp_t hs_ts; + wt_timestamp_t hs_start_ts; uint64_t hs_counter; uint32_t hs_btree_id; bool hs_read_all_flag; @@ -850,7 +780,7 @@ __wt_hs_delete_key(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btre ret = 0; goto done; } else { - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter)); + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); ++hs_counter; } @@ -880,8 +810,8 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui WT_CURSOR_BTREE *hs_cbt; WT_DECL_RET; WT_ITEM hs_key, hs_value; - WT_TIME_WINDOW hs_insert_tw, tw, *twp; - wt_timestamp_t hs_ts; + WT_TIME_WINDOW hs_insert_tw, *twp; + wt_timestamp_t hs_durable_start_ts, hs_durable_stop_ts, hs_start_ts; uint64_t cache_hs_order_lose_durable_timestamp, cache_hs_order_reinsert, cache_hs_order_remove; uint64_t hs_counter, hs_upd_type; uint32_t hs_btree_id; @@ -958,7 +888,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui continue; /* We shouldn't have crossed the btree and user key search space. */ - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter)); + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); WT_ASSERT(session, hs_btree_id == btree_id); #ifdef HAVE_DIAGNOSTIC WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); @@ -970,7 +900,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui * the cell. The cell's start timestamp can be cleared during reconciliation if it is * globally visible. */ - if (hs_ts >= ts || twp->stop_ts >= ts) + if (hs_start_ts >= ts || twp->stop_ts >= ts) break; } if (ret == WT_NOTFOUND) @@ -1026,7 +956,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui */ for (; ret == 0; ret = hs_cursor->next(hs_cursor)) { /* We shouldn't have crossed the btree and user key search space. */ - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter)); + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); WT_ASSERT(session, hs_btree_id == btree_id); #ifdef HAVE_DIAGNOSTIC WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); @@ -1044,7 +974,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui * by ignoring them. */ __wt_hs_upd_time_window(hs_cursor, &twp); - if (hs_ts < ts && twp->stop_ts < ts) + if (hs_start_ts < ts && twp->stop_ts < ts) continue; if (reinsert) { @@ -1098,10 +1028,10 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui /* Extract the underlying value for reinsertion. */ WT_ERR(hs_cursor->get_value( - hs_cursor, &tw.durable_stop_ts, &tw.durable_start_ts, &hs_upd_type, &hs_value)); + hs_cursor, &hs_durable_stop_ts, &hs_durable_start_ts, &hs_upd_type, &hs_value)); /* Reinsert the update with corrected timestamps. */ - if (no_ts_tombstone && hs_ts == ts) + if (no_ts_tombstone && hs_start_ts == ts) *counter = hs_counter; /* Insert the value back with different timestamps. */ @@ -1132,3 +1062,92 @@ err: return (ret); } + +/* + * __hs_delete_record -- + * Delete an update from the history store if it is not obsolete + */ +static int +__hs_delete_record( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ITEM *key, WT_UPDATE *upd, WT_UPDATE *tombstone) +{ + WT_DECL_RET; + bool hs_read_committed; +#ifdef HAVE_DIAGNOSTIC + WT_TIME_WINDOW *hs_tw; +#endif + + if (r->hs_cursor == NULL) + WT_RET(__wt_curhs_open(session, NULL, &r->hs_cursor)); + hs_read_committed = F_ISSET(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED); + /* Ensure we can see all the content in the history store. */ + F_SET(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED); + + /* No need to delete from the history store if it is already obsolete. */ + if (tombstone != NULL && __wt_txn_upd_visible_all(session, tombstone)) + goto done; + + r->hs_cursor->set_key(r->hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX); + WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, r->hs_cursor), true); + /* It's possible the value in the history store becomes obsolete concurrently. */ + if (ret == WT_NOTFOUND) { + WT_ASSERT(session, tombstone != NULL && __wt_txn_upd_visible_all(session, tombstone)); + ret = 0; + } else { +#ifdef HAVE_DIAGNOSTIC + __wt_hs_upd_time_window(r->hs_cursor, &hs_tw); + WT_ASSERT(session, hs_tw->start_txn == WT_TXN_NONE || hs_tw->start_txn == upd->txnid); + WT_ASSERT(session, hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == upd->start_ts); + WT_ASSERT(session, + hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == upd->durable_ts); + if (tombstone != NULL) { + WT_ASSERT(session, hs_tw->stop_txn == tombstone->txnid); + WT_ASSERT(session, hs_tw->stop_ts == tombstone->start_ts); + WT_ASSERT(session, hs_tw->durable_stop_ts == tombstone->durable_ts); + } else + WT_ASSERT(session, !WT_TIME_WINDOW_HAS_STOP(hs_tw)); +#endif + + WT_ERR(r->hs_cursor->remove(r->hs_cursor)); + } +done: + if (tombstone != NULL) + F_CLR(tombstone, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS); + F_CLR(upd, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS); + +err: + if (!hs_read_committed) + F_CLR(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED); + return (ret); +} + +/* + * __wt_hs_delete_updates -- + * Delete the updates from the history store + */ +int +__wt_hs_delete_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_BTREE *btree; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_DELETE_HS_UPD *delete_hs_upd; + uint32_t i; + + /* Nothing to delete from the history store. */ + if (r->delete_hs_upd == NULL) + return (0); + + btree = S2BT(session); + + WT_RET(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key)); + + for (delete_hs_upd = r->delete_hs_upd, i = 0; i < r->delete_hs_upd_next; ++delete_hs_upd, ++i) { + WT_ERR(__hs_pack_key(session, btree, r, delete_hs_upd->ins, delete_hs_upd->rip, key)); + WT_ERR(__hs_delete_record(session, r, key, delete_hs_upd->upd, delete_hs_upd->tombstone)); + } + +err: + __wt_scr_free(session, &key); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 94a89044eef..8ca84a40fae 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -795,6 +795,8 @@ extern int __wt_hs_config(WT_SESSION_IMPL *session, const char **cfg) extern int __wt_hs_delete_key(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btree_id, const WT_ITEM *key, bool reinsert, bool error_on_ts_ordering) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_hs_delete_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, WT_ITEM *base_value_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h index 64519f5de01..b644419c6de 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.h +++ b/src/third_party/wiredtiger/src/include/reconcile.h @@ -64,6 +64,17 @@ struct __wt_rec_chunk { uint32_t auxentries; }; +/* + * WT_DELETE_HS_UPD -- + * Update that needs to be deleted from the history store. + */ +struct __wt_delete_hs_upd { + WT_INSERT *ins; /* Insert list reference */ + WT_ROW *rip; /* Original on-page reference */ + WT_UPDATE *upd; + WT_UPDATE *tombstone; +}; + /* * Reconciliation is the process of taking an in-memory page, walking each entry * in the page, building a backing disk image in a temporary buffer representing @@ -227,6 +238,15 @@ struct __wt_reconcile { size_t supd_allocated; size_t supd_memsize; /* Size of saved update structures */ + /* + * List of updates to be deleted from the history store. While reviewing updates for each page, + * we save the updates that needs to be deleted from history store here, and then delete them + * after we have built the disk image. + */ + WT_DELETE_HS_UPD *delete_hs_upd; /* Updates to delete from history store */ + uint32_t delete_hs_upd_next; + size_t delete_hs_upd_allocated; + /* List of pages we've written so far. */ WT_MULTI *multi; uint32_t multi_next; diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index f55cfb05e50..52bcd2d0204 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -187,6 +187,8 @@ struct __wt_data_handle; typedef struct __wt_data_handle WT_DATA_HANDLE; struct __wt_data_handle_cache; typedef struct __wt_data_handle_cache WT_DATA_HANDLE_CACHE; +struct __wt_delete_hs_upd; +typedef struct __wt_delete_hs_upd WT_DELETE_HS_UPD; struct __wt_dlh; typedef struct __wt_dlh WT_DLH; struct __wt_dsrc_stats; diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index 3e19e383ca2..f34884e6a65 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -19,7 +19,7 @@ __rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ WT_SAVE_UPD *supd; WT_ASSERT_ALWAYS(session, onpage_upd != NULL || supd_restore, - "If nothing is committed the update chain must be restored"); + "If nothing is committed, the update chain must be restored"); WT_ASSERT_ALWAYS(session, onpage_upd == NULL || onpage_upd->type == WT_UPDATE_STANDARD || onpage_upd->type == WT_UPDATE_MODIFY, @@ -39,6 +39,27 @@ __rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ return (0); } +/* + * __rec_delete_hs_upd_save -- + * Save an update into a WT_DELETE_HS_UPD list to delete it from the history store later. + */ +static inline int +__rec_delete_hs_upd_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, + WT_UPDATE *upd, WT_UPDATE *tombstone) +{ + WT_DELETE_HS_UPD *delete_hs_upd; + + WT_RET(__wt_realloc_def( + session, &r->delete_hs_upd_allocated, r->delete_hs_upd_next + 1, &r->delete_hs_upd)); + delete_hs_upd = &r->delete_hs_upd[r->delete_hs_upd_next]; + delete_hs_upd->ins = ins; + delete_hs_upd->rip = rip; + delete_hs_upd->upd = upd; + delete_hs_upd->tombstone = tombstone; + ++r->delete_hs_upd_next; + return (0); +} + /* * __rec_append_orig_value -- * Append the key's original value to its update list. It assumes that we have an onpage value, @@ -182,6 +203,44 @@ err: return (ret); } +/* + * __rec_find_and_save_delete_hs_upd -- + * Find and save the update that needs to be deleted from the history store later + */ +static int +__rec_find_and_save_delete_hs_upd(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, + WT_ROW *rip, WT_UPDATE_SELECT *upd_select) +{ + WT_UPDATE *delete_tombstone, *delete_upd; + + delete_tombstone = NULL; + + for (delete_upd = upd_select->tombstone != NULL ? upd_select->tombstone : upd_select->upd; + delete_upd != NULL; delete_upd = delete_upd->next) { + if (delete_upd->txnid == WT_TXN_ABORTED) + continue; + + if (F_ISSET(delete_upd, WT_UPDATE_TO_DELETE_FROM_HS)) { + WT_ASSERT_ALWAYS(session, + F_ISSET(delete_upd, WT_UPDATE_HS | WT_UPDATE_RESTORED_FROM_HS), + "Attempting to remove an update from the history store in WiredTiger, but the " + "update was missing."); + if (delete_upd->type == WT_UPDATE_TOMBSTONE) + delete_tombstone = delete_upd; + else { + WT_RET( + __rec_delete_hs_upd_save(session, r, ins, rip, delete_upd, delete_tombstone)); + break; + } + } + } + + WT_ASSERT_ALWAYS(session, delete_tombstone == NULL || delete_upd != NULL, + "If we delete a tombstone from the history store, we must also delete the update."); + + return (0); +} + /* * __rec_need_save_upd -- * Return if we need to save the update chain @@ -190,8 +249,6 @@ static inline bool __rec_need_save_upd( WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE_SELECT *upd_select, bool has_newer_updates) { - WT_UPDATE *upd; - if (upd_select->tw.prepare) return (true); @@ -202,15 +259,6 @@ __rec_need_save_upd( if (upd_select->upd != NULL && upd_select->upd->type == WT_UPDATE_TOMBSTONE) return (false); - /* Save the update chain to delete the update from the history store later. */ - for (upd = upd_select->upd; upd != NULL; upd = upd->next) { - if (upd->txnid == WT_TXN_ABORTED) - continue; - - if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS)) - return (true); - } - /* * Don't save updates for any reconciliation that doesn't involve history store (in-memory * database, metadata, and history store reconciliation itself), except when the selected stop @@ -297,15 +345,10 @@ __rec_validate_upd_chain(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *s if (!F_ISSET(r, WT_REC_CHECKPOINT_RUNNING)) return (0); - for (upd = select_upd; upd != NULL; upd = upd->next) { - if (upd->txnid == WT_TXN_ABORTED) - continue; - - /* Cannot delete the update from history store when checkpoint is running. */ - if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS)) { - WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_remove_hs_race_with_checkpoint); - return (EBUSY); - } + /* Cannot delete the update from history store when checkpoint is running. */ + if (r->delete_hs_upd_next > 0) { + WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_remove_hs_race_with_checkpoint); + return (EBUSY); } /* @@ -795,6 +838,13 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W NULL : upd_select->upd; + /* + * If we have done a prepared rollback, we may have restored a history store value to the update + * chain but the same value is left in the history store. Save it to delete it from the history + * store later. + */ + WT_RET(__rec_find_and_save_delete_hs_upd(session, r, ins, rip, upd_select)); + /* Check the update chain for conditions that could prevent it's eviction. */ WT_RET(__rec_validate_upd_chain(session, r, onpage_upd, &upd_select->tw, vpack)); @@ -836,7 +886,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W /* Catch this case in diagnostic builds. */ WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_no_ts_checkpoint_race_3); WT_ASSERT(session, false); - WT_RET(EBUSY); + return (EBUSY); } /* diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 2c26d83adf7..d1f363a3e54 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -620,6 +620,9 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO r->supd_next = 0; r->supd_memsize = 0; + /* The list of updates to be deleted from the history store. */ + r->delete_hs_upd_next = 0; + /* The list of pages we've written. */ r->multi = NULL; r->multi_next = 0; @@ -772,6 +775,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) __wt_buf_free(session, &r->chunk_B.image); __wt_free(session, r->supd); + __wt_free(session, r->delete_hs_upd); __wt_rec_dictionary_free(session, r); @@ -2666,6 +2670,12 @@ __rec_hs_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r) /* Flag as unused for non diagnostic builds. */ WT_UNUSED(btree); + /* + * Delete the updates left in the history store by prepared rollback first before moving updates + * to the history store. + */ + WT_ERR(__wt_hs_delete_updates(session, r)); + /* Check if there's work to do. */ for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i) if (multi->supd != NULL) diff --git a/src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9805 b/src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9805 new file mode 100644 index 00000000000..34fd526585b --- /dev/null +++ b/src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9805 @@ -0,0 +1,196 @@ +############################################ +# RUN PARAMETERS: V3 +############################################ +assert.read_timestamp=0 +backup=0 +backup.incremental=off +backup.incr_granularity=11691 +block_cache=0 +block_cache.cache_on_checkpoint=0 +block_cache.cache_on_writes=1 +block_cache.size=62 +btree.huffman_value=0 +cache=2178 +cache.evict_max=2 +cache.minimum=20 +checkpoint=on +checkpoint.log_size=200 +checkpoint.wait=31 +disk.data_extend=0 +disk.direct_io=0 +disk.encryption=rotn-7 +disk.mmap=1 +disk.mmap_all=0 +format.abort=0 +format.independent_thread_rng=1 +format.major_timeout=0 +import=0 +logging=0 +logging.compression=none +logging.file_max=64891 +logging.prealloc=0 +logging.remove=1 +ops.alter=0 +ops.compaction=0 +ops.hs_cursor=1 +ops.prepare=1 +ops.random_cursor=0 +ops.salvage=0 +ops.verify=1 +quiet=1 +runs.in_memory=0 +runs.ops=0 +runs.rows=1000000 +runs.tables=5 +runs.threads=32 +runs.timer=26 +runs.verify_failure_dump=0 +statistics=0 +statistics.server=0 +stress.aggressive_sweep=0 +stress.checkpoint=0 +stress.checkpoint_evict_page=0 +stress.checkpoint_reserved_txnid_delay=0 +stress.checkpoint_prepare=0 +stress.evict_reposition=0 +stress.failpoint_eviction_fail_after_reconciliation=1 +stress.failpoint_hs_delete_key_from_ts=0 +stress.hs_checkpoint_delay=0 +stress.hs_search=0 +stress.hs_sweep=0 +stress.split_1=0 +stress.split_2=0 +stress.split_3=0 +stress.split_4=0 +stress.split_5=0 +stress.split_6=0 +stress.split_7=0 +transaction.implicit=0 +transaction.timestamps=1 +wiredtiger.config=off +wiredtiger.rwlock=1 +wiredtiger.leak_memory=0 +############################################ +# TABLE PARAMETERS: table 1 +############################################ +table1.btree.compression=snappy +table1.btree.dictionary=0 +table1.btree.internal_key_truncation=1 +table1.btree.internal_page_max=9 +table1.btree.leaf_page_max=10 +table1.btree.memory_page_max=1 +table1.btree.repeat_data_pct=47 +table1.btree.split_pct=77 +table1.btree.value_max=3212 +table1.btree.value_min=0 +table1.disk.checksum=unencrypted +table1.disk.firstfit=0 +table1.ops.pct.delete=55 +table1.ops.pct.insert=42 +table1.ops.pct.modify=3 +table1.ops.pct.read=0 +table1.ops.pct.write=0 +table1.ops.truncate=1 +table1.runs.mirror=0 +table1.runs.source=table +table1.runs.type=variable-length column-store +############################################ +# TABLE PARAMETERS: table 2 +############################################ +table2.btree.compression=zstd +table2.btree.dictionary=0 +table2.btree.internal_key_truncation=1 +table2.btree.internal_page_max=13 +table2.btree.key_max=46 +table2.btree.key_min=25 +table2.btree.leaf_page_max=14 +table2.btree.memory_page_max=6 +table2.btree.prefix_len=0 +table2.btree.prefix_compression=1 +table2.btree.prefix_compression_min=8 +table2.btree.reverse=0 +table2.btree.split_pct=69 +table2.btree.value_max=3706 +table2.btree.value_min=13 +table2.disk.checksum=off +table2.disk.firstfit=0 +table2.ops.pct.delete=72 +table2.ops.pct.insert=0 +table2.ops.pct.modify=5 +table2.ops.pct.read=1 +table2.ops.pct.write=22 +table2.ops.truncate=1 +table2.runs.mirror=0 +table2.runs.source=table +table2.runs.type=row-store +############################################ +# TABLE PARAMETERS: table 3 +############################################ +table3.btree.bitcnt=3 +table3.btree.compression=zlib +table3.btree.internal_key_truncation=1 +table3.btree.internal_page_max=17 +table3.btree.leaf_page_max=11 +table3.btree.memory_page_max=10 +table3.btree.split_pct=81 +table3.disk.checksum=on +table3.disk.firstfit=0 +table3.ops.pct.delete=9 +table3.ops.pct.insert=44 +table3.ops.pct.modify=14 +table3.ops.pct.read=32 +table3.ops.pct.write=1 +table3.ops.truncate=1 +table3.runs.mirror=0 +table3.runs.source=table +table3.runs.type=fixed-length column-store +############################################ +# TABLE PARAMETERS: table 4 +############################################ +table4.btree.compression=zlib +table4.btree.dictionary=1 +table4.btree.internal_key_truncation=1 +table4.btree.internal_page_max=14 +table4.btree.key_max=30 +table4.btree.key_min=27 +table4.btree.leaf_page_max=14 +table4.btree.memory_page_max=9 +table4.btree.prefix_len=67 +table4.btree.prefix_compression=1 +table4.btree.prefix_compression_min=5 +table4.btree.reverse=0 +table4.btree.split_pct=88 +table4.btree.value_max=120 +table4.btree.value_min=20 +table4.disk.checksum=on +table4.disk.firstfit=0 +table4.ops.pct.delete=4 +table4.ops.pct.insert=41 +table4.ops.pct.modify=0 +table4.ops.pct.read=2 +table4.ops.pct.write=53 +table4.ops.truncate=1 +table4.runs.mirror=0 +table4.runs.source=file +table4.runs.type=row-store +############################################ +# TABLE PARAMETERS: table 5 +############################################ +table5.btree.bitcnt=2 +table5.btree.compression=snappy +table5.btree.internal_key_truncation=1 +table5.btree.internal_page_max=11 +table5.btree.leaf_page_max=14 +table5.btree.memory_page_max=7 +table5.btree.split_pct=97 +table5.disk.checksum=on +table5.disk.firstfit=0 +table5.ops.pct.delete=19 +table5.ops.pct.insert=2 +table5.ops.pct.modify=71 +table5.ops.pct.read=7 +table5.ops.pct.write=1 +table5.ops.truncate=1 +table5.runs.mirror=0 +table5.runs.source=table +table5.runs.type=fixed-length column-store diff --git a/src/third_party/wiredtiger/test/suite/test_prepare25.py b/src/third_party/wiredtiger/test/suite/test_prepare25.py index 95fce47181d..e512c3c7fef 100644 --- a/src/third_party/wiredtiger/test/suite/test_prepare25.py +++ b/src/third_party/wiredtiger/test/suite/test_prepare25.py @@ -31,7 +31,7 @@ from wtscenario import make_scenarios # test_prepare25.py # Test prepare rollback and then prepare commit with failed eviction. -class test_prepare23(wttest.WiredTigerTestCase): +class test_prepare25(wttest.WiredTigerTestCase): conn_config = 'timing_stress_for_test=[failpoint_eviction_fail_after_reconciliation]' format_values = [ diff --git a/src/third_party/wiredtiger/test/suite/test_prepare26.py b/src/third_party/wiredtiger/test/suite/test_prepare26.py new file mode 100644 index 00000000000..52e08cedff6 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_prepare26.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from wtscenario import make_scenarios + +# test_prepare26.py +# Test prepare rollback and then delete the key. +class test_prepare26(wttest.WiredTigerTestCase): + format_values = [ + ('column', dict(key_format='r', value_format='S')), + ('column_fix', dict(key_format='r', value_format='8t')), + ('row_integer', dict(key_format='i', value_format='S')), + ] + + scenarios = make_scenarios(format_values) + + def test_prepare26(self): + uri = "table:test_prepare26" + self.session.create(uri, 'key_format=' + self.key_format + ',value_format=' + self.value_format) + + if self.value_format == '8t': + value_a = 97 + value_b = 98 + value_c = 99 + else: + value_a = "a" + value_b = "b" + value_c = "c" + + # Pin oldest timestamp to 1 + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1)) + + # Insert a value + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + cursor[1] = value_a + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10)) + + # Do a prepared update + self.session.begin_transaction() + cursor[1] = value_c + self.session.prepare_transaction('prepare_timestamp=' + self.timestamp_str(20)) + + # Evict the page + session2 = self.conn.open_session() + evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)') + session2.begin_transaction('ignore_prepare=true,read_timestamp=' + self.timestamp_str(10)) + self.assertEquals(evict_cursor[1], value_a) + evict_cursor.reset() + evict_cursor.close() + session2.rollback_transaction() + + # Rollback the prepared transaction + self.session.rollback_transaction() + + # Delete the key + self.session.begin_transaction() + cursor.set_key(1) + cursor.remove() + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(30)) + + # Set oldest timestamp to 30 + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(30)) + + # Evict the page again + evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)') + session2.begin_transaction() + evict_cursor.set_key(1) + if self.value_format == '8t': + self.assertEquals(evict_cursor[1], 0) + else: + evict_cursor.set_key(1) + self.assertEquals(evict_cursor.search(), wiredtiger.WT_NOTFOUND) + evict_cursor.reset() + evict_cursor.close() + session2.rollback_transaction() + + # Do another update + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + cursor[1] = value_b + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(40)) + + # Do another update + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + cursor[1] = value_c + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(50)) + + # Evict the page again + evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)') + session2.begin_transaction('read_timestamp=' + self.timestamp_str(50)) + self.assertEquals(evict_cursor[1], value_c) + evict_cursor.reset() + evict_cursor.close() + session2.rollback_transaction() + + # Verify we read nothing at the oldest + self.session.begin_transaction('read_timestamp=' + self.timestamp_str(30)) + if self.value_format == '8t': + self.assertEquals(cursor[1], 0) + else: + cursor.set_key(1) + self.assertEquals(cursor.search(), wiredtiger.WT_NOTFOUND) + self.session.rollback_transaction() + +if __name__ == '__main__': + wttest.run() -- cgit v1.2.1